diff options
Diffstat (limited to 'block')
41 files changed, 1669 insertions, 1039 deletions
diff --git a/block/Makefile b/block/Makefile index b31b05390749..46ada9dc8bbf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -9,7 +9,7 @@ obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ - disk-events.o blk-ia-ranges.o + disk-events.o blk-ia-ranges.o early-lookup.o obj-$(CONFIG_BOUNCE) += bounce.o obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o diff --git a/block/bdev.c b/block/bdev.c index 21c63bfef323..979e28a46b98 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -93,7 +93,7 @@ EXPORT_SYMBOL(invalidate_bdev); * Drop all buffers & page cache for given bdev range. This function bails * with error if bdev has other exclusive owner (such as filesystem). */ -int truncate_bdev_range(struct block_device *bdev, fmode_t mode, +int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, loff_t lstart, loff_t lend) { /* @@ -101,14 +101,14 @@ int truncate_bdev_range(struct block_device *bdev, fmode_t mode, * while we discard the buffer cache to avoid discarding buffers * under live filesystem. */ - if (!(mode & FMODE_EXCL)) { - int err = bd_prepare_to_claim(bdev, truncate_bdev_range); + if (!(mode & BLK_OPEN_EXCL)) { + int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); if (err) goto invalidate; } truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(bdev, truncate_bdev_range); return 0; @@ -308,7 +308,7 @@ EXPORT_SYMBOL(thaw_bdev); * pseudo-fs */ -static __cacheline_aligned_in_smp DEFINE_SPINLOCK(bdev_lock); +static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); static struct kmem_cache * bdev_cachep __read_mostly; static struct inode *bdev_alloc_inode(struct super_block *sb) @@ -415,6 +415,7 @@ struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) bdev = I_BDEV(inode); mutex_init(&bdev->bd_fsfreeze_mutex); spin_lock_init(&bdev->bd_size_lock); + mutex_init(&bdev->bd_holder_lock); bdev->bd_partno = partno; bdev->bd_inode = inode; bdev->bd_queue = disk->queue; @@ -463,39 +464,48 @@ long nr_blockdev_pages(void) /** * bd_may_claim - test whether a block device can be claimed * @bdev: block device of interest - * @whole: whole block device containing @bdev, may equal @bdev * @holder: holder trying to claim @bdev + * @hops: holder ops * * Test whether @bdev can be claimed by @holder. * - * CONTEXT: - * spin_lock(&bdev_lock). - * * RETURNS: * %true if @bdev can be claimed, %false otherwise. */ -static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, - void *holder) +static bool bd_may_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) { - if (bdev->bd_holder == holder) - return true; /* already a holder */ - else if (bdev->bd_holder != NULL) - return false; /* held by someone else */ - else if (whole == bdev) - return true; /* is a whole device which isn't held */ - - else if (whole->bd_holder == bd_may_claim) - return true; /* is a partition of a device that is being partitioned */ - else if (whole->bd_holder != NULL) - return false; /* is a partition of a held device */ - else - return true; /* is a partition of an un-held device */ + struct block_device *whole = bdev_whole(bdev); + + lockdep_assert_held(&bdev_lock); + + if (bdev->bd_holder) { + /* + * The same holder can always re-claim. + */ + if (bdev->bd_holder == holder) { + if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) + return false; + return true; + } + return false; + } + + /* + * If the whole devices holder is set to bd_may_claim, a partition on + * the device is claimed, but not the whole device. + */ + if (whole != bdev && + whole->bd_holder && whole->bd_holder != bd_may_claim) + return false; + return true; } /** * bd_prepare_to_claim - claim a block device * @bdev: block device of interest * @holder: holder trying to claim @bdev + * @hops: holder ops. * * Claim @bdev. This function fails if @bdev is already claimed by another * holder and waits if another claiming is in progress. return, the caller @@ -504,17 +514,18 @@ static bool bd_may_claim(struct block_device *bdev, struct block_device *whole, * RETURNS: * 0 if @bdev can be claimed, -EBUSY otherwise. */ -int bd_prepare_to_claim(struct block_device *bdev, void *holder) +int bd_prepare_to_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); if (WARN_ON_ONCE(!holder)) return -EINVAL; retry: - spin_lock(&bdev_lock); + mutex_lock(&bdev_lock); /* if someone else claimed, fail */ - if (!bd_may_claim(bdev, whole, holder)) { - spin_unlock(&bdev_lock); + if (!bd_may_claim(bdev, holder, hops)) { + mutex_unlock(&bdev_lock); return -EBUSY; } @@ -524,7 +535,7 @@ retry: DEFINE_WAIT(wait); prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); schedule(); finish_wait(wq, &wait); goto retry; @@ -532,7 +543,7 @@ retry: /* yay, all mine */ whole->bd_claiming = holder; - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); return 0; } EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ @@ -550,16 +561,18 @@ static void bd_clear_claiming(struct block_device *whole, void *holder) * bd_finish_claiming - finish claiming of a block device * @bdev: block device of interest * @holder: holder that has claimed @bdev + * @hops: block device holder operations * * Finish exclusive open of a block device. Mark the device as exlusively * open by the holder and wake up all waiters for exclusive open to finish. */ -static void bd_finish_claiming(struct block_device *bdev, void *holder) +static void bd_finish_claiming(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) { struct block_device *whole = bdev_whole(bdev); - spin_lock(&bdev_lock); - BUG_ON(!bd_may_claim(bdev, whole, holder)); + mutex_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, holder, hops)); /* * Note that for a whole device bd_holders will be incremented twice, * and bd_holder will be set to bd_may_claim before being set to holder @@ -567,9 +580,12 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) whole->bd_holders++; whole->bd_holder = bd_may_claim; bdev->bd_holders++; + mutex_lock(&bdev->bd_holder_lock); bdev->bd_holder = holder; + bdev->bd_holder_ops = hops; + mutex_unlock(&bdev->bd_holder_lock); bd_clear_claiming(whole, holder); - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); } /** @@ -583,12 +599,47 @@ static void bd_finish_claiming(struct block_device *bdev, void *holder) */ void bd_abort_claiming(struct block_device *bdev, void *holder) { - spin_lock(&bdev_lock); + mutex_lock(&bdev_lock); bd_clear_claiming(bdev_whole(bdev), holder); - spin_unlock(&bdev_lock); + mutex_unlock(&bdev_lock); } EXPORT_SYMBOL(bd_abort_claiming); +static void bd_end_claim(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev_whole(bdev); + bool unblock = false; + + /* + * Release a claim on the device. The holder fields are protected with + * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. + */ + mutex_lock(&bdev_lock); + WARN_ON_ONCE(bdev->bd_holder != holder); + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--whole->bd_holders < 0); + if (!bdev->bd_holders) { + mutex_lock(&bdev->bd_holder_lock); + bdev->bd_holder = NULL; + bdev->bd_holder_ops = NULL; + mutex_unlock(&bdev->bd_holder_lock); + if (bdev->bd_write_holder) + unblock = true; + } + if (!whole->bd_holders) + whole->bd_holder = NULL; + mutex_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and unblock evpoll if + * it was a write holder. + */ + if (unblock) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } +} + static void blkdev_flush_mapping(struct block_device *bdev) { WARN_ON_ONCE(bdev->bd_holders); @@ -597,13 +648,13 @@ static void blkdev_flush_mapping(struct block_device *bdev) bdev_write_inode(bdev); } -static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) +static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) { struct gendisk *disk = bdev->bd_disk; int ret; if (disk->fops->open) { - ret = disk->fops->open(bdev, mode); + ret = disk->fops->open(disk, mode); if (ret) { /* avoid ghost partitions on a removed medium */ if (ret == -ENOMEDIUM && @@ -621,22 +672,19 @@ static int blkdev_get_whole(struct block_device *bdev, fmode_t mode) return 0; } -static void blkdev_put_whole(struct block_device *bdev, fmode_t mode) +static void blkdev_put_whole(struct block_device *bdev) { if (atomic_dec_and_test(&bdev->bd_openers)) blkdev_flush_mapping(bdev); if (bdev->bd_disk->fops->release) - bdev->bd_disk->fops->release(bdev->bd_disk, mode); + bdev->bd_disk->fops->release(bdev->bd_disk); } -static int blkdev_get_part(struct block_device *part, fmode_t mode) +static int blkdev_get_part(struct block_device *part, blk_mode_t mode) { struct gendisk *disk = part->bd_disk; int ret; - if (atomic_read(&part->bd_openers)) - goto done; - ret = blkdev_get_whole(bdev_whole(part), mode); if (ret) return ret; @@ -645,26 +693,27 @@ static int blkdev_get_part(struct block_device *part, fmode_t mode) if (!bdev_nr_sectors(part)) goto out_blkdev_put; - disk->open_partitions++; - set_init_blocksize(part); -done: + if (!atomic_read(&part->bd_openers)) { + disk->open_partitions++; + set_init_blocksize(part); + } atomic_inc(&part->bd_openers); return 0; out_blkdev_put: - blkdev_put_whole(bdev_whole(part), mode); + blkdev_put_whole(bdev_whole(part)); return ret; } -static void blkdev_put_part(struct block_device *part, fmode_t mode) +static void blkdev_put_part(struct block_device *part) { struct block_device *whole = bdev_whole(part); - if (!atomic_dec_and_test(&part->bd_openers)) - return; - blkdev_flush_mapping(part); - whole->bd_disk->open_partitions--; - blkdev_put_whole(whole, mode); + if (atomic_dec_and_test(&part->bd_openers)) { + blkdev_flush_mapping(part); + whole->bd_disk->open_partitions--; + } + blkdev_put_whole(whole); } struct block_device *blkdev_get_no_open(dev_t dev) @@ -695,17 +744,17 @@ void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } - + /** * blkdev_get_by_dev - open a block device by device number * @dev: device number of block device to open - * @mode: FMODE_* mask + * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier + * @hops: holder operations * - * Open the block device described by device number @dev. If @mode includes - * %FMODE_EXCL, the block device is opened with exclusive access. Specifying - * %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may nest for - * the same @holder. + * Open the block device described by device number @dev. If @holder is not + * %NULL, the block device is opened with exclusive access. Exclusive opens may + * nest for the same @holder. * * Use this interface ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a device @@ -717,7 +766,8 @@ void blkdev_put_no_open(struct block_device *bdev) * RETURNS: * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) +struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) { bool unblock_events = true; struct block_device *bdev; @@ -726,8 +776,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), - ((mode & FMODE_READ) ? DEVCG_ACC_READ : 0) | - ((mode & FMODE_WRITE) ? DEVCG_ACC_WRITE : 0)); + ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | + ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) return ERR_PTR(ret); @@ -736,10 +786,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) return ERR_PTR(-ENXIO); disk = bdev->bd_disk; - if (mode & FMODE_EXCL) { - ret = bd_prepare_to_claim(bdev, holder); + if (holder) { + mode |= BLK_OPEN_EXCL; + ret = bd_prepare_to_claim(bdev, holder, hops); if (ret) goto put_blkdev; + } else { + if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { + ret = -EIO; + goto put_blkdev; + } } disk_block_events(disk); @@ -756,8 +812,8 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; - if (mode & FMODE_EXCL) { - bd_finish_claiming(bdev, holder); + if (holder) { + bd_finish_claiming(bdev, holder, hops); /* * Block event polling for write claims if requested. Any write @@ -766,7 +822,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) * writeable reference is too fragile given the way @mode is * used in blkdev_get/put(). */ - if ((mode & FMODE_WRITE) && !bdev->bd_write_holder && + if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { bdev->bd_write_holder = true; unblock_events = false; @@ -780,7 +836,7 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) put_module: module_put(disk->fops->owner); abort_claiming: - if (mode & FMODE_EXCL) + if (holder) bd_abort_claiming(bdev, holder); mutex_unlock(&disk->open_mutex); disk_unblock_events(disk); @@ -793,13 +849,13 @@ EXPORT_SYMBOL(blkdev_get_by_dev); /** * blkdev_get_by_path - open a block device by name * @path: path to the block device to open - * @mode: FMODE_* mask + * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier + * @hops: holder operations * - * Open the block device described by the device file at @path. If @mode - * includes %FMODE_EXCL, the block device is opened with exclusive access. - * Specifying %FMODE_EXCL with a %NULL @holder is invalid. Exclusive opens may - * nest for the same @holder. + * Open the block device described by the device file at @path. If @holder is + * not %NULL, the block device is opened with exclusive access. Exclusive opens + * may nest for the same @holder. * * CONTEXT: * Might sleep. @@ -807,8 +863,8 @@ EXPORT_SYMBOL(blkdev_get_by_dev); * RETURNS: * Reference to the block_device on success, ERR_PTR(-errno) on failure. */ -struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, - void *holder) +struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hops) { struct block_device *bdev; dev_t dev; @@ -818,9 +874,9 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, if (error) return ERR_PTR(error); - bdev = blkdev_get_by_dev(dev, mode, holder); - if (!IS_ERR(bdev) && (mode & FMODE_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, mode); + bdev = blkdev_get_by_dev(dev, mode, holder, hops); + if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { + blkdev_put(bdev, holder); return ERR_PTR(-EACCES); } @@ -828,7 +884,7 @@ struct block_device *blkdev_get_by_path(const char *path, fmode_t mode, } EXPORT_SYMBOL(blkdev_get_by_path); -void blkdev_put(struct block_device *bdev, fmode_t mode) +void blkdev_put(struct block_device *bdev, void *holder) { struct gendisk *disk = bdev->bd_disk; @@ -843,36 +899,8 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - if (mode & FMODE_EXCL) { - struct block_device *whole = bdev_whole(bdev); - bool bdev_free; - - /* - * Release a claim on the device. The holder fields - * are protected with bdev_lock. open_mutex is to - * synchronize disk_holder unlinking. - */ - spin_lock(&bdev_lock); - - WARN_ON_ONCE(--bdev->bd_holders < 0); - WARN_ON_ONCE(--whole->bd_holders < 0); - - if ((bdev_free = !bdev->bd_holders)) - bdev->bd_holder = NULL; - if (!whole->bd_holders) - whole->bd_holder = NULL; - - spin_unlock(&bdev_lock); - - /* - * If this was the last claim, remove holder link and - * unblock evpoll if it was a write holder. - */ - if (bdev_free && bdev->bd_write_holder) { - disk_unblock_events(disk); - bdev->bd_write_holder = false; - } - } + if (holder) + bd_end_claim(bdev, holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -882,9 +910,9 @@ void blkdev_put(struct block_device *bdev, fmode_t mode) disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); if (bdev_is_partition(bdev)) - blkdev_put_part(bdev, mode); + blkdev_put_part(bdev); else - blkdev_put_whole(bdev, mode); + blkdev_put_whole(bdev); mutex_unlock(&disk->open_mutex); module_put(disk->fops->owner); diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 3164e3177965..3cce6de464a7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -5403,6 +5403,10 @@ void bfq_put_queue(struct bfq_queue *bfqq) if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; + WARN_ON_ONCE(!list_empty(&bfqq->fifo)); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list)); + WARN_ON_ONCE(bfqq->dispatched); + kmem_cache_free(bfq_pool, bfqq); bfqg_and_blkg_put(bfqg); } @@ -5524,16 +5528,16 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->new_ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); bfqq->new_ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); bfqq->new_ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; break; } @@ -5830,7 +5834,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bfq_io_cq *bic, bool respawn) { - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; struct bfq_queue *bfqq; @@ -7135,6 +7139,7 @@ static void bfq_exit_queue(struct elevator_queue *e) { struct bfq_data *bfqd = e->elevator_data; struct bfq_queue *bfqq, *n; + unsigned int actuator; hrtimer_cancel(&bfqd->idle_slice_timer); @@ -7143,6 +7148,10 @@ static void bfq_exit_queue(struct elevator_queue *e) bfq_deactivate_bfqq(bfqd, bfqq, false, false); spin_unlock_irq(&bfqd->lock); + for (actuator = 0; actuator < bfqd->num_actuators; actuator++) + WARN_ON_ONCE(bfqd->rq_in_driver[actuator]); + WARN_ON_ONCE(bfqd->tot_rq_in_driver); + hrtimer_cancel(&bfqd->idle_slice_timer); /* release oom-queue reference to root group */ diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 4533eb491661..6f81c10757fb 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -123,17 +123,34 @@ void bio_integrity_free(struct bio *bio) int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - if (bip->bip_vcnt >= bip->bip_max_vcnt) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > + queue_max_hw_sectors(q)) return 0; - } - if (bip->bip_vcnt && - bvec_gap_to_prev(&bdev_get_queue(bio->bi_bdev)->limits, - &bip->bip_vec[bip->bip_vcnt - 1], offset)) - return 0; + if (bip->bip_vcnt > 0) { + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + bool same_page = false; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + &same_page)) { + bip->bip_iter.bi_size += len; + return len; + } + + if (bip->bip_vcnt >= + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) + return 0; + + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + if (bvec_gap_to_prev(&q->limits, bv, offset)) + return 0; + } bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; diff --git a/block/bio.c b/block/bio.c index e07bea077db8..39c6acdd777d 100644 --- a/block/bio.c +++ b/block/bio.c @@ -942,9 +942,8 @@ static inline bool bio_full(struct bio *bio, unsigned len) return false; } -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off, bool *same_page) { size_t bv_end = bv->bv_offset + bv->bv_len; phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; @@ -958,49 +957,15 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (*same_page) - return true; - else if (IS_ENABLED(CONFIG_KMSAN)) - return false; - return (bv->bv_page + bv_end / PAGE_SIZE) == (page + off / PAGE_SIZE); -} - -/** - * __bio_try_merge_page - try appending data to an existing bvec. - * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * useful optimisation for file systems with a block size smaller than the - * page size. - * - * Warn if (@len, @off) crosses pages in case that @same_page is true. - * - * Return %true on success or %false on failure. - */ -static bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) -{ - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; - - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; - - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) { - *same_page = false; - return false; - } - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } + if (!*same_page) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; } - return false; + + bv->bv_len += len; + return true; } /* @@ -1008,11 +973,10 @@ static bool __bio_try_merge_page(struct bio *bio, struct page *page, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; @@ -1021,7 +985,7 @@ static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, return false; if (bv->bv_len + len > queue_max_segment_size(q)) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset, same_page); } /** @@ -1041,8 +1005,6 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { - struct bio_vec *bvec; - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; @@ -1050,15 +1012,19 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + same_page)) { + bio->bi_iter.bi_size += len; return len; + } /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(&q->limits, bvec, offset)) + if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } @@ -1168,15 +1134,33 @@ int bio_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return 0; + + if (bio->bi_vcnt > 0 && + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; + return len; } + + if (bio_full(bio, len)) + return 0; + __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); +void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, + size_t off) +{ + WARN_ON_ONCE(len > UINT_MAX); + WARN_ON_ONCE(off > UINT_MAX); + __bio_add_page(bio, &folio->page, len, off); +} + /** * bio_add_folio - Attempt to add part of a folio to a bio. * @bio: BIO to add to. @@ -1208,7 +1192,7 @@ void __bio_release_pages(struct bio *bio, bool mark_dirty) bio_for_each_segment_all(bvec, bio, iter_all) { if (mark_dirty && !PageCompound(bvec->bv_page)) set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); + bio_release_page(bio, bvec->bv_page); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1230,7 +1214,6 @@ void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) bio->bi_io_vec = (struct bio_vec *)iter->bvec; bio->bi_iter.bi_bvec_done = iter->iov_offset; bio->bi_iter.bi_size = size; - bio_set_flag(bio, BIO_NO_PAGE_REF); bio_set_flag(bio, BIO_CLONED); } @@ -1239,13 +1222,18 @@ static int bio_iov_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - __bio_add_page(bio, page, len, offset); + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) + return -EIO; + + if (bio->bi_vcnt > 0 && + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; + if (same_page) + bio_release_page(bio, page); return 0; } - - if (same_page) - put_page(page); + __bio_add_page(bio, page, len, offset); return 0; } @@ -1259,7 +1247,7 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; if (same_page) - put_page(page); + bio_release_page(bio, page); return 0; } @@ -1270,10 +1258,10 @@ static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. - * For multi-segment *iter, this function only adds pages from the - * next non-empty segment of the iov iterator. + * Extracts pages from *iter and appends them to @bio's bvec array. The pages + * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. + * For a multi-segment *iter, this function only adds pages from the next + * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { @@ -1305,9 +1293,9 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) * result to ensure the bio's total size is correct. The remainder of * the iov data will be picked up in the next bio iteration. */ - size = iov_iter_get_pages(iter, pages, - UINT_MAX - bio->bi_iter.bi_size, - nr_pages, &offset, extraction_flags); + size = iov_iter_extract_pages(iter, &pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; @@ -1340,7 +1328,7 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) iov_iter_revert(iter, left); out: while (i < nr_pages) - put_page(pages[i++]); + bio_release_page(bio, pages[i++]); return ret; } @@ -1369,12 +1357,17 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { int ret = 0; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + if (iov_iter_is_bvec(iter)) { bio_iov_bvec_set(bio, iter); iov_iter_advance(iter, bio->bi_iter.bi_size); return 0; } + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); do { ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); @@ -1528,8 +1521,8 @@ void bio_set_pages_dirty(struct bio *bio) * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one put_page() against each page and will run one - * bio_put() against the BIO. + * here on. It will unpin each page and will run one bio_put() against the + * BIO. */ static void bio_dirty_fn(struct work_struct *work); diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c index 842e5e1c0f3c..3ec21333f393 100644 --- a/block/blk-cgroup-fc-appid.c +++ b/block/blk-cgroup-fc-appid.c @@ -34,7 +34,7 @@ int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) * the vmid from the fabric. * Adding the overhead of a lock is not necessary. */ - strlcpy(blkcg->fc_app_id, app_id, app_id_len); + strscpy(blkcg->fc_app_id, app_id, app_id_len); css_put(css); out_cgrp_put: cgroup_put(cgrp); diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index dce1548a7a0c..4a42ea2972ad 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -136,7 +136,9 @@ static void blkg_free_workfn(struct work_struct *work) blkcg_policy[i]->pd_free_fn(blkg->pd[i]); if (blkg->parent) blkg_put(blkg->parent); + spin_lock_irq(&q->queue_lock); list_del_init(&blkg->q_node); + spin_unlock_irq(&q->queue_lock); mutex_unlock(&q->blkcg_mutex); blk_put_queue(q); @@ -624,8 +626,13 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkg_iostat_set *bis = per_cpu_ptr(blkg->iostat_cpu, cpu); memset(bis, 0, sizeof(*bis)); + + /* Re-initialize the cleared blkg_iostat_set */ + u64_stats_init(&bis->sync); + bis->blkg = blkg; } memset(&blkg->iostat, 0, sizeof(blkg->iostat)); + u64_stats_init(&blkg->iostat.sync); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -762,6 +769,13 @@ int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) return -ENODEV; } + mutex_lock(&bdev->bd_queue->rq_qos_mutex); + if (!disk_live(bdev->bd_disk)) { + blkdev_put_no_open(bdev); + mutex_unlock(&bdev->bd_queue->rq_qos_mutex); + return -ENODEV; + } + ctx->body = input; ctx->bdev = bdev; return 0; @@ -906,6 +920,7 @@ EXPORT_SYMBOL_GPL(blkg_conf_prep); */ void blkg_conf_exit(struct blkg_conf_ctx *ctx) __releases(&ctx->bdev->bd_queue->queue_lock) + __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { if (ctx->blkg) { spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); @@ -913,6 +928,7 @@ void blkg_conf_exit(struct blkg_conf_ctx *ctx) } if (ctx->bdev) { + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); blkdev_put_no_open(ctx->bdev); ctx->body = NULL; ctx->bdev = NULL; @@ -1495,7 +1511,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1533,21 +1549,20 @@ retry: goto enomem; } - blkg->pd[pol->plid] = pd; + spin_lock(&blkg->blkcg->lock); + pd->blkg = blkg; pd->plid = pol->plid; - pd->online = false; - } + blkg->pd[pol->plid] = pd; - /* all allocated, init in the same order */ - if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) - pol->pd_init_fn(blkg->pd[pol->plid]); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { if (pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[pol->plid]); - blkg->pd[pol->plid]->online = true; + pol->pd_online_fn(pd); + pd->online = true; + + spin_unlock(&blkg->blkcg->lock); } __set_bit(pol->plid, q->blkcg_pols); @@ -1564,14 +1579,19 @@ out: return ret; enomem: - /* alloc failed, nothing's initialized yet, free everything */ + /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; spin_lock(&blkcg->lock); - if (blkg->pd[pol->plid]) { - pol->pd_free_fn(blkg->pd[pol->plid]); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } spin_unlock(&blkcg->lock); @@ -2072,6 +2092,9 @@ void blk_cgroup_bio_start(struct bio *bio) struct blkg_iostat_set *bis; unsigned long flags; + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) + return; + /* Root-level stats are sourced from system-wide IO stats */ if (!cgroup_parent(blkcg->css.cgroup)) return; @@ -2102,8 +2125,7 @@ void blk_cgroup_bio_start(struct bio *bio) } u64_stats_update_end_irqrestore(&bis->sync, flags); - if (cgroup_subsys_on_dfl(io_cgrp_subsys)) - cgroup_rstat_updated(blkcg->css.cgroup, cpu); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); put_cpu(); } diff --git a/block/blk-core.c b/block/blk-core.c index 1da77e7d6289..9866468c72a2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -155,7 +155,7 @@ static const struct { [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, - [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, @@ -170,6 +170,9 @@ static const struct { [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + /* Command duration limit device-side timeout */ + [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -420,6 +423,7 @@ struct request_queue *blk_alloc_queue(int node_id) mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); @@ -718,14 +722,9 @@ void submit_bio_noacct(struct bio *bio) struct block_device *bdev = bio->bi_bdev; struct request_queue *q = bdev_get_queue(bdev); blk_status_t status = BLK_STS_IOERR; - struct blk_plug *plug; might_sleep(); - plug = blk_mq_plug(bio); - if (plug && plug->nowait) - bio->bi_opf |= REQ_NOWAIT; - /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP * if queue does not support NOWAIT. @@ -1055,7 +1054,6 @@ void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) plug->rq_count = 0; plug->multiple_queues = false; plug->has_elevator = false; - plug->nowait = false; INIT_LIST_HEAD(&plug->cb_list); /* @@ -1140,8 +1138,7 @@ void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { if (!list_empty(&plug->cb_list)) flush_plug_callbacks(plug, from_schedule); - if (!rq_list_empty(plug->mq_list)) - blk_mq_flush_plug_list(plug, from_schedule); + blk_mq_flush_plug_list(plug, from_schedule); /* * Unconditionally flush out cached requests, even if the unplug * event came from schedule. Since we know hold references to the diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index ad9844c5b40c..e6468eab2681 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -78,7 +78,7 @@ static struct blk_crypto_fallback_keyslot { struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; -static struct blk_crypto_profile blk_crypto_fallback_profile; +static struct blk_crypto_profile *blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; static struct bio_set crypto_bio_split; @@ -292,7 +292,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for * this bio's algorithm and key. */ - blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { src_bio->bi_status = blk_st; @@ -395,7 +395,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for * this bio's algorithm and key. */ - blk_st = blk_crypto_get_keyslot(&blk_crypto_fallback_profile, + blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { bio->bi_status = blk_st; @@ -499,7 +499,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) return false; } - if (!__blk_crypto_cfg_supported(&blk_crypto_fallback_profile, + if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile, &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; return false; @@ -526,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { - return __blk_crypto_evict_key(&blk_crypto_fallback_profile, key); + return __blk_crypto_evict_key(blk_crypto_fallback_profile, key); } static bool blk_crypto_fallback_inited; @@ -534,7 +534,6 @@ static int blk_crypto_fallback_init(void) { int i; int err; - struct blk_crypto_profile *profile = &blk_crypto_fallback_profile; if (blk_crypto_fallback_inited) return 0; @@ -545,18 +544,27 @@ static int blk_crypto_fallback_init(void) if (err) goto out; - err = blk_crypto_profile_init(profile, blk_crypto_num_keyslots); - if (err) + /* Dynamic allocation is needed because of lockdep_register_key(). */ + blk_crypto_fallback_profile = + kzalloc(sizeof(*blk_crypto_fallback_profile), GFP_KERNEL); + if (!blk_crypto_fallback_profile) { + err = -ENOMEM; goto fail_free_bioset; + } + + err = blk_crypto_profile_init(blk_crypto_fallback_profile, + blk_crypto_num_keyslots); + if (err) + goto fail_free_profile; err = -ENOMEM; - profile->ll_ops = blk_crypto_fallback_ll_ops; - profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops; + blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) - profile->modes_supported[i] = 0xFFFFFFFF; - profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + blk_crypto_fallback_profile->modes_supported[i] = 0xFFFFFFFF; + blk_crypto_fallback_profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; blk_crypto_wq = alloc_workqueue("blk_crypto_wq", WQ_UNBOUND | WQ_HIGHPRI | @@ -597,7 +605,9 @@ fail_free_keyslots: fail_free_wq: destroy_workqueue(blk_crypto_wq); fail_destroy_profile: - blk_crypto_profile_destroy(profile); + blk_crypto_profile_destroy(blk_crypto_fallback_profile); +fail_free_profile: + kfree(blk_crypto_fallback_profile); fail_free_bioset: bioset_exit(&crypto_bio_split); out: diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c index 2a67d3fb63e5..7fabc883e39f 100644 --- a/block/blk-crypto-profile.c +++ b/block/blk-crypto-profile.c @@ -79,7 +79,14 @@ int blk_crypto_profile_init(struct blk_crypto_profile *profile, unsigned int slot_hashtable_size; memset(profile, 0, sizeof(*profile)); - init_rwsem(&profile->lock); + + /* + * profile->lock of an underlying device can nest inside profile->lock + * of a device-mapper device, so use a dynamic lock class to avoid + * false-positive lockdep reports. + */ + lockdep_register_key(&profile->lockdep_key); + __init_rwsem(&profile->lock, "&profile->lock", &profile->lockdep_key); if (num_slots == 0) return 0; @@ -89,7 +96,7 @@ int blk_crypto_profile_init(struct blk_crypto_profile *profile, profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), GFP_KERNEL); if (!profile->slots) - return -ENOMEM; + goto err_destroy; profile->num_slots = num_slots; @@ -435,6 +442,7 @@ void blk_crypto_profile_destroy(struct blk_crypto_profile *profile) { if (!profile) return; + lockdep_unregister_key(&profile->lockdep_key); kvfree(profile->slot_hashtable); kvfree_sensitive(profile->slots, sizeof(profile->slots[0]) * profile->num_slots); diff --git a/block/blk-flush.c b/block/blk-flush.c index 04698ed9bcd4..fdc489e0ea16 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -188,7 +188,9 @@ static void blk_flush_complete_seq(struct request *rq, case REQ_FSEQ_DATA: list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + spin_lock(&q->requeue_lock); + list_add(&rq->queuelist, &q->requeue_list); + spin_unlock(&q->requeue_lock); blk_mq_kick_requeue_list(q); break; @@ -346,7 +348,10 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, smp_wmb(); req_ref_set(flush_rq, 1); - blk_mq_add_to_requeue_list(flush_rq, 0); + spin_lock(&q->requeue_lock); + list_add_tail(&flush_rq->queuelist, &q->flush_list); + spin_unlock(&q->requeue_lock); + blk_mq_kick_requeue_list(q); } @@ -376,22 +381,29 @@ static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, return RQ_END_IO_NONE; } -/** - * blk_insert_flush - insert a new PREFLUSH/FUA request - * @rq: request to insert - * - * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. - * or __blk_mq_run_hw_queue() to dispatch request. - * @rq is being submitted. Analyze what needs to be done and put it on the - * right queue. +static void blk_rq_init_flush(struct request *rq) +{ + rq->flush.seq = 0; + INIT_LIST_HEAD(&rq->flush.list); + rq->rq_flags |= RQF_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + rq->end_io = mq_flush_data_end_io; +} + +/* + * Insert a PREFLUSH/FUA request into the flush state machine. + * Returns true if the request has been consumed by the flush state machine, + * or false if the caller should continue to process it. */ -void blk_insert_flush(struct request *rq) +bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + /* FLUSH/FUA request must never be merged */ + WARN_ON_ONCE(rq->bio != rq->biotail); /* * @policy now records what operations need to be done. Adjust @@ -408,45 +420,45 @@ void blk_insert_flush(struct request *rq) */ rq->cmd_flags |= REQ_SYNC; - /* - * An empty flush handed down from a stacking driver may - * translate into nothing if the underlying device does not - * advertise a write-back cache. In this case, simply - * complete the request. - */ - if (!policy) { + switch (policy) { + case 0: + /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ blk_mq_end_request(rq, 0); - return; - } - - BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ - - /* - * If there's data but flush is not necessary, the request can be - * processed directly without going through flush machinery. Queue - * for normal execution. - */ - if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, 0); - blk_mq_run_hw_queue(hctx, false); - return; + return true; + case REQ_FSEQ_DATA: + /* + * If there's data, but no flush is necessary, the request can + * be processed directly without going through flush machinery. + * Queue for normal execution. + */ + return false; + case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: + /* + * Initialize the flush fields and completion handler to trigger + * the post flush, and then just pass the command on. + */ + blk_rq_init_flush(rq); + rq->flush.seq |= REQ_FSEQ_PREFLUSH; + spin_lock_irq(&fq->mq_flush_lock); + list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); + spin_unlock_irq(&fq->mq_flush_lock); + return false; + default: + /* + * Mark the request as part of a flush sequence and submit it + * for further processing to the flush state machine. + */ + blk_rq_init_flush(rq); + spin_lock_irq(&fq->mq_flush_lock); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); + return true; } - - /* - * @rq should go through flush machinery. Mark it part of flush - * sequence and submit for further processing. - */ - memset(&rq->flush, 0, sizeof(rq->flush)); - INIT_LIST_HEAD(&rq->flush.list); - rq->rq_flags |= RQF_FLUSH_SEQ; - rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ - - rq->end_io = mq_flush_data_end_io; - - spin_lock_irq(&fq->mq_flush_lock); - blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&fq->mq_flush_lock); } /** diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 63fc02042408..25dd4db11121 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -77,6 +77,10 @@ static void ioc_destroy_icq(struct io_cq *icq) struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); + lockdep_assert_held(&q->queue_lock); + + if (icq->flags & ICQ_DESTROYED) + return; radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); @@ -128,12 +132,7 @@ static void ioc_release_fn(struct work_struct *work) spin_lock(&q->queue_lock); spin_lock(&ioc->lock); - /* - * The icq may have been destroyed when the ioc lock - * was released. - */ - if (!(icq->flags & ICQ_DESTROYED)) - ioc_destroy_icq(icq); + ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); rcu_read_unlock(); @@ -171,23 +170,20 @@ static bool ioc_delay_free(struct io_context *ioc) */ void ioc_clear_queue(struct request_queue *q) { - LIST_HEAD(icq_list); - spin_lock_irq(&q->queue_lock); - list_splice_init(&q->icq_list, &icq_list); - spin_unlock_irq(&q->queue_lock); - - rcu_read_lock(); - while (!list_empty(&icq_list)) { + while (!list_empty(&q->icq_list)) { struct io_cq *icq = - list_entry(icq_list.next, struct io_cq, q_node); - - spin_lock_irq(&icq->ioc->lock); - if (!(icq->flags & ICQ_DESTROYED)) - ioc_destroy_icq(icq); - spin_unlock_irq(&icq->ioc->lock); + list_first_entry(&q->icq_list, struct io_cq, q_node); + + /* + * Other context won't hold ioc lock to wait for queue_lock, see + * details in ioc_release_fn(). + */ + spin_lock(&icq->ioc->lock); + ioc_destroy_icq(icq); + spin_unlock(&icq->ioc->lock); } - rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); } #else /* CONFIG_BLK_ICQ */ static inline void ioc_exit_icqs(struct io_context *ioc) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 285ced3467ab..089fcb9cfce3 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -2455,6 +2455,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, u32 hwi, adj_step; s64 margin; u64 cost, new_inuse; + unsigned long flags; current_hweight(iocg, NULL, &hwi); old_hwi = hwi; @@ -2473,11 +2474,11 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, iocg->inuse == iocg->active) return cost; - spin_lock_irq(&ioc->lock); + spin_lock_irqsave(&ioc->lock, flags); /* we own inuse only when @iocg is in the normal active state */ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); return cost; } @@ -2498,7 +2499,7 @@ static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, } while (time_after64(vtime + cost, now->vnow) && iocg->inuse != iocg->active); - spin_unlock_irq(&ioc->lock); + spin_unlock_irqrestore(&ioc->lock, flags); TRACE_IOCG_PATH(inuse_adjust, iocg, now, old_inuse, iocg->inuse, old_hwi, hwi); @@ -2515,6 +2516,10 @@ static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, u64 seek_pages = 0; u64 cost = 0; + /* Can't calculate cost for empty bio */ + if (!bio->bi_iter.bi_size) + goto out; + switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; @@ -3296,15 +3301,14 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; - if (enable) { + if (enable && !ioc->enabled) { blk_stat_enable_accounting(disk->queue); blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - wbt_disable_default(disk); - } else { + } else if (!enable && ioc->enabled) { + blk_stat_disable_accounting(disk->queue); blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; - wbt_enable_default(disk); } if (user) { @@ -3317,6 +3321,11 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); + if (enable) + wbt_disable_default(disk); + else + wbt_enable_default(disk); + blk_mq_unquiesce_queue(disk->queue); blk_mq_unfreeze_queue(disk->queue); diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c index 055529b9b92b..4051fada01f1 100644 --- a/block/blk-ioprio.c +++ b/block/blk-ioprio.c @@ -23,25 +23,28 @@ /** * enum prio_policy - I/O priority class policy. * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. - * @POLICY_NONE_TO_RT: modify IOPRIO_CLASS_NONE into IOPRIO_CLASS_RT. + * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into * IOPRIO_CLASS_BE. * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. + * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. * * See also <linux/ioprio.h>. */ enum prio_policy { POLICY_NO_CHANGE = 0, - POLICY_NONE_TO_RT = 1, + POLICY_PROMOTE_TO_RT = 1, POLICY_RESTRICT_TO_BE = 2, POLICY_ALL_TO_IDLE = 3, + POLICY_NONE_TO_RT = 4, }; static const char *policy_name[] = { [POLICY_NO_CHANGE] = "no-change", - [POLICY_NONE_TO_RT] = "none-to-rt", + [POLICY_PROMOTE_TO_RT] = "promote-to-rt", [POLICY_RESTRICT_TO_BE] = "restrict-to-be", [POLICY_ALL_TO_IDLE] = "idle", + [POLICY_NONE_TO_RT] = "none-to-rt", }; static struct blkcg_policy ioprio_policy; @@ -189,6 +192,20 @@ void blkcg_set_ioprio(struct bio *bio) if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) return; + if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || + blkcg->prio_policy == POLICY_NONE_TO_RT) { + /* + * For RT threads, the default priority level is 4 because + * task_nice is 0. By promoting non-RT io-priority to RT-class + * and default level 4, those requests that are already + * RT-class but need a higher io-priority can use ioprio_set() + * to achieve this. + */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); + return; + } + /* * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers * correspond to a lower priority. Hence, the max_t() below selects diff --git a/block/blk-map.c b/block/blk-map.c index 46eed2e627c3..8584babf3ea0 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -281,21 +281,21 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, if (blk_queue_pci_p2pdma(rq->q)) extraction_flags |= ITER_ALLOW_P2PDMA; + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); while (iov_iter_count(iter)) { - struct page **pages, *stack_pages[UIO_FASTIOV]; + struct page *stack_pages[UIO_FASTIOV]; + struct page **pages = stack_pages; ssize_t bytes; size_t offs; int npages; - if (nr_vecs <= ARRAY_SIZE(stack_pages)) { - pages = stack_pages; - bytes = iov_iter_get_pages(iter, pages, LONG_MAX, - nr_vecs, &offs, extraction_flags); - } else { - bytes = iov_iter_get_pages_alloc(iter, &pages, - LONG_MAX, &offs, extraction_flags); - } + if (nr_vecs > ARRAY_SIZE(stack_pages)) + pages = NULL; + + bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, + nr_vecs, extraction_flags, &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -315,12 +315,11 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, n = bytes; if (!bio_add_hw_page(rq->q, bio, page, n, offs, - max_sectors, &same_page)) { - if (same_page) - put_page(page); + max_sectors, &same_page)) break; - } + if (same_page) + bio_release_page(bio, page); bytes -= n; offs = 0; } @@ -329,7 +328,7 @@ static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, * release the pages we didn't map into the bio, if any */ while (j < npages) - put_page(pages[j++]); + bio_release_page(bio, pages[j++]); if (pages != stack_pages) kvfree(pages); /* couldn't stuff something into bio? */ diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index d23a8554ec4a..c3b5930106b2 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -88,6 +88,7 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(IO_STAT), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), + QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), QUEUE_FLAG_NAME(INIT_DONE), QUEUE_FLAG_NAME(STABLE_WRITES), @@ -103,6 +104,8 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(RQ_ALLOC_TIME), QUEUE_FLAG_NAME(HCTX_ACTIVE), QUEUE_FLAG_NAME(NOWAIT), + QUEUE_FLAG_NAME(SQ_SCHED), + QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; #undef QUEUE_FLAG_NAME @@ -241,14 +244,14 @@ static const char *const cmd_flag_name[] = { #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { RQF_NAME(STARTED), - RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), + RQF_NAME(SCHED_TAGS), + RQF_NAME(USE_SCHED), RQF_NAME(FAILED), RQF_NAME(QUIET), - RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), RQF_NAME(PM), RQF_NAME(HASHED), @@ -256,7 +259,6 @@ static const char *const rqf_name[] = { RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), RQF_NAME(TIMED_OUT), - RQF_NAME(ELV), RQF_NAME(RESV), }; #undef RQF_NAME @@ -399,7 +401,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", - atomic_read(&tags->active_queues)); + READ_ONCE(tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 7c3cbad17f30..1326526bb733 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -37,7 +37,7 @@ static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = q->elevator; if (e->type->ops.allow_merge) @@ -48,7 +48,7 @@ blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = rq->q->elevator; if (e->type->ops.completed_request) @@ -58,11 +58,11 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) static inline void blk_mq_sched_requeue_request(struct request *rq) { - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; - if ((rq->rq_flags & RQF_ELVPRIV) && e->type->ops.requeue_request) + if (e->type->ops.requeue_request) e->type->ops.requeue_request(rq); } } diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index dfd81cab5788..cc57e2dd9a0b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -38,6 +38,7 @@ static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { unsigned int users; + struct blk_mq_tags *tags = hctx->tags; /* * calling test_bit() prior to test_and_set_bit() is intentional, @@ -55,9 +56,11 @@ void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) return; } - users = atomic_inc_return(&hctx->tags->active_queues); - - blk_mq_update_wake_batch(hctx->tags, users); + spin_lock_irq(&tags->lock); + users = tags->active_queues + 1; + WRITE_ONCE(tags->active_queues, users); + blk_mq_update_wake_batch(tags, users); + spin_unlock_irq(&tags->lock); } /* @@ -90,9 +93,11 @@ void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) return; } - users = atomic_dec_return(&tags->active_queues); - + spin_lock_irq(&tags->lock); + users = tags->active_queues - 1; + WRITE_ONCE(tags->active_queues, users); blk_mq_update_wake_batch(tags, users); + spin_unlock_irq(&tags->lock); blk_mq_tag_wakeup_all(tags, false); } diff --git a/block/blk-mq.c b/block/blk-mq.c index 850bfb844ed2..c21bc81a790f 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -45,19 +45,12 @@ static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); +static void blk_mq_request_bypass_insert(struct request *rq, + blk_insert_t flags); static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list); - -static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, - blk_qc_t qc) -{ - return xa_load(&q->hctx_table, qc); -} - -static inline blk_qc_t blk_rq_to_qc(struct request *rq) -{ - return rq->mq_hctx->queue_num; -} +static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator @@ -335,8 +328,24 @@ void blk_rq_init(struct request_queue *q, struct request *rq) } EXPORT_SYMBOL(blk_rq_init); +/* Set start and alloc time when the allocated request is actually used */ +static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) +{ + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; + +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + if (blk_queue_rq_alloc_time(rq->q)) + rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + else + rq->alloc_time_ns = 0; +#endif +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - struct blk_mq_tags *tags, unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag) { struct blk_mq_ctx *ctx = data->ctx; struct blk_mq_hw_ctx *hctx = data->hctx; @@ -354,23 +363,16 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, data->rq_flags |= RQF_IO_STAT; rq->rq_flags = data->rq_flags; - if (!(data->rq_flags & RQF_ELV)) { - rq->tag = tag; - rq->internal_tag = BLK_MQ_NO_TAG; - } else { + if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; + } else { + rq->tag = tag; + rq->internal_tag = BLK_MQ_NO_TAG; } rq->timeout = 0; - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); - else - rq->start_time_ns = 0; rq->part = NULL; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - rq->alloc_time_ns = alloc_time_ns; -#endif rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; @@ -386,25 +388,21 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, WRITE_ONCE(rq->deadline, 0); req_ref_set(rq, 1); - if (rq->rq_flags & RQF_ELV) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; INIT_HLIST_NODE(&rq->hash); RB_CLEAR_NODE(&rq->rb_node); - if (!op_is_flush(data->cmd_flags) && - e->type->ops.prepare_request) { + if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); - rq->rq_flags |= RQF_ELVPRIV; - } } return rq; } static inline struct request * -__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, - u64 alloc_time_ns) +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) { unsigned int tag, tag_offset; struct blk_mq_tags *tags; @@ -423,7 +421,7 @@ __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, tag = tag_offset + i; prefetch(tags->static_rqs[tag]); tag_mask &= ~(1UL << i); - rq = blk_mq_rq_ctx_init(data, tags, tag, alloc_time_ns); + rq = blk_mq_rq_ctx_init(data, tags, tag); rq_list_add(data->cached_rq, rq); nr++; } @@ -449,26 +447,32 @@ static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) data->flags |= BLK_MQ_REQ_NOWAIT; if (q->elevator) { - struct elevator_queue *e = q->elevator; - - data->rq_flags |= RQF_ELV; + /* + * All requests use scheduler tags when an I/O scheduler is + * enabled for the queue. + */ + data->rq_flags |= RQF_SCHED_TAGS; /* * Flush/passthrough requests are special and go directly to the - * dispatch list. Don't include reserved tags in the - * limiting, as it isn't useful. + * dispatch list. */ - if (!op_is_flush(data->cmd_flags) && - !blk_op_is_passthrough(data->cmd_flags) && - e->type->ops.limit_depth && - !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.limit_depth(data->cmd_flags, data); + if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && + !blk_op_is_passthrough(data->cmd_flags)) { + struct elevator_mq_ops *ops = &q->elevator->type->ops; + + WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); + + data->rq_flags |= RQF_USE_SCHED; + if (ops->limit_depth) + ops->limit_depth(data->cmd_flags, data); + } } retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->rq_flags & RQF_ELV)) + if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_tag_busy(data->hctx); if (data->flags & BLK_MQ_REQ_RESERVED) @@ -478,9 +482,11 @@ retry: * Try batched alloc if we want more than 1 tag. */ if (data->nr_tags > 1) { - rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); - if (rq) + rq = __blk_mq_alloc_requests_batch(data); + if (rq) { + blk_mq_rq_time_init(rq, alloc_time_ns); return rq; + } data->nr_tags = 1; } @@ -503,8 +509,9 @@ retry: goto retry; } - return blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag, - alloc_time_ns); + rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); + blk_mq_rq_time_init(rq, alloc_time_ns); + return rq; } static struct request *blk_mq_rq_cache_fill(struct request_queue *q, @@ -559,6 +566,7 @@ static struct request *blk_mq_alloc_cached_request(struct request_queue *q, return NULL; plug->cached_rq = rq_list_next(rq); + blk_mq_rq_time_init(rq, 0); } rq->cmd_flags = opf; @@ -648,10 +656,10 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); - if (!q->elevator) - blk_mq_tag_busy(data.hctx); + if (q->elevator) + data.rq_flags |= RQF_SCHED_TAGS; else - data.rq_flags |= RQF_ELV; + blk_mq_tag_busy(data.hctx); if (flags & BLK_MQ_REQ_RESERVED) data.rq_flags |= RQF_RESV; @@ -660,8 +668,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag, - alloc_time_ns); + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); + blk_mq_rq_time_init(rq, alloc_time_ns); rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -673,6 +681,21 @@ out_queue_exit: } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); +static void blk_mq_finish_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (rq->rq_flags & RQF_USE_SCHED) { + q->elevator->type->ops.finish_request(rq); + /* + * For postflush request that may need to be + * completed twice, we should clear this flag + * to avoid double finish_request() on the rq. + */ + rq->rq_flags &= ~RQF_USE_SCHED; + } +} + static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; @@ -699,9 +722,7 @@ void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; - if ((rq->rq_flags & RQF_ELVPRIV) && - q->elevator->type->ops.finish_request) - q->elevator->type->ops.finish_request(rq); + blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) laptop_io_completion(q->disk->bdi); @@ -957,6 +978,8 @@ EXPORT_SYMBOL_GPL(blk_update_request); static inline void blk_account_io_done(struct request *req, u64 now) { + trace_block_io_done(req); + /* * Account IO completion. flush_rq isn't accounted as a * normal IO on queueing nor completion. Accounting the @@ -976,6 +999,8 @@ static inline void blk_account_io_done(struct request *req, u64 now) static inline void blk_account_io_start(struct request *req) { + trace_block_io_start(req); + if (blk_do_io_stat(req)) { /* * All non-passthrough requests are created from a bio with one @@ -1008,6 +1033,8 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error) if (blk_mq_need_time_stamp(rq)) __blk_mq_end_request_acct(rq, ktime_get_ns()); + blk_mq_finish_request(rq); + if (rq->end_io) { rq_qos_done(rq->q, rq); if (rq->end_io(rq, error) == RQ_END_IO_FREE) @@ -1062,6 +1089,8 @@ void blk_mq_end_request_batch(struct io_comp_batch *iob) if (iob->need_ts) __blk_mq_end_request_acct(rq, now); + blk_mq_finish_request(rq); + rq_qos_done(rq->q, rq); /* @@ -1176,8 +1205,9 @@ bool blk_mq_complete_request_remote(struct request *rq) * or a polled request, always complete locally, * it's pointless to redirect the completion. */ - if (rq->mq_hctx->nr_ctx == 1 || - rq->cmd_flags & REQ_POLLED) + if ((rq->mq_hctx->nr_ctx == 1 && + rq->mq_ctx->cpu == raw_smp_processor_id()) || + rq->cmd_flags & REQ_POLLED) return false; if (blk_mq_complete_need_ipi(rq)) { @@ -1238,7 +1268,7 @@ void blk_mq_start_request(struct request *rq) q->integrity.profile->prepare_fn(rq); #endif if (rq->bio && rq->bio->bi_opf & REQ_POLLED) - WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); + WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); @@ -1270,7 +1300,11 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) if (!plug->multiple_queues && last && last->q != rq->q) plug->multiple_queues = true; - if (!plug->has_elevator && (rq->rq_flags & RQF_ELV)) + /* + * Any request allocated from sched tags can't be issued to + * ->queue_rqs() directly + */ + if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) plug->has_elevator = true; rq->rq_next = NULL; rq_list_add(&plug->mq_list, rq); @@ -1340,7 +1374,7 @@ EXPORT_SYMBOL_GPL(blk_rq_is_poll); static void blk_rq_poll_completion(struct request *rq, struct completion *wait) { do { - blk_mq_poll(rq->q, blk_rq_to_qc(rq), NULL, 0); + blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); cond_resched(); } while (!completion_done(wait)); } @@ -1411,13 +1445,16 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { struct request_queue *q = rq->q; + unsigned long flags; __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - blk_mq_add_to_requeue_list(rq, BLK_MQ_INSERT_AT_HEAD); + spin_lock_irqsave(&q->requeue_lock, flags); + list_add_tail(&rq->queuelist, &q->requeue_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); if (kick_requeue_list) blk_mq_kick_requeue_list(q); @@ -1429,13 +1466,16 @@ static void blk_mq_requeue_work(struct work_struct *work) struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); - struct request *rq, *next; + LIST_HEAD(flush_list); + struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); + list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); - list_for_each_entry_safe(rq, next, &rq_list, queuelist) { + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); /* * If RQF_DONTPREP ist set, the request has been started by the * driver already and might have driver-specific data allocated @@ -1443,18 +1483,16 @@ static void blk_mq_requeue_work(struct work_struct *work) * block layer merges for the request. */ if (rq->rq_flags & RQF_DONTPREP) { - rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); blk_mq_request_bypass_insert(rq, 0); - } else if (rq->rq_flags & RQF_SOFTBARRIER) { - rq->rq_flags &= ~RQF_SOFTBARRIER; + } else { list_del_init(&rq->queuelist); blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); } } - while (!list_empty(&rq_list)) { - rq = list_entry(rq_list.next, struct request, queuelist); + while (!list_empty(&flush_list)) { + rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); blk_mq_insert_request(rq, 0); } @@ -1462,27 +1500,6 @@ static void blk_mq_requeue_work(struct work_struct *work) blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - /* - * We abuse this flag that is otherwise used by the I/O scheduler to - * request head insertion from the workqueue. - */ - BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); - - spin_lock_irqsave(&q->requeue_lock, flags); - if (insert_flags & BLK_MQ_INSERT_AT_HEAD) { - rq->rq_flags |= RQF_SOFTBARRIER; - list_add(&rq->queuelist, &q->requeue_list); - } else { - list_add_tail(&rq->queuelist, &q->requeue_list); - } - spin_unlock_irqrestore(&q->requeue_lock, flags); -} - void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); @@ -2427,7 +2444,7 @@ static void blk_mq_run_work_fn(struct work_struct *work) * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) +static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -2492,7 +2509,7 @@ static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) * dispatch it given we prioritize requests in hctx->dispatch. */ blk_mq_request_bypass_insert(rq, flags); - } else if (rq->rq_flags & RQF_FLUSH_SEQ) { + } else if (req_op(rq) == REQ_OP_FLUSH) { /* * Firstly normal IO request is inserted to scheduler queue or * sw queue, meantime we add flush request to dispatch queue( @@ -2622,7 +2639,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, return; } - if ((rq->rq_flags & RQF_ELV) || !blk_mq_get_budget_and_tag(rq)) { + if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, false); return; @@ -2711,6 +2728,7 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) struct request *requeue_list = NULL; struct request **requeue_lastp = &requeue_list; unsigned int depth = 0; + bool is_passthrough = false; LIST_HEAD(list); do { @@ -2719,7 +2737,9 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) if (!this_hctx) { this_hctx = rq->mq_hctx; this_ctx = rq->mq_ctx; - } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx) { + is_passthrough = blk_rq_is_passthrough(rq); + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || + is_passthrough != blk_rq_is_passthrough(rq)) { rq_list_add_tail(&requeue_lastp, rq); continue; } @@ -2731,7 +2751,13 @@ static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) trace_block_unplug(this_hctx->queue, depth, !from_sched); percpu_ref_get(&this_hctx->queue->q_usage_counter); - if (this_hctx->queue->elevator) { + /* passthrough requests should never be issued to the I/O scheduler */ + if (is_passthrough) { + spin_lock(&this_hctx->lock); + list_splice_tail_init(&list, &this_hctx->dispatch); + spin_unlock(&this_hctx->lock); + blk_mq_run_hw_queue(this_hctx, from_sched); + } else if (this_hctx->queue->elevator) { this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, &list, 0); blk_mq_run_hw_queue(this_hctx, from_sched); @@ -2745,7 +2771,14 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { struct request *rq; - if (rq_list_empty(plug->mq_list)) + /* + * We may have been called recursively midway through handling + * plug->mq_list via a schedule() in the driver's queue_rq() callback. + * To avoid mq_list changing under our feet, clear rq_count early and + * bail out specifically if rq_count is 0 rather than checking + * whether the mq_list is empty. + */ + if (plug->rq_count == 0) return; plug->rq_count = 0; @@ -2899,6 +2932,7 @@ static inline struct request *blk_mq_get_cached_request(struct request_queue *q, plug->cached_rq = rq_list_next(rq); rq_qos_throttle(q, *bio); + blk_mq_rq_time_init(rq, 0); rq->cmd_flags = (*bio)->bi_opf; INIT_LIST_HEAD(&rq->queuelist); return rq; @@ -2970,10 +3004,8 @@ void blk_mq_submit_bio(struct bio *bio) return; } - if (op_is_flush(bio->bi_opf)) { - blk_insert_flush(rq); + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) return; - } if (plug) { blk_add_rq_to_plug(plug, rq); @@ -2981,7 +3013,7 @@ void blk_mq_submit_bio(struct bio *bio) } hctx = rq->mq_hctx; - if ((rq->rq_flags & RQF_ELV) || + if ((rq->rq_flags & RQF_USE_SCHED) || (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { blk_mq_insert_request(rq, 0); blk_mq_run_hw_queue(hctx, true); @@ -4232,6 +4264,7 @@ int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); @@ -4369,6 +4402,7 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, int new_nr_hw_queues) { struct blk_mq_tags **new_tags; + int i; if (set->nr_hw_queues >= new_nr_hw_queues) goto done; @@ -4383,6 +4417,16 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; + + for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_rqs(set, i)) { + while (--i >= set->nr_hw_queues) + __blk_mq_free_map_and_rqs(set, i); + return -ENOMEM; + } + cond_resched(); + } + done: set->nr_hw_queues = new_nr_hw_queues; return 0; @@ -4608,9 +4652,6 @@ static bool blk_mq_elv_switch_none(struct list_head *head, { struct blk_mq_qe_pair *qe; - if (!q->elevator) - return true; - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!qe) return false; @@ -4618,6 +4659,12 @@ static bool blk_mq_elv_switch_none(struct list_head *head, /* q->elevator needs protection from ->sysfs_lock */ mutex_lock(&q->sysfs_lock); + /* the check has to be done with holding sysfs_lock */ + if (!q->elevator) { + kfree(qe); + goto unlock; + } + INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; @@ -4625,6 +4672,7 @@ static bool blk_mq_elv_switch_none(struct list_head *head, __elevator_get(qe->type); list_add(&qe->node, head); elevator_disable(q); +unlock: mutex_unlock(&q->sysfs_lock); return true; @@ -4667,7 +4715,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, { struct request_queue *q; LIST_HEAD(head); - int prev_nr_hw_queues; + int prev_nr_hw_queues = set->nr_hw_queues; + int i; lockdep_assert_held(&set->tag_list_lock); @@ -4694,7 +4743,6 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, blk_mq_sysfs_unregister_hctxs(q); } - prev_nr_hw_queues = set->nr_hw_queues; if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; @@ -4730,6 +4778,10 @@ switch_back: list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); + + /* Free the excess tags when nr_hw_queues shrink. */ + for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) @@ -4740,10 +4792,9 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, - unsigned int flags) +static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob, unsigned int flags) { - struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); long state = get_current_state(); int ret; @@ -4768,6 +4819,32 @@ int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch * return 0; } +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, + struct io_comp_batch *iob, unsigned int flags) +{ + struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); + + return blk_hctx_poll(q, hctx, iob, flags); +} + +int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, + unsigned int poll_flags) +{ + struct request_queue *q = rq->q; + int ret; + + if (!blk_rq_is_poll(rq)) + return 0; + if (!percpu_ref_tryget(&q->q_usage_counter)) + return 0; + + ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); + blk_queue_exit(q); + + return ret; +} +EXPORT_SYMBOL_GPL(blk_rq_poll); + unsigned int blk_mq_rq_cpu(struct request *rq) { return rq->mq_ctx->cpu; diff --git a/block/blk-mq.h b/block/blk-mq.h index e876584d3516..1743857e0b01 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -47,7 +47,6 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, unsigned int); -void blk_mq_add_to_requeue_list(struct request *rq, blk_insert_t insert_flags); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); @@ -64,10 +63,6 @@ struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); -/* - * Internal helpers for request insertion into sw queues - */ -void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags); /* * CPU -> queue mappings @@ -226,9 +221,9 @@ static inline bool blk_mq_is_shared_tags(unsigned int flags) static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (!(data->rq_flags & RQF_ELV)) - return data->hctx->tags; - return data->hctx->sched_tags; + if (data->rq_flags & RQF_SCHED_TAGS) + return data->hctx->sched_tags; + return data->hctx->tags; } static inline bool blk_mq_hctx_stopped(struct blk_mq_hw_ctx *hctx) @@ -417,8 +412,7 @@ static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, return true; } - users = atomic_read(&hctx->tags->active_queues); - + users = READ_ONCE(hctx->tags->active_queues); if (!users) return true; diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index d8cc820a365e..167be74df4ee 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -288,11 +288,13 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { + mutex_lock(&q->rq_qos_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); } + mutex_unlock(&q->rq_qos_mutex); } int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, @@ -300,6 +302,8 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, { struct request_queue *q = disk->queue; + lockdep_assert_held(&q->rq_qos_mutex); + rqos->disk = disk; rqos->id = id; rqos->ops = ops; @@ -307,18 +311,13 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, /* * No IO can be in-flight when adding rqos, so freeze queue, which * is fine since we only support rq_qos for blk-mq queue. - * - * Reuse ->queue_lock for protecting against other concurrent - * rq_qos adding/deleting */ blk_mq_freeze_queue(q); - spin_lock_irq(&q->queue_lock); if (rq_qos_id(q, rqos->id)) goto ebusy; rqos->next = q->rq_qos; q->rq_qos = rqos; - spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); @@ -330,7 +329,6 @@ int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, return 0; ebusy: - spin_unlock_irq(&q->queue_lock); blk_mq_unfreeze_queue(q); return -EBUSY; } @@ -340,21 +338,15 @@ void rq_qos_del(struct rq_qos *rqos) struct request_queue *q = rqos->disk->queue; struct rq_qos **cur; - /* - * See comment in rq_qos_add() about freezing queue & using - * ->queue_lock. - */ - blk_mq_freeze_queue(q); + lockdep_assert_held(&q->rq_qos_mutex); - spin_lock_irq(&q->queue_lock); + blk_mq_freeze_queue(q); for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { if (*cur == rqos) { *cur = rqos->next; break; } } - spin_unlock_irq(&q->queue_lock); - blk_mq_unfreeze_queue(q); mutex_lock(&q->debugfs_mutex); diff --git a/block/blk-settings.c b/block/blk-settings.c index 4dd59059b788..0046b447268f 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -830,10 +830,13 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - if (wc) + if (wc) { + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index a64208583853..63e481262336 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -47,19 +47,6 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_var_store64(s64 *var, const char *page) -{ - int err; - s64 v; - - err = kstrtos64(page, 10, &v); - if (err < 0) - return err; - - *var = v; - return 0; -} - static ssize_t queue_requests_show(struct request_queue *q, char *page) { return queue_var_show(q->nr_requests, page); @@ -451,61 +438,6 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, return count; } -static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) -{ - if (!wbt_rq_qos(q)) - return -EINVAL; - - if (wbt_disabled(q)) - return sprintf(page, "0\n"); - - return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); -} - -static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, - size_t count) -{ - struct rq_qos *rqos; - ssize_t ret; - s64 val; - - ret = queue_var_store64(&val, page); - if (ret < 0) - return ret; - if (val < -1) - return -EINVAL; - - rqos = wbt_rq_qos(q); - if (!rqos) { - ret = wbt_init(q->disk); - if (ret) - return ret; - } - - if (val == -1) - val = wbt_default_latency_nsec(q); - else if (val >= 0) - val *= 1000ULL; - - if (wbt_get_min_lat(q) == val) - return count; - - /* - * Ensure that the queue is idled, in case the latency update - * ends up either enabling or disabling wbt completely. We can't - * have IO inflight if that happens. - */ - blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); - - wbt_set_min_lat(q, val); - - blk_mq_unquiesce_queue(q); - blk_mq_unfreeze_queue(q); - - return count; -} - static ssize_t queue_wc_show(struct request_queue *q, char *page) { if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) @@ -517,21 +449,16 @@ static ssize_t queue_wc_show(struct request_queue *q, char *page) static ssize_t queue_wc_store(struct request_queue *q, const char *page, size_t count) { - int set = -1; - - if (!strncmp(page, "write back", 10)) - set = 1; - else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) - set = 0; - - if (set == -1) - return -EINVAL; - - if (set) + if (!strncmp(page, "write back", 10)) { + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) + return -EINVAL; blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) { blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } else { + return -EINVAL; + } return count; } @@ -598,7 +525,6 @@ QUEUE_RW_ENTRY(queue_wc, "write_cache"); QUEUE_RO_ENTRY(queue_fua, "fua"); QUEUE_RO_ENTRY(queue_dax, "dax"); QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); -QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); @@ -617,8 +543,79 @@ QUEUE_RW_ENTRY(queue_iostats, "iostats"); QUEUE_RW_ENTRY(queue_random, "add_random"); QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); +#ifdef CONFIG_BLK_WBT +static ssize_t queue_var_store64(s64 *var, const char *page) +{ + int err; + s64 v; + + err = kstrtos64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + +static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) +{ + if (!wbt_rq_qos(q)) + return -EINVAL; + + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); +} + +static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, + size_t count) +{ + struct rq_qos *rqos; + ssize_t ret; + s64 val; + + ret = queue_var_store64(&val, page); + if (ret < 0) + return ret; + if (val < -1) + return -EINVAL; + + rqos = wbt_rq_qos(q); + if (!rqos) { + ret = wbt_init(q->disk); + if (ret) + return ret; + } + + if (val == -1) + val = wbt_default_latency_nsec(q); + else if (val >= 0) + val *= 1000ULL; + + if (wbt_get_min_lat(q) == val) + return count; + + /* + * Ensure that the queue is idled, in case the latency update + * ends up either enabling or disabling wbt completely. We can't + * have IO inflight if that happens. + */ + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + wbt_set_min_lat(q, val); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + return count; +} + +QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); +#endif + static struct attribute *queue_attrs[] = { - &queue_requests_entry.attr, &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, @@ -626,7 +623,6 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, - &elv_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -647,7 +643,6 @@ static struct attribute *queue_attrs[] = { &queue_max_open_zones_entry.attr, &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, - &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, &queue_stable_writes_entry.attr, &queue_random_entry.attr, @@ -655,9 +650,7 @@ static struct attribute *queue_attrs[] = { &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, - &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, - &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW &blk_throtl_sample_time_entry.attr, #endif @@ -666,16 +659,23 @@ static struct attribute *queue_attrs[] = { NULL, }; +static struct attribute *blk_mq_queue_attrs[] = { + &queue_requests_entry.attr, + &elv_iosched_entry.attr, + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, +#endif + NULL, +}; + static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); struct request_queue *q = disk->queue; - if (attr == &queue_io_timeout_entry.attr && - (!q->mq_ops || !q->mq_ops->timeout)) - return 0; - if ((attr == &queue_max_open_zones_entry.attr || attr == &queue_max_active_zones_entry.attr) && !blk_queue_is_zoned(q)) @@ -684,11 +684,30 @@ static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, return attr->mode; } +static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; + + if (!queue_is_mq(q)) + return 0; + + if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) + return 0; + + return attr->mode; +} + static struct attribute_group queue_attr_group = { .attrs = queue_attrs, .is_visible = queue_attr_visible, }; +static struct attribute_group blk_mq_queue_attr_group = { + .attrs = blk_mq_queue_attrs, + .is_visible = blk_mq_queue_attr_visible, +}; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) @@ -733,6 +752,7 @@ static const struct sysfs_ops queue_sysfs_ops = { static const struct attribute_group *blk_queue_attr_groups[] = { &queue_attr_group, + &blk_mq_queue_attr_group, NULL }; diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 9d010d867fbf..e78bc3b65ec8 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -697,11 +697,41 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) return true; } +static unsigned int calculate_io_allowed(u32 iops_limit, + unsigned long jiffy_elapsed) +{ + unsigned int io_allowed; + u64 tmp; + + /* + * jiffy_elapsed should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)iops_limit * jiffy_elapsed; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; + + return io_allowed; +} + +static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) +{ + return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); +} + /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, time_elapsed, io_trim; - u64 bytes_trim, tmp; + unsigned long time_elapsed; + long long bytes_trim; + int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -723,67 +753,38 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); - time_elapsed = jiffies - tg->slice_start[rw]; - - nr_slices = time_elapsed / tg->td->throtl_slice; - - if (!nr_slices) + time_elapsed = rounddown(jiffies - tg->slice_start[rw], + tg->td->throtl_slice); + if (!time_elapsed) return; - tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; - do_div(tmp, HZ); - bytes_trim = tmp; - - io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / - HZ; - if (!bytes_trim && !io_trim) + bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), + time_elapsed) + + tg->carryover_bytes[rw]; + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + + tg->carryover_ios[rw]; + if (bytes_trim <= 0 && io_trim <= 0) return; - if (tg->bytes_disp[rw] >= bytes_trim) + tg->carryover_bytes[rw] = 0; + if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; - if (tg->io_disp[rw] >= io_trim) + tg->carryover_ios[rw] = 0; + if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; - tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; + tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, - "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, - tg->slice_start[rw], tg->slice_end[rw], jiffies); -} - -static unsigned int calculate_io_allowed(u32 iops_limit, - unsigned long jiffy_elapsed) -{ - unsigned int io_allowed; - u64 tmp; - - /* - * jiffy_elapsed should not be a big value as minimum iops can be - * 1 then at max jiffy elapsed should be equivalent of 1 second as we - * will allow dispatch after 1 second and after that slice should - * have been trimmed. - */ - - tmp = (u64)iops_limit * jiffy_elapsed; - do_div(tmp, HZ); - - if (tmp > UINT_MAX) - io_allowed = UINT_MAX; - else - io_allowed = tmp; - - return io_allowed; -} - -static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) -{ - return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); + "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, + bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], + jiffies); } static void __tg_update_carryover(struct throtl_grp *tg, bool rw) @@ -2178,12 +2179,6 @@ bool __blk_throtl_bio(struct bio *bio) rcu_read_lock(); - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { - blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); - } - spin_lock_irq(&q->queue_lock); throtl_update_latency_buckets(td); diff --git a/block/blk-throttle.h b/block/blk-throttle.h index ef4b7a4de987..d1ccbfe9f797 100644 --- a/block/blk-throttle.h +++ b/block/blk-throttle.h @@ -185,6 +185,15 @@ static inline bool blk_should_throtl(struct bio *bio) struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); int rw = bio_data_dir(bio); + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + } + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + /* iops limit is always counted */ if (tg->has_rules_iops[rw]) return true; diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 9ec2a2f1eda3..0bb613139bec 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -146,7 +146,7 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && - rwb->wb_normal != 0; + rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -201,15 +201,6 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, inflight = atomic_dec_return(&rqw->inflight); /* - * wbt got disabled with IO in flight. Wake up any potential - * waiters, we don't have to do more than that. - */ - if (unlikely(!rwb_enabled(rwb))) { - rwb_wake_all(rwb); - return; - } - - /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up. @@ -503,8 +494,7 @@ bool wbt_disabled(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); - return !rqos || RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT || - RQWB(rqos)->enable_state == WBT_STATE_OFF_MANUAL; + return !rqos || !rwb_enabled(RQWB(rqos)); } u64 wbt_get_min_lat(struct request_queue *q) @@ -545,13 +535,6 @@ static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; - /* - * If we got disabled, just return UINT_MAX. This ensures that - * we'll properly inc a new IO, and dec+wakeup at the end. - */ - if (!rwb_enabled(rwb)) - return UINT_MAX; - if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; @@ -944,7 +927,9 @@ int wbt_init(struct gendisk *disk) /* * Assign rwb and add the stats callback. */ + mutex_lock(&q->rq_qos_mutex); ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + mutex_unlock(&q->rq_qos_mutex); if (ret) goto err_free; diff --git a/block/blk-wbt.h b/block/blk-wbt.h index ba6cca5849a6..8a029e138f7a 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -18,10 +18,6 @@ u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline int wbt_init(struct gendisk *disk) -{ - return -EINVAL; -} static inline void wbt_disable_default(struct gendisk *disk) { } @@ -31,21 +27,6 @@ static inline void wbt_enable_default(struct gendisk *disk) static inline void wbt_set_write_cache(struct request_queue *q, bool wc) { } -static inline u64 wbt_get_min_lat(struct request_queue *q) -{ - return 0; -} -static inline void wbt_set_min_lat(struct request_queue *q, u64 val) -{ -} -static inline u64 wbt_default_latency_nsec(struct request_queue *q) -{ - return 0; -} -static inline bool wbt_disabled(struct request_queue *q) -{ - return true; -} #endif /* CONFIG_BLK_WBT */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index fce9082384d6..619ee41a51cc 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -57,16 +57,10 @@ EXPORT_SYMBOL_GPL(blk_zone_cond_str); */ bool blk_req_needs_zone_write_lock(struct request *rq) { - if (blk_rq_is_passthrough(rq)) - return false; - if (!rq->q->disk->seq_zones_wlock) return false; - if (bdev_op_is_zoned_write(rq->q->disk->part0, req_op(rq))) - return blk_rq_zone_is_seq(rq); - - return false; + return blk_rq_is_seq_zoned_write(rq); } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); @@ -329,8 +323,8 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, * BLKREPORTZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, + unsigned long arg) { void __user *argp = (void __user *)arg; struct zone_report_args args; @@ -362,8 +356,8 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, - const struct blk_zone_range *zrange) +static int blkdev_truncate_zone_range(struct block_device *bdev, + blk_mode_t mode, const struct blk_zone_range *zrange) { loff_t start, end; @@ -382,7 +376,7 @@ static int blkdev_truncate_zone_range(struct block_device *bdev, fmode_t mode, * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -396,7 +390,7 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!bdev_is_zoned(bdev)) return -ENOTTY; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) @@ -448,7 +442,6 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int nr_zones; - sector_t zone_sectors; sector_t sector; }; @@ -462,38 +455,34 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, struct gendisk *disk = args->disk; struct request_queue *q = disk->queue; sector_t capacity = get_capacity(disk); + sector_t zone_sectors = q->limits.chunk_sectors; + + /* Check for bad zones and holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } + + if (zone->start >= capacity || !zone->len) { + pr_warn("%s: Invalid zone start %llu, length %llu\n", + disk->disk_name, zone->start, zone->len); + return -ENODEV; + } /* * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start == 0) { - if (zone->len == 0 || !is_power_of_2(zone->len)) { - pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", - disk->disk_name, zone->len); - return -ENODEV; - } - - args->zone_sectors = zone->len; - args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); - } else if (zone->start + args->zone_sectors < capacity) { - if (zone->len != args->zone_sectors) { + if (zone->start + zone->len < capacity) { + if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); return -ENODEV; } - } else { - if (zone->len > args->zone_sectors) { - pr_warn("%s: Invalid zoned device with larger last zone size\n", - disk->disk_name); - return -ENODEV; - } - } - - /* Check for holes in the zone report */ - if (zone->start != args->sector) { - pr_warn("%s: Zone gap at sectors %llu..%llu\n", - disk->disk_name, args->sector, zone->start); + } else if (zone->len > zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); return -ENODEV; } @@ -532,11 +521,13 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * @disk: Target disk * @update_driver_data: Callback to update driver data on the frozen disk * - * Helper function for low-level device drivers to (re) allocate and initialize - * a disk request queue zone bitmaps. This functions should normally be called - * within the disk ->revalidate method for blk-mq based drivers. For BIO based - * drivers only q->nr_zones needs to be updated so that the sysfs exposed value - * is correct. + * Helper function for low-level device drivers to check and (re) allocate and + * initialize a disk request queue zone bitmaps. This functions should normally + * be called within the disk ->revalidate method for blk-mq based drivers. + * Before calling this function, the device driver must already have set the + * device zone size (chunk_sector limit) and the max zone append limit. + * For BIO based drivers, this function cannot be used. BIO based device drivers + * only need to set disk->nr_zones so that the sysfs exposed value is correct. * If the @update_driver_data callback function is not NULL, the callback is * executed with the device request queue frozen after all zones have been * checked. @@ -545,9 +536,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)) { struct request_queue *q = disk->queue; - struct blk_revalidate_zone_args args = { - .disk = disk, - }; + sector_t zone_sectors = q->limits.chunk_sectors; + sector_t capacity = get_capacity(disk); + struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; int ret; @@ -556,13 +547,31 @@ int blk_revalidate_disk_zones(struct gendisk *disk, if (WARN_ON_ONCE(!queue_is_mq(q))) return -EIO; - if (!get_capacity(disk)) - return -EIO; + if (!capacity) + return -ENODEV; + + /* + * Checks that the device driver indicated a valid zone size and that + * the max zone append limit is set. + */ + if (!zone_sectors || !is_power_of_2(zone_sectors)) { + pr_warn("%s: Invalid non power of two zone size (%llu)\n", + disk->disk_name, zone_sectors); + return -ENODEV; + } + + if (!q->limits.max_zone_append_sectors) { + pr_warn("%s: Invalid 0 maximum zone append limit\n", + disk->disk_name); + return -ENODEV; + } /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ + args.disk = disk; + args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); @@ -576,7 +585,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk, * If zones where reported, make sure that the entire disk capacity * has been checked. */ - if (ret > 0 && args.sector != get_capacity(disk)) { + if (ret > 0 && args.sector != capacity) { pr_warn("%s: Missing zones from sector %llu\n", disk->disk_name, args.sector); ret = -ENODEV; @@ -589,7 +598,6 @@ int blk_revalidate_disk_zones(struct gendisk *disk, */ blk_mq_freeze_queue(q); if (ret > 0) { - blk_queue_chunk_sectors(q, args.zone_sectors); disk->nr_zones = args.nr_zones; swap(disk->seq_zones_wlock, args.seq_zones_wlock); swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); diff --git a/block/blk.h b/block/blk.h index 45547bcf1119..b0dbbc405596 100644 --- a/block/blk.h +++ b/block/blk.h @@ -76,6 +76,10 @@ struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, gfp_t gfp_mask); void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page); + static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) { @@ -269,7 +273,7 @@ bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) -void blk_insert_flush(struct request *rq); +bool blk_insert_flush(struct request *rq); int elevator_switch(struct request_queue *q, struct elevator_type *new_e); void elevator_disable(struct request_queue *q); @@ -394,10 +398,27 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); void disk_clear_zone_settings(struct gendisk *disk); -#else +int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, + unsigned long arg); +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long arg); +#else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} static inline void disk_clear_zone_settings(struct gendisk *disk) {} -#endif +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + blk_mode_t mode, unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} +#endif /* CONFIG_BLK_DEV_ZONED */ + +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); +void bdev_add(struct block_device *bdev, dev_t dev); int blk_alloc_ext_minor(void); void blk_free_ext_minor(unsigned int minor); @@ -409,7 +430,7 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, int bdev_del_partition(struct gendisk *disk, int partno); int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, sector_t length); -void blk_drop_partitions(struct gendisk *disk); +void drop_partition(struct block_device *part); void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); @@ -420,9 +441,19 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page); +/* + * Clean up a page appropriately, where the page may be pinned, may have a + * ref taken on it or neither. + */ +static inline void bio_release_page(struct bio *bio, struct page *page) +{ + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(page); +} + struct request_queue *blk_alloc_queue(int node_id); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode); +int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); @@ -437,6 +468,9 @@ extern struct device_attribute dev_attr_events_poll_msecs; extern struct attribute_group blk_trace_attr_group; +blk_mode_t file_to_blk_mode(struct file *file); +int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, + loff_t lstart, loff_t lend); long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 435c32373cd6..b3acdbdb6e7e 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -26,7 +26,7 @@ struct bsg_set { }; static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, - fmode_t mode, unsigned int timeout) + bool open_for_write, unsigned int timeout) { struct bsg_job *job; struct request *rq; diff --git a/block/bsg.c b/block/bsg.c index 7eca43f33d7f..72157a59b788 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -36,10 +36,10 @@ static inline struct bsg_device *to_bsg_device(struct inode *inode) } #define BSG_DEFAULT_CMDS 64 -#define BSG_MAX_DEVS 32768 +#define BSG_MAX_DEVS (1 << MINORBITS) static DEFINE_IDA(bsg_minor_ida); -static struct class *bsg_class; +static const struct class bsg_class; static int bsg_major; static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) @@ -54,7 +54,8 @@ static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) return max_t(unsigned int, timeout, BLK_MIN_SG_TIMEOUT); } -static int bsg_sg_io(struct bsg_device *bd, fmode_t mode, void __user *uarg) +static int bsg_sg_io(struct bsg_device *bd, bool open_for_write, + void __user *uarg) { struct sg_io_v4 hdr; int ret; @@ -63,7 +64,8 @@ static int bsg_sg_io(struct bsg_device *bd, fmode_t mode, void __user *uarg) return -EFAULT; if (hdr.guard != 'Q') return -EINVAL; - ret = bd->sg_io_fn(bd->queue, &hdr, mode, bsg_timeout(bd, &hdr)); + ret = bd->sg_io_fn(bd->queue, &hdr, open_for_write, + bsg_timeout(bd, &hdr)); if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr))) return -EFAULT; return ret; @@ -146,7 +148,7 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case SG_EMULATED_HOST: return put_user(1, intp); case SG_IO: - return bsg_sg_io(bd, file->f_mode, uarg); + return bsg_sg_io(bd, file->f_mode & FMODE_WRITE, uarg); case SCSI_IOCTL_SEND_COMMAND: pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", current->comm); @@ -206,7 +208,7 @@ struct bsg_device *bsg_register_queue(struct request_queue *q, return ERR_PTR(ret); } bd->device.devt = MKDEV(bsg_major, ret); - bd->device.class = bsg_class; + bd->device.class = &bsg_class; bd->device.parent = parent; bd->device.release = bsg_device_release; dev_set_name(&bd->device, "%s", name); @@ -240,15 +242,19 @@ static char *bsg_devnode(const struct device *dev, umode_t *mode) return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } +static const struct class bsg_class = { + .name = "bsg", + .devnode = bsg_devnode, +}; + static int __init bsg_init(void) { dev_t devid; int ret; - bsg_class = class_create("bsg"); - if (IS_ERR(bsg_class)) - return PTR_ERR(bsg_class); - bsg_class->devnode = bsg_devnode; + ret = class_register(&bsg_class); + if (ret) + return ret; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) @@ -260,7 +266,7 @@ static int __init bsg_init(void) return 0; destroy_bsg_class: - class_destroy(bsg_class); + class_unregister(&bsg_class); return ret; } diff --git a/block/disk-events.c b/block/disk-events.c index aee25a7e1ab7..0cfac464e6d1 100644 --- a/block/disk-events.c +++ b/block/disk-events.c @@ -263,31 +263,31 @@ static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) } /** - * bdev_check_media_change - check if a removable media has been changed - * @bdev: block device to check + * disk_check_media_change - check if a removable media has been changed + * @disk: gendisk to check * * Check whether a removable media has been changed, and attempt to free all * dentries and inodes and invalidates all block device page cache entries in * that case. * - * Returns %true if the block device changed, or %false if not. + * Returns %true if the media has changed, or %false if not. */ -bool bdev_check_media_change(struct block_device *bdev) +bool disk_check_media_change(struct gendisk *disk) { unsigned int events; - events = disk_clear_events(bdev->bd_disk, DISK_EVENT_MEDIA_CHANGE | + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | DISK_EVENT_EJECT_REQUEST); if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; - if (__invalidate_device(bdev, true)) + if (__invalidate_device(disk->part0, true)) pr_warn("VFS: busy inodes on changed media %s\n", - bdev->bd_disk->disk_name); - set_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + disk->disk_name); + set_bit(GD_NEED_PART_SCAN, &disk->state); return true; } -EXPORT_SYMBOL(bdev_check_media_change); +EXPORT_SYMBOL(disk_check_media_change); /** * disk_force_media_change - force a media change event @@ -307,6 +307,7 @@ bool disk_force_media_change(struct gendisk *disk, unsigned int events) if (!(events & DISK_EVENT_MEDIA_CHANGE)) return false; + inc_diskseq(disk); if (__invalidate_device(disk->part0, true)) pr_warn("VFS: busy inodes on changed media %s\n", disk->disk_name); diff --git a/block/early-lookup.c b/block/early-lookup.c new file mode 100644 index 000000000000..3effbd0d35e9 --- /dev/null +++ b/block/early-lookup.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Code for looking up block devices in the early boot code before mounting the + * root file system. + */ +#include <linux/blkdev.h> +#include <linux/ctype.h> + +struct uuidcmp { + const char *uuid; + int len; +}; + +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to the desired struct uuidcmp to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int __init match_dev_by_uuid(struct device *dev, const void *data) +{ + struct block_device *bdev = dev_to_bdev(dev); + const struct uuidcmp *cmp = data; + + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) + return 0; + return 1; +} + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid_str: char array containing ascii UUID + * @devt: dev_t result + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns 0 on success or a negative error code on failure. + */ +static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt) +{ + struct uuidcmp cmp; + struct device *dev = NULL; + int offset = 0; + char *slash; + + cmp.uuid = uuid_str; + + slash = strchr(uuid_str, '/'); + /* Check for optional partition number offset attributes. */ + if (slash) { + char c = 0; + + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) + goto out_invalid; + cmp.len = slash - uuid_str; + } else { + cmp.len = strlen(uuid_str); + } + + if (!cmp.len) + goto out_invalid; + + dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); + if (!dev) + return -ENODEV; + + if (offset) { + /* + * Attempt to find the requested partition by adding an offset + * to the partition number found by UUID. + */ + *devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); + } else { + *devt = dev->devt; + } + + put_device(dev); + return 0; + +out_invalid: + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); + return -EINVAL; +} + +/** + * match_dev_by_label - callback for finding a partition using its label + * @dev: device passed in by the caller + * @data: opaque pointer to the label to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int __init match_dev_by_label(struct device *dev, const void *data) +{ + struct block_device *bdev = dev_to_bdev(dev); + const char *label = data; + + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) + return 0; + return 1; +} + +static int __init devt_from_partlabel(const char *label, dev_t *devt) +{ + struct device *dev; + + dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); + if (!dev) + return -ENODEV; + *devt = dev->devt; + put_device(dev); + return 0; +} + +static dev_t __init blk_lookup_devt(const char *name, int partno) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + + if (strcmp(dev_name(dev), name)) + continue; + + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + } else { + devt = part_devt(disk, partno); + if (devt) + break; + } + } + class_dev_iter_exit(&iter); + return devt; +} + +static int __init devt_from_devname(const char *name, dev_t *devt) +{ + int part; + char s[32]; + char *p; + + if (strlen(name) > 31) + return -EINVAL; + strcpy(s, name); + for (p = s; *p; p++) { + if (*p == '/') + *p = '!'; + } + + *devt = blk_lookup_devt(s, 0); + if (*devt) + return 0; + + /* + * Try non-existent, but valid partition, which may only exist after + * opening the device, like partitioned md devices. + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + return -ENODEV; + + /* try disk name without <part number> */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + + /* try disk name without p<part number> */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + return -ENODEV; + p[-1] = '\0'; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + return -ENODEV; +} + +static int __init devt_from_devnum(const char *name, dev_t *devt) +{ + unsigned maj, min, offset; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { + *devt = MKDEV(maj, min); + if (maj != MAJOR(*devt) || min != MINOR(*devt)) + return -EINVAL; + } else { + *devt = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + return -EINVAL; + } + + return 0; +} + +/* + * Convert a name into device number. We accept the following variants: + * + * 1) <hex_major><hex_minor> device number in hexadecimal represents itself + * no leading 0x, for example b302. + * 3) /dev/<disk_name> represents the device number of disk + * 4) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 5) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 8) <major>:<minor> major and minor number of the device separated by + * a colon. + * 9) PARTLABEL=<name> with name being the GPT partition label. + * MSDOS partitions do not support labels! + * + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. + */ +int __init early_lookup_bdev(const char *name, dev_t *devt) +{ + if (strncmp(name, "PARTUUID=", 9) == 0) + return devt_from_partuuid(name + 9, devt); + if (strncmp(name, "PARTLABEL=", 10) == 0) + return devt_from_partlabel(name + 10, devt); + if (strncmp(name, "/dev/", 5) == 0) + return devt_from_devname(name + 5, devt); + return devt_from_devnum(name, devt); +} + +static char __init *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + +/* + * print a full list of all partitions - intended for places where the root + * filesystem can't be mounted and thus to give the victim some idea of what + * went wrong + */ +void __init printk_all_partitions(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct block_device *part; + char devt_buf[BDEVT_SIZE]; + unsigned long idx; + + /* + * Don't show empty devices or things that have been + * suppressed + */ + if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the numbers in + * hex - the same format as the root= option takes. + */ + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + printk("%s%s %10llu %pg %s", + bdev_is_partition(part) ? " " : "", + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, part, + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); + if (bdev_is_partition(part)) + printk("\n"); + else if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); + } + rcu_read_unlock(); + } + class_dev_iter_exit(&iter); +} diff --git a/block/elevator.c b/block/elevator.c index 24909069f872..5ff093cb3cf8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -499,6 +499,9 @@ void elv_unregister_queue(struct request_queue *q) int elv_register(struct elevator_type *e) { + /* finish request is mandatory */ + if (WARN_ON_ONCE(!e->ops.finish_request)) + return -EINVAL; /* insert_requests and dispatch_request are mandatory */ if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) return -EINVAL; @@ -751,7 +754,7 @@ ssize_t elv_iosched_store(struct request_queue *q, const char *buf, if (!elv_support_iosched(q)) return count; - strlcpy(elevator_name, buf, sizeof(elevator_name)); + strscpy(elevator_name, buf, sizeof(elevator_name)); ret = elevator_change(q, strstrip(elevator_name)); if (!ret) return count; diff --git a/block/fops.c b/block/fops.c index 58d0aebc7313..838ffada5341 100644 --- a/block/fops.c +++ b/block/fops.c @@ -54,7 +54,7 @@ static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; loff_t pos = iocb->ki_pos; bool should_dirty = false; @@ -170,7 +170,7 @@ static void blkdev_bio_end_io(struct bio *bio) static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct blk_plug plug; struct blkdev_dio *dio; struct bio *bio; @@ -310,7 +310,7 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, struct iov_iter *iter, unsigned int nr_pages) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); bool is_read = iov_iter_rw(iter) == READ; blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); struct blkdev_dio *dio; @@ -358,13 +358,14 @@ static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + if (iocb->ki_flags & IOCB_HIPRI) { - bio->bi_opf |= REQ_POLLED | REQ_NOWAIT; + bio->bi_opf |= REQ_POLLED; submit_bio(bio); WRITE_ONCE(iocb->private, bio); } else { - if (iocb->ki_flags & IOCB_NOWAIT) - bio->bi_opf |= REQ_NOWAIT; submit_bio(bio); } return -EIOCBQUEUED; @@ -451,7 +452,7 @@ static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, int datasync) { - struct block_device *bdev = filp->private_data; + struct block_device *bdev = I_BDEV(filp->f_mapping->host); int error; error = file_write_and_wait_range(filp, start, end); @@ -470,6 +471,30 @@ static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, return error; } +blk_mode_t file_to_blk_mode(struct file *file) +{ + blk_mode_t mode = 0; + + if (file->f_mode & FMODE_READ) + mode |= BLK_OPEN_READ; + if (file->f_mode & FMODE_WRITE) + mode |= BLK_OPEN_WRITE; + if (file->private_data) + mode |= BLK_OPEN_EXCL; + if (file->f_flags & O_NDELAY) + mode |= BLK_OPEN_NDELAY; + + /* + * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy + * driver has historically allowed ioctls as if the file was opened for + * writing, but does not allow and actual reads or writes. + */ + if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) + mode |= BLK_OPEN_WRITE_IOCTL; + + return mode; +} + static int blkdev_open(struct inode *inode, struct file *filp) { struct block_device *bdev; @@ -481,30 +506,31 @@ static int blkdev_open(struct inode *inode, struct file *filp) * during an unstable branch. */ filp->f_flags |= O_LARGEFILE; - filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC; + filp->f_mode |= FMODE_BUF_RASYNC; - if (filp->f_flags & O_NDELAY) - filp->f_mode |= FMODE_NDELAY; + /* + * Use the file private data to store the holder for exclusive openes. + * file_to_blk_mode relies on it being present to set BLK_OPEN_EXCL. + */ if (filp->f_flags & O_EXCL) - filp->f_mode |= FMODE_EXCL; - if ((filp->f_flags & O_ACCMODE) == 3) - filp->f_mode |= FMODE_WRITE_IOCTL; + filp->private_data = filp; - bdev = blkdev_get_by_dev(inode->i_rdev, filp->f_mode, filp); + bdev = blkdev_get_by_dev(inode->i_rdev, file_to_blk_mode(filp), + filp->private_data, NULL); if (IS_ERR(bdev)) return PTR_ERR(bdev); - filp->private_data = bdev; + if (bdev_nowait(bdev)) + filp->f_mode |= FMODE_NOWAIT; + filp->f_mapping = bdev->bd_inode->i_mapping; filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); return 0; } -static int blkdev_close(struct inode *inode, struct file *filp) +static int blkdev_release(struct inode *inode, struct file *filp) { - struct block_device *bdev = filp->private_data; - - blkdev_put(bdev, filp->f_mode); + blkdev_put(I_BDEV(filp->f_mapping->host), filp->private_data); return 0; } @@ -517,10 +543,9 @@ static int blkdev_close(struct inode *inode, struct file *filp) */ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); struct inode *bd_inode = bdev->bd_inode; loff_t size = bdev_nr_bytes(bdev); - struct blk_plug plug; size_t shorted = 0; ssize_t ret; @@ -545,18 +570,16 @@ static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) iov_iter_truncate(from, size); } - blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); iov_iter_reexpand(from, iov_iter_count(from) + shorted); - blk_finish_plug(&plug); return ret; } static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) { - struct block_device *bdev = iocb->ki_filp->private_data; + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); loff_t size = bdev_nr_bytes(bdev); loff_t pos = iocb->ki_pos; size_t shorted = 0; @@ -576,21 +599,9 @@ static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) goto reexpand; /* skip atime */ if (iocb->ki_flags & IOCB_DIRECT) { - struct address_space *mapping = iocb->ki_filp->f_mapping; - - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_needs_writeback(mapping, pos, - pos + count - 1)) { - ret = -EAGAIN; - goto reexpand; - } - } else { - ret = filemap_write_and_wait_range(mapping, pos, - pos + count - 1); - if (ret < 0) - goto reexpand; - } - + ret = kiocb_write_and_wait(iocb, count); + if (ret < 0) + goto reexpand; file_accessed(iocb->ki_filp); ret = blkdev_direct_IO(iocb, to); @@ -649,7 +660,7 @@ static long blkdev_fallocate(struct file *file, int mode, loff_t start, filemap_invalidate_lock(inode->i_mapping); /* Invalidate the page cache, including dirty pages. */ - error = truncate_bdev_range(bdev, file->f_mode, start, end); + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); if (error) goto fail; @@ -690,7 +701,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) const struct file_operations def_blk_fops = { .open = blkdev_open, - .release = blkdev_close, + .release = blkdev_release, .llseek = blkdev_llseek, .read_iter = blkdev_read_iter, .write_iter = blkdev_write_iter, @@ -701,7 +712,7 @@ const struct file_operations def_blk_fops = { #ifdef CONFIG_COMPAT .compat_ioctl = compat_blkdev_ioctl, #endif - .splice_read = generic_file_splice_read, + .splice_read = filemap_splice_read, .splice_write = iter_file_splice_write, .fallocate = blkdev_fallocate, }; diff --git a/block/genhd.c b/block/genhd.c index 1cb489b927d5..3d287b32d50d 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -25,8 +25,9 @@ #include <linux/pm_runtime.h> #include <linux/badblocks.h> #include <linux/part_stat.h> -#include "blk-throttle.h" +#include <linux/blktrace_api.h> +#include "blk-throttle.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" @@ -253,7 +254,7 @@ int __register_blkdev(unsigned int major, const char *name, #ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD p->probe = probe; #endif - strlcpy(p->name, name, sizeof(p->name)); + strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); @@ -318,18 +319,6 @@ void blk_free_ext_minor(unsigned int minor) ida_free(&ext_devt_ida, minor); } -static char *bdevt_str(dev_t devt, char *buf) -{ - if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { - char tbuf[BDEVT_SIZE]; - snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); - snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); - } else - snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); - - return buf; -} - void disk_uevent(struct gendisk *disk, enum kobject_action action) { struct block_device *part; @@ -351,7 +340,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode) +int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { struct block_device *bdev; int ret = 0; @@ -369,18 +358,20 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) * synchronize with other exclusive openers and other partition * scanners. */ - if (!(mode & FMODE_EXCL)) { - ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions); + if (!(mode & BLK_OPEN_EXCL)) { + ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, + NULL); if (ret) return ret; } set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXCL, NULL); + bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, + NULL); if (IS_ERR(bdev)) ret = PTR_ERR(bdev); else - blkdev_put(bdev, mode & ~FMODE_EXCL); + blkdev_put(bdev, NULL); /* * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, @@ -388,7 +379,7 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) * creat partition for underlying disk. */ clear_bit(GD_NEED_PART_SCAN, &disk->state); - if (!(mode & FMODE_EXCL)) + if (!(mode & BLK_OPEN_EXCL)) bd_abort_claiming(disk->part0, disk_scan_partitions); return ret; } @@ -516,7 +507,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) - disk_scan_partitions(disk, FMODE_READ); + disk_scan_partitions(disk, BLK_OPEN_READ); /* * Announce the disk and partitions after all partitions are @@ -563,6 +554,28 @@ out_exit_elevator: } EXPORT_SYMBOL(device_add_disk); +static void blk_report_disk_dead(struct gendisk *disk) +{ + struct block_device *bdev; + unsigned long idx; + + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, bdev) { + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + continue; + rcu_read_unlock(); + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) + bdev->bd_holder_ops->mark_dead(bdev); + mutex_unlock(&bdev->bd_holder_lock); + + put_device(&bdev->bd_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + /** * blk_mark_disk_dead - mark a disk as dead * @disk: disk to mark as dead @@ -572,13 +585,26 @@ EXPORT_SYMBOL(device_add_disk); */ void blk_mark_disk_dead(struct gendisk *disk) { - set_bit(GD_DEAD, &disk->state); - blk_queue_start_drain(disk->queue); + /* + * Fail any new I/O. + */ + if (test_and_set_bit(GD_DEAD, &disk->state)) + return; + + if (test_bit(GD_OWNS_QUEUE, &disk->state)) + blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* * Stop buffered writers from dirtying pages that can't be written out. */ - set_capacity_and_notify(disk, 0); + set_capacity(disk, 0); + + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(disk->queue); + + blk_report_disk_dead(disk); } EXPORT_SYMBOL_GPL(blk_mark_disk_dead); @@ -604,6 +630,8 @@ EXPORT_SYMBOL_GPL(blk_mark_disk_dead); void del_gendisk(struct gendisk *disk) { struct request_queue *q = disk->queue; + struct block_device *part; + unsigned long idx; might_sleep(); @@ -612,26 +640,27 @@ void del_gendisk(struct gendisk *disk) disk_del_events(disk); + /* + * Prevent new openers by unlinked the bdev inode, and write out + * dirty data before marking the disk dead and stopping all I/O. + */ mutex_lock(&disk->open_mutex); - remove_inode_hash(disk->part0->bd_inode); - blk_drop_partitions(disk); + xa_for_each(&disk->part_tbl, idx, part) { + remove_inode_hash(part->bd_inode); + fsync_bdev(part); + __invalidate_device(part, true); + } mutex_unlock(&disk->open_mutex); - fsync_bdev(disk->part0); - __invalidate_device(disk->part0, true); + blk_mark_disk_dead(disk); /* - * Fail any new I/O. + * Drop all partitions now that the disk is marked dead. */ - set_bit(GD_DEAD, &disk->state); - if (test_bit(GD_OWNS_QUEUE, &disk->state)) - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - set_capacity(disk, 0); - - /* - * Prevent new I/O from crossing bio_queue_enter(). - */ - blk_queue_start_drain(q); + mutex_lock(&disk->open_mutex); + xa_for_each_start(&disk->part_tbl, idx, part, 1) + drop_partition(part); + mutex_unlock(&disk->open_mutex); if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); @@ -755,57 +784,6 @@ void blk_request_module(dev_t devt) } #endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ -/* - * print a full list of all partitions - intended for places where the root - * filesystem can't be mounted and thus to give the victim some idea of what - * went wrong - */ -void __init printk_all_partitions(void) -{ - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct block_device *part; - char devt_buf[BDEVT_SIZE]; - unsigned long idx; - - /* - * Don't show empty devices or things that have been - * suppressed - */ - if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) - continue; - - /* - * Note, unlike /proc/partitions, I am showing the numbers in - * hex - the same format as the root= option takes. - */ - rcu_read_lock(); - xa_for_each(&disk->part_tbl, idx, part) { - if (!bdev_nr_sectors(part)) - continue; - printk("%s%s %10llu %pg %s", - bdev_is_partition(part) ? " " : "", - bdevt_str(part->bd_dev, devt_buf), - bdev_nr_sectors(part) >> 1, part, - part->bd_meta_info ? - part->bd_meta_info->uuid : ""); - if (bdev_is_partition(part)) - printk("\n"); - else if (dev->parent && dev->parent->driver) - printk(" driver: %s\n", - dev->parent->driver->name); - else - printk(" (driver?)\n"); - } - rcu_read_unlock(); - } - class_dev_iter_exit(&iter); -} - #ifdef CONFIG_PROC_FS /* iterator */ static void *disk_seqf_start(struct seq_file *seqf, loff_t *pos) @@ -1171,6 +1149,8 @@ static void disk_release(struct device *dev) might_sleep(); WARN_ON_ONCE(disk_live(disk)); + blk_trace_remove(disk->queue); + /* * To undo the all initialization from blk_mq_init_allocated_queue in * case of a probe failure where add_disk is never called we have to @@ -1339,35 +1319,6 @@ dev_t part_devt(struct gendisk *disk, u8 partno) return devt; } -dev_t blk_lookup_devt(const char *name, int partno) -{ - dev_t devt = MKDEV(0, 0); - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - - if (strcmp(dev_name(dev), name)) - continue; - - if (partno < disk->minors) { - /* We need to return the right devno, even - * if the partition doesn't exist yet. - */ - devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + partno); - } else { - devt = part_devt(disk, partno); - if (devt) - break; - } - } - class_dev_iter_exit(&iter); - return devt; -} - struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, struct lock_class_key *lkclass) { diff --git a/block/ioctl.c b/block/ioctl.c index 9c5f637ff153..9fcddd847937 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -20,6 +20,8 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition p; long long start, length; + if (disk->flags & GENHD_FL_NO_PART) + return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) @@ -82,7 +84,7 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, +static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; @@ -90,7 +92,7 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, struct inode *inode = bdev->bd_inode; int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_discard_sectors(bdev)) @@ -120,14 +122,14 @@ fail: return err; } -static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, +static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, void __user *argp) { uint64_t start, len; uint64_t range[2]; int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (!bdev_max_secure_erase_sectors(bdev)) return -EOPNOTSUPP; @@ -151,7 +153,7 @@ static int blk_ioctl_secure_erase(struct block_device *bdev, fmode_t mode, } -static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, +static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; @@ -159,7 +161,7 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, struct inode *inode = bdev->bd_inode; int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(range, (void __user *)arg, sizeof(range))) @@ -240,7 +242,7 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) * drivers that implement only commands that are completely compatible * between 32-bit and 64-bit user space */ -int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; @@ -254,13 +256,28 @@ int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif -static int blkdev_pr_register(struct block_device *bdev, +static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) +{ + /* no sense to make reservations for partitions */ + if (bdev_is_partition(bdev)) + return false; + + if (capable(CAP_SYS_ADMIN)) + return true; + /* + * Only allow unprivileged reservations if the file descriptor is open + * for writing. + */ + return mode & BLK_OPEN_WRITE; +} + +static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; @@ -272,13 +289,13 @@ static int blkdev_pr_register(struct block_device *bdev, return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); } -static int blkdev_pr_reserve(struct block_device *bdev, +static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; @@ -290,13 +307,13 @@ static int blkdev_pr_reserve(struct block_device *bdev, return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); } -static int blkdev_pr_release(struct block_device *bdev, +static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; @@ -308,13 +325,13 @@ static int blkdev_pr_release(struct block_device *bdev, return ops->pr_release(bdev, rsv.key, rsv.type); } -static int blkdev_pr_preempt(struct block_device *bdev, +static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, struct pr_preempt __user *arg, bool abort) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; @@ -326,13 +343,13 @@ static int blkdev_pr_preempt(struct block_device *bdev, return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); } -static int blkdev_pr_clear(struct block_device *bdev, +static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, struct pr_clear __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; @@ -344,8 +361,8 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) +static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, + unsigned long arg) { if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -354,8 +371,8 @@ static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, return 0; } -static int blkdev_roset(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) +static int blkdev_roset(struct block_device *bdev, unsigned cmd, + unsigned long arg) { int ret, n; @@ -439,7 +456,7 @@ static int compat_hdio_getgeo(struct block_device *bdev, #endif /* set the logical block size */ -static int blkdev_bszset(struct block_device *bdev, fmode_t mode, +static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, int __user *argp) { int ret, n; @@ -451,13 +468,13 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, if (get_user(n, argp)) return -EFAULT; - if (mode & FMODE_EXCL) + if (mode & BLK_OPEN_EXCL) return set_blocksize(bdev, n); - if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode | FMODE_EXCL, &bdev))) + if (IS_ERR(blkdev_get_by_dev(bdev->bd_dev, mode, &bdev, NULL))) return -EBUSY; ret = set_blocksize(bdev, n); - blkdev_put(bdev, mode | FMODE_EXCL); + blkdev_put(bdev, &bdev); return ret; } @@ -467,7 +484,7 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, +static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg, void __user *argp) { @@ -475,9 +492,9 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKFLSBUF: - return blkdev_flushbuf(bdev, mode, cmd, arg); + return blkdev_flushbuf(bdev, cmd, arg); case BLKROSET: - return blkdev_roset(bdev, mode, cmd, arg); + return blkdev_roset(bdev, cmd, arg); case BLKDISCARD: return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: @@ -487,7 +504,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKGETDISKSEQ: return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: - return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: case BLKCLOSEZONE: @@ -534,17 +551,17 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: - return blkdev_pr_register(bdev, argp); + return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: - return blkdev_pr_reserve(bdev, argp); + return blkdev_pr_reserve(bdev, mode, argp); case IOC_PR_RELEASE: - return blkdev_pr_release(bdev, argp); + return blkdev_pr_release(bdev, mode, argp); case IOC_PR_PREEMPT: - return blkdev_pr_preempt(bdev, argp, false); + return blkdev_pr_preempt(bdev, mode, argp, false); case IOC_PR_PREEMPT_ABORT: - return blkdev_pr_preempt(bdev, argp, true); + return blkdev_pr_preempt(bdev, mode, argp, true); case IOC_PR_CLEAR: - return blkdev_pr_clear(bdev, argp); + return blkdev_pr_clear(bdev, mode, argp); default: return -ENOIOCTLCMD; } @@ -560,18 +577,9 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; - fmode_t mode = file->f_mode; + blk_mode_t mode = file_to_blk_mode(file); int ret; - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; - switch (cmd) { /* These need separate implementations for the data structure */ case HDIO_GETGEO: @@ -630,16 +638,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) void __user *argp = compat_ptr(arg); struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; - fmode_t mode = file->f_mode; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; + blk_mode_t mode = file_to_blk_mode(file); switch (cmd) { /* These need separate implementations for the data structure */ diff --git a/block/ioprio.c b/block/ioprio.c index 32a456b45804..b5a942519a79 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -33,7 +33,7 @@ int ioprio_check_cap(int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); - int data = IOPRIO_PRIO_DATA(ioprio); + int level = IOPRIO_PRIO_LEVEL(ioprio); switch (class) { case IOPRIO_CLASS_RT: @@ -49,15 +49,16 @@ int ioprio_check_cap(int ioprio) fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_NR_LEVELS || data < 0) + if (level >= IOPRIO_NR_LEVELS) return -EINVAL; break; case IOPRIO_CLASS_IDLE: break; case IOPRIO_CLASS_NONE: - if (data) + if (level) return -EINVAL; break; + case IOPRIO_CLASS_INVALID: default: return -EINVAL; } diff --git a/block/mq-deadline.c b/block/mq-deadline.c index 5839a027e0f0..f958e79277b8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -74,8 +74,8 @@ struct dd_per_prio { struct list_head dispatch; struct rb_root sort_list[DD_DIR_COUNT]; struct list_head fifo_list[DD_DIR_COUNT]; - /* Next request in FIFO order. Read, write or both are NULL. */ - struct request *next_rq[DD_DIR_COUNT]; + /* Position of the most recently dispatched request. */ + sector_t latest_pos[DD_DIR_COUNT]; struct io_stats_per_prio stats; }; @@ -156,6 +156,40 @@ deadline_latter_request(struct request *rq) return NULL; } +/* + * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, + * return the first request after the start of the zone containing @pos. + */ +static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir, sector_t pos) +{ + struct rb_node *node = per_prio->sort_list[data_dir].rb_node; + struct request *rq, *res = NULL; + + if (!node) + return NULL; + + rq = rb_entry_rq(node); + /* + * A zoned write may have been requeued with a starting position that + * is below that of the most recently dispatched request. Hence, for + * zoned writes, start searching from the start of a zone. + */ + if (blk_rq_is_seq_zoned_write(rq)) + pos = round_down(pos, rq->q->limits.chunk_sectors); + + while (node) { + rq = rb_entry_rq(node); + if (blk_rq_pos(rq) >= pos) { + res = rq; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return res; +} + static void deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { @@ -167,11 +201,6 @@ deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) static inline void deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - const enum dd_data_dir data_dir = rq_data_dir(rq); - - if (per_prio->next_rq[data_dir] == rq) - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - elv_rb_del(deadline_rb_root(per_prio, rq), rq); } @@ -251,10 +280,6 @@ static void deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq) { - const enum dd_data_dir data_dir = rq_data_dir(rq); - - per_prio->next_rq[data_dir] = deadline_latter_request(rq); - /* * take it off the sort and fifo list */ @@ -272,21 +297,15 @@ static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) } /* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + * deadline_check_fifo returns true if and only if there are expired requests + * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ -static inline int deadline_check_fifo(struct dd_per_prio *per_prio, - enum dd_data_dir data_dir) +static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; - - return 0; + return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); } /* @@ -310,14 +329,11 @@ static struct request *deadline_skip_seq_writes(struct deadline_data *dd, struct request *rq) { sector_t pos = blk_rq_pos(rq); - sector_t skipped_sectors = 0; - while (rq) { - if (blk_rq_pos(rq) != pos + skipped_sectors) - break; - skipped_sectors += blk_rq_sectors(rq); + do { + pos += blk_rq_sectors(rq); rq = deadline_latter_request(rq); - } + } while (rq && blk_rq_pos(rq) == pos); return rq; } @@ -330,7 +346,7 @@ static struct request * deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, enum dd_data_dir data_dir) { - struct request *rq; + struct request *rq, *rb_rq, *next; unsigned long flags; if (list_empty(&per_prio->fifo_list[data_dir])) @@ -348,7 +364,12 @@ deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { + list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], + queuelist) { + /* Check whether a prior request exists for the same zone. */ + rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); + if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) + rq = rb_rq; if (blk_req_can_dispatch_to_zone(rq) && (blk_queue_nonrot(rq->q) || !deadline_is_seq_write(dd, rq))) @@ -372,7 +393,8 @@ deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, struct request *rq; unsigned long flags; - rq = per_prio->next_rq[data_dir]; + rq = deadline_from_pos(per_prio, data_dir, + per_prio->latest_pos[data_dir]); if (!rq) return NULL; @@ -435,6 +457,7 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, if (started_after(dd, rq, latest_start)) return NULL; list_del_init(&rq->queuelist); + data_dir = rq_data_dir(rq); goto done; } @@ -442,9 +465,11 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd, * batches are currently reads XOR writes */ rq = deadline_next_request(dd, per_prio, dd->last_dir); - if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ + if (rq && dd->batching < dd->fifo_batch) { + /* we have a next request and are still entitled to batch */ + data_dir = rq_data_dir(rq); goto dispatch_request; + } /* * at this point we are not running a batch. select the appropriate @@ -522,6 +547,7 @@ dispatch_request: done: ioprio_class = dd_rq_ioclass(rq); prio = ioprio_class_to_prio[ioprio_class]; + dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); dd->per_prio[prio].stats.dispatched++; /* * If the request needs its target zone locked, do it. @@ -620,8 +646,9 @@ static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; - dd->async_depth = max(1UL, 3 * q->nr_requests / 4); + dd->async_depth = max(1U, 3 * (1U << shift) / 4); sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); } @@ -766,7 +793,7 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_insert_t flags) + blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; @@ -775,7 +802,6 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); struct dd_per_prio *per_prio; enum dd_prio prio; - LIST_HEAD(free); lockdep_assert_held(&dd->lock); @@ -792,10 +818,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, rq->elv.priv[0] = (void *)(uintptr_t)1; } - if (blk_mq_sched_try_insert_merge(q, rq, &free)) { - blk_mq_free_requests(&free); + if (blk_mq_sched_try_insert_merge(q, rq, free)) return; - } trace_block_rq_insert(rq); @@ -803,6 +827,8 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, list_add(&rq->queuelist, &per_prio->dispatch); rq->fifo_time = jiffies; } else { + struct list_head *insert_before; + deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { @@ -815,7 +841,20 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); + insert_before = &per_prio->fifo_list[data_dir]; +#ifdef CONFIG_BLK_DEV_ZONED + /* + * Insert zoned writes such that requests are sorted by + * position per zone. + */ + if (blk_rq_is_seq_zoned_write(rq)) { + struct request *rq2 = deadline_latter_request(rq); + + if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) + insert_before = &rq2->queuelist; + } +#endif + list_add_tail(&rq->queuelist, insert_before); } } @@ -828,6 +867,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; + LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { @@ -835,9 +875,11 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, flags); + dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); + + blk_mq_free_requests(&free); } /* Callback from inside blk_mq_rq_ctx_init(). */ @@ -1035,8 +1077,10 @@ static int deadline_##name##_next_rq_show(void *data, \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ - struct request *rq = per_prio->next_rq[data_dir]; \ + struct request *rq; \ \ + rq = deadline_from_pos(per_prio, data_dir, \ + per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 5c8624e26a54..506921095412 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -11,10 +11,18 @@ #define pr_fmt(fmt) fmt #include <linux/types.h> +#include <linux/mm_types.h> +#include <linux/overflow.h> #include <linux/affs_hardblocks.h> #include "check.h" +/* magic offsets in partition DosEnvVec */ +#define NR_HD 3 +#define NR_SECT 5 +#define LO_CYL 9 +#define HI_CYL 10 + static __inline__ u32 checksum_block(__be32 *m, int size) { @@ -31,8 +39,12 @@ int amiga_partition(struct parsed_partitions *state) unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; - int start_sect, nr_sects, blk, part, res = 0; - int blksize = 1; /* Multiplier for disk block size */ + u64 start_sect, nr_sects; + sector_t blk, end_sect; + u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ + u32 nr_hd, nr_sect, lo_cyl, hi_cyl; + int part, res = 0; + unsigned int blksize = 1; /* Multiplier for disk block size */ int slot = 1; for (blk = 0; ; blk++, put_dev_sector(sect)) { @@ -40,7 +52,7 @@ int amiga_partition(struct parsed_partitions *state) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { - pr_err("Dev %s: unable to read RDB block %d\n", + pr_err("Dev %s: unable to read RDB block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; @@ -57,12 +69,12 @@ int amiga_partition(struct parsed_partitions *state) *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { - pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n", + pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", blk); break; } - pr_err("Dev %s: RDB in block %d has bad checksum\n", + pr_err("Dev %s: RDB in block %llu has bad checksum\n", state->disk->disk_name, blk); } @@ -78,11 +90,16 @@ int amiga_partition(struct parsed_partitions *state) } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); - for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { - blk *= blksize; /* Read in terms partition table understands */ + for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { + /* Read in terms partition table understands */ + if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { + pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", + state->disk->disk_name, blk, part); + break; + } data = read_part_sector(state, blk, §); if (!data) { - pr_err("Dev %s: unable to read partition block %d\n", + pr_err("Dev %s: unable to read partition block %llu\n", state->disk->disk_name, blk); res = -1; goto rdb_done; @@ -94,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state) if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) continue; - /* Tell Kernel about it */ + /* RDB gives us more than enough rope to hang ourselves with, + * many times over (2^128 bytes if all fields max out). + * Some careful checks are in order, so check for potential + * overflows. + * We are multiplying four 32 bit numbers to one sector_t! + */ + + nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); + nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); + + /* CylBlocks is total number of blocks per cylinder */ + if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { + pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", + state->disk->disk_name, cylblk); + continue; + } + + /* check for consistency with RDB defined CylBlocks */ + if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { + pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", + state->disk->disk_name, cylblk, + be32_to_cpu(rdb->rdb_CylBlocks)); + } + + /* RDB allows for variable logical block size - + * normalize to 512 byte blocks and check result. + */ + + if (check_mul_overflow(cylblk, blksize, &cylblk)) { + pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", + state->disk->disk_name, part); + continue; + } + + /* Calculate partition start and end. Limit of 32 bit on cylblk + * guarantees no overflow occurs if LBD support is enabled. + */ + + lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); + start_sect = ((u64) lo_cyl * cylblk); + + hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); + nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); - nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - - be32_to_cpu(pb->pb_Environment[9])) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; if (!nr_sects) continue; - start_sect = be32_to_cpu(pb->pb_Environment[9]) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; + + /* Warn user if partition end overflows u32 (AmigaDOS limit) */ + + if ((start_sect + nr_sects) > UINT_MAX) { + pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", + state->disk->disk_name, part, + start_sect, start_sect + nr_sects); + } + + if (check_add_overflow(start_sect, nr_sects, &end_sect)) { + pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", + state->disk->disk_name, part, + start_sect, end_sect); + continue; + } + + /* Tell Kernel about it */ + put_partition(state,slot++,start_sect,nr_sects); { /* Be even more informative to aid mounting */ diff --git a/block/partitions/core.c b/block/partitions/core.c index 49e0496ff23c..13a7341299a9 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -12,7 +12,7 @@ #include <linux/raid/detect.h> #include "check.h" -static int (*check_part[])(struct parsed_partitions *) = { +static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. @@ -228,7 +228,7 @@ static struct attribute *part_attrs[] = { NULL }; -static struct attribute_group part_attr_group = { +static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; @@ -256,31 +256,36 @@ static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) return 0; } -struct device_type part_type = { +const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; -static void delete_partition(struct block_device *part) +void drop_partition(struct block_device *part) { lockdep_assert_held(&part->bd_disk->open_mutex); - fsync_bdev(part); - __invalidate_device(part, true); - xa_erase(&part->bd_disk->part_tbl, part->bd_partno); kobject_put(part->bd_holder_dir); + device_del(&part->bd_device); + put_device(&part->bd_device); +} +static void delete_partition(struct block_device *part) +{ /* * Remove the block device from the inode hash, so that it cannot be * looked up any more even when openers still hold references. */ remove_inode_hash(part->bd_inode); - put_device(&part->bd_device); + fsync_bdev(part); + __invalidate_device(part, true); + + drop_partition(part); } static ssize_t whole_disk_show(struct device *dev, @@ -288,7 +293,7 @@ static ssize_t whole_disk_show(struct device *dev, { return 0; } -static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); +static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* * Must be called either with open_mutex held, before a disk can be opened or @@ -436,10 +441,21 @@ static bool partition_overlaps(struct gendisk *disk, sector_t start, int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, sector_t length) { + sector_t capacity = get_capacity(disk), end; struct block_device *part; int ret; mutex_lock(&disk->open_mutex); + if (check_add_overflow(start, length, &end)) { + ret = -EINVAL; + goto out; + } + + if (start >= capacity || end > capacity) { + ret = -EINVAL; + goto out; + } + if (!disk_live(disk)) { ret = -ENXIO; goto out; @@ -519,17 +535,6 @@ static bool disk_unlock_native_capacity(struct gendisk *disk) return true; } -void blk_drop_partitions(struct gendisk *disk) -{ - struct block_device *part; - unsigned long idx; - - lockdep_assert_held(&disk->open_mutex); - - xa_for_each_start(&disk->part_tbl, idx, part, 1) - delete_partition(part); -} - static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { @@ -646,6 +651,8 @@ out_free_state: int bdev_disk_changed(struct gendisk *disk, bool invalidate) { + struct block_device *part; + unsigned long idx; int ret = 0; lockdep_assert_held(&disk->open_mutex); @@ -658,8 +665,9 @@ rescan: return -EBUSY; sync_blockdev(disk->part0); invalidate_bdev(disk->part0); - blk_drop_partitions(disk); + xa_for_each_start(&disk->part_tbl, idx, part, 1) + delete_partition(part); clear_bit(GD_NEED_PART_SCAN, &disk->state); /* |