diff options
Diffstat (limited to 'block')
-rw-r--r-- | block/Kconfig | 20 | ||||
-rw-r--r-- | block/bdev.c | 258 | ||||
-rw-r--r-- | block/bio-integrity.c | 218 | ||||
-rw-r--r-- | block/bio.c | 53 | ||||
-rw-r--r-- | block/blk-cgroup.c | 9 | ||||
-rw-r--r-- | block/blk-cgroup.h | 3 | ||||
-rw-r--r-- | block/blk-core.c | 36 | ||||
-rw-r--r-- | block/blk-iocost.c | 9 | ||||
-rw-r--r-- | block/blk-map.c | 13 | ||||
-rw-r--r-- | block/blk-merge.c | 6 | ||||
-rw-r--r-- | block/blk-mq-debugfs.c | 18 | ||||
-rw-r--r-- | block/blk-mq-sched.c | 2 | ||||
-rw-r--r-- | block/blk-mq.c | 63 | ||||
-rw-r--r-- | block/blk-rq-qos.h | 2 | ||||
-rw-r--r-- | block/blk-settings.c | 107 | ||||
-rw-r--r-- | block/blk-sysfs.c | 11 | ||||
-rw-r--r-- | block/blk-wbt.c | 17 | ||||
-rw-r--r-- | block/blk-wbt.h | 5 | ||||
-rw-r--r-- | block/blk-zoned.c | 21 | ||||
-rw-r--r-- | block/blk.h | 2 | ||||
-rw-r--r-- | block/fops.c | 23 | ||||
-rw-r--r-- | block/genhd.c | 5 | ||||
-rw-r--r-- | block/ioctl.c | 13 | ||||
-rw-r--r-- | block/ioprio.c | 26 | ||||
-rw-r--r-- | block/opal_proto.h | 1 | ||||
-rw-r--r-- | block/partitions/core.c | 21 | ||||
-rw-r--r-- | block/sed-opal.c | 6 |
27 files changed, 593 insertions, 375 deletions
diff --git a/block/Kconfig b/block/Kconfig index 55ae2286a4de..1de4682d48cc 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10 select CRC_T10DIF select CRC64_ROCKSOFT +config BLK_DEV_WRITE_MOUNTED + bool "Allow writing to mounted block devices" + default y + help + When a block device is mounted, writing to its buffer cache is very + likely going to cause filesystem corruption. It is also rather easy to + crash the kernel in this way since the filesystem has no practical way + of detecting these writes to buffer cache and verifying its metadata + integrity. However there are some setups that need this capability + like running fsck on read-only mounted root device, modifying some + features on mounted ext4 filesystem, and similar. If you say N, the + kernel will prevent processes from writing to block devices that are + mounted by filesystems which provides some more protection from runaway + privileged processes and generally makes it much harder to crash + filesystem drivers. Note however that this does not prevent + underlying device(s) from being modified by other means, e.g. by + directly submitting SCSI commands or through access to lower layers of + storage stack. If in doubt, say Y. The configuration can be overridden + with the bdev_allow_write_mounted boot option. + config BLK_DEV_ZONED bool "Zoned block device support" select MQ_IOSCHED_DEADLINE diff --git a/block/bdev.c b/block/bdev.c index 750aec178b6a..e9f1b12bd75c 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -30,6 +30,9 @@ #include "../fs/internal.h" #include "blk.h" +/* Should we allow writing to mounted block devices? */ +static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); + struct bdev_inode { struct block_device bdev; struct inode vfs_inode; @@ -207,85 +210,88 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) EXPORT_SYMBOL(sync_blockdev_range); /** - * freeze_bdev - lock a filesystem and force it into a consistent state + * bdev_freeze - lock a filesystem and force it into a consistent state * @bdev: blockdevice to lock * * If a superblock is found on this device, we take the s_umount semaphore * on it to make sure nobody unmounts until the snapshot creation is done. * The reference counter (bd_fsfreeze_count) guarantees that only the last * unfreeze process can unfreeze the frozen filesystem actually when multiple - * freeze requests arrive simultaneously. It counts up in freeze_bdev() and - * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze + * freeze requests arrive simultaneously. It counts up in bdev_freeze() and + * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze * actually. + * + * Return: On success zero is returned, negative error code on failure. */ -int freeze_bdev(struct block_device *bdev) +int bdev_freeze(struct block_device *bdev) { - struct super_block *sb; int error = 0; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (++bdev->bd_fsfreeze_count > 1) - goto done; - - sb = get_active_super(bdev); - if (!sb) - goto sync; - if (sb->s_op->freeze_super) - error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); - deactivate_super(sb); - if (error) { - bdev->bd_fsfreeze_count--; - goto done; + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; } - bdev->bd_fsfreeze_sb = sb; -sync: - sync_blockdev(bdev); -done: + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + error = sync_blockdev(bdev); + } + + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(freeze_bdev); +EXPORT_SYMBOL(bdev_freeze); /** - * thaw_bdev - unlock filesystem + * bdev_thaw - unlock filesystem * @bdev: blockdevice to unlock * - * Unlocks the filesystem and marks it writeable again after freeze_bdev(). + * Unlocks the filesystem and marks it writeable again after bdev_freeze(). + * + * Return: On success zero is returned, negative error code on failure. */ -int thaw_bdev(struct block_device *bdev) +int bdev_thaw(struct block_device *bdev) { - struct super_block *sb; - int error = -EINVAL; + int error = -EINVAL, nr_freeze; mutex_lock(&bdev->bd_fsfreeze_mutex); - if (!bdev->bd_fsfreeze_count) + + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) goto out; error = 0; - if (--bdev->bd_fsfreeze_count > 0) + if (nr_freeze > 0) goto out; - sb = bdev->bd_fsfreeze_sb; - if (!sb) - goto out; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } - if (sb->s_op->thaw_super) - error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - else - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); if (error) - bdev->bd_fsfreeze_count++; - else - bdev->bd_fsfreeze_sb = NULL; + atomic_inc(&bdev->bd_fsfreeze_count); out: mutex_unlock(&bdev->bd_fsfreeze_mutex); return error; } -EXPORT_SYMBOL(thaw_bdev); +EXPORT_SYMBOL(bdev_thaw); /* * pseudo-fs @@ -729,9 +735,60 @@ void blkdev_put_no_open(struct block_device *bdev) { put_device(&bdev->bd_device); } - + +static bool bdev_writes_blocked(struct block_device *bdev) +{ + return bdev->bd_writers == -1; +} + +static void bdev_block_writes(struct block_device *bdev) +{ + bdev->bd_writers = -1; +} + +static void bdev_unblock_writes(struct block_device *bdev) +{ + bdev->bd_writers = 0; +} + +static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return true; + /* Writes blocked? */ + if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) + return false; + if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) + return false; + return true; +} + +static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Claim exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_block_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers++; +} + +static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Yield exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_unblock_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers--; +} + /** - * blkdev_get_by_dev - open a block device by device number + * bdev_open_by_dev - open a block device by device number * @dev: device number of block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier @@ -743,32 +800,46 @@ void blkdev_put_no_open(struct block_device *bdev) * * Use this interface ONLY if you really do not have anything better - i.e. when * you are behind a truly sucky interface and all you are given is a device - * number. Everything else should use blkdev_get_by_path(). + * number. Everything else should use bdev_open_by_path(). * * CONTEXT: * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. */ -struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) { - bool unblock_events = true; + struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle), + GFP_KERNEL); struct block_device *bdev; + bool unblock_events = true; struct gendisk *disk; int ret; + if (!handle) + return ERR_PTR(-ENOMEM); + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, MAJOR(dev), MINOR(dev), ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); if (ret) - return ERR_PTR(ret); + goto free_handle; + + /* Blocking writes requires exclusive opener */ + if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) { + ret = -EINVAL; + goto free_handle; + } bdev = blkdev_get_no_open(dev); - if (!bdev) - return ERR_PTR(-ENXIO); + if (!bdev) { + ret = -ENXIO; + goto free_handle; + } disk = bdev->bd_disk; if (holder) { @@ -791,12 +862,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, goto abort_claiming; if (!try_module_get(disk->fops->owner)) goto abort_claiming; + ret = -EBUSY; + if (!bdev_may_open(bdev, mode)) + goto abort_claiming; if (bdev_is_partition(bdev)) ret = blkdev_get_part(bdev, mode); else ret = blkdev_get_whole(bdev, mode); if (ret) goto put_module; + bdev_claim_write_access(bdev, mode); if (holder) { bd_finish_claiming(bdev, holder, hops); @@ -817,7 +892,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder, if (unblock_events) disk_unblock_events(disk); - return bdev; + handle->bdev = bdev; + handle->holder = holder; + handle->mode = mode; + return handle; put_module: module_put(disk->fops->owner); abort_claiming: @@ -827,34 +905,14 @@ abort_claiming: disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); +free_handle: + kfree(handle); return ERR_PTR(ret); } -EXPORT_SYMBOL(blkdev_get_by_dev); - -struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, - const struct blk_holder_ops *hops) -{ - struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL); - struct block_device *bdev; - - if (!handle) - return ERR_PTR(-ENOMEM); - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (IS_ERR(bdev)) { - kfree(handle); - return ERR_CAST(bdev); - } - handle->bdev = bdev; - handle->holder = holder; - if (holder) - mode |= BLK_OPEN_EXCL; - handle->mode = mode; - return handle; -} EXPORT_SYMBOL(bdev_open_by_dev); /** - * blkdev_get_by_path - open a block device by name + * bdev_open_by_path - open a block device by name * @path: path to the block device to open * @mode: open mode (BLK_OPEN_*) * @holder: exclusive holder identifier @@ -868,29 +926,9 @@ EXPORT_SYMBOL(bdev_open_by_dev); * Might sleep. * * RETURNS: - * Reference to the block_device on success, ERR_PTR(-errno) on failure. + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. */ -struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode, - void *holder, const struct blk_holder_ops *hops) -{ - struct block_device *bdev; - dev_t dev; - int error; - - error = lookup_bdev(path, &dev); - if (error) - return ERR_PTR(error); - - bdev = blkdev_get_by_dev(dev, mode, holder, hops); - if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) { - blkdev_put(bdev, holder); - return ERR_PTR(-EACCES); - } - - return bdev; -} -EXPORT_SYMBOL(blkdev_get_by_path); - struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, void *holder, const struct blk_holder_ops *hops) { @@ -913,8 +951,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, } EXPORT_SYMBOL(bdev_open_by_path); -void blkdev_put(struct block_device *bdev, void *holder) +void bdev_release(struct bdev_handle *handle) { + struct block_device *bdev = handle->bdev; struct gendisk *disk = bdev->bd_disk; /* @@ -928,8 +967,10 @@ void blkdev_put(struct block_device *bdev, void *holder) sync_blockdev(bdev); mutex_lock(&disk->open_mutex); - if (holder) - bd_end_claim(bdev, holder); + bdev_yield_write_access(bdev, handle->mode); + + if (handle->holder) + bd_end_claim(bdev, handle->holder); /* * Trigger event checking and tell drivers to flush MEDIA_CHANGE @@ -946,12 +987,6 @@ void blkdev_put(struct block_device *bdev, void *holder) module_put(disk->fops->owner); blkdev_put_no_open(bdev); -} -EXPORT_SYMBOL(blkdev_put); - -void bdev_release(struct bdev_handle *handle) -{ - blkdev_put(handle->bdev, handle->holder); kfree(handle); } EXPORT_SYMBOL(bdev_release); @@ -1102,3 +1137,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) blkdev_put_no_open(bdev); } + +static int __init setup_bdev_allow_write_mounted(char *str) +{ + if (kstrtobool(str, &bdev_allow_write_mounted)) + pr_warn("Invalid option string for bdev_allow_write_mounted:" + " '%s'\n", str); + return 1; +} +__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); diff --git a/block/bio-integrity.c b/block/bio-integrity.c index ec8ac8cf6e1b..c9a16fba58b9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); + /* always report as many vecs as asked explicitly, not inline vecs */ + bip->bip_max_vcnt = nr_vecs; if (nr_vecs > inline_vecs) { - bip->bip_max_vcnt = nr_vecs; bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; } else { bip->bip_vec = bip->bip_inline_vecs; - bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; @@ -91,6 +91,47 @@ err: } EXPORT_SYMBOL(bio_integrity_alloc); +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, + bool dirty) +{ + int i; + + for (i = 0; i < nr_vecs; i++) { + if (dirty && !PageCompound(bv[i].bv_page)) + set_page_dirty_lock(bv[i].bv_page); + unpin_user_page(bv[i].bv_page); + } +} + +static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) +{ + unsigned short nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *copy = &bip->bip_vec[1]; + size_t bytes = bip->bip_iter.bi_size; + struct iov_iter iter; + int ret; + + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + WARN_ON_ONCE(ret != bytes); + + bio_integrity_unpin_bvec(copy, nr_vecs, true); +} + +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +{ + bool dirty = bio_data_dir(bip->bip_bio) == READ; + + if (bip->bip_flags & BIP_COPY_USER) { + if (dirty) + bio_integrity_uncopy_user(bip); + kfree(bvec_virt(bip->bip_vec)); + return; + } + + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio) if (bip->bip_flags & BIP_BLOCK_INTEGRITY) kfree(bvec_virt(bip->bip_vec)); + else if (bip->bip_flags & BIP_INTEGRITY_USER) + bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page, } EXPORT_SYMBOL(bio_integrity_add_page); +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, + unsigned int direction, u32 seed) +{ + bool write = direction == ITER_SOURCE; + struct bio_integrity_payload *bip; + struct iov_iter iter; + void *buf; + int ret; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (write) { + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + if (!copy_from_iter_full(buf, len, &iter)) { + ret = -EFAULT; + goto free_buf; + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + } else { + memset(buf, 0, len); + + /* + * We need to preserve the original bvec and the number of vecs + * in it for completion handling + */ + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1); + } + + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto free_buf; + } + + if (write) + bio_integrity_unpin_bvec(bvec, nr_vecs, false); + else + memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); + + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret != len) { + ret = -ENOMEM; + goto free_bip; + } + + bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; + bip->bip_iter.bi_sector = seed; + return 0; +free_bip: + bio_integrity_free(bio); +free_buf: + kfree(buf); + return ret; +} + +static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, u32 seed) +{ + struct bio_integrity_payload *bip; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->bip_iter.bi_sector = seed; + bip->bip_iter.bi_size = len; + return 0; +} + +static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, + int nr_vecs, ssize_t bytes, ssize_t offset) +{ + unsigned int nr_bvecs = 0; + int i, j; + + for (i = 0; i < nr_vecs; i = j) { + size_t size = min_t(size_t, bytes, PAGE_SIZE - offset); + struct folio *folio = page_folio(pages[i]); + + bytes -= size; + for (j = i + 1; j < nr_vecs; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) + break; + unpin_user_page(pages[j]); + size += next; + bytes -= next; + } + + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); + offset = 0; + nr_bvecs++; + } + + return nr_bvecs; +} + +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, + u32 seed) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; + struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + unsigned int direction, nr_bvecs; + struct iov_iter iter; + int ret, nr_vecs; + size_t offset; + bool copy; + + if (bio_integrity(bio)) + return -EINVAL; + if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) + return -E2BIG; + + if (bio_data_dir(bio) == READ) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + + iov_iter_ubuf(&iter, direction, ubuf, bytes); + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + if (nr_vecs > BIO_MAX_VECS) + return -E2BIG; + if (nr_vecs > UIO_FASTIOV) { + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + return -ENOMEM; + pages = NULL; + } + + copy = !iov_iter_is_aligned(&iter, align, align); + ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + if (unlikely(ret < 0)) + goto free_bvec; + + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + if (pages != stack_pages) + kvfree(pages); + if (nr_bvecs > queue_max_integrity_segments(q)) + copy = true; + + if (copy) + ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, + direction, seed); + else + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + if (ret) + goto release_pages; + if (bvec != stack_vec) + kfree(bvec); + + return 0; + +release_pages: + bio_integrity_unpin_bvec(bvec, nr_bvecs, false); +free_bvec: + if (bvec != stack_vec) + kfree(bvec); + return ret; +} +EXPORT_SYMBOL_GPL(bio_integrity_map_user); + /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for diff --git a/block/bio.c b/block/bio.c index 816d412c06e9..b9642a41f286 100644 --- a/block/bio.c +++ b/block/bio.c @@ -944,7 +944,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) + if (len > queue_max_segment_size(q) - bv->bv_len) return false; return bvec_try_merge_page(bv, page, len, offset, same_page); } @@ -966,10 +966,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { + unsigned int max_size = max_sectors << SECTOR_SHIFT; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors) + len = min3(len, max_size, queue_max_segment_size(q)); + if (len > max_size - bio->bi_iter.bi_size) return 0; if (bio->bi_vcnt > 0) { @@ -1145,13 +1148,22 @@ EXPORT_SYMBOL(bio_add_folio); void __bio_release_pages(struct bio *bio, bool mark_dirty) { - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (mark_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - bio_release_page(bio, bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + struct page *page; + size_t done = 0; + + if (mark_dirty) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); + } + page = folio_page(fi.folio, fi.offset / PAGE_SIZE); + do { + bio_release_page(bio, page++); + done += PAGE_SIZE; + } while (done < fi.length); } } EXPORT_SYMBOL_GPL(__bio_release_pages); @@ -1439,18 +1451,12 @@ EXPORT_SYMBOL(bio_free_pages); * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * - * The problem is that we cannot run set_page_dirty() from interrupt context + * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. @@ -1466,12 +1472,12 @@ EXPORT_SYMBOL(bio_free_pages); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); } } EXPORT_SYMBOL_GPL(bio_set_pages_dirty); @@ -1515,12 +1521,11 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; + struct folio_iter fi; unsigned long flags; - struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + bio_for_each_folio_all(fi, bio) { + if (!folio_test_dirty(fi.folio)) goto defer; } diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 4b48c2c44098..ff93c385ba5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) @@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg) static void blkg_destroy_all(struct gendisk *disk) { struct request_queue *q = disk->queue; - struct blkcg_gq *blkg, *n; + struct blkcg_gq *blkg; int count = BLKG_DESTROY_BATCH_SIZE; int i; restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; if (hlist_unhashed(&blkg->blkcg_node)) @@ -2064,6 +2064,9 @@ void bio_associate_blkg(struct bio *bio) { struct cgroup_subsys_state *css; + if (blk_op_is_passthrough(bio->bi_opf)) + return; + rcu_read_lock(); if (bio->bi_blkg) diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h index fd482439afbc..b927a4a0ad03 100644 --- a/block/blk-cgroup.h +++ b/block/blk-cgroup.h @@ -252,7 +252,8 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, if (blkcg == &blkcg_root) return q->root_blkg; - blkg = rcu_dereference(blkcg->blkg_hint); + blkg = rcu_dereference_check(blkcg->blkg_hint, + lockdep_is_held(&q->queue_lock)); if (blkg && blkg->q == q) return blkg; diff --git a/block/blk-core.c b/block/blk-core.c index 2eca76ccf4ee..de771093b526 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -49,6 +49,7 @@ #include "blk-pm.h" #include "blk-cgroup.h" #include "blk-throttle.h" +#include "blk-ioprio.h" struct dentry *blk_debugfs_root; @@ -772,6 +773,15 @@ void submit_bio_noacct(struct bio *bio) bio_clear_polled(bio); switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + break; + case REQ_OP_FLUSH: + /* + * REQ_OP_FLUSH can't be submitted through bios, it is only + * synthetized in struct request by the flush state machine. + */ + goto not_supported; case REQ_OP_DISCARD: if (!bdev_max_discard_sectors(bdev)) goto not_supported; @@ -785,6 +795,10 @@ void submit_bio_noacct(struct bio *bio) if (status != BLK_STS_OK) goto end_io; break; + case REQ_OP_WRITE_ZEROES: + if (!q->limits.max_write_zeroes_sectors) + goto not_supported; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: @@ -796,12 +810,15 @@ void submit_bio_noacct(struct bio *bio) if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) goto not_supported; break; - case REQ_OP_WRITE_ZEROES: - if (!q->limits.max_write_zeroes_sectors) - goto not_supported; - break; + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* + * Driver private operations are only used with passthrough + * requests. + */ + fallthrough; default: - break; + goto not_supported; } if (blk_throtl_bio(bio)) @@ -817,6 +834,14 @@ end_io: } EXPORT_SYMBOL(submit_bio_noacct); +static void bio_set_ioprio(struct bio *bio) +{ + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); + blkcg_set_ioprio(bio); +} + /** * submit_bio - submit a bio to the block device layer for I/O * @bio: The &struct bio which describes the I/O @@ -839,6 +864,7 @@ void submit_bio(struct bio *bio) count_vm_events(PGPGOUT, bio_sectors(bio)); } + bio_set_ioprio(bio); submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 089fcb9cfce3..04d44f0bcbc8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period; + u64 __maybe_unused last_period, cur_period; u64 vtime, vtarget; int i; @@ -1353,6 +1353,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) lockdep_assert_held(&iocg->waitq.lock); + /* + * If the delay is set by another CPU, we may be in the past. No need to + * change anything if so. This avoids decay calculation underflow. + */ + if (time_before64(now->now, iocg->delay_at)) + return false; + /* calculate the current delay in effect - 1/2 every second */ tdelta = now->now - iocg->delay_at; if (iocg->delay) diff --git a/block/blk-map.c b/block/blk-map.c index 8584babf3ea0..71210cdb3442 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -205,12 +205,19 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, /* * success */ - if ((iov_iter_rw(iter) == WRITE && - (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { + if (iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) { ret = bio_copy_from_iter(bio, iter); if (ret) goto cleanup; + } else if (map_data && map_data->from_user) { + struct iov_iter iter2 = *iter; + + /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */ + iter2.data_source = ITER_SOURCE; + ret = bio_copy_from_iter(bio, &iter2); + if (ret) + goto cleanup; } else { if (bmd->is_our_pages) zero_fill_bio(bio); diff --git a/block/blk-merge.c b/block/blk-merge.c index 65e75efa9bd3..2d470cf2173e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -115,17 +115,13 @@ static struct bio *bio_split_discard(struct bio *bio, *nsegs = 1; - /* Zero-sector (unknown) and one-sector granularities are the same. */ granularity = max(lim->discard_granularity >> 9, 1U); max_discard_sectors = min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; - - if (unlikely(!max_discard_sectors)) { - /* XXX: warn */ + if (unlikely(!max_discard_sectors)) return NULL; - } if (bio_sectors(bio) <= max_discard_sectors) return NULL; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index 5cbeb9344f2f..94668e72ab09 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -479,23 +479,6 @@ out: return res; } -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; @@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 67c95f31b15b..451a2c1f1f32 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. diff --git a/block/blk-mq.c b/block/blk-mq.c index ac18f802c027..2dc01551e27c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -40,7 +40,6 @@ #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" -#include "blk-ioprio.h" static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); @@ -772,11 +771,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio, /* * Partial zone append completions cannot be supported as the * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. */ - if (bio->bi_iter.bi_size != nbytes) + if (bio->bi_iter.bi_size != nbytes) { bio->bi_status = BLK_STS_IOERR; - else + nbytes = bio->bi_iter.bi_size; + } else { bio->bi_iter.bi_sector = rq->__sector; + } } bio_advance(bio, nbytes); @@ -1248,7 +1252,8 @@ void blk_mq_start_request(struct request *rq) trace_block_rq_issue(rq); - if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && + !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = ktime_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; @@ -1859,6 +1864,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, __add_wait_queue(wq, wait); /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. @@ -2890,8 +2911,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q, return NULL; } -/* return true if this @rq can be used for @bio */ -static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, struct bio *bio) { enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); @@ -2919,14 +2943,6 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug, return true; } -static void bio_set_ioprio(struct bio *bio) -{ - /* Nobody set ioprio so far? Initialize it based on task's nice value */ - if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) - bio->bi_ioprio = get_current_ioprio(); - blkcg_set_ioprio(bio); -} - /** * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. @@ -2951,13 +2967,6 @@ void blk_mq_submit_bio(struct bio *bio) blk_status_t ret; bio = blk_queue_bounce(bio, q); - if (bio_may_exceed_limits(bio, &q->limits)) { - bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); - if (!bio) - return; - } - - bio_set_ioprio(bio); if (plug) { rq = rq_list_peek(&plug->cached_rq); @@ -2965,16 +2974,26 @@ void blk_mq_submit_bio(struct bio *bio) rq = NULL; } if (rq) { + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } if (!bio_integrity_prep(bio)) return; if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) return; - if (blk_mq_can_use_cached_rq(rq, plug, bio)) + if (blk_mq_use_cached_rq(rq, plug, bio)) goto done; percpu_ref_get(&q->q_usage_counter); } else { if (unlikely(bio_queue_enter(bio))) return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto fail; + } if (!bio_integrity_prep(bio)) goto fail; } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index f48ee150d667..37245c97ee61 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } diff --git a/block/blk-settings.c b/block/blk-settings.c index 0046b447268f..06ea91e51b8b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -48,7 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; lim->max_secure_erase_sectors = 0; - lim->discard_granularity = 0; + lim->discard_granularity = 512; lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; @@ -56,7 +56,7 @@ void blk_set_default_limits(struct queue_limits *lim) lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; - lim->zoned = BLK_ZONED_NONE; + lim->zoned = false; lim->zone_write_granularity = 0; lim->dma_alignment = 511; } @@ -127,8 +127,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if ((max_hw_sectors << 9) < PAGE_SIZE) { max_hw_sectors = 1 << (PAGE_SHIFT - 9); - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_hw_sectors); + pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors); } max_hw_sectors = round_down(max_hw_sectors, @@ -140,7 +139,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if (limits->max_user_sectors) max_sectors = min(max_sectors, limits->max_user_sectors); else - max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS); + max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP); max_sectors = round_down(max_sectors, limits->logical_block_size >> SECTOR_SHIFT); @@ -248,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments { if (!max_segments) { max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); + pr_info("%s: set to minimum %u\n", __func__, max_segments); } q->limits.max_segments = max_segments; @@ -285,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) { if (max_size < PAGE_SIZE) { max_size = PAGE_SIZE; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_size); + pr_info("%s: set to minimum %u\n", __func__, max_size); } /* see blk_queue_virt_boundary() for the explanation */ @@ -312,6 +309,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) limits->logical_block_size = size; + if (limits->discard_granularity < limits->logical_block_size) + limits->discard_granularity = limits->logical_block_size; + if (limits->physical_block_size < size) limits->physical_block_size = size; @@ -342,6 +342,9 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) if (q->limits.physical_block_size < q->limits.logical_block_size) q->limits.physical_block_size = q->limits.logical_block_size; + if (q->limits.discard_granularity < q->limits.physical_block_size) + q->limits.discard_granularity = q->limits.physical_block_size; + if (q->limits.io_min < q->limits.physical_block_size) q->limits.io_min = q->limits.physical_block_size; } @@ -740,8 +743,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) { if (mask < PAGE_SIZE - 1) { mask = PAGE_SIZE - 1; - printk(KERN_INFO "%s: set to minimum %lx\n", - __func__, mask); + pr_info("%s: set to minimum %lx\n", __func__, mask); } q->limits.seg_boundary_mask = mask; @@ -841,8 +843,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); - - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); @@ -884,81 +884,22 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); -static bool disk_has_partitions(struct gendisk *disk) -{ - unsigned long idx; - struct block_device *part; - bool ret = false; - - rcu_read_lock(); - xa_for_each(&disk->part_tbl, idx, part) { - if (bdev_is_partition(part)) { - ret = true; - break; - } - } - rcu_read_unlock(); - - return ret; -} - /** - * disk_set_zoned - configure the zoned model for a disk - * @disk: the gendisk of the queue to configure - * @model: the zoned model to set - * - * Set the zoned model of @disk to @model. - * - * When @model is BLK_ZONED_HM (host managed), this should be called only - * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option). - * If @model specifies BLK_ZONED_HA (host aware), the effective model used - * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions - * on the disk. + * disk_set_zoned - inidicate a zoned device + * @disk: gendisk to configure */ -void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model) +void disk_set_zoned(struct gendisk *disk) { struct request_queue *q = disk->queue; - unsigned int old_model = q->limits.zoned; - - switch (model) { - case BLK_ZONED_HM: - /* - * Host managed devices are supported only if - * CONFIG_BLK_DEV_ZONED is enabled. - */ - WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); - break; - case BLK_ZONED_HA: - /* - * Host aware devices can be treated either as regular block - * devices (similar to drive managed devices) or as zoned block - * devices to take advantage of the zone command set, similarly - * to host managed devices. We try the latter if there are no - * partitions and zoned block device support is enabled, else - * we do nothing special as far as the block layer is concerned. - */ - if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) || - disk_has_partitions(disk)) - model = BLK_ZONED_NONE; - break; - case BLK_ZONED_NONE: - default: - if (WARN_ON_ONCE(model != BLK_ZONED_NONE)) - model = BLK_ZONED_NONE; - break; - } - q->limits.zoned = model; - if (model != BLK_ZONED_NONE) { - /* - * Set the zone write granularity to the device logical block - * size by default. The driver can change this value if needed. - */ - blk_queue_zone_write_granularity(q, - queue_logical_block_size(q)); - } else if (old_model != BLK_ZONED_NONE) { - disk_clear_zone_settings(disk); - } + WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); + + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + q->limits.zoned = true; + blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); } EXPORT_SYMBOL_GPL(disk_set_zoned); diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 0b2d04766324..6b2429cad81a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -241,7 +241,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) if (max_sectors_kb == 0) { q->limits.max_user_sectors = 0; max_sectors_kb = min(max_hw_sectors_kb, - BLK_DEF_MAX_SECTORS >> 1); + BLK_DEF_MAX_SECTORS_CAP >> 1); } else { if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) @@ -309,14 +309,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); static ssize_t queue_zoned_show(struct request_queue *q, char *page) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - return sprintf(page, "host-aware\n"); - case BLK_ZONED_HM: + if (blk_queue_is_zoned(q)) return sprintf(page, "host-managed\n"); - default: - return sprintf(page, "none\n"); - } + return sprintf(page, "none\n"); } static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0bb613139bec..0c0e270a8265 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -84,8 +84,6 @@ struct rq_wb { u64 sync_issue; void *sync_cookie; - unsigned int wc; - unsigned long last_issue; /* last non-throttled issue */ unsigned long last_comp; /* last non-throttled comp */ unsigned long min_lat_nsec; @@ -165,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, @@ -207,7 +205,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (rwb->wc && !wb_recent_wait(rwb)) + else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -699,13 +698,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) - RQWB(rqos)->wc = write_cache_on; -} - /* * Enable wbt if defaults are configured that way */ @@ -918,7 +910,6 @@ int wbt_init(struct gendisk *disk) rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags); rwb->rq_depth.default_depth = RWB_DEF_DEPTH; rwb->min_lat_nsec = wbt_default_latency_nsec(q); rwb->rq_depth.queue_depth = blk_queue_depth(q); diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 8a029e138f7a..e5fc653b9b76 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -12,8 +12,6 @@ u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); bool wbt_disabled(struct request_queue *); -void wbt_set_write_cache(struct request_queue *, bool); - u64 wbt_default_latency_nsec(struct request_queue *); #else @@ -24,9 +22,6 @@ static inline void wbt_disable_default(struct gendisk *disk) static inline void wbt_enable_default(struct gendisk *disk) { } -static inline void wbt_set_write_cache(struct request_queue *q, bool wc) -{ -} #endif /* CONFIG_BLK_WBT */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 619ee41a51cc..d343e5756a9c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -498,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, set_bit(idx, args->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: if (!args->seq_zones_wlock) { args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, args->nr_zones); @@ -506,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENOMEM; } break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); @@ -615,22 +615,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk, return ret; } EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones); - -void disk_clear_zone_settings(struct gendisk *disk) -{ - struct request_queue *q = disk->queue; - - blk_mq_freeze_queue(q); - - disk_free_zone_bitmaps(disk); - blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q); - q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE; - disk->nr_zones = 0; - disk->max_open_zones = 0; - disk->max_active_zones = 0; - q->limits.chunk_sectors = 0; - q->limits.zone_write_granularity = 0; - q->limits.max_zone_append_sectors = 0; - - blk_mq_unfreeze_queue(q); -} diff --git a/block/blk.h b/block/blk.h index 08a358bc0919..1ef920f72e0f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -395,14 +395,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio, #ifdef CONFIG_BLK_DEV_ZONED void disk_free_zone_bitmaps(struct gendisk *disk); -void disk_clear_zone_settings(struct gendisk *disk); int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg); int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg); #else /* CONFIG_BLK_DEV_ZONED */ static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} -static inline void disk_clear_zone_settings(struct gendisk *disk) {} static inline int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, unsigned long arg) { diff --git a/block/fops.c b/block/fops.c index 0abaac705daf..0cf8cf72cdfa 100644 --- a/block/fops.c +++ b/block/fops.c @@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock, return 0; } -static int blkdev_writepage(struct page *page, struct writeback_control *wbc) +/* + * We cannot call mpage_writepages() as it does not take the buffer lock. + * We must use block_write_full_folio() directly which holds the buffer + * lock. The buffer lock provides the synchronisation with writeback + * that filesystems rely on when they use the blockdev's mapping. + */ +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - return block_write_full_page(page, blkdev_get_block, wbc); + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + err = write_cache_pages(mapping, wbc, block_write_full_folio, + blkdev_get_block); + blk_finish_plug(&plug); + + return err; } static int blkdev_read_folio(struct file *file, struct folio *folio) @@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = { .invalidate_folio = block_invalidate_folio, .read_folio = blkdev_read_folio, .readahead = blkdev_readahead, - .writepage = blkdev_writepage, + .writepages = blkdev_writepages, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, .migrate_folio = buffer_migrate_folio_norefs, @@ -500,7 +515,7 @@ const struct address_space_operations def_blk_aops = { .readahead = blkdev_readahead, .writepages = blkdev_writepages, .is_partially_uptodate = iomap_is_partially_uptodate, - .error_remove_page = generic_error_remove_page, + .error_remove_folio = generic_error_remove_folio, .migrate_folio = filemap_migrate_folio, }; #endif /* CONFIG_BUFFER_HEAD */ diff --git a/block/genhd.c b/block/genhd.c index c9d06f72c587..d74fb5b4ae68 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -432,7 +432,9 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, DISK_MAX_PARTS); disk->minors = DISK_MAX_PARTS; } - if (disk->first_minor + disk->minors > MINORMASK + 1) + if (disk->first_minor > MINORMASK || + disk->minors > MINORMASK + 1 || + disk->first_minor + disk->minors > MINORMASK + 1) goto out_exit_elevator; } else { if (WARN_ON(disk->minors)) @@ -542,6 +544,7 @@ out_put_holder_dir: kobject_put(disk->part0->bd_holder_dir); out_del_block_link: sysfs_remove_link(block_depr, dev_name(ddev)); + pm_runtime_set_memalloc_noio(ddev, false); out_device_del: device_del(ddev); out_free_ext_minor: diff --git a/block/ioctl.c b/block/ioctl.c index 4160f4e6bd5b..438f79c564cf 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -18,10 +18,8 @@ static int blkpg_do_ioctl(struct block_device *bdev, { struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; - long long start, length; + sector_t start, length; - if (disk->flags & GENHD_FL_NO_PART) - return -EINVAL; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) @@ -35,14 +33,17 @@ static int blkpg_do_ioctl(struct block_device *bdev, if (op == BLKPG_DEL_PARTITION) return bdev_del_partition(disk, p.pno); + if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) + return -EINVAL; + /* Check that the partition is aligned to the block size */ + if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) + return -EINVAL; + start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; switch (op) { case BLKPG_ADD_PARTITION: - /* check if partition is aligned to blocksize */ - if (p.start & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: return bdev_resize_partition(disk, p.pno, start, length); diff --git a/block/ioprio.c b/block/ioprio.c index b5a942519a79..73301a261429 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -139,32 +139,6 @@ out: return ret; } -/* - * If the task has set an I/O priority, use that. Otherwise, return - * the default I/O priority. - * - * Expected to be called for current task or with task_lock() held to keep - * io_context stable. - */ -int __get_task_ioprio(struct task_struct *p) -{ - struct io_context *ioc = p->io_context; - int prio; - - if (p != current) - lockdep_assert_held(&p->alloc_lock); - if (ioc) - prio = ioc->ioprio; - else - prio = IOPRIO_DEFAULT; - - if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE) - prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p), - task_nice_ioprio(p)); - return prio; -} -EXPORT_SYMBOL_GPL(__get_task_ioprio); - static int get_task_ioprio(struct task_struct *p) { int ret; diff --git a/block/opal_proto.h b/block/opal_proto.h index dec7ce3a3edb..d247a457bf6e 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -71,6 +71,7 @@ enum opal_response_token { #define SHORT_ATOM_BYTE 0xBF #define MEDIUM_ATOM_BYTE 0xDF #define LONG_ATOM_BYTE 0xE3 +#define EMPTY_ATOM_BYTE 0xFF #define OPAL_INVAL_PARAM 12 #define OPAL_MANUFACTURED_INACTIVE 0x08 diff --git a/block/partitions/core.c b/block/partitions/core.c index f47ffcfdfcec..5f5ed5c75f04 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -305,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno, * Partitions are not supported on zoned block devices that are used as * such. */ - switch (disk->queue->limits.zoned) { - case BLK_ZONED_HM: + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); - case BLK_ZONED_HA: - pr_info("%s: disabling host aware zoned block device support due to partitions\n", - disk->disk_name); - disk_set_zoned(disk, BLK_ZONED_NONE); - break; - case BLK_ZONED_NONE: - break; } if (xa_load(&disk->part_tbl, partno)) @@ -447,6 +439,11 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, goto out; } + if (disk->flags & GENHD_FL_NO_PART) { + ret = -EINVAL; + goto out; + } + if (partition_overlaps(disk, start, length, -1)) { ret = -EBUSY; goto out; @@ -570,8 +567,8 @@ static bool blk_add_partition(struct gendisk *disk, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } @@ -613,7 +610,7 @@ static int blk_add_partitions(struct gendisk *disk) /* * Partitions are not supported on host managed zoned block devices. */ - if (disk->queue->limits.zoned == BLK_ZONED_HM) { + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; diff --git a/block/sed-opal.c b/block/sed-opal.c index 3d9e9cd250bd..fa4dba5d8531 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -1056,16 +1056,20 @@ static int response_parse(const u8 *buf, size_t length, token_length = response_parse_medium(iter, pos); else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ token_length = response_parse_long(iter, pos); + else if (pos[0] == EMPTY_ATOM_BYTE) /* empty atom */ + token_length = 1; else /* TOKEN */ token_length = response_parse_token(iter, pos); if (token_length < 0) return token_length; + if (pos[0] != EMPTY_ATOM_BYTE) + num_entries++; + pos += token_length; total -= token_length; iter++; - num_entries++; } resp->num = num_entries; |