aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig20
-rw-r--r--block/bdev.c258
-rw-r--r--block/bio-integrity.c218
-rw-r--r--block/bio.c53
-rw-r--r--block/blk-cgroup.c9
-rw-r--r--block/blk-cgroup.h3
-rw-r--r--block/blk-core.c36
-rw-r--r--block/blk-iocost.c9
-rw-r--r--block/blk-map.c13
-rw-r--r--block/blk-merge.c6
-rw-r--r--block/blk-mq-debugfs.c18
-rw-r--r--block/blk-mq-sched.c2
-rw-r--r--block/blk-mq.c63
-rw-r--r--block/blk-rq-qos.h2
-rw-r--r--block/blk-settings.c107
-rw-r--r--block/blk-sysfs.c11
-rw-r--r--block/blk-wbt.c17
-rw-r--r--block/blk-wbt.h5
-rw-r--r--block/blk-zoned.c21
-rw-r--r--block/blk.h2
-rw-r--r--block/fops.c23
-rw-r--r--block/genhd.c5
-rw-r--r--block/ioctl.c13
-rw-r--r--block/ioprio.c26
-rw-r--r--block/opal_proto.h1
-rw-r--r--block/partitions/core.c21
-rw-r--r--block/sed-opal.c6
27 files changed, 593 insertions, 375 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 55ae2286a4de..1de4682d48cc 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -78,6 +78,26 @@ config BLK_DEV_INTEGRITY_T10
select CRC_T10DIF
select CRC64_ROCKSOFT
+config BLK_DEV_WRITE_MOUNTED
+ bool "Allow writing to mounted block devices"
+ default y
+ help
+ When a block device is mounted, writing to its buffer cache is very
+ likely going to cause filesystem corruption. It is also rather easy to
+ crash the kernel in this way since the filesystem has no practical way
+ of detecting these writes to buffer cache and verifying its metadata
+ integrity. However there are some setups that need this capability
+ like running fsck on read-only mounted root device, modifying some
+ features on mounted ext4 filesystem, and similar. If you say N, the
+ kernel will prevent processes from writing to block devices that are
+ mounted by filesystems which provides some more protection from runaway
+ privileged processes and generally makes it much harder to crash
+ filesystem drivers. Note however that this does not prevent
+ underlying device(s) from being modified by other means, e.g. by
+ directly submitting SCSI commands or through access to lower layers of
+ storage stack. If in doubt, say Y. The configuration can be overridden
+ with the bdev_allow_write_mounted boot option.
+
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
diff --git a/block/bdev.c b/block/bdev.c
index 750aec178b6a..e9f1b12bd75c 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -30,6 +30,9 @@
#include "../fs/internal.h"
#include "blk.h"
+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
struct bdev_inode {
struct block_device bdev;
struct inode vfs_inode;
@@ -207,85 +210,88 @@ int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
EXPORT_SYMBOL(sync_blockdev_range);
/**
- * freeze_bdev - lock a filesystem and force it into a consistent state
+ * bdev_freeze - lock a filesystem and force it into a consistent state
* @bdev: blockdevice to lock
*
* If a superblock is found on this device, we take the s_umount semaphore
* on it to make sure nobody unmounts until the snapshot creation is done.
* The reference counter (bd_fsfreeze_count) guarantees that only the last
* unfreeze process can unfreeze the frozen filesystem actually when multiple
- * freeze requests arrive simultaneously. It counts up in freeze_bdev() and
- * count down in thaw_bdev(). When it becomes 0, thaw_bdev() will unfreeze
+ * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
+ * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
* actually.
+ *
+ * Return: On success zero is returned, negative error code on failure.
*/
-int freeze_bdev(struct block_device *bdev)
+int bdev_freeze(struct block_device *bdev)
{
- struct super_block *sb;
int error = 0;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (++bdev->bd_fsfreeze_count > 1)
- goto done;
-
- sb = get_active_super(bdev);
- if (!sb)
- goto sync;
- if (sb->s_op->freeze_super)
- error = sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- else
- error = freeze_super(sb, FREEZE_HOLDER_USERSPACE);
- deactivate_super(sb);
- if (error) {
- bdev->bd_fsfreeze_count--;
- goto done;
+ if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return 0;
}
- bdev->bd_fsfreeze_sb = sb;
-sync:
- sync_blockdev(bdev);
-done:
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
+ error = bdev->bd_holder_ops->freeze(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ error = sync_blockdev(bdev);
+ }
+
+ if (error)
+ atomic_dec(&bdev->bd_fsfreeze_count);
+
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
-EXPORT_SYMBOL(freeze_bdev);
+EXPORT_SYMBOL(bdev_freeze);
/**
- * thaw_bdev - unlock filesystem
+ * bdev_thaw - unlock filesystem
* @bdev: blockdevice to unlock
*
- * Unlocks the filesystem and marks it writeable again after freeze_bdev().
+ * Unlocks the filesystem and marks it writeable again after bdev_freeze().
+ *
+ * Return: On success zero is returned, negative error code on failure.
*/
-int thaw_bdev(struct block_device *bdev)
+int bdev_thaw(struct block_device *bdev)
{
- struct super_block *sb;
- int error = -EINVAL;
+ int error = -EINVAL, nr_freeze;
mutex_lock(&bdev->bd_fsfreeze_mutex);
- if (!bdev->bd_fsfreeze_count)
+
+ /*
+ * If this returns < 0 it means that @bd_fsfreeze_count was
+ * already 0 and no decrement was performed.
+ */
+ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
+ if (nr_freeze < 0)
goto out;
error = 0;
- if (--bdev->bd_fsfreeze_count > 0)
+ if (nr_freeze > 0)
goto out;
- sb = bdev->bd_fsfreeze_sb;
- if (!sb)
- goto out;
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
+ error = bdev->bd_holder_ops->thaw(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ }
- if (sb->s_op->thaw_super)
- error = sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE);
- else
- error = thaw_super(sb, FREEZE_HOLDER_USERSPACE);
if (error)
- bdev->bd_fsfreeze_count++;
- else
- bdev->bd_fsfreeze_sb = NULL;
+ atomic_inc(&bdev->bd_fsfreeze_count);
out:
mutex_unlock(&bdev->bd_fsfreeze_mutex);
return error;
}
-EXPORT_SYMBOL(thaw_bdev);
+EXPORT_SYMBOL(bdev_thaw);
/*
* pseudo-fs
@@ -729,9 +735,60 @@ void blkdev_put_no_open(struct block_device *bdev)
{
put_device(&bdev->bd_device);
}
-
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+ return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = 0;
+}
+
+static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return true;
+ /* Writes blocked? */
+ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+ return false;
+ if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+ return false;
+ return true;
+}
+
+static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Claim exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_block_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers++;
+}
+
+static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Yield exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_unblock_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers--;
+}
+
/**
- * blkdev_get_by_dev - open a block device by device number
+ * bdev_open_by_dev - open a block device by device number
* @dev: device number of block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -743,32 +800,46 @@ void blkdev_put_no_open(struct block_device *bdev)
*
* Use this interface ONLY if you really do not have anything better - i.e. when
* you are behind a truly sucky interface and all you are given is a device
- * number. Everything else should use blkdev_get_by_path().
+ * number. Everything else should use bdev_open_by_path().
*
* CONTEXT:
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ const struct blk_holder_ops *hops)
{
- bool unblock_events = true;
+ struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
+ GFP_KERNEL);
struct block_device *bdev;
+ bool unblock_events = true;
struct gendisk *disk;
int ret;
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+
ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
MAJOR(dev), MINOR(dev),
((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
if (ret)
- return ERR_PTR(ret);
+ goto free_handle;
+
+ /* Blocking writes requires exclusive opener */
+ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
+ ret = -EINVAL;
+ goto free_handle;
+ }
bdev = blkdev_get_no_open(dev);
- if (!bdev)
- return ERR_PTR(-ENXIO);
+ if (!bdev) {
+ ret = -ENXIO;
+ goto free_handle;
+ }
disk = bdev->bd_disk;
if (holder) {
@@ -791,12 +862,16 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
goto abort_claiming;
if (!try_module_get(disk->fops->owner))
goto abort_claiming;
+ ret = -EBUSY;
+ if (!bdev_may_open(bdev, mode))
+ goto abort_claiming;
if (bdev_is_partition(bdev))
ret = blkdev_get_part(bdev, mode);
else
ret = blkdev_get_whole(bdev, mode);
if (ret)
goto put_module;
+ bdev_claim_write_access(bdev, mode);
if (holder) {
bd_finish_claiming(bdev, holder, hops);
@@ -817,7 +892,10 @@ struct block_device *blkdev_get_by_dev(dev_t dev, blk_mode_t mode, void *holder,
if (unblock_events)
disk_unblock_events(disk);
- return bdev;
+ handle->bdev = bdev;
+ handle->holder = holder;
+ handle->mode = mode;
+ return handle;
put_module:
module_put(disk->fops->owner);
abort_claiming:
@@ -827,34 +905,14 @@ abort_claiming:
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
+free_handle:
+ kfree(handle);
return ERR_PTR(ret);
}
-EXPORT_SYMBOL(blkdev_get_by_dev);
-
-struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
- const struct blk_holder_ops *hops)
-{
- struct bdev_handle *handle = kmalloc(sizeof(*handle), GFP_KERNEL);
- struct block_device *bdev;
-
- if (!handle)
- return ERR_PTR(-ENOMEM);
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (IS_ERR(bdev)) {
- kfree(handle);
- return ERR_CAST(bdev);
- }
- handle->bdev = bdev;
- handle->holder = holder;
- if (holder)
- mode |= BLK_OPEN_EXCL;
- handle->mode = mode;
- return handle;
-}
EXPORT_SYMBOL(bdev_open_by_dev);
/**
- * blkdev_get_by_path - open a block device by name
+ * bdev_open_by_path - open a block device by name
* @path: path to the block device to open
* @mode: open mode (BLK_OPEN_*)
* @holder: exclusive holder identifier
@@ -868,29 +926,9 @@ EXPORT_SYMBOL(bdev_open_by_dev);
* Might sleep.
*
* RETURNS:
- * Reference to the block_device on success, ERR_PTR(-errno) on failure.
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
*/
-struct block_device *blkdev_get_by_path(const char *path, blk_mode_t mode,
- void *holder, const struct blk_holder_ops *hops)
-{
- struct block_device *bdev;
- dev_t dev;
- int error;
-
- error = lookup_bdev(path, &dev);
- if (error)
- return ERR_PTR(error);
-
- bdev = blkdev_get_by_dev(dev, mode, holder, hops);
- if (!IS_ERR(bdev) && (mode & BLK_OPEN_WRITE) && bdev_read_only(bdev)) {
- blkdev_put(bdev, holder);
- return ERR_PTR(-EACCES);
- }
-
- return bdev;
-}
-EXPORT_SYMBOL(blkdev_get_by_path);
-
struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
void *holder, const struct blk_holder_ops *hops)
{
@@ -913,8 +951,9 @@ struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
}
EXPORT_SYMBOL(bdev_open_by_path);
-void blkdev_put(struct block_device *bdev, void *holder)
+void bdev_release(struct bdev_handle *handle)
{
+ struct block_device *bdev = handle->bdev;
struct gendisk *disk = bdev->bd_disk;
/*
@@ -928,8 +967,10 @@ void blkdev_put(struct block_device *bdev, void *holder)
sync_blockdev(bdev);
mutex_lock(&disk->open_mutex);
- if (holder)
- bd_end_claim(bdev, holder);
+ bdev_yield_write_access(bdev, handle->mode);
+
+ if (handle->holder)
+ bd_end_claim(bdev, handle->holder);
/*
* Trigger event checking and tell drivers to flush MEDIA_CHANGE
@@ -946,12 +987,6 @@ void blkdev_put(struct block_device *bdev, void *holder)
module_put(disk->fops->owner);
blkdev_put_no_open(bdev);
-}
-EXPORT_SYMBOL(blkdev_put);
-
-void bdev_release(struct bdev_handle *handle)
-{
- blkdev_put(handle->bdev, handle->holder);
kfree(handle);
}
EXPORT_SYMBOL(bdev_release);
@@ -1102,3 +1137,12 @@ void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
blkdev_put_no_open(bdev);
}
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+ if (kstrtobool(str, &bdev_allow_write_mounted))
+ pr_warn("Invalid option string for bdev_allow_write_mounted:"
+ " '%s'\n", str);
+ return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index ec8ac8cf6e1b..c9a16fba58b9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -69,15 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
memset(bip, 0, sizeof(*bip));
+ /* always report as many vecs as asked explicitly, not inline vecs */
+ bip->bip_max_vcnt = nr_vecs;
if (nr_vecs > inline_vecs) {
- bip->bip_max_vcnt = nr_vecs;
bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
&bip->bip_max_vcnt, gfp_mask);
if (!bip->bip_vec)
goto err;
} else {
bip->bip_vec = bip->bip_inline_vecs;
- bip->bip_max_vcnt = inline_vecs;
}
bip->bip_bio = bio;
@@ -91,6 +91,47 @@ err:
}
EXPORT_SYMBOL(bio_integrity_alloc);
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
+ bool dirty)
+{
+ int i;
+
+ for (i = 0; i < nr_vecs; i++) {
+ if (dirty && !PageCompound(bv[i].bv_page))
+ set_page_dirty_lock(bv[i].bv_page);
+ unpin_user_page(bv[i].bv_page);
+ }
+}
+
+static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
+{
+ unsigned short nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *copy = &bip->bip_vec[1];
+ size_t bytes = bip->bip_iter.bi_size;
+ struct iov_iter iter;
+ int ret;
+
+ iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ WARN_ON_ONCE(ret != bytes);
+
+ bio_integrity_unpin_bvec(copy, nr_vecs, true);
+}
+
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+ bool dirty = bio_data_dir(bip->bip_bio) == READ;
+
+ if (bip->bip_flags & BIP_COPY_USER) {
+ if (dirty)
+ bio_integrity_uncopy_user(bip);
+ kfree(bvec_virt(bip->bip_vec));
+ return;
+ }
+
+ bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
+}
+
/**
* bio_integrity_free - Free bio integrity payload
* @bio: bio containing bip to be freed
@@ -105,6 +146,8 @@ void bio_integrity_free(struct bio *bio)
if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
kfree(bvec_virt(bip->bip_vec));
+ else if (bip->bip_flags & BIP_INTEGRITY_USER)
+ bio_integrity_unmap_user(bip);
__bio_integrity_free(bs, bip);
bio->bi_integrity = NULL;
@@ -160,6 +203,177 @@ int bio_integrity_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_integrity_add_page);
+static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
+ int nr_vecs, unsigned int len,
+ unsigned int direction, u32 seed)
+{
+ bool write = direction == ITER_SOURCE;
+ struct bio_integrity_payload *bip;
+ struct iov_iter iter;
+ void *buf;
+ int ret;
+
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (write) {
+ iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+ if (!copy_from_iter_full(buf, len, &iter)) {
+ ret = -EFAULT;
+ goto free_buf;
+ }
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+ } else {
+ memset(buf, 0, len);
+
+ /*
+ * We need to preserve the original bvec and the number of vecs
+ * in it for completion handling
+ */
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
+ }
+
+ if (IS_ERR(bip)) {
+ ret = PTR_ERR(bip);
+ goto free_buf;
+ }
+
+ if (write)
+ bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+ else
+ memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+ offset_in_page(buf));
+ if (ret != len) {
+ ret = -ENOMEM;
+ goto free_bip;
+ }
+
+ bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
+ bip->bip_iter.bi_sector = seed;
+ return 0;
+free_bip:
+ bio_integrity_free(bio);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
+ int nr_vecs, unsigned int len, u32 seed)
+{
+ struct bio_integrity_payload *bip;
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
+
+ memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
+ bip->bip_flags |= BIP_INTEGRITY_USER;
+ bip->bip_iter.bi_sector = seed;
+ bip->bip_iter.bi_size = len;
+ return 0;
+}
+
+static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
+ int nr_vecs, ssize_t bytes, ssize_t offset)
+{
+ unsigned int nr_bvecs = 0;
+ int i, j;
+
+ for (i = 0; i < nr_vecs; i = j) {
+ size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
+ struct folio *folio = page_folio(pages[i]);
+
+ bytes -= size;
+ for (j = i + 1; j < nr_vecs; j++) {
+ size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+ if (page_folio(pages[j]) != folio ||
+ pages[j] != pages[j - 1] + 1)
+ break;
+ unpin_user_page(pages[j]);
+ size += next;
+ bytes -= next;
+ }
+
+ bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
+ offset = 0;
+ nr_bvecs++;
+ }
+
+ return nr_bvecs;
+}
+
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
+ u32 seed)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
+ struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
+ struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ unsigned int direction, nr_bvecs;
+ struct iov_iter iter;
+ int ret, nr_vecs;
+ size_t offset;
+ bool copy;
+
+ if (bio_integrity(bio))
+ return -EINVAL;
+ if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
+ return -E2BIG;
+
+ if (bio_data_dir(bio) == READ)
+ direction = ITER_DEST;
+ else
+ direction = ITER_SOURCE;
+
+ iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+ if (nr_vecs > BIO_MAX_VECS)
+ return -E2BIG;
+ if (nr_vecs > UIO_FASTIOV) {
+ bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ return -ENOMEM;
+ pages = NULL;
+ }
+
+ copy = !iov_iter_is_aligned(&iter, align, align);
+ ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+ if (unlikely(ret < 0))
+ goto free_bvec;
+
+ nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+ if (pages != stack_pages)
+ kvfree(pages);
+ if (nr_bvecs > queue_max_integrity_segments(q))
+ copy = true;
+
+ if (copy)
+ ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
+ direction, seed);
+ else
+ ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+ if (ret)
+ goto release_pages;
+ if (bvec != stack_vec)
+ kfree(bvec);
+
+ return 0;
+
+release_pages:
+ bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+free_bvec:
+ if (bvec != stack_vec)
+ kfree(bvec);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_map_user);
+
/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
diff --git a/block/bio.c b/block/bio.c
index 816d412c06e9..b9642a41f286 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -944,7 +944,7 @@ bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
if ((addr1 | mask) != (addr2 | mask))
return false;
- if (bv->bv_len + len > queue_max_segment_size(q))
+ if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
return bvec_try_merge_page(bv, page, len, offset, same_page);
}
@@ -966,10 +966,13 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page)
{
+ unsigned int max_size = max_sectors << SECTOR_SHIFT;
+
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> SECTOR_SHIFT) > max_sectors)
+ len = min3(len, max_size, queue_max_segment_size(q));
+ if (len > max_size - bio->bi_iter.bi_size)
return 0;
if (bio->bi_vcnt > 0) {
@@ -1145,13 +1148,22 @@ EXPORT_SYMBOL(bio_add_folio);
void __bio_release_pages(struct bio *bio, bool mark_dirty)
{
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (mark_dirty && !PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
- bio_release_page(bio, bvec->bv_page);
+ bio_for_each_folio_all(fi, bio) {
+ struct page *page;
+ size_t done = 0;
+
+ if (mark_dirty) {
+ folio_lock(fi.folio);
+ folio_mark_dirty(fi.folio);
+ folio_unlock(fi.folio);
+ }
+ page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
+ do {
+ bio_release_page(bio, page++);
+ done += PAGE_SIZE;
+ } while (done < fi.length);
}
}
EXPORT_SYMBOL_GPL(__bio_release_pages);
@@ -1439,18 +1451,12 @@ EXPORT_SYMBOL(bio_free_pages);
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
* for performing direct-IO in BIOs.
*
- * The problem is that we cannot run set_page_dirty() from interrupt context
+ * The problem is that we cannot run folio_mark_dirty() from interrupt context
* because the required locks are not interrupt-safe. So what we can do is to
* mark the pages dirty _before_ performing IO. And in interrupt context,
* check that the pages are still dirty. If so, fine. If not, redirty them
* in process context.
*
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages. The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all. So we skip compound pages here at an early stage.
- *
* Note that this code is very hard to test under normal circumstances because
* direct-io pins the pages with get_user_pages(). This makes
* is_page_cache_freeable return false, and the VM will not clean the pages.
@@ -1466,12 +1472,12 @@ EXPORT_SYMBOL(bio_free_pages);
*/
void bio_set_pages_dirty(struct bio *bio)
{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (!PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
+ bio_for_each_folio_all(fi, bio) {
+ folio_lock(fi.folio);
+ folio_mark_dirty(fi.folio);
+ folio_unlock(fi.folio);
}
}
EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
@@ -1515,12 +1521,11 @@ static void bio_dirty_fn(struct work_struct *work)
void bio_check_pages_dirty(struct bio *bio)
{
- struct bio_vec *bvec;
+ struct folio_iter fi;
unsigned long flags;
- struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+ bio_for_each_folio_all(fi, bio) {
+ if (!folio_test_dirty(fi.folio))
goto defer;
}
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 4b48c2c44098..ff93c385ba5a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -300,7 +300,7 @@ static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
* @disk: gendisk the new blkg is associated with
* @gfp_mask: allocation mask to use
*
- * Allocate a new blkg assocating @blkcg and @q.
+ * Allocate a new blkg associating @blkcg and @disk.
*/
static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
gfp_t gfp_mask)
@@ -575,13 +575,13 @@ static void blkg_destroy(struct blkcg_gq *blkg)
static void blkg_destroy_all(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- struct blkcg_gq *blkg, *n;
+ struct blkcg_gq *blkg;
int count = BLKG_DESTROY_BATCH_SIZE;
int i;
restart:
spin_lock_irq(&q->queue_lock);
- list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
struct blkcg *blkcg = blkg->blkcg;
if (hlist_unhashed(&blkg->blkcg_node))
@@ -2064,6 +2064,9 @@ void bio_associate_blkg(struct bio *bio)
{
struct cgroup_subsys_state *css;
+ if (blk_op_is_passthrough(bio->bi_opf))
+ return;
+
rcu_read_lock();
if (bio->bi_blkg)
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
index fd482439afbc..b927a4a0ad03 100644
--- a/block/blk-cgroup.h
+++ b/block/blk-cgroup.h
@@ -252,7 +252,8 @@ static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
if (blkcg == &blkcg_root)
return q->root_blkg;
- blkg = rcu_dereference(blkcg->blkg_hint);
+ blkg = rcu_dereference_check(blkcg->blkg_hint,
+ lockdep_is_held(&q->queue_lock));
if (blkg && blkg->q == q)
return blkg;
diff --git a/block/blk-core.c b/block/blk-core.c
index 2eca76ccf4ee..de771093b526 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -49,6 +49,7 @@
#include "blk-pm.h"
#include "blk-cgroup.h"
#include "blk-throttle.h"
+#include "blk-ioprio.h"
struct dentry *blk_debugfs_root;
@@ -772,6 +773,15 @@ void submit_bio_noacct(struct bio *bio)
bio_clear_polled(bio);
switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ break;
+ case REQ_OP_FLUSH:
+ /*
+ * REQ_OP_FLUSH can't be submitted through bios, it is only
+ * synthetized in struct request by the flush state machine.
+ */
+ goto not_supported;
case REQ_OP_DISCARD:
if (!bdev_max_discard_sectors(bdev))
goto not_supported;
@@ -785,6 +795,10 @@ void submit_bio_noacct(struct bio *bio)
if (status != BLK_STS_OK)
goto end_io;
break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!q->limits.max_write_zeroes_sectors)
+ goto not_supported;
+ break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
@@ -796,12 +810,15 @@ void submit_bio_noacct(struct bio *bio)
if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
- case REQ_OP_WRITE_ZEROES:
- if (!q->limits.max_write_zeroes_sectors)
- goto not_supported;
- break;
+ case REQ_OP_DRV_IN:
+ case REQ_OP_DRV_OUT:
+ /*
+ * Driver private operations are only used with passthrough
+ * requests.
+ */
+ fallthrough;
default:
- break;
+ goto not_supported;
}
if (blk_throtl_bio(bio))
@@ -817,6 +834,14 @@ end_io:
}
EXPORT_SYMBOL(submit_bio_noacct);
+static void bio_set_ioprio(struct bio *bio)
+{
+ /* Nobody set ioprio so far? Initialize it based on task's nice value */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
+ bio->bi_ioprio = get_current_ioprio();
+ blkcg_set_ioprio(bio);
+}
+
/**
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
@@ -839,6 +864,7 @@ void submit_bio(struct bio *bio)
count_vm_events(PGPGOUT, bio_sectors(bio));
}
+ bio_set_ioprio(bio);
submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 089fcb9cfce3..04d44f0bcbc8 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -1261,7 +1261,7 @@ static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
- u64 last_period, cur_period;
+ u64 __maybe_unused last_period, cur_period;
u64 vtime, vtarget;
int i;
@@ -1353,6 +1353,13 @@ static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
lockdep_assert_held(&iocg->waitq.lock);
+ /*
+ * If the delay is set by another CPU, we may be in the past. No need to
+ * change anything if so. This avoids decay calculation underflow.
+ */
+ if (time_before64(now->now, iocg->delay_at))
+ return false;
+
/* calculate the current delay in effect - 1/2 every second */
tdelta = now->now - iocg->delay_at;
if (iocg->delay)
diff --git a/block/blk-map.c b/block/blk-map.c
index 8584babf3ea0..71210cdb3442 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -205,12 +205,19 @@ static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
/*
* success
*/
- if ((iov_iter_rw(iter) == WRITE &&
- (!map_data || !map_data->null_mapped)) ||
- (map_data && map_data->from_user)) {
+ if (iov_iter_rw(iter) == WRITE &&
+ (!map_data || !map_data->null_mapped)) {
ret = bio_copy_from_iter(bio, iter);
if (ret)
goto cleanup;
+ } else if (map_data && map_data->from_user) {
+ struct iov_iter iter2 = *iter;
+
+ /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */
+ iter2.data_source = ITER_SOURCE;
+ ret = bio_copy_from_iter(bio, &iter2);
+ if (ret)
+ goto cleanup;
} else {
if (bmd->is_our_pages)
zero_fill_bio(bio);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 65e75efa9bd3..2d470cf2173e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -115,17 +115,13 @@ static struct bio *bio_split_discard(struct bio *bio,
*nsegs = 1;
- /* Zero-sector (unknown) and one-sector granularities are the same. */
granularity = max(lim->discard_granularity >> 9, 1U);
max_discard_sectors =
min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
-
- if (unlikely(!max_discard_sectors)) {
- /* XXX: warn */
+ if (unlikely(!max_discard_sectors))
return NULL;
- }
if (bio_sectors(bio) <= max_discard_sectors)
return NULL;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 5cbeb9344f2f..94668e72ab09 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -479,23 +479,6 @@ out:
return res;
}
-static int hctx_run_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "%lu\n", hctx->run);
- return 0;
-}
-
-static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->run = 0;
- return count;
-}
-
static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
@@ -624,7 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
- {"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
{"type", 0400, hctx_type_show},
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 67c95f31b15b..451a2c1f1f32 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -324,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
- hctx->run++;
-
/*
* A return of -EAGAIN is an indication that hctx->dispatch is not
* empty and we must run again in order to avoid starving flushes.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ac18f802c027..2dc01551e27c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -40,7 +40,6 @@
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
-#include "blk-ioprio.h"
static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
@@ -772,11 +771,16 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
/*
* Partial zone append completions cannot be supported as the
* BIO fragments may end up not being written sequentially.
+ * For such case, force the completed nbytes to be equal to
+ * the BIO size so that bio_advance() sets the BIO remaining
+ * size to 0 and we end up calling bio_endio() before returning.
*/
- if (bio->bi_iter.bi_size != nbytes)
+ if (bio->bi_iter.bi_size != nbytes) {
bio->bi_status = BLK_STS_IOERR;
- else
+ nbytes = bio->bi_iter.bi_size;
+ } else {
bio->bi_iter.bi_sector = rq->__sector;
+ }
}
bio_advance(bio, nbytes);
@@ -1248,7 +1252,8 @@ void blk_mq_start_request(struct request *rq)
trace_block_rq_issue(rq);
- if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+ if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
+ !blk_rq_is_passthrough(rq)) {
rq->io_start_time_ns = ktime_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
@@ -1859,6 +1864,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
__add_wait_queue(wq, wait);
/*
+ * Add one explicit barrier since blk_mq_get_driver_tag() may
+ * not imply barrier in case of failure.
+ *
+ * Order adding us to wait queue and allocating driver tag.
+ *
+ * The pair is the one implied in sbitmap_queue_wake_up() which
+ * orders clearing sbitmap tag bits and waitqueue_active() in
+ * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
+ *
+ * Otherwise, re-order of adding wait queue and getting driver tag
+ * may cause __sbitmap_queue_wake_up() to wake up nothing because
+ * the waitqueue_active() may not observe us in wait queue.
+ */
+ smp_mb();
+
+ /*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
@@ -2890,8 +2911,11 @@ static struct request *blk_mq_get_new_requests(struct request_queue *q,
return NULL;
}
-/* return true if this @rq can be used for @bio */
-static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
+/*
+ * Check if we can use the passed on request for submitting the passed in bio,
+ * and remove it from the request list if it can be used.
+ */
+static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
struct bio *bio)
{
enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
@@ -2919,14 +2943,6 @@ static bool blk_mq_can_use_cached_rq(struct request *rq, struct blk_plug *plug,
return true;
}
-static void bio_set_ioprio(struct bio *bio)
-{
- /* Nobody set ioprio so far? Initialize it based on task's nice value */
- if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
- bio->bi_ioprio = get_current_ioprio();
- blkcg_set_ioprio(bio);
-}
-
/**
* blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
@@ -2951,13 +2967,6 @@ void blk_mq_submit_bio(struct bio *bio)
blk_status_t ret;
bio = blk_queue_bounce(bio, q);
- if (bio_may_exceed_limits(bio, &q->limits)) {
- bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
- if (!bio)
- return;
- }
-
- bio_set_ioprio(bio);
if (plug) {
rq = rq_list_peek(&plug->cached_rq);
@@ -2965,16 +2974,26 @@ void blk_mq_submit_bio(struct bio *bio)
rq = NULL;
}
if (rq) {
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ return;
+ }
if (!bio_integrity_prep(bio))
return;
if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
return;
- if (blk_mq_can_use_cached_rq(rq, plug, bio))
+ if (blk_mq_use_cached_rq(rq, plug, bio))
goto done;
percpu_ref_get(&q->q_usage_counter);
} else {
if (unlikely(bio_queue_enter(bio)))
return;
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto fail;
+ }
if (!bio_integrity_prep(bio))
goto fail;
}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index f48ee150d667..37245c97ee61 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -118,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (q->rq_qos && !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 0046b447268f..06ea91e51b8b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -48,7 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->max_secure_erase_sectors = 0;
- lim->discard_granularity = 0;
+ lim->discard_granularity = 512;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
@@ -56,7 +56,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
- lim->zoned = BLK_ZONED_NONE;
+ lim->zoned = false;
lim->zone_write_granularity = 0;
lim->dma_alignment = 511;
}
@@ -127,8 +127,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
if ((max_hw_sectors << 9) < PAGE_SIZE) {
max_hw_sectors = 1 << (PAGE_SHIFT - 9);
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_hw_sectors);
+ pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
}
max_hw_sectors = round_down(max_hw_sectors,
@@ -140,7 +139,7 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
if (limits->max_user_sectors)
max_sectors = min(max_sectors, limits->max_user_sectors);
else
- max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS);
+ max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
max_sectors = round_down(max_sectors,
limits->logical_block_size >> SECTOR_SHIFT);
@@ -248,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments
{
if (!max_segments) {
max_segments = 1;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_segments);
+ pr_info("%s: set to minimum %u\n", __func__, max_segments);
}
q->limits.max_segments = max_segments;
@@ -285,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
{
if (max_size < PAGE_SIZE) {
max_size = PAGE_SIZE;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_size);
+ pr_info("%s: set to minimum %u\n", __func__, max_size);
}
/* see blk_queue_virt_boundary() for the explanation */
@@ -312,6 +309,9 @@ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
limits->logical_block_size = size;
+ if (limits->discard_granularity < limits->logical_block_size)
+ limits->discard_granularity = limits->logical_block_size;
+
if (limits->physical_block_size < size)
limits->physical_block_size = size;
@@ -342,6 +342,9 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
if (q->limits.physical_block_size < q->limits.logical_block_size)
q->limits.physical_block_size = q->limits.logical_block_size;
+ if (q->limits.discard_granularity < q->limits.physical_block_size)
+ q->limits.discard_granularity = q->limits.physical_block_size;
+
if (q->limits.io_min < q->limits.physical_block_size)
q->limits.io_min = q->limits.physical_block_size;
}
@@ -740,8 +743,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
if (mask < PAGE_SIZE - 1) {
mask = PAGE_SIZE - 1;
- printk(KERN_INFO "%s: set to minimum %lx\n",
- __func__, mask);
+ pr_info("%s: set to minimum %lx\n", __func__, mask);
}
q->limits.seg_boundary_mask = mask;
@@ -841,8 +843,6 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else
blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
-
- wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
@@ -884,81 +884,22 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
-static bool disk_has_partitions(struct gendisk *disk)
-{
- unsigned long idx;
- struct block_device *part;
- bool ret = false;
-
- rcu_read_lock();
- xa_for_each(&disk->part_tbl, idx, part) {
- if (bdev_is_partition(part)) {
- ret = true;
- break;
- }
- }
- rcu_read_unlock();
-
- return ret;
-}
-
/**
- * disk_set_zoned - configure the zoned model for a disk
- * @disk: the gendisk of the queue to configure
- * @model: the zoned model to set
- *
- * Set the zoned model of @disk to @model.
- *
- * When @model is BLK_ZONED_HM (host managed), this should be called only
- * if zoned block device support is enabled (CONFIG_BLK_DEV_ZONED option).
- * If @model specifies BLK_ZONED_HA (host aware), the effective model used
- * depends on CONFIG_BLK_DEV_ZONED settings and on the existence of partitions
- * on the disk.
+ * disk_set_zoned - inidicate a zoned device
+ * @disk: gendisk to configure
*/
-void disk_set_zoned(struct gendisk *disk, enum blk_zoned_model model)
+void disk_set_zoned(struct gendisk *disk)
{
struct request_queue *q = disk->queue;
- unsigned int old_model = q->limits.zoned;
-
- switch (model) {
- case BLK_ZONED_HM:
- /*
- * Host managed devices are supported only if
- * CONFIG_BLK_DEV_ZONED is enabled.
- */
- WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
- break;
- case BLK_ZONED_HA:
- /*
- * Host aware devices can be treated either as regular block
- * devices (similar to drive managed devices) or as zoned block
- * devices to take advantage of the zone command set, similarly
- * to host managed devices. We try the latter if there are no
- * partitions and zoned block device support is enabled, else
- * we do nothing special as far as the block layer is concerned.
- */
- if (!IS_ENABLED(CONFIG_BLK_DEV_ZONED) ||
- disk_has_partitions(disk))
- model = BLK_ZONED_NONE;
- break;
- case BLK_ZONED_NONE:
- default:
- if (WARN_ON_ONCE(model != BLK_ZONED_NONE))
- model = BLK_ZONED_NONE;
- break;
- }
- q->limits.zoned = model;
- if (model != BLK_ZONED_NONE) {
- /*
- * Set the zone write granularity to the device logical block
- * size by default. The driver can change this value if needed.
- */
- blk_queue_zone_write_granularity(q,
- queue_logical_block_size(q));
- } else if (old_model != BLK_ZONED_NONE) {
- disk_clear_zone_settings(disk);
- }
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
+
+ /*
+ * Set the zone write granularity to the device logical block
+ * size by default. The driver can change this value if needed.
+ */
+ q->limits.zoned = true;
+ blk_queue_zone_write_granularity(q, queue_logical_block_size(q));
}
EXPORT_SYMBOL_GPL(disk_set_zoned);
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 0b2d04766324..6b2429cad81a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -241,7 +241,7 @@ queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
if (max_sectors_kb == 0) {
q->limits.max_user_sectors = 0;
max_sectors_kb = min(max_hw_sectors_kb,
- BLK_DEF_MAX_SECTORS >> 1);
+ BLK_DEF_MAX_SECTORS_CAP >> 1);
} else {
if (max_sectors_kb > max_hw_sectors_kb ||
max_sectors_kb < page_kb)
@@ -309,14 +309,9 @@ QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
{
- switch (blk_queue_zoned_model(q)) {
- case BLK_ZONED_HA:
- return sprintf(page, "host-aware\n");
- case BLK_ZONED_HM:
+ if (blk_queue_is_zoned(q))
return sprintf(page, "host-managed\n");
- default:
- return sprintf(page, "none\n");
- }
+ return sprintf(page, "none\n");
}
static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0bb613139bec..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -84,8 +84,6 @@ struct rq_wb {
u64 sync_issue;
void *sync_cookie;
- unsigned int wc;
-
unsigned long last_issue; /* last non-throttled issue */
unsigned long last_comp; /* last non-throttled comp */
unsigned long min_lat_nsec;
@@ -165,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.disk->bdi->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
@@ -207,7 +205,8 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
*/
if (wb_acct & WBT_DISCARD)
limit = rwb->wb_background;
- else if (rwb->wc && !wb_recent_wait(rwb))
+ else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
+ !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -699,13 +698,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
-void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (rqos)
- RQWB(rqos)->wc = write_cache_on;
-}
-
/*
* Enable wbt if defaults are configured that way
*/
@@ -918,7 +910,6 @@ int wbt_init(struct gendisk *disk)
rwb->last_comp = rwb->last_issue = jiffies;
rwb->win_nsec = RWB_WINDOW_NSEC;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
- rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
rwb->rq_depth.queue_depth = blk_queue_depth(q);
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 8a029e138f7a..e5fc653b9b76 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -12,8 +12,6 @@ u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
bool wbt_disabled(struct request_queue *);
-void wbt_set_write_cache(struct request_queue *, bool);
-
u64 wbt_default_latency_nsec(struct request_queue *);
#else
@@ -24,9 +22,6 @@ static inline void wbt_disable_default(struct gendisk *disk)
static inline void wbt_enable_default(struct gendisk *disk)
{
}
-static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
-{
-}
#endif /* CONFIG_BLK_WBT */
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 619ee41a51cc..d343e5756a9c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -498,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
set_bit(idx, args->conv_zones_bitmap);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
if (!args->seq_zones_wlock) {
args->seq_zones_wlock =
blk_alloc_zone_bitmap(q->node, args->nr_zones);
@@ -506,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENOMEM;
}
break;
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
disk->disk_name, (int)zone->type, zone->start);
@@ -615,22 +615,3 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
-void disk_clear_zone_settings(struct gendisk *disk)
-{
- struct request_queue *q = disk->queue;
-
- blk_mq_freeze_queue(q);
-
- disk_free_zone_bitmaps(disk);
- blk_queue_flag_clear(QUEUE_FLAG_ZONE_RESETALL, q);
- q->required_elevator_features &= ~ELEVATOR_F_ZBD_SEQ_WRITE;
- disk->nr_zones = 0;
- disk->max_open_zones = 0;
- disk->max_active_zones = 0;
- q->limits.chunk_sectors = 0;
- q->limits.zone_write_granularity = 0;
- q->limits.max_zone_append_sectors = 0;
-
- blk_mq_unfreeze_queue(q);
-}
diff --git a/block/blk.h b/block/blk.h
index 08a358bc0919..1ef920f72e0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -395,14 +395,12 @@ static inline struct bio *blk_queue_bounce(struct bio *bio,
#ifdef CONFIG_BLK_DEV_ZONED
void disk_free_zone_bitmaps(struct gendisk *disk);
-void disk_clear_zone_settings(struct gendisk *disk);
int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
unsigned long arg);
int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg);
#else /* CONFIG_BLK_DEV_ZONED */
static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
-static inline void disk_clear_zone_settings(struct gendisk *disk) {}
static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
unsigned int cmd, unsigned long arg)
{
diff --git a/block/fops.c b/block/fops.c
index 0abaac705daf..0cf8cf72cdfa 100644
--- a/block/fops.c
+++ b/block/fops.c
@@ -410,9 +410,24 @@ static int blkdev_get_block(struct inode *inode, sector_t iblock,
return 0;
}
-static int blkdev_writepage(struct page *page, struct writeback_control *wbc)
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock. The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
{
- return block_write_full_page(page, blkdev_get_block, wbc);
+ struct blk_plug plug;
+ int err;
+
+ blk_start_plug(&plug);
+ err = write_cache_pages(mapping, wbc, block_write_full_folio,
+ blkdev_get_block);
+ blk_finish_plug(&plug);
+
+ return err;
}
static int blkdev_read_folio(struct file *file, struct folio *folio)
@@ -449,7 +464,7 @@ const struct address_space_operations def_blk_aops = {
.invalidate_folio = block_invalidate_folio,
.read_folio = blkdev_read_folio,
.readahead = blkdev_readahead,
- .writepage = blkdev_writepage,
+ .writepages = blkdev_writepages,
.write_begin = blkdev_write_begin,
.write_end = blkdev_write_end,
.migrate_folio = buffer_migrate_folio_norefs,
@@ -500,7 +515,7 @@ const struct address_space_operations def_blk_aops = {
.readahead = blkdev_readahead,
.writepages = blkdev_writepages,
.is_partially_uptodate = iomap_is_partially_uptodate,
- .error_remove_page = generic_error_remove_page,
+ .error_remove_folio = generic_error_remove_folio,
.migrate_folio = filemap_migrate_folio,
};
#endif /* CONFIG_BUFFER_HEAD */
diff --git a/block/genhd.c b/block/genhd.c
index c9d06f72c587..d74fb5b4ae68 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -432,7 +432,9 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
DISK_MAX_PARTS);
disk->minors = DISK_MAX_PARTS;
}
- if (disk->first_minor + disk->minors > MINORMASK + 1)
+ if (disk->first_minor > MINORMASK ||
+ disk->minors > MINORMASK + 1 ||
+ disk->first_minor + disk->minors > MINORMASK + 1)
goto out_exit_elevator;
} else {
if (WARN_ON(disk->minors))
@@ -542,6 +544,7 @@ out_put_holder_dir:
kobject_put(disk->part0->bd_holder_dir);
out_del_block_link:
sysfs_remove_link(block_depr, dev_name(ddev));
+ pm_runtime_set_memalloc_noio(ddev, false);
out_device_del:
device_del(ddev);
out_free_ext_minor:
diff --git a/block/ioctl.c b/block/ioctl.c
index 4160f4e6bd5b..438f79c564cf 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -18,10 +18,8 @@ static int blkpg_do_ioctl(struct block_device *bdev,
{
struct gendisk *disk = bdev->bd_disk;
struct blkpg_partition p;
- long long start, length;
+ sector_t start, length;
- if (disk->flags & GENHD_FL_NO_PART)
- return -EINVAL;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&p, upart, sizeof(struct blkpg_partition)))
@@ -35,14 +33,17 @@ static int blkpg_do_ioctl(struct block_device *bdev,
if (op == BLKPG_DEL_PARTITION)
return bdev_del_partition(disk, p.pno);
+ if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+ return -EINVAL;
+ /* Check that the partition is aligned to the block size */
+ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
+ return -EINVAL;
+
start = p.start >> SECTOR_SHIFT;
length = p.length >> SECTOR_SHIFT;
switch (op) {
case BLKPG_ADD_PARTITION:
- /* check if partition is aligned to blocksize */
- if (p.start & (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
return bdev_add_partition(disk, p.pno, start, length);
case BLKPG_RESIZE_PARTITION:
return bdev_resize_partition(disk, p.pno, start, length);
diff --git a/block/ioprio.c b/block/ioprio.c
index b5a942519a79..73301a261429 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -139,32 +139,6 @@ out:
return ret;
}
-/*
- * If the task has set an I/O priority, use that. Otherwise, return
- * the default I/O priority.
- *
- * Expected to be called for current task or with task_lock() held to keep
- * io_context stable.
- */
-int __get_task_ioprio(struct task_struct *p)
-{
- struct io_context *ioc = p->io_context;
- int prio;
-
- if (p != current)
- lockdep_assert_held(&p->alloc_lock);
- if (ioc)
- prio = ioc->ioprio;
- else
- prio = IOPRIO_DEFAULT;
-
- if (IOPRIO_PRIO_CLASS(prio) == IOPRIO_CLASS_NONE)
- prio = IOPRIO_PRIO_VALUE(task_nice_ioclass(p),
- task_nice_ioprio(p));
- return prio;
-}
-EXPORT_SYMBOL_GPL(__get_task_ioprio);
-
static int get_task_ioprio(struct task_struct *p)
{
int ret;
diff --git a/block/opal_proto.h b/block/opal_proto.h
index dec7ce3a3edb..d247a457bf6e 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -71,6 +71,7 @@ enum opal_response_token {
#define SHORT_ATOM_BYTE 0xBF
#define MEDIUM_ATOM_BYTE 0xDF
#define LONG_ATOM_BYTE 0xE3
+#define EMPTY_ATOM_BYTE 0xFF
#define OPAL_INVAL_PARAM 12
#define OPAL_MANUFACTURED_INACTIVE 0x08
diff --git a/block/partitions/core.c b/block/partitions/core.c
index f47ffcfdfcec..5f5ed5c75f04 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -305,18 +305,10 @@ static struct block_device *add_partition(struct gendisk *disk, int partno,
* Partitions are not supported on zoned block devices that are used as
* such.
*/
- switch (disk->queue->limits.zoned) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(disk->part0)) {
pr_warn("%s: partitions not supported on host managed zoned block device\n",
disk->disk_name);
return ERR_PTR(-ENXIO);
- case BLK_ZONED_HA:
- pr_info("%s: disabling host aware zoned block device support due to partitions\n",
- disk->disk_name);
- disk_set_zoned(disk, BLK_ZONED_NONE);
- break;
- case BLK_ZONED_NONE:
- break;
}
if (xa_load(&disk->part_tbl, partno))
@@ -447,6 +439,11 @@ int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
goto out;
}
+ if (disk->flags & GENHD_FL_NO_PART) {
+ ret = -EINVAL;
+ goto out;
+ }
+
if (partition_overlaps(disk, start, length, -1)) {
ret = -EBUSY;
goto out;
@@ -570,8 +567,8 @@ static bool blk_add_partition(struct gendisk *disk,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
- printk(KERN_ERR " %s: p%d could not be added: %ld\n",
- disk->disk_name, p, -PTR_ERR(part));
+ printk(KERN_ERR " %s: p%d could not be added: %pe\n",
+ disk->disk_name, p, part);
return true;
}
@@ -613,7 +610,7 @@ static int blk_add_partitions(struct gendisk *disk)
/*
* Partitions are not supported on host managed zoned block devices.
*/
- if (disk->queue->limits.zoned == BLK_ZONED_HM) {
+ if (bdev_is_zoned(disk->part0)) {
pr_warn("%s: ignoring partition table on host managed zoned block device\n",
disk->disk_name);
ret = 0;
diff --git a/block/sed-opal.c b/block/sed-opal.c
index 3d9e9cd250bd..fa4dba5d8531 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -1056,16 +1056,20 @@ static int response_parse(const u8 *buf, size_t length,
token_length = response_parse_medium(iter, pos);
else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
token_length = response_parse_long(iter, pos);
+ else if (pos[0] == EMPTY_ATOM_BYTE) /* empty atom */
+ token_length = 1;
else /* TOKEN */
token_length = response_parse_token(iter, pos);
if (token_length < 0)
return token_length;
+ if (pos[0] != EMPTY_ATOM_BYTE)
+ num_entries++;
+
pos += token_length;
total -= token_length;
iter++;
- num_entries++;
}
resp->num = num_entries;