aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig65
-rw-r--r--block/Kconfig.iosched18
-rw-r--r--block/Makefile11
-rw-r--r--block/bfq-cgroup.c534
-rw-r--r--block/bfq-iosched.c1135
-rw-r--r--block/bfq-iosched.h66
-rw-r--r--block/bfq-wf2q.c30
-rw-r--r--block/bio-integrity.c36
-rw-r--r--block/bio.c936
-rw-r--r--block/blk-cgroup-rwstat.c129
-rw-r--r--block/blk-cgroup-rwstat.h149
-rw-r--r--block/blk-cgroup.c653
-rw-r--r--block/blk-core.c556
-rw-r--r--block/blk-crypto-fallback.c657
-rw-r--r--block/blk-crypto-internal.h201
-rw-r--r--block/blk-crypto.c404
-rw-r--r--block/blk-exec.c2
-rw-r--r--block/blk-flush.c74
-rw-r--r--block/blk-integrity.c20
-rw-r--r--block/blk-ioc.c7
-rw-r--r--block/blk-iocost.c2538
-rw-r--r--block/blk-iolatency.c62
-rw-r--r--block/blk-map.c531
-rw-r--r--block/blk-merge.c373
-rw-r--r--block/blk-mq-cpumap.c29
-rw-r--r--block/blk-mq-debugfs.c48
-rw-r--r--block/blk-mq-sched.c159
-rw-r--r--block/blk-mq-sched.h19
-rw-r--r--block/blk-mq-sysfs.c54
-rw-r--r--block/blk-mq-tag.c159
-rw-r--r--block/blk-mq-tag.h19
-rw-r--r--block/blk-mq-virtio.c2
-rw-r--r--block/blk-mq.c939
-rw-r--r--block/blk-mq.h58
-rw-r--r--block/blk-pm.c12
-rw-r--r--block/blk-rq-qos.c39
-rw-r--r--block/blk-rq-qos.h45
-rw-r--r--block/blk-settings.c153
-rw-r--r--block/blk-softirq.c4
-rw-r--r--block/blk-stat.c7
-rw-r--r--block/blk-sysfs.c64
-rw-r--r--block/blk-throttle.c152
-rw-r--r--block/blk-wbt.c44
-rw-r--r--block/blk-wbt.h8
-rw-r--r--block/blk-zoned.c543
-rw-r--r--block/blk.h223
-rw-r--r--block/bounce.c2
-rw-r--r--block/bsg-lib.c10
-rw-r--r--block/bsg.c1
-rw-r--r--block/compat_ioctl.c411
-rw-r--r--block/elevator.c196
-rw-r--r--block/genhd.c368
-rw-r--r--block/ioctl.c508
-rw-r--r--block/ioprio.c2
-rw-r--r--block/keyslot-manager.c396
-rw-r--r--block/kyber-iosched.c8
-rw-r--r--block/mq-deadline.c29
-rw-r--r--block/opal_proto.h26
-rw-r--r--block/partition-generic.c680
-rw-r--r--block/partitions/Kconfig8
-rw-r--r--block/partitions/Makefile3
-rw-r--r--block/partitions/acorn.c1
-rw-r--r--block/partitions/acorn.h15
-rw-r--r--block/partitions/aix.c1
-rw-r--r--block/partitions/aix.h2
-rw-r--r--block/partitions/amiga.c11
-rw-r--r--block/partitions/amiga.h7
-rw-r--r--block/partitions/atari.h1
-rw-r--r--block/partitions/check.c198
-rw-r--r--block/partitions/check.h41
-rw-r--r--block/partitions/cmdline.c3
-rw-r--r--block/partitions/cmdline.h3
-rw-r--r--block/partitions/core.c787
-rw-r--r--block/partitions/efi.c35
-rw-r--r--block/partitions/efi.h5
-rw-r--r--block/partitions/ibm.c25
-rw-r--r--block/partitions/ibm.h2
-rw-r--r--block/partitions/karma.c3
-rw-r--r--block/partitions/karma.h9
-rw-r--r--block/partitions/ldm.c10
-rw-r--r--block/partitions/ldm.h4
-rw-r--r--block/partitions/mac.h1
-rw-r--r--block/partitions/msdos.c172
-rw-r--r--block/partitions/msdos.h9
-rw-r--r--block/partitions/osf.c2
-rw-r--r--block/partitions/osf.h8
-rw-r--r--block/partitions/sgi.c7
-rw-r--r--block/partitions/sgi.h9
-rw-r--r--block/partitions/sun.c9
-rw-r--r--block/partitions/sun.h9
-rw-r--r--block/partitions/sysv68.c1
-rw-r--r--block/partitions/sysv68.h2
-rw-r--r--block/partitions/ultrix.c1
-rw-r--r--block/partitions/ultrix.h6
-rw-r--r--block/scsi_ioctl.c294
-rw-r--r--block/sed-opal.c442
-rw-r--r--block/t10-pi.c168
97 files changed, 11798 insertions, 5120 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 2466dcc3ef1d..9357d7302398 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,9 +26,15 @@ menuconfig BLOCK
if BLOCK
+config BLK_RQ_ALLOC_TIME
+ bool
+
config BLK_SCSI_REQUEST
bool
+config BLK_CGROUP_RWSTAT
+ bool
+
config BLK_DEV_BSG
bool "Block layer SG support v4"
default y
@@ -60,8 +66,7 @@ config BLK_DEV_BSGLIB
config BLK_DEV_INTEGRITY
bool "Block layer data integrity support"
- select CRC_T10DIF if BLK_DEV_INTEGRITY
- ---help---
+ help
Some storage devices allow extra information to be
stored/retrieved to help protect the data. The block layer
data integrity option provides hooks which can be used by
@@ -71,10 +76,15 @@ config BLK_DEV_INTEGRITY
T10/SCSI Data Integrity Field or the T13/ATA External Path
Protection. If in doubt, say N.
+config BLK_DEV_INTEGRITY_T10
+ tristate
+ depends on BLK_DEV_INTEGRITY
+ select CRC_T10DIF
+
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
- ---help---
+ help
Block layer zoned block device support. This option enables
support for ZAC/ZBC host-managed and host-aware zoned block devices.
@@ -83,18 +93,19 @@ config BLK_DEV_ZONED
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
depends on BLK_CGROUP=y
- ---help---
+ select BLK_CGROUP_RWSTAT
+ help
Block layer bio throttling support. It can be used to limit
the IO rate to a device. IO rate policies are per cgroup and
one needs to mount and use blkio cgroup controller for creating
cgroups and specifying per device IO rate policies.
- See Documentation/cgroup-v1/blkio-controller.txt for more information.
+ See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
config BLK_DEV_THROTTLING_LOW
bool "Block throttling .low limit interface support (EXPERIMENTAL)"
depends on BLK_DEV_THROTTLING
- ---help---
+ help
Add .low limit interface for block throttling. The low limit is a best
effort limit to prioritize cgroups. Depending on the setting, the limit
can be used to protect cgroups in terms of bandwidth/iops and better
@@ -104,17 +115,17 @@ config BLK_DEV_THROTTLING_LOW
config BLK_CMDLINE_PARSER
bool "Block device command line partition parser"
- ---help---
+ help
Enabling this option allows you to specify the partition layout from
the kernel boot args. This is typically of use for embedded devices
which don't otherwise have any standardized method for listing the
partitions on a block device.
- See Documentation/block/cmdline-partition.txt for more information.
+ See Documentation/block/cmdline-partition.rst for more information.
config BLK_WBT
bool "Enable support for block device writeback throttling"
- ---help---
+ help
Enabling this option enables the block layer to throttle buffered
background writeback from the VM, making it more smooth and having
less impact on foreground operations. The throttling is done
@@ -124,7 +135,7 @@ config BLK_WBT
config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection"
depends on BLK_CGROUP=y
- ---help---
+ help
Enabling this option enables the .latency interface for IO throttling.
The IO controller will attempt to maintain average IO latencies below
the configured latency target, throttling anybody with a higher latency
@@ -132,11 +143,22 @@ config BLK_CGROUP_IOLATENCY
Note, this is an experimental interface and could be changed someday.
+config BLK_CGROUP_IOCOST
+ bool "Enable support for cost model based cgroup IO controller"
+ depends on BLK_CGROUP=y
+ select BLK_RQ_IO_DATA_LEN
+ select BLK_RQ_ALLOC_TIME
+ help
+ Enabling this option enables the .weight interface for cost
+ model based proportional IO control. The IO controller
+ distributes IO capacity between different groups based on
+ their share of the overall weight distribution.
+
config BLK_WBT_MQ
bool "Multiqueue writeback throttling"
default y
depends on BLK_WBT
- ---help---
+ help
Enable writeback throttling by default on multiqueue devices.
Multiqueue currently doesn't have support for IO scheduling,
enabling this option is recommended.
@@ -145,7 +167,7 @@ config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
default y
depends on DEBUG_FS
- ---help---
+ help
Include block layer debugging information in debugfs. This information
is mostly useful for kernel developers, but it doesn't incur any cost
at runtime.
@@ -159,11 +181,28 @@ config BLK_DEBUG_FS_ZONED
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
- ---help---
+ help
Builds Logic for interfacing with Opal enabled controllers.
Enabling this option enables users to setup/unlock/lock
Locking ranges for SED devices using the Opal protocol.
+config BLK_INLINE_ENCRYPTION
+ bool "Enable inline encryption support in block layer"
+ help
+ Build the blk-crypto subsystem. Enabling this lets the
+ block layer handle encryption, so users can take
+ advantage of inline encryption hardware if present.
+
+config BLK_INLINE_ENCRYPTION_FALLBACK
+ bool "Enable crypto API fallback for blk-crypto"
+ depends on BLK_INLINE_ENCRYPTION
+ select CRYPTO
+ select CRYPTO_SKCIPHER
+ help
+ Enabling this lets the block layer handle inline encryption
+ by falling back to the kernel crypto API when inline
+ encryption hardware is not present.
+
menu "Partition Types"
source "block/partitions/Kconfig"
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 4626b88b2d5a..2f2158e05a91 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -6,13 +6,13 @@ menu "IO Schedulers"
config MQ_IOSCHED_DEADLINE
tristate "MQ deadline I/O scheduler"
default y
- ---help---
+ help
MQ version of the deadline IO scheduler.
config MQ_IOSCHED_KYBER
tristate "Kyber I/O scheduler"
default y
- ---help---
+ help
The Kyber I/O scheduler is a low-overhead scheduler suitable for
multiqueue and other fast devices. Given target latencies for reads and
synchronous writes, it will self-tune queue depths to achieve that
@@ -20,22 +20,30 @@ config MQ_IOSCHED_KYBER
config IOSCHED_BFQ
tristate "BFQ I/O scheduler"
- ---help---
+ help
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
of the device among all processes according to their weights,
regardless of the device parameters and with any workload. It
also guarantees a low latency to interactive and soft
real-time applications. Details in
- Documentation/block/bfq-iosched.txt
+ Documentation/block/bfq-iosched.rst
config BFQ_GROUP_IOSCHED
bool "BFQ hierarchical scheduling support"
depends on IOSCHED_BFQ && BLK_CGROUP
- ---help---
+ select BLK_CGROUP_RWSTAT
+ help
Enable hierarchical scheduling in BFQ, using the blkio
(cgroups-v1) or io (cgroups-v2) controller.
+config BFQ_CGROUP_DEBUG
+ bool "BFQ IO controller debugging"
+ depends on BFQ_GROUP_IOSCHED
+ help
+ Enable some debugging help. Currently it exports additional stat
+ files in a cgroup which can be useful for debugging.
+
endmenu
endif
diff --git a/block/Makefile b/block/Makefile
index eee1b4ceecf9..78719169fb2a 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -8,24 +8,25 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
- genhd.o partition-generic.o ioprio.o \
- badblocks.o partitions/ blk-rq-qos.o
+ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
obj-$(CONFIG_BOUNCE) += bounce.o
obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
+obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
+obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
-obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o
obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
-obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o t10-pi.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
+obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o
@@ -35,3 +36,5 @@ obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b3796a40a61a..68882b9b8f11 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -15,7 +15,83 @@
#include "bfq-iosched.h"
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp)
+{
+ int ret;
+
+ ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+ if (ret)
+ return ret;
+
+ atomic64_set(&stat->aux_cnt, 0);
+ return 0;
+}
+
+static void bfq_stat_exit(struct bfq_stat *stat)
+{
+ percpu_counter_destroy(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_add - add a value to a bfq_stat
+ * @stat: target bfq_stat
+ * @val: value to add
+ *
+ * Add @val to @stat. The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
+ */
+static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val)
+{
+ percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * bfq_stat_read - read the current value of a bfq_stat
+ * @stat: bfq_stat to read
+ */
+static inline uint64_t bfq_stat_read(struct bfq_stat *stat)
+{
+ return percpu_counter_sum_positive(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_reset - reset a bfq_stat
+ * @stat: bfq_stat to reset
+ */
+static inline void bfq_stat_reset(struct bfq_stat *stat)
+{
+ percpu_counter_set(&stat->cpu_cnt, 0);
+ atomic64_set(&stat->aux_cnt, 0);
+}
+
+/**
+ * bfq_stat_add_aux - add a bfq_stat into another's aux count
+ * @to: the destination bfq_stat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void bfq_stat_add_aux(struct bfq_stat *to,
+ struct bfq_stat *from)
+{
+ atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt),
+ &to->aux_cnt);
+}
+
+/**
+ * blkg_prfill_stat - prfill callback for bfq_stat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the bfq_stat in @pd
+ *
+ * prfill callback for printing a bfq_stat.
+ */
+static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off));
+}
/* bfqg stats flags */
enum bfqg_stats_flags {
@@ -53,7 +129,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
now = ktime_get_ns();
if (now > stats->start_group_wait_time)
- blkg_stat_add(&stats->group_wait_time,
+ bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
bfqg_stats_clear_waiting(stats);
}
@@ -82,14 +158,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
now = ktime_get_ns();
if (now > stats->start_empty_time)
- blkg_stat_add(&stats->empty_time,
+ bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time);
bfqg_stats_clear_empty(stats);
}
void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
{
- blkg_stat_add(&bfqg->stats.dequeue, 1);
+ bfq_stat_add(&bfqg->stats.dequeue, 1);
}
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
@@ -119,7 +195,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
u64 now = ktime_get_ns();
if (now > stats->start_idle_time)
- blkg_stat_add(&stats->idle_time,
+ bfq_stat_add(&stats->idle_time,
now - stats->start_idle_time);
bfqg_stats_clear_idling(stats);
}
@@ -137,9 +213,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
- blkg_stat_add(&stats->avg_queue_size_sum,
+ bfq_stat_add(&stats->avg_queue_size_sum,
blkg_rwstat_total(&stats->queued));
- blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfq_stat_add(&stats->avg_queue_size_samples, 1);
bfqg_stats_update_group_wait_time(stats);
}
@@ -176,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
io_start_time_ns - start_time_ns);
}
-#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op) { }
@@ -190,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -256,7 +332,7 @@ static void bfqg_put(struct bfq_group *bfqg)
kfree(bfqg);
}
-static void bfqg_and_blkg_get(struct bfq_group *bfqg)
+void bfqg_and_blkg_get(struct bfq_group *bfqg)
{
/* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
bfqg_get(bfqg);
@@ -271,21 +347,32 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
bfqg_put(bfqg);
}
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq)
+{
+ struct bfq_group *bfqg = blkg_to_bfqg(rq->bio->bi_blkg);
+
+ if (!bfqg)
+ return;
+
+ blkg_rwstat_add(&bfqg->stats.bytes, rq->cmd_flags, blk_rq_bytes(rq));
+ blkg_rwstat_add(&bfqg->stats.ios, rq->cmd_flags, 1);
+}
+
/* @stats = 0 */
static void bfqg_stats_reset(struct bfqg_stats *stats)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* queued stats shouldn't be cleared */
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
blkg_rwstat_reset(&stats->wait_time);
- blkg_stat_reset(&stats->time);
- blkg_stat_reset(&stats->avg_queue_size_sum);
- blkg_stat_reset(&stats->avg_queue_size_samples);
- blkg_stat_reset(&stats->dequeue);
- blkg_stat_reset(&stats->group_wait_time);
- blkg_stat_reset(&stats->idle_time);
- blkg_stat_reset(&stats->empty_time);
+ bfq_stat_reset(&stats->time);
+ bfq_stat_reset(&stats->avg_queue_size_sum);
+ bfq_stat_reset(&stats->avg_queue_size_samples);
+ bfq_stat_reset(&stats->dequeue);
+ bfq_stat_reset(&stats->group_wait_time);
+ bfq_stat_reset(&stats->idle_time);
+ bfq_stat_reset(&stats->empty_time);
#endif
}
@@ -295,19 +382,19 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
if (!to || !from)
return;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* queued stats shouldn't be cleared */
blkg_rwstat_add_aux(&to->merged, &from->merged);
blkg_rwstat_add_aux(&to->service_time, &from->service_time);
blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
- blkg_stat_add_aux(&from->time, &from->time);
- blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_add_aux(&to->avg_queue_size_samples,
+ bfq_stat_add_aux(&from->time, &from->time);
+ bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ bfq_stat_add_aux(&to->avg_queue_size_samples,
&from->avg_queue_size_samples);
- blkg_stat_add_aux(&to->dequeue, &from->dequeue);
- blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_add_aux(&to->idle_time, &from->idle_time);
- blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+ bfq_stat_add_aux(&to->dequeue, &from->dequeue);
+ bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ bfq_stat_add_aux(&to->idle_time, &from->idle_time);
+ bfq_stat_add_aux(&to->empty_time, &from->empty_time);
#endif
}
@@ -355,35 +442,41 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
static void bfqg_stats_exit(struct bfqg_stats *stats)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+ blkg_rwstat_exit(&stats->bytes);
+ blkg_rwstat_exit(&stats->ios);
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
blkg_rwstat_exit(&stats->merged);
blkg_rwstat_exit(&stats->service_time);
blkg_rwstat_exit(&stats->wait_time);
blkg_rwstat_exit(&stats->queued);
- blkg_stat_exit(&stats->time);
- blkg_stat_exit(&stats->avg_queue_size_sum);
- blkg_stat_exit(&stats->avg_queue_size_samples);
- blkg_stat_exit(&stats->dequeue);
- blkg_stat_exit(&stats->group_wait_time);
- blkg_stat_exit(&stats->idle_time);
- blkg_stat_exit(&stats->empty_time);
+ bfq_stat_exit(&stats->time);
+ bfq_stat_exit(&stats->avg_queue_size_sum);
+ bfq_stat_exit(&stats->avg_queue_size_samples);
+ bfq_stat_exit(&stats->dequeue);
+ bfq_stat_exit(&stats->group_wait_time);
+ bfq_stat_exit(&stats->idle_time);
+ bfq_stat_exit(&stats->empty_time);
#endif
}
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+ if (blkg_rwstat_init(&stats->bytes, gfp) ||
+ blkg_rwstat_init(&stats->ios, gfp))
+ return -ENOMEM;
+
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
blkg_rwstat_init(&stats->service_time, gfp) ||
blkg_rwstat_init(&stats->wait_time, gfp) ||
blkg_rwstat_init(&stats->queued, gfp) ||
- blkg_stat_init(&stats->time, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
- blkg_stat_init(&stats->dequeue, gfp) ||
- blkg_stat_init(&stats->group_wait_time, gfp) ||
- blkg_stat_init(&stats->idle_time, gfp) ||
- blkg_stat_init(&stats->empty_time, gfp)) {
+ bfq_stat_init(&stats->time, gfp) ||
+ bfq_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ bfq_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ bfq_stat_init(&stats->dequeue, gfp) ||
+ bfq_stat_init(&stats->group_wait_time, gfp) ||
+ bfq_stat_init(&stats->idle_time, gfp) ||
+ bfq_stat_init(&stats->empty_time, gfp)) {
bfqg_stats_exit(stats);
return -ENOMEM;
}
@@ -425,11 +518,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd)
kfree(cpd_to_bfqgd(cpd));
}
-static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q,
+ struct blkcg *blkcg)
{
struct bfq_group *bfqg;
- bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node);
if (!bfqg)
return NULL;
@@ -516,12 +610,13 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
*/
entity = &bfqg->entity;
for_each_entity(entity) {
- bfqg = container_of(entity, struct bfq_group, entity);
- if (bfqg != bfqd->root_group) {
- parent = bfqg_parent(bfqg);
+ struct bfq_group *curr_bfqg = container_of(entity,
+ struct bfq_group, entity);
+ if (curr_bfqg != bfqd->root_group) {
+ parent = bfqg_parent(curr_bfqg);
if (!parent)
parent = bfqd->root_group;
- bfq_group_set_parent(bfqg, parent);
+ bfq_group_set_parent(curr_bfqg, parent);
}
}
@@ -547,6 +642,12 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
{
struct bfq_entity *entity = &bfqq->entity;
+ /*
+ * Get extra reference to prevent bfqq from being freed in
+ * next possible expire or deactivate.
+ */
+ bfqq->ref++;
+
/* If bfqq is empty, then bfq_bfqq_expire also invokes
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
* from data structures related to current group. Otherwise we
@@ -559,7 +660,7 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfq_bfqq_busy(bfqq))
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
- else if (entity->on_st)
+ else if (entity->on_st_or_in_serv)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
bfqg_and_blkg_put(bfqq_group(bfqq));
@@ -576,6 +677,8 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
bfq_schedule_dispatch(bfqd);
+ /* release extra ref taken above, bfqq may happen to be freed now */
+ bfq_put_queue(bfqq);
}
/**
@@ -611,10 +714,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
if (entity->sched_data != &bfqg->sched_data) {
bic_set_bfqq(bic, NULL, 0);
- bfq_log_bfqq(bfqd, async_bfqq,
- "bic_change_group: %p %d",
- async_bfqq, async_bfqq->ref);
- bfq_put_queue(async_bfqq);
+ bfq_release_process_ref(bfqd, async_bfqq);
}
}
@@ -715,39 +815,53 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
/**
* bfq_reparent_leaf_entity - move leaf entity to the root_group.
* @bfqd: the device data structure with the root group.
- * @entity: the entity to move.
+ * @entity: the entity to move, if entity is a leaf; or the parent entity
+ * of an active leaf entity to move, if entity is not a leaf.
*/
static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
- struct bfq_entity *entity)
+ struct bfq_entity *entity,
+ int ioprio_class)
{
- struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
+ struct bfq_queue *bfqq;
+ struct bfq_entity *child_entity = entity;
+
+ while (child_entity->my_sched_data) { /* leaf not reached yet */
+ struct bfq_sched_data *child_sd = child_entity->my_sched_data;
+ struct bfq_service_tree *child_st = child_sd->service_tree +
+ ioprio_class;
+ struct rb_root *child_active = &child_st->active;
+
+ child_entity = bfq_entity_of(rb_first(child_active));
+ if (!child_entity)
+ child_entity = child_sd->in_service_entity;
+ }
+
+ bfqq = bfq_entity_to_bfqq(child_entity);
bfq_bfqq_move(bfqd, bfqq, bfqd->root_group);
}
/**
- * bfq_reparent_active_entities - move to the root group all active
- * entities.
+ * bfq_reparent_active_queues - move to the root group all active queues.
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
- * @st: the service tree with the entities.
+ * @st: the service tree to start the search from.
*/
-static void bfq_reparent_active_entities(struct bfq_data *bfqd,
- struct bfq_group *bfqg,
- struct bfq_service_tree *st)
+static void bfq_reparent_active_queues(struct bfq_data *bfqd,
+ struct bfq_group *bfqg,
+ struct bfq_service_tree *st,
+ int ioprio_class)
{
struct rb_root *active = &st->active;
- struct bfq_entity *entity = NULL;
-
- if (!RB_EMPTY_ROOT(&st->active))
- entity = bfq_entity_of(rb_first(active));
+ struct bfq_entity *entity;
- for (; entity ; entity = bfq_entity_of(rb_first(active)))
- bfq_reparent_leaf_entity(bfqd, entity);
+ while ((entity = bfq_entity_of(rb_first(active))))
+ bfq_reparent_leaf_entity(bfqd, entity, ioprio_class);
if (bfqg->sched_data.in_service_entity)
bfq_reparent_leaf_entity(bfqd,
- bfqg->sched_data.in_service_entity);
+ bfqg->sched_data.in_service_entity,
+ ioprio_class);
}
/**
@@ -780,13 +894,6 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
st = bfqg->sched_data.service_tree + i;
/*
- * The idle tree may still contain bfq_queues belonging
- * to exited task because they never migrated to a different
- * cgroup from the one being destroyed now.
- */
- bfq_flush_idle_tree(st);
-
- /*
* It may happen that some queues are still active
* (busy) upon group destruction (if the corresponding
* processes have been forced to terminate). We move
@@ -798,7 +905,20 @@ static void bfq_pd_offline(struct blkg_policy_data *pd)
* There is no need to put the sync queues, as the
* scheduler has taken no reference.
*/
- bfq_reparent_active_entities(bfqd, bfqg, st);
+ bfq_reparent_active_queues(bfqd, bfqg, st, i);
+
+ /*
+ * The idle tree may still contain bfq_queues
+ * belonging to exited task because they never
+ * migrated to a different cgroup from the one being
+ * destroyed now. In addition, even
+ * bfq_reparent_active_queues() may happen to add some
+ * entities to the idle tree. It happens if, in some
+ * of the calls to bfq_bfqq_move() performed by
+ * bfq_reparent_active_queues(), the queue to move is
+ * empty and gets expired.
+ */
+ bfq_flush_idle_tree(st);
}
__bfq_deactivate_entity(entity, false);
@@ -828,7 +948,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd)
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}
-static int bfq_io_show_weight(struct seq_file *sf, void *v)
+static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
@@ -842,6 +962,60 @@ static int bfq_io_show_weight(struct seq_file *sf, void *v)
return 0;
}
+static u64 bfqg_prfill_weight_device(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ if (!bfqg->entity.dev_weight)
+ return 0;
+ return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+
+ seq_printf(sf, "default %u\n", bfqgd->weight);
+ blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device,
+ &blkcg_policy_bfq, 0, false);
+ return 0;
+}
+
+static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight)
+{
+ weight = dev_weight ?: weight;
+
+ bfqg->entity.dev_weight = dev_weight;
+ /*
+ * Setting the prio_changed flag of the entity
+ * to 1 with new_weight == weight would re-set
+ * the value of the weight to its ioprio mapping.
+ * Set the flag only if necessary.
+ */
+ if ((unsigned short)weight != bfqg->entity.new_weight) {
+ bfqg->entity.new_weight = (unsigned short)weight;
+ /*
+ * Make sure that the above new value has been
+ * stored in bfqg->entity.new_weight before
+ * setting the prio_changed flag. In fact,
+ * this flag may be read asynchronously (in
+ * critical sections protected by a different
+ * lock than that held here), and finding this
+ * flag set may cause the execution of the code
+ * for updating parameters whose value may
+ * depend also on bfqg->entity.new_weight (in
+ * __bfq_entity_update_weight_prio).
+ * This barrier makes sure that the new value
+ * of bfqg->entity.new_weight is correctly
+ * seen in that code.
+ */
+ smp_wmb();
+ bfqg->entity.prio_changed = 1;
+ }
+}
+
static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
struct cftype *cftype,
u64 val)
@@ -860,61 +1034,70 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- if (!bfqg)
- continue;
- /*
- * Setting the prio_changed flag of the entity
- * to 1 with new_weight == weight would re-set
- * the value of the weight to its ioprio mapping.
- * Set the flag only if necessary.
- */
- if ((unsigned short)val != bfqg->entity.new_weight) {
- bfqg->entity.new_weight = (unsigned short)val;
- /*
- * Make sure that the above new value has been
- * stored in bfqg->entity.new_weight before
- * setting the prio_changed flag. In fact,
- * this flag may be read asynchronously (in
- * critical sections protected by a different
- * lock than that held here), and finding this
- * flag set may cause the execution of the code
- * for updating parameters whose value may
- * depend also on bfqg->entity.new_weight (in
- * __bfq_entity_update_weight_prio).
- * This barrier makes sure that the new value
- * of bfqg->entity.new_weight is correctly
- * seen in that code.
- */
- smp_wmb();
- bfqg->entity.prio_changed = 1;
- }
+ if (bfqg)
+ bfq_group_set_weight(bfqg, val, 0);
}
spin_unlock_irq(&blkcg->lock);
return ret;
}
-static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
+static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- u64 weight;
- /* First unsigned long found in the file is used */
- int ret = kstrtoull(strim(buf), 0, &weight);
+ int ret;
+ struct blkg_conf_ctx ctx;
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct bfq_group *bfqg;
+ u64 v;
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
if (ret)
return ret;
- ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+ if (sscanf(ctx.body, "%llu", &v) == 1) {
+ /* require "default" on dfl */
+ ret = -ERANGE;
+ if (!v)
+ goto out;
+ } else if (!strcmp(strim(ctx.body), "default")) {
+ v = 0;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ bfqg = blkg_to_bfqg(ctx.blkg);
+
+ ret = -ERANGE;
+ if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) {
+ bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
+ ret = 0;
+ }
+out:
+ blkg_conf_finish(&ctx);
return ret ?: nbytes;
}
-#ifdef CONFIG_DEBUG_BLK_CGROUP
-static int bfqg_print_stat(struct seq_file *sf, void *v)
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
- &blkcg_policy_bfq, seq_cft(sf)->private, false);
- return 0;
+ char *endp;
+ int ret;
+ u64 v;
+
+ buf = strim(buf);
+
+ /* "WEIGHT" or "default WEIGHT" sets the default weight */
+ v = simple_strtoull(buf, &endp, 0);
+ if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+ ret = bfq_io_set_weight_legacy(of_css(of), NULL, v);
+ return ret ?: nbytes;
+ }
+
+ return bfq_io_set_device_weight(of, buf, nbytes, off);
}
static int bfqg_print_rwstat(struct seq_file *sf, void *v)
@@ -924,43 +1107,69 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)
return 0;
}
-static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq, off);
- return __blkg_prfill_u64(sf, pd, sum);
-}
-
static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq,
- off);
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
return __blkg_prfill_rwstat(sf, pd, &sum);
}
-static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, false);
+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, true);
return 0;
}
-static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+static int bfqg_print_stat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
+ &blkcg_policy_bfq, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkcg_gq *blkg = pd_to_blkg(pd);
+ struct blkcg_gq *pos_blkg;
+ struct cgroup_subsys_state *pos_css;
+ u64 sum = 0;
+
+ lockdep_assert_held(&blkg->q->queue_lock);
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct bfq_stat *stat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off;
+ sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt);
+ }
+ rcu_read_unlock();
+
+ return __blkg_prfill_u64(sf, pd, sum);
+}
+
+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq,
- seq_cft(sf)->private, true);
+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq,
+ seq_cft(sf)->private, false);
return 0;
}
static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
- u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes);
+ struct bfq_group *bfqg = blkg_to_bfqg(pd->blkg);
+ u64 sum = blkg_rwstat_total(&bfqg->stats.bytes);
return __blkg_prfill_u64(sf, pd, sum >> 9);
}
@@ -975,12 +1184,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes));
- u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+ struct blkg_rwstat_sample tmp;
- return __blkg_prfill_u64(sf, pd, sum >> 9);
+ blkg_rwstat_recursive_sum(pd->blkg, &blkcg_policy_bfq,
+ offsetof(struct bfq_group, stats.bytes), &tmp);
+
+ return __blkg_prfill_u64(sf, pd,
+ (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
}
static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
@@ -995,11 +1205,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct bfq_group *bfqg = pd_to_bfqg(pd);
- u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+ u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples);
u64 v = 0;
if (samples) {
- v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+ v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum);
v = div64_u64(v, samples);
}
__blkg_prfill_u64(sf, pd, v);
@@ -1014,7 +1224,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
0, false);
return 0;
}
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
@@ -1047,22 +1257,28 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.weight",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = bfq_io_show_weight,
+ .seq_show = bfq_io_show_weight_legacy,
.write_u64 = bfq_io_set_weight_legacy,
},
+ {
+ .name = "bfq.weight_device",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write = bfq_io_set_weight,
+ },
/* statistics, covers only the tasks in the bfqg */
{
.name = "bfq.io_service_bytes",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes,
+ .private = offsetof(struct bfq_group, stats.bytes),
+ .seq_show = bfqg_print_rwstat,
},
{
.name = "bfq.io_serviced",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios,
+ .private = offsetof(struct bfq_group, stats.ios),
+ .seq_show = bfqg_print_rwstat,
},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
.name = "bfq.time",
.private = offsetof(struct bfq_group, stats.time),
@@ -1092,20 +1308,20 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.queued),
.seq_show = bfqg_print_rwstat,
},
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
/* the same statistics which cover the bfqg and its descendants */
{
.name = "bfq.io_service_bytes_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_bytes_recursive,
+ .private = offsetof(struct bfq_group, stats.bytes),
+ .seq_show = bfqg_print_rwstat_recursive,
},
{
.name = "bfq.io_serviced_recursive",
- .private = (unsigned long)&blkcg_policy_bfq,
- .seq_show = blkg_print_stat_ios_recursive,
+ .private = offsetof(struct bfq_group, stats.ios),
+ .seq_show = bfqg_print_rwstat_recursive,
},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
.name = "bfq.time_recursive",
.private = offsetof(struct bfq_group, stats.time),
@@ -1159,7 +1375,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.dequeue),
.seq_show = bfqg_print_stat,
},
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
{ } /* terminate */
};
@@ -1208,6 +1424,10 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
return bfqq->bfqd->root_group;
}
+void bfqg_and_blkg_get(struct bfq_group *bfqg) {}
+
+void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
+
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
struct bfq_group *bfqg;
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index e5db3856b194..50c8f034c01c 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -17,7 +17,7 @@
* low-latency capabilities. BFQ also supports full hierarchical
* scheduling through cgroups. Next paragraphs provide an introduction
* on BFQ inner workings. Details on BFQ benefits, usage and
- * limitations can be found in Documentation/block/bfq-iosched.txt.
+ * limitations can be found in Documentation/block/bfq-iosched.rst.
*
* BFQ is a proportional-share storage-I/O scheduling algorithm based
* on the slice-by-slice service scheme of CFQ. But BFQ assigns
@@ -123,6 +123,7 @@
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
+#include <linux/backing-dev.h>
#include "blk.h"
#include "blk-mq.h"
@@ -157,6 +158,7 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS \
/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -426,7 +428,6 @@ void bfq_schedule_dispatch(struct bfq_data *bfqd)
}
#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE)
-#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT)
#define bfq_sample_valid(samples) ((samples) > 80)
@@ -613,6 +614,10 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->pos_root = NULL;
}
+ /* oom_bfqq does not participate in queue merging */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+
/*
* bfqq cannot be merged any longer (see comments in
* bfq_setup_cooperator): no point in adding bfqq into the
@@ -1055,7 +1060,7 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st -
+ return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
(bfqq->weight_counter != NULL);
}
@@ -1427,17 +1432,19 @@ static int bfq_min_budget(struct bfq_data *bfqd)
* mechanism may be re-designed in such a way to make it possible to
* know whether preemption is needed without needing to update service
* trees). In addition, queue preemptions almost always cause random
- * I/O, and thus loss of throughput. Because of these facts, the next
- * function adopts the following simple scheme to avoid both costly
- * operations and too frequent preemptions: it requests the expiration
- * of the in-service queue (unconditionally) only for queues that need
- * to recover a hole, or that either are weight-raised or deserve to
- * be weight-raised.
+ * I/O, which may in turn cause loss of throughput. Finally, there may
+ * even be no in-service queue when the next function is invoked (so,
+ * no queue to compare timestamps with). Because of these facts, the
+ * next function adopts the following simple scheme to avoid costly
+ * operations, too frequent preemptions and too many dependencies on
+ * the state of the scheduler: it requests the expiration of the
+ * in-service queue (unconditionally) only for queues that need to
+ * recover a hole. Then it delegates to other parts of the code the
+ * responsibility of handling the above case 2.
*/
static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
- bool arrived_in_time,
- bool wr_or_deserves_wr)
+ bool arrived_in_time)
{
struct bfq_entity *entity = &bfqq->entity;
@@ -1492,7 +1499,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(bfqq->next_rq, bfqq));
bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
- return wr_or_deserves_wr;
+ return false;
}
/*
@@ -1610,6 +1617,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
bfqd->bfq_wr_min_idle_time);
}
+
+/*
+ * Return true if bfqq is in a higher priority class, or has a higher
+ * weight than the in-service queue.
+ */
+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
+ struct bfq_queue *in_serv_bfqq)
+{
+ int bfqq_weight, in_serv_weight;
+
+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
+ return true;
+
+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
+ bfqq_weight = bfqq->entity.weight;
+ in_serv_weight = in_serv_bfqq->entity.weight;
+ } else {
+ if (bfqq->entity.parent)
+ bfqq_weight = bfqq->entity.parent->weight;
+ else
+ bfqq_weight = bfqq->entity.weight;
+ if (in_serv_bfqq->entity.parent)
+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
+ else
+ in_serv_weight = in_serv_bfqq->entity.weight;
+ }
+
+ return bfqq_weight > in_serv_weight;
+}
+
static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
int old_wr_coeff,
@@ -1654,8 +1691,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
*/
bfqq_wants_to_preempt =
bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
- arrived_in_time,
- wr_or_deserves_wr);
+ arrived_in_time);
/*
* If bfqq happened to be activated in a burst, but has been
@@ -1720,21 +1756,111 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
/*
* Expire in-service queue only if preemption may be needed
- * for guarantees. In this respect, the function
- * next_queue_may_preempt just checks a simple, necessary
- * condition, and not a sufficient condition based on
- * timestamps. In fact, for the latter condition to be
- * evaluated, timestamps would need first to be updated, and
- * this operation is quite costly (see the comments on the
- * function bfq_bfqq_update_budg_for_activation).
+ * for guarantees. In particular, we care only about two
+ * cases. The first is that bfqq has to recover a service
+ * hole, as explained in the comments on
+ * bfq_bfqq_update_budg_for_activation(), i.e., that
+ * bfqq_wants_to_preempt is true. However, if bfqq does not
+ * carry time-critical I/O, then bfqq's bandwidth is less
+ * important than that of queues that carry time-critical I/O.
+ * So, as a further constraint, we consider this case only if
+ * bfqq is at least as weight-raised, i.e., at least as time
+ * critical, as the in-service queue.
+ *
+ * The second case is that bfqq is in a higher priority class,
+ * or has a higher weight than the in-service queue. If this
+ * condition does not hold, we don't care because, even if
+ * bfqq does not start to be served immediately, the resulting
+ * delay for bfqq's I/O is however lower or much lower than
+ * the ideal completion time to be guaranteed to bfqq's I/O.
+ *
+ * In both cases, preemption is needed only if, according to
+ * the timestamps of both bfqq and of the in-service queue,
+ * bfqq actually is the next queue to serve. So, to reduce
+ * useless preemptions, the return value of
+ * next_queue_may_preempt() is considered in the next compound
+ * condition too. Yet next_queue_may_preempt() just checks a
+ * simple, necessary condition for bfqq to be the next queue
+ * to serve. In fact, to evaluate a sufficient condition, the
+ * timestamps of the in-service queue would need to be
+ * updated, and this operation is quite costly (see the
+ * comments on bfq_bfqq_update_budg_for_activation()).
*/
- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+ if (bfqd->in_service_queue &&
+ ((bfqq_wants_to_preempt &&
+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
next_queue_may_preempt(bfqd))
bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
false, BFQQE_PREEMPTED);
}
+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ /* invalidate baseline total service time */
+ bfqq->last_serv_time_ns = 0;
+
+ /*
+ * Reset pointer in case we are waiting for
+ * some request completion.
+ */
+ bfqd->waited_rq = NULL;
+
+ /*
+ * If bfqq has a short think time, then start by setting the
+ * inject limit to 0 prudentially, because the service time of
+ * an injected I/O request may be higher than the think time
+ * of bfqq, and therefore, if one request was injected when
+ * bfqq remains empty, this injected request might delay the
+ * service of the next I/O request for bfqq significantly. In
+ * case bfqq can actually tolerate some injection, then the
+ * adaptive update will however raise the limit soon. This
+ * lucky circumstance holds exactly because bfqq has a short
+ * think time, and thus, after remaining empty, is likely to
+ * get new I/O enqueued---and then completed---before being
+ * expired. This is the very pattern that gives the
+ * limit-update algorithm the chance to measure the effect of
+ * injection on request service times, and then to update the
+ * limit accordingly.
+ *
+ * However, in the following special case, the inject limit is
+ * left to 1 even if the think time is short: bfqq's I/O is
+ * synchronized with that of some other queue, i.e., bfqq may
+ * receive new I/O only after the I/O of the other queue is
+ * completed. Keeping the inject limit to 1 allows the
+ * blocking I/O to be served while bfqq is in service. And
+ * this is very convenient both for bfqq and for overall
+ * throughput, as explained in detail in the comments in
+ * bfq_update_has_short_ttime().
+ *
+ * On the opposite end, if bfqq has a long think time, then
+ * start directly by 1, because:
+ * a) on the bright side, keeping at most one request in
+ * service in the drive is unlikely to cause any harm to the
+ * latency of bfqq's requests, as the service time of a single
+ * request is likely to be lower than the think time of bfqq;
+ * b) on the downside, after becoming empty, bfqq is likely to
+ * expire before getting its next request. With this request
+ * arrival pattern, it is very hard to sample total service
+ * times and update the inject limit accordingly (see comments
+ * on bfq_update_inject_limit()). So the limit is likely to be
+ * never, or at least seldom, updated. As a consequence, by
+ * setting the limit to 1, we avoid that no injection ever
+ * occurs with bfqq. On the downside, this proactive step
+ * further reduces chances to actually compute the baseline
+ * total service time. Thus it reduces chances to execute the
+ * limit-update algorithm and possibly raise the limit to more
+ * than 1.
+ */
+ if (bfq_bfqq_has_short_ttime(bfqq))
+ bfqq->inject_limit = 0;
+ else
+ bfqq->inject_limit = 1;
+
+ bfqq->decrease_time_jif = jiffies;
+}
+
static void bfq_add_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1749,77 +1875,120 @@ static void bfq_add_request(struct request *rq)
if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
/*
+ * Detect whether bfqq's I/O seems synchronized with
+ * that of some other queue, i.e., whether bfqq, after
+ * remaining empty, happens to receive new I/O only
+ * right after some I/O request of the other queue has
+ * been completed. We call waker queue the other
+ * queue, and we assume, for simplicity, that bfqq may
+ * have at most one waker queue.
+ *
+ * A remarkable throughput boost can be reached by
+ * unconditionally injecting the I/O of the waker
+ * queue, every time a new bfq_dispatch_request
+ * happens to be invoked while I/O is being plugged
+ * for bfqq. In addition to boosting throughput, this
+ * unblocks bfqq's I/O, thereby improving bandwidth
+ * and latency for bfqq. Note that these same results
+ * may be achieved with the general injection
+ * mechanism, but less effectively. For details on
+ * this aspect, see the comments on the choice of the
+ * queue for injection in bfq_select_queue().
+ *
+ * Turning back to the detection of a waker queue, a
+ * queue Q is deemed as a waker queue for bfqq if, for
+ * two consecutive times, bfqq happens to become non
+ * empty right after a request of Q has been
+ * completed. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while
+ * on the second time, the flag
+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
+ * is a waker queue for bfqq. These detection steps
+ * are performed only if bfqq has a long think time,
+ * so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This
+ * last filter, plus the above two-times requirement,
+ * make false positives less likely.
+ *
+ * NOTE
+ *
+ * The sooner a waker queue is detected, the sooner
+ * throughput can be boosted by injecting I/O from the
+ * waker queue. Fortunately, detection is likely to be
+ * actually fast, for the following reasons. While
+ * blocked by synchronization, bfqq has a long think
+ * time. This implies that bfqq's inject limit is at
+ * least equal to 1 (see the comments in
+ * bfq_update_inject_limit()). So, thanks to
+ * injection, the waker queue is likely to be served
+ * during the very first I/O-plugging time interval
+ * for bfqq. This triggers the first step of the
+ * detection mechanism. Thanks again to injection, the
+ * candidate waker queue is then likely to be
+ * confirmed no later than during the next
+ * I/O-plugging interval for bfqq.
+ */
+ if (bfqd->last_completed_rq_bfqq &&
+ !bfq_bfqq_has_short_ttime(bfqq) &&
+ ktime_get_ns() - bfqd->last_completion <
+ 200 * NSEC_PER_USEC) {
+ if (bfqd->last_completed_rq_bfqq != bfqq &&
+ bfqd->last_completed_rq_bfqq !=
+ bfqq->waker_bfqq) {
+ /*
+ * First synchronization detected with
+ * a candidate waker queue, or with a
+ * different candidate waker queue
+ * from the current one.
+ */
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+
+ /*
+ * If the waker queue disappears, then
+ * bfqq->waker_bfqq must be reset. To
+ * this goal, we maintain in each
+ * waker queue a list, woken_list, of
+ * all the queues that reference the
+ * waker queue through their
+ * waker_bfqq pointer. When the waker
+ * queue exits, the waker_bfqq pointer
+ * of all the queues in the woken_list
+ * is reset.
+ *
+ * In addition, if bfqq is already in
+ * the woken_list of a waker queue,
+ * then, before being inserted into
+ * the woken_list of a new waker
+ * queue, bfqq must be removed from
+ * the woken_list of the old waker
+ * queue.
+ */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqd->last_completed_rq_bfqq->woken_list);
+
+ bfq_clear_bfqq_has_waker(bfqq);
+ } else if (bfqd->last_completed_rq_bfqq ==
+ bfqq->waker_bfqq &&
+ !bfq_bfqq_has_waker(bfqq)) {
+ /*
+ * synchronization with waker_bfqq
+ * seen for the second time
+ */
+ bfq_mark_bfqq_has_waker(bfqq);
+ }
+ }
+
+ /*
* Periodically reset inject limit, to make sure that
* the latter eventually drops in case workload
* changes, see step (3) in the comments on
* bfq_update_inject_limit().
*/
if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
- msecs_to_jiffies(1000))) {
- /* invalidate baseline total service time */
- bfqq->last_serv_time_ns = 0;
-
- /*
- * Reset pointer in case we are waiting for
- * some request completion.
- */
- bfqd->waited_rq = NULL;
-
- /*
- * If bfqq has a short think time, then start
- * by setting the inject limit to 0
- * prudentially, because the service time of
- * an injected I/O request may be higher than
- * the think time of bfqq, and therefore, if
- * one request was injected when bfqq remains
- * empty, this injected request might delay
- * the service of the next I/O request for
- * bfqq significantly. In case bfqq can
- * actually tolerate some injection, then the
- * adaptive update will however raise the
- * limit soon. This lucky circumstance holds
- * exactly because bfqq has a short think
- * time, and thus, after remaining empty, is
- * likely to get new I/O enqueued---and then
- * completed---before being expired. This is
- * the very pattern that gives the
- * limit-update algorithm the chance to
- * measure the effect of injection on request
- * service times, and then to update the limit
- * accordingly.
- *
- * On the opposite end, if bfqq has a long
- * think time, then start directly by 1,
- * because:
- * a) on the bright side, keeping at most one
- * request in service in the drive is unlikely
- * to cause any harm to the latency of bfqq's
- * requests, as the service time of a single
- * request is likely to be lower than the
- * think time of bfqq;
- * b) on the downside, after becoming empty,
- * bfqq is likely to expire before getting its
- * next request. With this request arrival
- * pattern, it is very hard to sample total
- * service times and update the inject limit
- * accordingly (see comments on
- * bfq_update_inject_limit()). So the limit is
- * likely to be never, or at least seldom,
- * updated. As a consequence, by setting the
- * limit to 1, we avoid that no injection ever
- * occurs with bfqq. On the downside, this
- * proactive step further reduces chances to
- * actually compute the baseline total service
- * time. Thus it reduces chances to execute the
- * limit-update algorithm and possibly raise the
- * limit to more than 1.
- */
- if (bfq_bfqq_has_short_ttime(bfqq))
- bfqq->inject_limit = 0;
- else
- bfqq->inject_limit = 1;
- bfqq->decrease_time_jif = jiffies;
- }
+ msecs_to_jiffies(1000)))
+ bfq_reset_inject_limit(bfqd, bfqq);
/*
* The following conditions must hold to setup a new
@@ -1851,7 +2020,7 @@ static void bfq_add_request(struct request *rq)
(bfqq->last_serv_time_ns > 0 &&
bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
- msecs_to_jiffies(100))) {
+ msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns();
/*
* Start the state machine for measuring the
@@ -1860,7 +2029,21 @@ static void bfq_add_request(struct request *rq)
* be set when rq will be dispatched.
*/
bfqd->wait_dispatch = true;
- bfqd->rqs_injected = false;
+ /*
+ * If there is no I/O in service in the drive,
+ * then possible injection occurred before the
+ * arrival of rq will not affect the total
+ * service time of rq. So the injection limit
+ * must not be updated as a function of such
+ * total service time, unless new injection
+ * occurs before rq is completed. To have the
+ * injection limit updated only in the latter
+ * case, reset rqs_injected here (rqs_injected
+ * will be set in case injection is performed
+ * on bfqq before rq is completed).
+ */
+ if (bfqd->rq_in_driver == 0)
+ bfqd->rqs_injected = false;
}
}
@@ -2027,7 +2210,8 @@ static void bfq_remove_request(struct request_queue *q,
}
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+ unsigned int nr_segs)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -2050,7 +2234,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
bfqd->bio_bfqq = NULL;
bfqd->bio_bic = bic;
- ret = blk_mq_sched_try_merge(q, bio, &free);
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
if (free)
blk_mq_free_request(free);
@@ -2085,9 +2269,14 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
blk_rq_pos(container_of(rb_prev(&req->rb_node),
struct request, rb_node))) {
struct bfq_queue *bfqq = bfq_init_rq(req);
- struct bfq_data *bfqd = bfqq->bfqd;
+ struct bfq_data *bfqd;
struct request *prev, *next_rq;
+ if (!bfqq)
+ return;
+
+ bfqd = bfqq->bfqd;
+
/* Reposition request in its sort_list */
elv_rb_del(&bfqq->sort_list, req);
elv_rb_add(&bfqq->sort_list, req);
@@ -2134,6 +2323,9 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct bfq_queue *bfqq = bfq_init_rq(rq),
*next_bfqq = bfq_init_rq(next);
+ if (!bfqq)
+ return;
+
/*
* If next and rq belong to the same bfq_queue and next is older
* than rq, then reposition rq in the fifo (by substituting next
@@ -2513,6 +2705,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
* to enjoy weight raising if split soon.
*/
bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
bic->saved_last_wr_start_finish = jiffies;
} else {
@@ -2524,6 +2717,26 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
}
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * To prevent bfqq's service guarantees from being violated,
+ * bfqq may be left busy, i.e., queued for service, even if
+ * empty (see comments in __bfq_bfqq_expire() for
+ * details). But, if no process will send requests to bfqq any
+ * longer, then there is no point in keeping bfqq queued for
+ * service. In addition, keeping bfqq queued for service, but
+ * with no process ref any longer, may have caused bfqq to be
+ * freed when dequeued from service. But this is assumed to
+ * never happen.
+ */
+ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ bfqq != bfqd->in_service_queue)
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
+
+ bfq_put_queue(bfqq);
+}
+
static void
bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
@@ -2594,8 +2807,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
*/
new_bfqq->pid = -1;
bfqq->bic = NULL;
- /* release process reference to bfqq */
- bfq_put_queue(bfqq);
+ bfq_release_process_ref(bfqd, bfqq);
}
static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -3045,7 +3257,209 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
bfq_remove_request(q, rq);
}
-static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+/*
+ * There is a case where idling does not have to be performed for
+ * throughput concerns, but to preserve the throughput share of
+ * the process associated with bfqq.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired throughput
+ * distribution only in a completely symmetric, or favorably
+ * skewed scenario where:
+ * (i-a) each of these processes must get the same throughput as
+ * the others,
+ * (i-b) in case (i-a) does not hold, it holds that the process
+ * associated with bfqq must receive a lower or equal
+ * throughput than any of the other processes;
+ * (ii) the I/O of each process has the same properties, in
+ * terms of locality (sequential or random), direction
+ * (reads or writes), request sizes, greediness
+ * (from I/O-bound to sporadic), and so on;
+
+ * In fact, in such a scenario, the drive tends to treat the requests
+ * of each process in about the same way as the requests of the
+ * others, and thus to provide each of these processes with about the
+ * same throughput. This is exactly the desired throughput
+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
+ * even more convenient distribution for (the process associated with)
+ * bfqq.
+ *
+ * In contrast, in any asymmetric or unfavorable scenario, device
+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
+ * that bfqq receives its assigned fraction of the device throughput
+ * (see [1] for details).
+ *
+ * The problem is that idling may significantly reduce throughput with
+ * certain combinations of types of I/O and devices. An important
+ * example is sync random I/O on flash storage with command
+ * queueing. So, unless bfqq falls in cases where idling also boosts
+ * throughput, it is important to check conditions (i-a), i(-b) and
+ * (ii) accurately, so as to avoid idling when not strictly needed for
+ * service guarantees.
+ *
+ * Unfortunately, it is extremely difficult to thoroughly check
+ * condition (ii). And, in case there are active groups, it becomes
+ * very difficult to check conditions (i-a) and (i-b) too. In fact,
+ * if there are active groups, then, for conditions (i-a) or (i-b) to
+ * become false 'indirectly', it is enough that an active group
+ * contains more active processes or sub-groups than some other active
+ * group. More precisely, for conditions (i-a) or (i-b) to become
+ * false because of such a group, it is not even necessary that the
+ * group is (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still have
+ * some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed their
+ * share of the throughput even after being dispatched. In this
+ * respect, it is easy to show that, if a group frequently becomes
+ * inactive while still having in-flight requests, and if, when this
+ * happens, the group is not considered in the calculation of whether
+ * the scenario is asymmetric, then the group may fail to be
+ * guaranteed its fair share of the throughput (basically because
+ * idling may not be performed for the descendant processes of the
+ * group, but it had to be). We address this issue with the following
+ * bi-modal behavior, implemented in the function
+ * bfq_asymmetric_scenario().
+ *
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
+ * asymmetric, conservatively, without checking any of the
+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
+ * This behavior matches also the fact that groups are created
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
+ *
+ * On the opposite end, if there are no groups with requests waiting
+ * for completion, then only conditions (i-a) and (i-b) are actually
+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
+ * idling is not performed, regardless of whether condition (ii)
+ * holds. In other words, only if conditions (i-a) and (i-b) do not
+ * hold, then idling is allowed, and the device tends to be prevented
+ * from queueing many requests, possibly of several processes. Since
+ * there are no groups with requests waiting for completion, then, to
+ * control conditions (i-a) and (i-b) it is enough to check just
+ * whether all the queues with requests waiting for completion also
+ * have the same weight.
+ *
+ * Not checking condition (ii) evidently exposes bfqq to the
+ * risk of getting less throughput than its fair share.
+ * However, for queues with the same weight, a further
+ * mechanism, preemption, mitigates or even eliminates this
+ * problem. And it does so without consequences on overall
+ * throughput. This mechanism and its benefits are explained
+ * in the next three paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * The motivation for using preemption instead of idling (for
+ * queues with the same weight) is that, by not idling,
+ * service guarantees are preserved (completely or at least in
+ * part) without minimally sacrificing throughput. And, if
+ * there is no active group, then the primary expectation for
+ * this device is probably a high throughput.
+ *
+ * We are now left only with explaining the two sub-conditions in the
+ * additional compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain the first
+ * sub-condition, we need to add that the function
+ * bfq_asymmetric_scenario checks the weights of only
+ * non-weight-raised queues, for efficiency reasons (see comments on
+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
+ * is checked explicitly here. More precisely, the compound condition
+ * below takes into account also the fact that, even if bfqq is being
+ * weight-raised, the scenario is still symmetric if all queues with
+ * requests waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise here, and
+ * differentiate between interactive weight raising and soft real-time
+ * weight raising.
+ *
+ * The second sub-condition checked in the compound condition is
+ * whether there is a fair amount of already in-flight I/O not
+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
+ * following reason. The drive may decide to serve in-flight
+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
+ * basically uncontrolled amount of I/O from other queues may be
+ * dispatched too, possibly causing the service of bfqq's I/O to be
+ * delayed even longer in the drive. This problem gets more and more
+ * serious as the speed and the queue depth of the drive grow,
+ * because, as these two quantities grow, the probability to find no
+ * queue busy but many requests in flight grows too. By contrast,
+ * plugging I/O dispatching minimizes the delay induced by already
+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
+ * lose because of this delay.
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the following
+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
+ * in a time period during which all symmetry sub-conditions hold, and
+ * therefore the device is allowed to enqueue many requests, but at
+ * some later point in time some sub-condition stops to hold, then it
+ * may become impossible to make requests be served in the desired
+ * order until all the requests already queued in the device have been
+ * served. The last sub-condition commented above somewhat mitigates
+ * this problem for weight-raised queues.
+ */
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ /* No point in idling for bfqq if it won't get requests any longer */
+ if (unlikely(!bfqq_process_refs(bfqq)))
+ return false;
+
+ return (bfqq->wr_coeff > 1 &&
+ (bfqd->wr_busy_queues <
+ bfq_tot_busy_queues(bfqd) ||
+ bfqd->rq_in_driver >=
+ bfqq->dispatched + 4)) ||
+ bfq_asymmetric_scenario(bfqd, bfqq);
+}
+
+static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ enum bfqq_expiration reason)
{
/*
* If this bfqq is shared between multiple processes, check
@@ -3056,7 +3470,22 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
bfq_mark_bfqq_split_coop(bfqq);
- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * Consider queues with a higher finish virtual time than
+ * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
+ * true, then bfqq's bandwidth would be violated if an
+ * uncontrolled amount of I/O from these queues were
+ * dispatched while bfqq is waiting for its new I/O to
+ * arrive. This is exactly what may happen if this is a forced
+ * expiration caused by a preemption attempt, and if bfqq is
+ * not re-scheduled. To prevent this from happening, re-queue
+ * bfqq if it needs I/O-dispatch plugging, even if it is
+ * empty. By doing so, bfqq is granted to be served before the
+ * above queues (provided that bfqq is of course eligible).
+ */
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ !(reason == BFQQE_PREEMPTED &&
+ idling_needed_for_service_guarantees(bfqd, bfqq))) {
if (bfqq->dispatched == 0)
/*
* Overloading budget_timeout field to store
@@ -3073,7 +3502,8 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* Resort priority tree of potential close cooperators.
* See comments on bfq_pos_tree_add_move() for the unlikely().
*/
- if (unlikely(!bfqd->nonrot_with_queueing))
+ if (unlikely(!bfqd->nonrot_with_queueing &&
+ !RB_EMPTY_ROOT(&bfqq->sort_list)))
bfq_pos_tree_add_move(bfqd, bfqq);
}
@@ -3574,7 +4004,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* reason.
*/
__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
- if (__bfq_bfqq_expire(bfqd, bfqq))
+ if (__bfq_bfqq_expire(bfqd, bfqq, reason))
/* bfqq is gone, no more actions on it */
return;
@@ -3653,6 +4083,10 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
bfqq_sequential_and_IO_bound,
idling_boosts_thr;
+ /* No point in idling for bfqq if it won't get requests any longer */
+ if (unlikely(!bfqq_process_refs(bfqq)))
+ return false;
+
bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) &&
bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq);
@@ -3721,184 +4155,6 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
}
/*
- * There is a case where idling does not have to be performed for
- * throughput concerns, but to preserve the throughput share of
- * the process associated with bfqq.
- *
- * To introduce this case, we can note that allowing the drive
- * to enqueue more than one request at a time, and hence
- * delegating de facto final scheduling decisions to the
- * drive's internal scheduler, entails loss of control on the
- * actual request service order. In particular, the critical
- * situation is when requests from different processes happen
- * to be present, at the same time, in the internal queue(s)
- * of the drive. In such a situation, the drive, by deciding
- * the service order of the internally-queued requests, does
- * determine also the actual throughput distribution among
- * these processes. But the drive typically has no notion or
- * concern about per-process throughput distribution, and
- * makes its decisions only on a per-request basis. Therefore,
- * the service distribution enforced by the drive's internal
- * scheduler is likely to coincide with the desired throughput
- * distribution only in a completely symmetric, or favorably
- * skewed scenario where:
- * (i-a) each of these processes must get the same throughput as
- * the others,
- * (i-b) in case (i-a) does not hold, it holds that the process
- * associated with bfqq must receive a lower or equal
- * throughput than any of the other processes;
- * (ii) the I/O of each process has the same properties, in
- * terms of locality (sequential or random), direction
- * (reads or writes), request sizes, greediness
- * (from I/O-bound to sporadic), and so on;
-
- * In fact, in such a scenario, the drive tends to treat the requests
- * of each process in about the same way as the requests of the
- * others, and thus to provide each of these processes with about the
- * same throughput. This is exactly the desired throughput
- * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
- * even more convenient distribution for (the process associated with)
- * bfqq.
- *
- * In contrast, in any asymmetric or unfavorable scenario, device
- * idling (I/O-dispatch plugging) is certainly needed to guarantee
- * that bfqq receives its assigned fraction of the device throughput
- * (see [1] for details).
- *
- * The problem is that idling may significantly reduce throughput with
- * certain combinations of types of I/O and devices. An important
- * example is sync random I/O on flash storage with command
- * queueing. So, unless bfqq falls in cases where idling also boosts
- * throughput, it is important to check conditions (i-a), i(-b) and
- * (ii) accurately, so as to avoid idling when not strictly needed for
- * service guarantees.
- *
- * Unfortunately, it is extremely difficult to thoroughly check
- * condition (ii). And, in case there are active groups, it becomes
- * very difficult to check conditions (i-a) and (i-b) too. In fact,
- * if there are active groups, then, for conditions (i-a) or (i-b) to
- * become false 'indirectly', it is enough that an active group
- * contains more active processes or sub-groups than some other active
- * group. More precisely, for conditions (i-a) or (i-b) to become
- * false because of such a group, it is not even necessary that the
- * group is (still) active: it is sufficient that, even if the group
- * has become inactive, some of its descendant processes still have
- * some request already dispatched but still waiting for
- * completion. In fact, requests have still to be guaranteed their
- * share of the throughput even after being dispatched. In this
- * respect, it is easy to show that, if a group frequently becomes
- * inactive while still having in-flight requests, and if, when this
- * happens, the group is not considered in the calculation of whether
- * the scenario is asymmetric, then the group may fail to be
- * guaranteed its fair share of the throughput (basically because
- * idling may not be performed for the descendant processes of the
- * group, but it had to be). We address this issue with the following
- * bi-modal behavior, implemented in the function
- * bfq_asymmetric_scenario().
- *
- * If there are groups with requests waiting for completion
- * (as commented above, some of these groups may even be
- * already inactive), then the scenario is tagged as
- * asymmetric, conservatively, without checking any of the
- * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
- * This behavior matches also the fact that groups are created
- * exactly if controlling I/O is a primary concern (to
- * preserve bandwidth and latency guarantees).
- *
- * On the opposite end, if there are no groups with requests waiting
- * for completion, then only conditions (i-a) and (i-b) are actually
- * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
- * idling is not performed, regardless of whether condition (ii)
- * holds. In other words, only if conditions (i-a) and (i-b) do not
- * hold, then idling is allowed, and the device tends to be prevented
- * from queueing many requests, possibly of several processes. Since
- * there are no groups with requests waiting for completion, then, to
- * control conditions (i-a) and (i-b) it is enough to check just
- * whether all the queues with requests waiting for completion also
- * have the same weight.
- *
- * Not checking condition (ii) evidently exposes bfqq to the
- * risk of getting less throughput than its fair share.
- * However, for queues with the same weight, a further
- * mechanism, preemption, mitigates or even eliminates this
- * problem. And it does so without consequences on overall
- * throughput. This mechanism and its benefits are explained
- * in the next three paragraphs.
- *
- * Even if a queue, say Q, is expired when it remains idle, Q
- * can still preempt the new in-service queue if the next
- * request of Q arrives soon (see the comments on
- * bfq_bfqq_update_budg_for_activation). If all queues and
- * groups have the same weight, this form of preemption,
- * combined with the hole-recovery heuristic described in the
- * comments on function bfq_bfqq_update_budg_for_activation,
- * are enough to preserve a correct bandwidth distribution in
- * the mid term, even without idling. In fact, even if not
- * idling allows the internal queues of the device to contain
- * many requests, and thus to reorder requests, we can rather
- * safely assume that the internal scheduler still preserves a
- * minimum of mid-term fairness.
- *
- * More precisely, this preemption-based, idleless approach
- * provides fairness in terms of IOPS, and not sectors per
- * second. This can be seen with a simple example. Suppose
- * that there are two queues with the same weight, but that
- * the first queue receives requests of 8 sectors, while the
- * second queue receives requests of 1024 sectors. In
- * addition, suppose that each of the two queues contains at
- * most one request at a time, which implies that each queue
- * always remains idle after it is served. Finally, after
- * remaining idle, each queue receives very quickly a new
- * request. It follows that the two queues are served
- * alternatively, preempting each other if needed. This
- * implies that, although both queues have the same weight,
- * the queue with large requests receives a service that is
- * 1024/8 times as high as the service received by the other
- * queue.
- *
- * The motivation for using preemption instead of idling (for
- * queues with the same weight) is that, by not idling,
- * service guarantees are preserved (completely or at least in
- * part) without minimally sacrificing throughput. And, if
- * there is no active group, then the primary expectation for
- * this device is probably a high throughput.
- *
- * We are now left only with explaining the additional
- * compound condition that is checked below for deciding
- * whether the scenario is asymmetric. To explain this
- * compound condition, we need to add that the function
- * bfq_asymmetric_scenario checks the weights of only
- * non-weight-raised queues, for efficiency reasons (see
- * comments on bfq_weights_tree_add()). Then the fact that
- * bfqq is weight-raised is checked explicitly here. More
- * precisely, the compound condition below takes into account
- * also the fact that, even if bfqq is being weight-raised,
- * the scenario is still symmetric if all queues with requests
- * waiting for completion happen to be
- * weight-raised. Actually, we should be even more precise
- * here, and differentiate between interactive weight raising
- * and soft real-time weight raising.
- *
- * As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
- */
-static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- return (bfqq->wr_coeff > 1 &&
- bfqd->wr_busy_queues <
- bfq_tot_busy_queues(bfqd)) ||
- bfq_asymmetric_scenario(bfqd, bfqq);
-}
-
-/*
* For a queue that becomes empty, device idling is allowed only if
* this function returns true for that queue. As a consequence, since
* device idling plays a critical role for both throughput boosting
@@ -3924,6 +4180,10 @@ static bool bfq_better_to_idle(struct bfq_queue *bfqq)
struct bfq_data *bfqd = bfqq->bfqd;
bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar;
+ /* No point in idling for bfqq if it won't get requests any longer */
+ if (unlikely(!bfqq_process_refs(bfqq)))
+ return false;
+
if (unlikely(bfqd->strict_guarantees))
return true;
@@ -4156,22 +4416,95 @@ check_queue:
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
struct bfq_queue *async_bfqq =
bfqq->bic && bfqq->bic->bfqq[0] &&
- bfq_bfqq_busy(bfqq->bic->bfqq[0]) ?
+ bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
+ bfqq->bic->bfqq[0]->next_rq ?
bfqq->bic->bfqq[0] : NULL;
/*
- * If the process associated with bfqq has also async
- * I/O pending, then inject it
- * unconditionally. Injecting I/O from the same
- * process can cause no harm to the process. On the
- * contrary, it can only increase bandwidth and reduce
- * latency for the process.
+ * The next three mutually-exclusive ifs decide
+ * whether to try injection, and choose the queue to
+ * pick an I/O request from.
+ *
+ * The first if checks whether the process associated
+ * with bfqq has also async I/O pending. If so, it
+ * injects such I/O unconditionally. Injecting async
+ * I/O from the same process can cause no harm to the
+ * process. On the contrary, it can only increase
+ * bandwidth and reduce latency for the process.
+ *
+ * The second if checks whether there happens to be a
+ * non-empty waker queue for bfqq, i.e., a queue whose
+ * I/O needs to be completed for bfqq to receive new
+ * I/O. This happens, e.g., if bfqq is associated with
+ * a process that does some sync. A sync generates
+ * extra blocking I/O, which must be completed before
+ * the process associated with bfqq can go on with its
+ * I/O. If the I/O of the waker queue is not served,
+ * then bfqq remains empty, and no I/O is dispatched,
+ * until the idle timeout fires for bfqq. This is
+ * likely to result in lower bandwidth and higher
+ * latencies for bfqq, and in a severe loss of total
+ * throughput. The best action to take is therefore to
+ * serve the waker queue as soon as possible. So do it
+ * (without relying on the third alternative below for
+ * eventually serving waker_bfqq's I/O; see the last
+ * paragraph for further details). This systematic
+ * injection of I/O from the waker queue does not
+ * cause any delay to bfqq's I/O. On the contrary,
+ * next bfqq's I/O is brought forward dramatically,
+ * for it is not blocked for milliseconds.
+ *
+ * The third if checks whether bfqq is a queue for
+ * which it is better to avoid injection. It is so if
+ * bfqq delivers more throughput when served without
+ * any further I/O from other queues in the middle, or
+ * if the service times of bfqq's I/O requests both
+ * count more than overall throughput, and may be
+ * easily increased by injection (this happens if bfqq
+ * has a short think time). If none of these
+ * conditions holds, then a candidate queue for
+ * injection is looked for through
+ * bfq_choose_bfqq_for_injection(). Note that the
+ * latter may return NULL (for example if the inject
+ * limit for bfqq is currently 0).
+ *
+ * NOTE: motivation for the second alternative
+ *
+ * Thanks to the way the inject limit is updated in
+ * bfq_update_has_short_ttime(), it is rather likely
+ * that, if I/O is being plugged for bfqq and the
+ * waker queue has pending I/O requests that are
+ * blocking bfqq's I/O, then the third alternative
+ * above lets the waker queue get served before the
+ * I/O-plugging timeout fires. So one may deem the
+ * second alternative superfluous. It is not, because
+ * the third alternative may be way less effective in
+ * case of a synchronization. For two main
+ * reasons. First, throughput may be low because the
+ * inject limit may be too low to guarantee the same
+ * amount of injected I/O, from the waker queue or
+ * other queues, that the second alternative
+ * guarantees (the second alternative unconditionally
+ * injects a pending I/O request of the waker queue
+ * for each bfq_dispatch_request()). Second, with the
+ * third alternative, the duration of the plugging,
+ * i.e., the time before bfqq finally receives new I/O,
+ * may not be minimized, because the waker queue may
+ * happen to be served only after other queues.
*/
if (async_bfqq &&
icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
bfq_bfqq_budget_left(async_bfqq))
bfqq = bfqq->bic->bfqq[0];
+ else if (bfq_bfqq_has_waker(bfqq) &&
+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
+ bfqq->next_rq &&
+ bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
+ bfqq->waker_bfqq) <=
+ bfq_bfqq_budget_left(bfqq->waker_bfqq)
+ )
+ bfqq = bfqq->waker_bfqq;
else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
(bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
!bfq_bfqq_has_short_ttime(bfqq)))
@@ -4403,7 +4736,7 @@ exit:
return rq;
}
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
static void bfq_update_dispatch_stats(struct request_queue *q,
struct request *rq,
struct bfq_queue *in_serv_queue,
@@ -4453,7 +4786,7 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
struct request *rq,
struct bfq_queue *in_serv_queue,
bool idle_timer_disabled) {}
-#endif
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
@@ -4489,9 +4822,9 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
*/
void bfq_put_queue(struct bfq_queue *bfqq)
{
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_queue *item;
+ struct hlist_node *n;
struct bfq_group *bfqg = bfqq_group(bfqq);
-#endif
if (bfqq->bfqd)
bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
@@ -4533,10 +4866,38 @@ void bfq_put_queue(struct bfq_queue *bfqq)
bfqq->bfqd->burst_size--;
}
+ /*
+ * bfqq does not exist any longer, so it cannot be woken by
+ * any other queue, and cannot wake any other queue. Then bfqq
+ * must be removed from the woken list of its possible waker
+ * queue, and all queues in the woken list of bfqq must stop
+ * having a waker queue. Strictly speaking, these updates
+ * should be performed when bfqq remains with no I/O source
+ * attached to it, which happens before bfqq gets freed. In
+ * particular, this happens when the last process associated
+ * with bfqq exits or gets associated with a different
+ * queue. However, both events lead to bfqq being freed soon,
+ * and dangling references would come out only after bfqq gets
+ * freed. So these updates are done here, as a simple and safe
+ * way to handle all cases.
+ */
+ /* remove bfqq from woken list */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+
+ /* reset waker for all queues in woken list */
+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
+ woken_list_node) {
+ item->waker_bfqq = NULL;
+ bfq_clear_bfqq_has_waker(item);
+ hlist_del_init(&item->woken_list_node);
+ }
+
+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
+
kmem_cache_free(bfq_pool, bfqq);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
bfqg_and_blkg_put(bfqg);
-#endif
}
static void bfq_put_cooperator(struct bfq_queue *bfqq)
@@ -4561,7 +4922,7 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
if (bfqq == bfqd->in_service_queue) {
- __bfq_bfqq_expire(bfqd, bfqq);
+ __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
bfq_schedule_dispatch(bfqd);
}
@@ -4569,7 +4930,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_cooperator(bfqq);
- bfq_put_queue(bfqq); /* release process reference */
+ bfq_release_process_ref(bfqd, bfqq);
}
static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
@@ -4616,8 +4977,9 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
switch (ioprio_class) {
default:
- dev_err(bfqq->bfqd->queue->backing_dev_info->dev,
- "bfq: bad prio class %d\n", ioprio_class);
+ pr_err("bdi %s: bfq: bad prio class %d\n",
+ bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
+ ioprio_class);
/* fall through */
case IOPRIO_CLASS_NONE:
/*
@@ -4671,8 +5033,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bfqq = bic_to_bfqq(bic, false);
if (bfqq) {
- /* release process reference on this queue */
- bfq_put_queue(bfqq);
+ bfq_release_process_ref(bfqd, bfqq);
bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
bic_set_bfqq(bic, bfqq, false);
}
@@ -4688,6 +5049,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
RB_CLEAR_NODE(&bfqq->entity.rb_node);
INIT_LIST_HEAD(&bfqq->fifo);
INIT_HLIST_NODE(&bfqq->burst_list_node);
+ INIT_HLIST_NODE(&bfqq->woken_list_node);
+ INIT_HLIST_HEAD(&bfqq->woken_list);
bfqq->ref = 0;
bfqq->bfqd = bfqd;
@@ -4855,7 +5218,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
struct bfq_io_cq *bic)
{
- bool has_short_ttime = true;
+ bool has_short_ttime = true, state_changed;
/*
* No need to update has_short_ttime if bfqq is async or in
@@ -4880,13 +5243,102 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
has_short_ttime = false;
- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
- has_short_ttime);
+ state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
if (has_short_ttime)
bfq_mark_bfqq_has_short_ttime(bfqq);
else
bfq_clear_bfqq_has_short_ttime(bfqq);
+
+ /*
+ * Until the base value for the total service time gets
+ * finally computed for bfqq, the inject limit does depend on
+ * the think-time state (short|long). In particular, the limit
+ * is 0 or 1 if the think time is deemed, respectively, as
+ * short or long (details in the comments in
+ * bfq_update_inject_limit()). Accordingly, the next
+ * instructions reset the inject limit if the think-time state
+ * has changed and the above base value is still to be
+ * computed.
+ *
+ * However, the reset is performed only if more than 100 ms
+ * have elapsed since the last update of the inject limit, or
+ * (inclusive) if the change is from short to long think
+ * time. The reason for this waiting is as follows.
+ *
+ * bfqq may have a long think time because of a
+ * synchronization with some other queue, i.e., because the
+ * I/O of some other queue may need to be completed for bfqq
+ * to receive new I/O. Details in the comments on the choice
+ * of the queue for injection in bfq_select_queue().
+ *
+ * As stressed in those comments, if such a synchronization is
+ * actually in place, then, without injection on bfqq, the
+ * blocking I/O cannot happen to served while bfqq is in
+ * service. As a consequence, if bfqq is granted
+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
+ * is dispatched, until the idle timeout fires. This is likely
+ * to result in lower bandwidth and higher latencies for bfqq,
+ * and in a severe loss of total throughput.
+ *
+ * On the opposite end, a non-zero inject limit may allow the
+ * I/O that blocks bfqq to be executed soon, and therefore
+ * bfqq to receive new I/O soon.
+ *
+ * But, if the blocking gets actually eliminated, then the
+ * next think-time sample for bfqq may be very low. This in
+ * turn may cause bfqq's think time to be deemed
+ * short. Without the 100 ms barrier, this new state change
+ * would cause the body of the next if to be executed
+ * immediately. But this would set to 0 the inject
+ * limit. Without injection, the blocking I/O would cause the
+ * think time of bfqq to become long again, and therefore the
+ * inject limit to be raised again, and so on. The only effect
+ * of such a steady oscillation between the two think-time
+ * states would be to prevent effective injection on bfqq.
+ *
+ * In contrast, if the inject limit is not reset during such a
+ * long time interval as 100 ms, then the number of short
+ * think time samples can grow significantly before the reset
+ * is performed. As a consequence, the think time state can
+ * become stable before the reset. Therefore there will be no
+ * state change when the 100 ms elapse, and no reset of the
+ * inject limit. The inject limit remains steadily equal to 1
+ * both during and after the 100 ms. So injection can be
+ * performed at all times, and throughput gets boosted.
+ *
+ * An inject limit equal to 1 is however in conflict, in
+ * general, with the fact that the think time of bfqq is
+ * short, because injection may be likely to delay bfqq's I/O
+ * (as explained in the comments in
+ * bfq_update_inject_limit()). But this does not happen in
+ * this special case, because bfqq's low think time is due to
+ * an effective handling of a synchronization, through
+ * injection. In this special case, bfqq's I/O does not get
+ * delayed by injection; on the contrary, bfqq's I/O is
+ * brought forward, because it is not blocked for
+ * milliseconds.
+ *
+ * In addition, serving the blocking I/O much sooner, and much
+ * more frequently than once per I/O-plugging timeout, makes
+ * it much quicker to detect a waker queue (the concept of
+ * waker queue is defined in the comments in
+ * bfq_add_request()). This makes it possible to start sooner
+ * to boost throughput more effectively, by injecting the I/O
+ * of the waker queue unconditionally on every
+ * bfq_dispatch_request().
+ *
+ * One last, important benefit of not resetting the inject
+ * limit before 100 ms is that, during this time interval, the
+ * base value for the total service time is likely to get
+ * finally computed for bfqq, freeing the inject limit from
+ * its relation with the think time.
+ */
+ if (state_changed && bfqq->last_serv_time_ns == 0 &&
+ (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
+ msecs_to_jiffies(100)) ||
+ !has_short_ttime))
+ bfq_reset_inject_limit(bfqd, bfqq);
}
/*
@@ -4896,19 +5348,9 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq)
{
- struct bfq_io_cq *bic = RQ_BIC(rq);
-
if (rq->cmd_flags & REQ_META)
bfqq->meta_pending++;
- bfq_update_io_thinktime(bfqd, bfqq);
- bfq_update_has_short_ttime(bfqd, bfqq, bic);
- bfq_update_io_seektime(bfqd, bfqq, rq);
-
- bfq_log_bfqq(bfqd, bfqq,
- "rq_enqueued: has_short_ttime=%d (seeky %d)",
- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
-
bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
@@ -4996,6 +5438,10 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfqq = new_bfqq;
}
+ bfq_update_io_thinktime(bfqd, bfqq);
+ bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
+ bfq_update_io_seektime(bfqd, bfqq, rq);
+
waiting = bfqq && bfq_bfqq_wait_request(bfqq);
bfq_add_request(rq);
idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
@@ -5008,7 +5454,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
return idle_timer_disabled;
}
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
static void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
@@ -5038,7 +5484,7 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
unsigned int cmd_flags) {}
-#endif
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
@@ -5049,6 +5495,10 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool idle_timer_disabled = false;
unsigned int cmd_flags;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
+ bfqg_stats_update_legacy_io(q, rq);
+#endif
spin_lock_irq(&bfqd->lock);
if (blk_mq_sched_try_insert_merge(q, rq)) {
spin_unlock_irq(&bfqd->lock);
@@ -5061,12 +5511,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
- if (at_head || blk_rq_is_passthrough(rq)) {
+ if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
- } else { /* bfqq is assumed to be non null here */
+ } else {
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
* Update bfqq, because, if a queue merge has occurred
@@ -5201,6 +5651,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
1UL<<(BFQ_RATE_SHIFT - 10))
bfq_update_rate_reset(bfqd, NULL);
bfqd->last_completion = now_ns;
+ bfqd->last_completed_rq_bfqq = bfqq;
/*
* If we are waiting to discover whether the request pattern
@@ -5382,14 +5833,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
unsigned int old_limit = bfqq->inject_limit;
- if (bfqq->last_serv_time_ns > 0) {
+ if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
if (tot_time_ns >= threshold && old_limit > 0) {
bfqq->inject_limit--;
bfqq->decrease_time_jif = jiffies;
} else if (tot_time_ns < threshold &&
- old_limit < bfqd->max_rq_in_driver<<1)
+ old_limit <= bfqd->max_rq_in_driver)
bfqq->inject_limit++;
}
@@ -5398,19 +5849,39 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
* total service time, and there seem to be the right
* conditions to do it, or we can lower the last base value
* computed.
+ *
+ * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
+ * request in flight, because this function is in the code
+ * path that handles the completion of a request of bfqq, and,
+ * in particular, this function is executed before
+ * bfqd->rq_in_driver is decremented in such a code path.
*/
- if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 0) ||
+ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
tot_time_ns < bfqq->last_serv_time_ns) {
+ if (bfqq->last_serv_time_ns == 0) {
+ /*
+ * Now we certainly have a base value: make sure we
+ * start trying injection.
+ */
+ bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
+ }
bfqq->last_serv_time_ns = tot_time_ns;
+ } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
/*
- * Now we certainly have a base value: make sure we
- * start trying injection.
+ * No I/O injected and no request still in service in
+ * the drive: these are the exact conditions for
+ * computing the base value of the total service time
+ * for bfqq. So let's update this value, because it is
+ * rather variable. For example, it varies if the size
+ * or the spatial locality of the I/O requests in bfqq
+ * change.
*/
- bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
- }
+ bfqq->last_serv_time_ns = tot_time_ns;
+
/* update complete, not waiting for any request completion any longer */
bfqd->waited_rq = NULL;
+ bfqd->rqs_injected = false;
}
/*
@@ -5508,6 +5979,8 @@ static void bfq_finish_requeue_request(struct request *rq)
}
/*
+ * Removes the association between the current task and bfqq, assuming
+ * that bic points to the bfq iocontext of the task.
* Returns NULL if a new bfqq should be allocated, or the old bfqq if this
* was the last process referring to that bfqq.
*/
@@ -5527,7 +6000,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
bfq_put_cooperator(bfqq);
- bfq_put_queue(bfqq);
+ bfq_release_process_ref(bfqq->bfqd, bfqq);
return NULL;
}
@@ -5600,7 +6073,7 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
* comments on bfq_init_rq for the reason behind this delayed
* preparation.
*/
-static void bfq_prepare_request(struct request *rq, struct bio *bio)
+static void bfq_prepare_request(struct request *rq)
{
/*
* Regardless of whether we have an icq attached, we have to
@@ -5742,20 +6215,28 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
return bfqq;
}
-static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq)
+static void
+bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
- struct bfq_data *bfqd = bfqq->bfqd;
enum bfqq_expiration reason;
unsigned long flags;
spin_lock_irqsave(&bfqd->lock, flags);
- bfq_clear_bfqq_wait_request(bfqq);
+ /*
+ * Considering that bfqq may be in race, we should firstly check
+ * whether bfqq is in service before doing something on it. If
+ * the bfqq in race is not in service, it has already been expired
+ * through __bfq_bfqq_expire func and its wait_request flags has
+ * been cleared in __bfq_bfqd_reset_in_service func.
+ */
if (bfqq != bfqd->in_service_queue) {
spin_unlock_irqrestore(&bfqd->lock, flags);
return;
}
+ bfq_clear_bfqq_wait_request(bfqq);
+
if (bfq_bfqq_budget_timeout(bfqq))
/*
* Also here the queue can be safely expired
@@ -5800,7 +6281,7 @@ static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer)
* early.
*/
if (bfqq)
- bfq_idle_slice_timer_body(bfqq);
+ bfq_idle_slice_timer_body(bfqd, bfqq);
return HRTIMER_NORESTART;
}
@@ -5915,10 +6396,10 @@ static void bfq_exit_queue(struct elevator_queue *e)
hrtimer_cancel(&bfqd->idle_slice_timer);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
/* release oom-queue reference to root group */
bfqg_and_blkg_put(bfqd->root_group);
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
#else
spin_lock_irq(&bfqd->lock);
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index c2faa77824f8..cd224aaf9f52 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -10,6 +10,8 @@
#include <linux/hrtimer.h>
#include <linux/blk-cgroup.h>
+#include "blk-cgroup-rwstat.h"
+
#define BFQ_IOPRIO_CLASSES 3
#define BFQ_CL_IDLE_TIMEOUT (HZ/5)
@@ -148,7 +150,7 @@ struct bfq_entity {
* Flag, true if the entity is on a tree (either the active or
* the idle one of its service_tree) or is in service.
*/
- bool on_st;
+ bool on_st_or_in_serv;
/* B-WF2Q+ start and finish timestamps [sectors/weight] */
u64 start, finish;
@@ -168,6 +170,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
+ /* device weight, if non-zero, it overrides the default weight of
+ * bfq_group_data */
+ int dev_weight;
/* weight of the queue */
int weight;
/* next weight if a change is in progress */
@@ -357,6 +362,24 @@ struct bfq_queue {
/* max service rate measured so far */
u32 max_service_rate;
+
+ /*
+ * Pointer to the waker queue for this queue, i.e., to the
+ * queue Q such that this queue happens to get new I/O right
+ * after some I/O request of Q is completed. For details, see
+ * the comments on the choice of the queue for injection in
+ * bfq_select_queue().
+ */
+ struct bfq_queue *waker_bfqq;
+ /* node for woken_list, see below */
+ struct hlist_node woken_list_node;
+ /*
+ * Head of the list of the woken queues for this queue, i.e.,
+ * of the list of the queues for which this queue is a waker
+ * queue. This list is used to reset the waker_bfqq pointer in
+ * the woken queues when this queue exits.
+ */
+ struct hlist_head woken_list;
};
/**
@@ -533,6 +556,9 @@ struct bfq_data {
/* time of last request completion (ns) */
u64 last_completion;
+ /* bfqq owning the last completed rq */
+ struct bfq_queue *last_completed_rq_bfqq;
+
/* time of last transition from empty to non-empty (ns) */
u64 last_empty_occupied_ns;
@@ -743,7 +769,8 @@ enum bfqq_state_flags {
* update
*/
BFQQF_coop, /* bfqq is shared */
- BFQQF_split_coop /* shared bfqq will be split */
+ BFQQF_split_coop, /* shared bfqq will be split */
+ BFQQF_has_waker /* bfqq has a waker queue */
};
#define BFQ_BFQQ_FNS(name) \
@@ -763,6 +790,7 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS
/* Expiration reasons. */
@@ -777,8 +805,16 @@ enum bfqq_expiration {
BFQQE_PREEMPTED /* preemption in progress */
};
+struct bfq_stat {
+ struct percpu_counter cpu_cnt;
+ atomic64_t aux_cnt;
+};
+
struct bfqg_stats {
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+ /* basic stats */
+ struct blkg_rwstat bytes;
+ struct blkg_rwstat ios;
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -788,25 +824,25 @@ struct bfqg_stats {
/* number of IOs queued up */
struct blkg_rwstat queued;
/* total disk time and nr sectors dispatched by this group */
- struct blkg_stat time;
+ struct bfq_stat time;
/* sum of number of ios queued across all samples */
- struct blkg_stat avg_queue_size_sum;
+ struct bfq_stat avg_queue_size_sum;
/* count of samples taken for average */
- struct blkg_stat avg_queue_size_samples;
+ struct bfq_stat avg_queue_size_samples;
/* how many times this group has been removed from service tree */
- struct blkg_stat dequeue;
+ struct bfq_stat dequeue;
/* total time spent waiting for it to be assigned a timeslice. */
- struct blkg_stat group_wait_time;
+ struct bfq_stat group_wait_time;
/* time spent idling for this blkcg_gq */
- struct blkg_stat idle_time;
+ struct bfq_stat idle_time;
/* total time with empty current active q with other requests queued */
- struct blkg_stat empty_time;
+ struct bfq_stat empty_time;
/* fields after this shouldn't be cleared on stat reset */
u64 start_group_wait_time;
u64 start_idle_time;
u64 start_empty_time;
uint16_t flags;
-#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
};
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -885,6 +921,7 @@ struct bfq_group {
#else
struct bfq_group {
+ struct bfq_entity entity;
struct bfq_sched_data sched_data;
struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
@@ -918,6 +955,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_schedule_dispatch(struct bfq_data *bfqd);
void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
@@ -925,6 +963,7 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
/* ---------------- cgroups-support interface ---------------- */
+void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op);
void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
@@ -947,6 +986,7 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
+void bfqg_and_blkg_get(struct bfq_group *bfqg);
void bfqg_and_blkg_put(struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -1031,6 +1071,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
+ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \
bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
@@ -1047,6 +1089,8 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
char pid_str[MAX_PID_STR_LENGTH]; \
+ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
+ break; \
bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index c9ba225081ce..eb0e2a6daabe 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -277,10 +277,7 @@ struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity)
*/
static u64 bfq_delta(unsigned long service, unsigned long weight)
{
- u64 d = (u64)service << WFQ_SERVICE_SHIFT;
-
- do_div(d, weight);
- return d;
+ return div64_ul((u64)service << WFQ_SERVICE_SHIFT, weight);
}
/**
@@ -536,7 +533,9 @@ static void bfq_get_entity(struct bfq_entity *entity)
bfqq->ref++;
bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
bfqq, bfqq->ref);
- }
+ } else
+ bfqg_and_blkg_get(container_of(entity, struct bfq_group,
+ entity));
}
/**
@@ -648,10 +647,16 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
- entity->on_st = false;
+ entity->on_st_or_in_serv = false;
st->wsum -= entity->weight;
- if (bfqq && !is_in_service)
+ if (is_in_service)
+ return;
+
+ if (bfqq)
bfq_put_queue(bfqq);
+ else
+ bfqg_and_blkg_put(container_of(entity, struct bfq_group,
+ entity));
}
/**
@@ -744,6 +749,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
}
#endif
+ /* Matches the smp_wmb() in bfq_group_set_weight. */
+ smp_rmb();
old_st->wsum -= entity->weight;
if (entity->new_weight != entity->orig_weight) {
@@ -1000,7 +1007,7 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
*/
bfq_get_entity(entity);
- entity->on_st = true;
+ entity->on_st_or_in_serv = true;
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -1166,7 +1173,10 @@ bool __bfq_deactivate_entity(struct bfq_entity *entity, bool ins_into_idle_tree)
struct bfq_service_tree *st;
bool is_in_service;
- if (!entity->on_st) /* entity never activated, or already inactive */
+ if (!entity->on_st_or_in_serv) /*
+ * entity never activated, or
+ * already inactive
+ */
return false;
/*
@@ -1621,7 +1631,7 @@ bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd)
* service tree either, then release the service reference to
* the queue it represents (taken with bfq_get_entity).
*/
- if (!in_serv_entity->on_st) {
+ if (!in_serv_entity->on_st_or_in_serv) {
/*
* If no process is referencing in_serv_bfqq any
* longer, then the service reference may be the only
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4db620849515..9ffd7e289554 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -24,6 +24,19 @@ void blk_flush_integrity(void)
flush_workqueue(kintegrityd_wq);
}
+static void __bio_integrity_free(struct bio_set *bs,
+ struct bio_integrity_payload *bip)
+{
+ if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
+ if (bip->bip_vec)
+ bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
+ bip->bip_slab);
+ mempool_free(bip, &bs->bio_integrity_pool);
+ } else {
+ kfree(bip);
+ }
+}
+
/**
* bio_integrity_alloc - Allocate integrity payload and attach it to bio
* @bio: bio to attach integrity metadata to
@@ -42,6 +55,9 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
struct bio_set *bs = bio->bi_pool;
unsigned inline_vecs;
+ if (WARN_ON_ONCE(bio_has_crypt_ctx(bio)))
+ return ERR_PTR(-EOPNOTSUPP);
+
if (!bs || !mempool_initialized(&bs->bio_integrity_pool)) {
bip = kmalloc(struct_size(bip, bip_inline_vecs, nr_vecs), gfp_mask);
inline_vecs = nr_vecs;
@@ -75,7 +91,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
return bip;
err:
- mempool_free(bip, &bs->bio_integrity_pool);
+ __bio_integrity_free(bs, bip);
return ERR_PTR(-ENOMEM);
}
EXPORT_SYMBOL(bio_integrity_alloc);
@@ -87,7 +103,7 @@ EXPORT_SYMBOL(bio_integrity_alloc);
* Description: Used to free the integrity portion of a bio. Usually
* called from bio_free().
*/
-static void bio_integrity_free(struct bio *bio)
+void bio_integrity_free(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_set *bs = bio->bi_pool;
@@ -96,14 +112,7 @@ static void bio_integrity_free(struct bio *bio)
kfree(page_address(bip->bip_vec->bv_page) +
bip->bip_vec->bv_offset);
- if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
- bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, bip->bip_slab);
-
- mempool_free(bip, &bs->bio_integrity_pool);
- } else {
- kfree(bip);
- }
-
+ __bio_integrity_free(bs, bip);
bio->bi_integrity = NULL;
bio->bi_opf &= ~REQ_INTEGRITY;
}
@@ -276,8 +285,11 @@ bool bio_integrity_prep(struct bio *bio)
ret = bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset);
- if (ret == 0)
- return false;
+ if (ret == 0) {
+ printk(KERN_ERR "could not attach integrity payload\n");
+ status = BLK_STS_RESOURCE;
+ goto err_end_io;
+ }
if (ret < bytes)
break;
diff --git a/block/bio.c b/block/bio.c
index 67bba12d273b..496aff366767 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,6 +16,9 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
+#include <linux/highmem.h>
+#include <linux/sched/sysctl.h>
+#include <linux/blk-crypto.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -232,6 +235,11 @@ fallback:
void bio_uninit(struct bio *bio)
{
bio_disassociate_blkg(bio);
+
+ if (bio_integrity(bio))
+ bio_integrity_free(bio);
+
+ bio_crypt_free_ctx(bio);
}
EXPORT_SYMBOL(bio_uninit);
@@ -534,6 +542,137 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
}
EXPORT_SYMBOL(zero_fill_bio_iter);
+void bio_truncate(struct bio *bio, unsigned new_size)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ unsigned int done = 0;
+ bool truncated = false;
+
+ if (new_size >= bio->bi_iter.bi_size)
+ return;
+
+ if (bio_data_dir(bio) != READ)
+ goto exit;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (done + bv.bv_len > new_size) {
+ unsigned offset;
+
+ if (!truncated)
+ offset = new_size - done;
+ else
+ offset = 0;
+ zero_user(bv.bv_page, offset, bv.bv_len - offset);
+ truncated = true;
+ }
+ done += bv.bv_len;
+ }
+
+ exit:
+ /*
+ * Don't touch bvec table here and make it really immutable, since
+ * fs bio user has to retrieve all pages via bio_for_each_segment_all
+ * in its .end_bio() callback.
+ *
+ * It is enough to truncate bio by updating .bi_size since we can make
+ * correct bvec with the updated .bi_size for drivers.
+ */
+ bio->bi_iter.bi_size = new_size;
+}
+
+/**
+ * bio_truncate - truncate the bio to small size of @new_size
+ * @bio: the bio to be truncated
+ * @new_size: new size for truncating the bio
+ *
+ * Description:
+ * Truncate the bio to new size of @new_size. If bio_op(bio) is
+ * REQ_OP_READ, zero the truncated part. This function should only
+ * be used for handling corner cases, such as bio eod.
+ */
+void bio_truncate(struct bio *bio, unsigned new_size)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ unsigned int done = 0;
+ bool truncated = false;
+
+ if (new_size >= bio->bi_iter.bi_size)
+ return;
+
+ if (bio_op(bio) != REQ_OP_READ)
+ goto exit;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (done + bv.bv_len > new_size) {
+ unsigned offset;
+
+ if (!truncated)
+ offset = new_size - done;
+ else
+ offset = 0;
+ zero_user(bv.bv_page, offset, bv.bv_len - offset);
+ truncated = true;
+ }
+ done += bv.bv_len;
+ }
+
+ exit:
+ /*
+ * Don't touch bvec table here and make it really immutable, since
+ * fs bio user has to retrieve all pages via bio_for_each_segment_all
+ * in its .end_bio() callback.
+ *
+ * It is enough to truncate bio by updating .bi_size since we can make
+ * correct bvec with the updated .bi_size for drivers.
+ */
+ bio->bi_iter.bi_size = new_size;
+}
+
+/**
+ * guard_bio_eod - truncate a BIO to fit the block device
+ * @bio: bio to truncate
+ *
+ * This allows us to do IO even on the odd last sectors of a device, even if the
+ * block size is some multiple of the physical sector size.
+ *
+ * We'll just truncate the bio to the size of the device, and clear the end of
+ * the buffer head manually. Truly out-of-range accesses will turn into actual
+ * I/O errors, this only handles the "we need to be able to do I/O at the final
+ * sector" case.
+ */
+void guard_bio_eod(struct bio *bio)
+{
+ sector_t maxsector;
+ struct hd_struct *part;
+
+ rcu_read_lock();
+ part = __disk_get_part(bio->bi_disk, bio->bi_partno);
+ if (part)
+ maxsector = part_nr_sects_read(part);
+ else
+ maxsector = get_capacity(bio->bi_disk);
+ rcu_read_unlock();
+
+ if (!maxsector)
+ return;
+
+ /*
+ * If the *whole* IO is past the end of the device,
+ * let it through, and the IO layer will turn it into
+ * an EIO.
+ */
+ if (unlikely(bio->bi_iter.bi_sector >= maxsector))
+ return;
+
+ maxsector -= bio->bi_iter.bi_sector;
+ if (likely((bio->bi_iter.bi_size >> 9) <= maxsector))
+ return;
+
+ bio_truncate(bio, maxsector << 9);
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -558,14 +697,6 @@ void bio_put(struct bio *bio)
}
EXPORT_SYMBOL(bio_put);
-int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
- if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
- blk_recount_segments(q, bio);
-
- return bio->bi_phys_segments;
-}
-
/**
* __bio_clone_fast - clone a bio that shares the original bio's biovec
* @bio: destination bio
@@ -619,6 +750,8 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
__bio_clone_fast(b, bio);
+ bio_crypt_clone(b, bio, gfp_mask);
+
if (bio_integrity(bio)) {
int ret;
@@ -634,6 +767,12 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
}
EXPORT_SYMBOL(bio_clone_fast);
+const char *bio_devname(struct bio *bio, char *buf)
+{
+ return disk_name(bio->bi_disk, bio->bi_partno, buf);
+}
+EXPORT_SYMBOL(bio_devname);
+
static inline bool page_is_mergeable(const struct bio_vec *bv,
struct page *page, unsigned int len, unsigned int off,
bool *same_page)
@@ -654,87 +793,68 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
}
/*
- * Check if the @page can be added to the current segment(@bv), and make
- * sure to call it only if page_is_mergeable(@bv, @page) is true
+ * Try to merge a page into a segment, while obeying the hardware segment
+ * size limit. This is not for normal read/write bios, but for passthrough
+ * or Zone Append operations that we can't split.
*/
-static bool can_add_page_to_seg(struct request_queue *q,
- struct bio_vec *bv, struct page *page, unsigned len,
- unsigned offset)
+static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
+ struct page *page, unsigned len,
+ unsigned offset, bool *same_page)
{
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
if ((addr1 | mask) != (addr2 | mask))
return false;
-
if (bv->bv_len + len > queue_max_segment_size(q))
return false;
-
- return true;
+ return __bio_try_merge_page(bio, page, len, offset, same_page);
}
/**
- * __bio_add_pc_page - attempt to add page to passthrough bio
- * @q: the target queue
- * @bio: destination bio
- * @page: page to add
- * @len: vec entry length
- * @offset: vec entry offset
- * @put_same_page: put the page if it is same with last added page
- *
- * Attempt to add a page to the bio_vec maplist. This can fail for a
- * number of reasons, such as the bio being full or target block device
- * limitations. The target block device must allow bio's up to PAGE_SIZE,
- * so it is always possible to add a single page to an empty bio.
+ * bio_add_hw_page - attempt to add a page to a bio with hw constraints
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ * @max_sectors: maximum number of sectors that can be added
+ * @same_page: return if the segment has been merged inside the same page
*
- * This should only be used by passthrough bios.
+ * Add a page to a bio while respecting the hardware max_sectors, max_segment
+ * and gap limitations.
*/
-static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
+int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
- bool put_same_page)
+ unsigned int max_sectors, bool *same_page)
{
struct bio_vec *bvec;
- bool same_page = false;
- /*
- * cloned bio must not modify vec list
- */
- if (unlikely(bio_flagged(bio, BIO_CLONED)))
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> 9) > queue_max_hw_sectors(q))
+ if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
return 0;
if (bio->bi_vcnt > 0) {
- bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page == bvec->bv_page &&
- offset == bvec->bv_offset + bvec->bv_len) {
- if (put_same_page)
- put_page(page);
- bvec->bv_len += len;
- goto done;
- }
+ if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
+ return len;
/*
- * If the queue doesn't support SG gaps and adding this
- * offset would create a gap, disallow it.
+ * If the queue doesn't support SG gaps and adding this segment
+ * would create a gap, disallow it.
*/
+ bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bvec_gap_to_prev(q, bvec, offset))
return 0;
-
- if (page_is_mergeable(bvec, page, len, offset, &same_page) &&
- can_add_page_to_seg(q, bvec, page, len, offset)) {
- bvec->bv_len += len;
- goto done;
- }
}
if (bio_full(bio, len))
return 0;
- if (bio->bi_phys_segments >= queue_max_segments(q))
+ if (bio->bi_vcnt >= queue_max_segments(q))
return 0;
bvec = &bio->bi_io_vec[bio->bi_vcnt];
@@ -742,17 +862,31 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
bvec->bv_len = len;
bvec->bv_offset = offset;
bio->bi_vcnt++;
- done:
bio->bi_iter.bi_size += len;
- bio->bi_phys_segments = bio->bi_vcnt;
- bio_set_flag(bio, BIO_SEG_VALID);
return len;
}
+/**
+ * bio_add_pc_page - attempt to add page to passthrough bio
+ * @q: the target queue
+ * @bio: destination bio
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
+ *
+ * Attempt to add a page to the bio_vec maplist. This can fail for a
+ * number of reasons, such as the bio being full or target block device
+ * limitations. The target block device must allow bio's up to PAGE_SIZE,
+ * so it is always possible to add a single page to an empty bio.
+ *
+ * This should only be used by passthrough bios.
+ */
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset)
{
- return __bio_add_pc_page(q, bio, page, len, offset, false);
+ bool same_page = false;
+ return bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_hw_sectors(q), &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);
@@ -782,6 +916,8 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return false;
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
@@ -815,6 +951,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
+
+ if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
+ bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
@@ -842,23 +981,21 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
-static void bio_get_pages(struct bio *bio)
+void bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
- bio_for_each_segment_all(bvec, bio, iter_all)
- get_page(bvec->bv_page);
-}
-
-static void bio_release_pages(struct bio *bio)
-{
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ if (bio_flagged(bio, BIO_NO_PAGE_REF))
+ return;
- bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ if (mark_dirty && !PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
+ }
}
+EXPORT_SYMBOL_GPL(bio_release_pages);
static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
{
@@ -933,6 +1070,50 @@ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return 0;
}
+static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
+{
+ unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
+ unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
+ struct request_queue *q = bio->bi_disk->queue;
+ unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
+ struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
+ struct page **pages = (struct page **)bv;
+ ssize_t size, left;
+ unsigned len, i;
+ size_t offset;
+
+ if (WARN_ON_ONCE(!max_append_sectors))
+ return 0;
+
+ /*
+ * Move page array up in the allocated memory for the bio vecs as far as
+ * possible so that we can start filling biovecs from the beginning
+ * without overwriting the temporary page array.
+ */
+ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
+ pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
+
+ size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ if (unlikely(size <= 0))
+ return size ? size : -EFAULT;
+
+ for (left = size, i = 0; left > 0; left -= len, i++) {
+ struct page *page = pages[i];
+ bool same_page = false;
+
+ len = min_t(size_t, PAGE_SIZE - offset, left);
+ if (bio_add_hw_page(q, bio, page, len, offset,
+ max_append_sectors, &same_page) != len)
+ return -EINVAL;
+ if (same_page)
+ put_page(page);
+ offset = 0;
+ }
+
+ iov_iter_advance(iter, size);
+ return 0;
+}
+
/**
* bio_iov_iter_get_pages - add user or kernel pages to a bio
* @bio: bio to add pages to
@@ -962,19 +1143,23 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
return -EINVAL;
do {
- if (is_bvec)
- ret = __bio_iov_bvec_add_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ if (WARN_ON_ONCE(is_bvec))
+ return -EINVAL;
+ ret = __bio_iov_append_get_pages(bio, iter);
+ } else {
+ if (is_bvec)
+ ret = __bio_iov_bvec_add_pages(bio, iter);
+ else
+ ret = __bio_iov_iter_get_pages(bio, iter);
+ }
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- if (iov_iter_bvec_no_ref(iter))
+ if (is_bvec)
bio_set_flag(bio, BIO_NO_PAGE_REF);
- else if (is_bvec)
- bio_get_pages(bio);
-
return bio->bi_vcnt ? 0 : ret;
}
+EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
static void submit_bio_wait_endio(struct bio *bio)
{
@@ -995,12 +1180,21 @@ static void submit_bio_wait_endio(struct bio *bio)
int submit_bio_wait(struct bio *bio)
{
DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
+ unsigned long hang_check;
bio->bi_private = &done;
bio->bi_end_io = submit_bio_wait_endio;
bio->bi_opf |= REQ_SYNC;
submit_bio(bio);
- wait_for_completion_io(&done);
+
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ hang_check = sysctl_hung_task_timeout_secs;
+ if (hang_check)
+ while (!wait_for_completion_io_timeout(&done,
+ hang_check * (HZ/2)))
+ ;
+ else
+ wait_for_completion_io(&done);
return blk_status_to_errno(bio->bi_status);
}
@@ -1022,6 +1216,7 @@ void bio_advance(struct bio *bio, unsigned bytes)
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
+ bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
EXPORT_SYMBOL(bio_advance);
@@ -1111,91 +1306,6 @@ void bio_list_copy_data(struct bio *dst, struct bio *src)
}
EXPORT_SYMBOL(bio_list_copy_data);
-struct bio_map_data {
- int is_our_pages;
- struct iov_iter iter;
- struct iovec iov[];
-};
-
-static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
- gfp_t gfp_mask)
-{
- struct bio_map_data *bmd;
- if (data->nr_segs > UIO_MAXIOV)
- return NULL;
-
- bmd = kmalloc(sizeof(struct bio_map_data) +
- sizeof(struct iovec) * data->nr_segs, gfp_mask);
- if (!bmd)
- return NULL;
- memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
- bmd->iter = *data;
- bmd->iter.iov = bmd->iov;
- return bmd;
-}
-
-/**
- * bio_copy_from_iter - copy all pages from iov_iter to bio
- * @bio: The &struct bio which describes the I/O as destination
- * @iter: iov_iter as source
- *
- * Copy all pages from iov_iter to bio.
- * Returns 0 on success, or error on failure.
- */
-static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
-{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- ssize_t ret;
-
- ret = copy_page_from_iter(bvec->bv_page,
- bvec->bv_offset,
- bvec->bv_len,
- iter);
-
- if (!iov_iter_count(iter))
- break;
-
- if (ret < bvec->bv_len)
- return -EFAULT;
- }
-
- return 0;
-}
-
-/**
- * bio_copy_to_iter - copy all pages from bio to iov_iter
- * @bio: The &struct bio which describes the I/O as source
- * @iter: iov_iter as destination
- *
- * Copy all pages from bio to iov_iter.
- * Returns 0 on success, or error on failure.
- */
-static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
-{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- ssize_t ret;
-
- ret = copy_page_to_iter(bvec->bv_page,
- bvec->bv_offset,
- bvec->bv_len,
- &iter);
-
- if (!iov_iter_count(&iter))
- break;
-
- if (ret < bvec->bv_len)
- return -EFAULT;
- }
-
- return 0;
-}
-
void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
@@ -1206,423 +1316,6 @@ void bio_free_pages(struct bio *bio)
}
EXPORT_SYMBOL(bio_free_pages);
-/**
- * bio_uncopy_user - finish previously mapped bio
- * @bio: bio being terminated
- *
- * Free pages allocated from bio_copy_user_iov() and write back data
- * to user space in case of a read.
- */
-int bio_uncopy_user(struct bio *bio)
-{
- struct bio_map_data *bmd = bio->bi_private;
- int ret = 0;
-
- if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
- /*
- * if we're in a workqueue, the request is orphaned, so
- * don't copy into a random user address space, just free
- * and return -EINTR so user space doesn't expect any data.
- */
- if (!current->mm)
- ret = -EINTR;
- else if (bio_data_dir(bio) == READ)
- ret = bio_copy_to_iter(bio, bmd->iter);
- if (bmd->is_our_pages)
- bio_free_pages(bio);
- }
- kfree(bmd);
- bio_put(bio);
- return ret;
-}
-
-/**
- * bio_copy_user_iov - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iter: iovec iterator
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-struct bio *bio_copy_user_iov(struct request_queue *q,
- struct rq_map_data *map_data,
- struct iov_iter *iter,
- gfp_t gfp_mask)
-{
- struct bio_map_data *bmd;
- struct page *page;
- struct bio *bio;
- int i = 0, ret;
- int nr_pages;
- unsigned int len = iter->count;
- unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
-
- bmd = bio_alloc_map_data(iter, gfp_mask);
- if (!bmd)
- return ERR_PTR(-ENOMEM);
-
- /*
- * We need to do a deep copy of the iov_iter including the iovecs.
- * The caller provided iov might point to an on-stack or otherwise
- * shortlived one.
- */
- bmd->is_our_pages = map_data ? 0 : 1;
-
- nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
- if (nr_pages > BIO_MAX_PAGES)
- nr_pages = BIO_MAX_PAGES;
-
- ret = -ENOMEM;
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- goto out_bmd;
-
- ret = 0;
-
- if (map_data) {
- nr_pages = 1 << map_data->page_order;
- i = map_data->offset / PAGE_SIZE;
- }
- while (len) {
- unsigned int bytes = PAGE_SIZE;
-
- bytes -= offset;
-
- if (bytes > len)
- bytes = len;
-
- if (map_data) {
- if (i == map_data->nr_entries * nr_pages) {
- ret = -ENOMEM;
- break;
- }
-
- page = map_data->pages[i / nr_pages];
- page += (i % nr_pages);
-
- i++;
- } else {
- page = alloc_page(q->bounce_gfp | gfp_mask);
- if (!page) {
- ret = -ENOMEM;
- break;
- }
- }
-
- if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
- if (!map_data)
- __free_page(page);
- break;
- }
-
- len -= bytes;
- offset = 0;
- }
-
- if (ret)
- goto cleanup;
-
- if (map_data)
- map_data->offset += bio->bi_iter.bi_size;
-
- /*
- * success
- */
- if ((iov_iter_rw(iter) == WRITE && (!map_data || !map_data->null_mapped)) ||
- (map_data && map_data->from_user)) {
- ret = bio_copy_from_iter(bio, iter);
- if (ret)
- goto cleanup;
- } else {
- if (bmd->is_our_pages)
- zero_fill_bio(bio);
- iov_iter_advance(iter, bio->bi_iter.bi_size);
- }
-
- bio->bi_private = bmd;
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
- return bio;
-cleanup:
- if (!map_data)
- bio_free_pages(bio);
- bio_put(bio);
-out_bmd:
- kfree(bmd);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_map_user_iov - map user iovec into bio
- * @q: the struct request_queue for the bio
- * @iter: iovec iterator
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_user_iov(struct request_queue *q,
- struct iov_iter *iter,
- gfp_t gfp_mask)
-{
- int j;
- struct bio *bio;
- int ret;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- if (!iov_iter_count(iter))
- return ERR_PTR(-EINVAL);
-
- bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- while (iov_iter_count(iter)) {
- struct page **pages;
- ssize_t bytes;
- size_t offs, added = 0;
- int npages;
-
- bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
- if (unlikely(bytes <= 0)) {
- ret = bytes ? bytes : -EFAULT;
- goto out_unmap;
- }
-
- npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
-
- if (unlikely(offs & queue_dma_alignment(q))) {
- ret = -EINVAL;
- j = 0;
- } else {
- for (j = 0; j < npages; j++) {
- struct page *page = pages[j];
- unsigned int n = PAGE_SIZE - offs;
-
- if (n > bytes)
- n = bytes;
-
- if (!__bio_add_pc_page(q, bio, page, n, offs,
- true))
- break;
-
- added += n;
- bytes -= n;
- offs = 0;
- }
- iov_iter_advance(iter, added);
- }
- /*
- * release the pages we didn't map into the bio, if any
- */
- while (j < npages)
- put_page(pages[j++]);
- kvfree(pages);
- /* couldn't stuff something into bio? */
- if (bytes)
- break;
- }
-
- bio_set_flag(bio, BIO_USER_MAPPED);
-
- /*
- * subtle -- if bio_map_user_iov() ended up bouncing a bio,
- * it would normally disappear when its bi_end_io is run.
- * however, we need it for the unmap, so grab an extra
- * reference to it
- */
- bio_get(bio);
- return bio;
-
- out_unmap:
- bio_for_each_segment_all(bvec, bio, iter_all) {
- put_page(bvec->bv_page);
- }
- bio_put(bio);
- return ERR_PTR(ret);
-}
-
-static void __bio_unmap_user(struct bio *bio)
-{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * make sure we dirty pages we wrote to
- */
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (bio_data_dir(bio) == READ)
- set_page_dirty_lock(bvec->bv_page);
-
- put_page(bvec->bv_page);
- }
-
- bio_put(bio);
-}
-
-/**
- * bio_unmap_user - unmap a bio
- * @bio: the bio being unmapped
- *
- * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
- * process context.
- *
- * bio_unmap_user() may sleep.
- */
-void bio_unmap_user(struct bio *bio)
-{
- __bio_unmap_user(bio);
- bio_put(bio);
-}
-
-static void bio_map_kern_endio(struct bio *bio)
-{
- bio_put(bio);
-}
-
-/**
- * bio_map_kern - map kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to map
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio allocation
- *
- * Map the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask)
-{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- const int nr_pages = end - start;
- int offset, i;
- struct bio *bio;
-
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- offset = offset_in_page(kaddr);
- for (i = 0; i < nr_pages; i++) {
- unsigned int bytes = PAGE_SIZE - offset;
-
- if (len <= 0)
- break;
-
- if (bytes > len)
- bytes = len;
-
- if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
- offset) < bytes) {
- /* we don't support partial mappings */
- bio_put(bio);
- return ERR_PTR(-EINVAL);
- }
-
- data += bytes;
- len -= bytes;
- offset = 0;
- }
-
- bio->bi_end_io = bio_map_kern_endio;
- return bio;
-}
-EXPORT_SYMBOL(bio_map_kern);
-
-static void bio_copy_kern_endio(struct bio *bio)
-{
- bio_free_pages(bio);
- bio_put(bio);
-}
-
-static void bio_copy_kern_endio_read(struct bio *bio)
-{
- char *p = bio->bi_private;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- bio_for_each_segment_all(bvec, bio, iter_all) {
- memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
- p += bvec->bv_len;
- }
-
- bio_copy_kern_endio(bio);
-}
-
-/**
- * bio_copy_kern - copy kernel address into bio
- * @q: the struct request_queue for the bio
- * @data: pointer to buffer to copy
- * @len: length in bytes
- * @gfp_mask: allocation flags for bio and page allocation
- * @reading: data direction is READ
- *
- * copy the kernel address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-struct bio *bio_copy_kern(struct request_queue *q, void *data, unsigned int len,
- gfp_t gfp_mask, int reading)
-{
- unsigned long kaddr = (unsigned long)data;
- unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
- unsigned long start = kaddr >> PAGE_SHIFT;
- struct bio *bio;
- void *p = data;
- int nr_pages = 0;
-
- /*
- * Overflow, abort
- */
- if (end < start)
- return ERR_PTR(-EINVAL);
-
- nr_pages = end - start;
- bio = bio_kmalloc(gfp_mask, nr_pages);
- if (!bio)
- return ERR_PTR(-ENOMEM);
-
- while (len) {
- struct page *page;
- unsigned int bytes = PAGE_SIZE;
-
- if (bytes > len)
- bytes = len;
-
- page = alloc_page(q->bounce_gfp | gfp_mask);
- if (!page)
- goto cleanup;
-
- if (!reading)
- memcpy(page_address(page), p, bytes);
-
- if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
- break;
-
- len -= bytes;
- p += bytes;
- }
-
- if (reading) {
- bio->bi_end_io = bio_copy_kern_endio_read;
- bio->bi_private = data;
- } else {
- bio->bi_end_io = bio_copy_kern_endio;
- }
-
- return bio;
-
-cleanup:
- bio_free_pages(bio);
- bio_put(bio);
- return ERR_PTR(-ENOMEM);
-}
-
/*
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
* for performing direct-IO in BIOs.
@@ -1695,9 +1388,7 @@ static void bio_dirty_fn(struct work_struct *work)
while ((bio = next) != NULL) {
next = bio->bi_private;
- bio_set_pages_dirty(bio);
- if (!bio_flagged(bio, BIO_NO_PAGE_REF))
- bio_release_pages(bio);
+ bio_release_pages(bio, true);
bio_put(bio);
}
}
@@ -1713,8 +1404,7 @@ void bio_check_pages_dirty(struct bio *bio)
goto defer;
}
- if (!bio_flagged(bio, BIO_NO_PAGE_REF))
- bio_release_pages(bio);
+ bio_release_pages(bio, false);
bio_put(bio);
return;
defer:
@@ -1725,68 +1415,6 @@ defer:
schedule_work(&bio_dirty_work);
}
-void update_io_ticks(struct hd_struct *part, unsigned long now)
-{
- unsigned long stamp;
-again:
- stamp = READ_ONCE(part->stamp);
- if (unlikely(stamp != now)) {
- if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) {
- __part_stat_add(part, io_ticks, 1);
- }
- }
- if (part->partno) {
- part = &part_to_disk(part)->part0;
- goto again;
- }
-}
-
-void generic_start_io_acct(struct request_queue *q, int op,
- unsigned long sectors, struct hd_struct *part)
-{
- const int sgrp = op_stat_group(op);
-
- part_stat_lock();
-
- update_io_ticks(part, jiffies);
- part_stat_inc(part, ios[sgrp]);
- part_stat_add(part, sectors[sgrp], sectors);
- part_inc_in_flight(q, part, op_is_write(op));
-
- part_stat_unlock();
-}
-EXPORT_SYMBOL(generic_start_io_acct);
-
-void generic_end_io_acct(struct request_queue *q, int req_op,
- struct hd_struct *part, unsigned long start_time)
-{
- unsigned long now = jiffies;
- unsigned long duration = now - start_time;
- const int sgrp = op_stat_group(req_op);
-
- part_stat_lock();
-
- update_io_ticks(part, now);
- part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_stat_add(part, time_in_queue, duration);
- part_dec_in_flight(q, part, op_is_write(req_op));
-
- part_stat_unlock();
-}
-EXPORT_SYMBOL(generic_end_io_acct);
-
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
- struct bio_vec bvec;
- struct bvec_iter iter;
-
- bio_for_each_segment(bvec, bi, iter)
- flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -1845,8 +1473,7 @@ again:
}
if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_complete(bio->bi_disk->queue, bio,
- blk_status_to_errno(bio->bi_status));
+ trace_block_bio_complete(bio->bi_disk->queue, bio);
bio_clear_flag(bio, BIO_TRACE_COMPLETION);
}
@@ -1869,8 +1496,8 @@ EXPORT_SYMBOL(bio_endio);
* @bio, and updates @bio to represent the remaining sectors.
*
* Unless this is a discard request the newly allocated bio will point
- * to @bio's bi_io_vec; it is the caller's responsibility to ensure that
- * @bio is not freed before the split.
+ * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
+ * neither @bio nor @bs are freed before the split bio.
*/
struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)
@@ -1880,6 +1507,10 @@ struct bio *bio_split(struct bio *bio, int sectors,
BUG_ON(sectors <= 0);
BUG_ON(sectors >= bio_sectors(bio));
+ /* Zone append commands cannot be split */
+ if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
+ return NULL;
+
split = bio_clone_fast(bio, gfp, bs);
if (!split)
return NULL;
@@ -1914,10 +1545,7 @@ void bio_trim(struct bio *bio, int offset, int size)
if (offset == 0 && size == bio->bi_iter.bi_size)
return;
- bio_clear_flag(bio, BIO_SEG_VALID);
-
bio_advance(bio, offset << 9);
-
bio->bi_iter.bi_size = size;
if (bio_integrity(bio))
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
new file mode 100644
index 000000000000..85d5790ac49b
--- /dev/null
+++ b/block/blk-cgroup-rwstat.c
@@ -0,0 +1,129 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT.
+ * Do not use in new code.
+ */
+#include "blk-cgroup-rwstat.h"
+
+int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
+{
+ int i, ret;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
+ if (ret) {
+ while (--i >= 0)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+ return ret;
+ }
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+ return 0;
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_init);
+
+void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ percpu_counter_destroy(&rwstat->cpu_cnt[i]);
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_exit);
+
+/**
+ * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @rwstat: rwstat to print
+ *
+ * Print @rwstat to @sf for the device assocaited with @pd.
+ */
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ const struct blkg_rwstat_sample *rwstat)
+{
+ static const char *rwstr[] = {
+ [BLKG_RWSTAT_READ] = "Read",
+ [BLKG_RWSTAT_WRITE] = "Write",
+ [BLKG_RWSTAT_SYNC] = "Sync",
+ [BLKG_RWSTAT_ASYNC] = "Async",
+ [BLKG_RWSTAT_DISCARD] = "Discard",
+ };
+ const char *dname = blkg_dev_name(pd->blkg);
+ u64 v;
+ int i;
+
+ if (!dname)
+ return 0;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
+ rwstat->cnt[i]);
+
+ v = rwstat->cnt[BLKG_RWSTAT_READ] +
+ rwstat->cnt[BLKG_RWSTAT_WRITE] +
+ rwstat->cnt[BLKG_RWSTAT_DISCARD];
+ seq_printf(sf, "%s Total %llu\n", dname, v);
+ return v;
+}
+EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
+
+/**
+ * blkg_prfill_rwstat - prfill callback for blkg_rwstat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the blkg_rwstat in @pd
+ *
+ * prfill callback for printing a blkg_rwstat.
+ */
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ struct blkg_rwstat_sample rwstat = { };
+
+ blkg_rwstat_read((void *)pd + off, &rwstat);
+ return __blkg_prfill_rwstat(sf, pd, &rwstat);
+}
+EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
+
+/**
+ * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
+ * @blkg: blkg of interest
+ * @pol: blkcg_policy which contains the blkg_rwstat
+ * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ * @sum: blkg_rwstat_sample structure containing the results
+ *
+ * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
+ * online descendants and their aux counts. The caller must be holding the
+ * queue lock for online tests.
+ *
+ * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
+ * is at @off bytes into @blkg's blkg_policy_data of the policy.
+ */
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum)
+{
+ struct blkcg_gq *pos_blkg;
+ struct cgroup_subsys_state *pos_css;
+ unsigned int i;
+
+ lockdep_assert_held(&blkg->q->queue_lock);
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct blkg_rwstat *rwstat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ if (pol)
+ rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
+ else
+ rwstat = (void *)pos_blkg + off;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
+ }
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
new file mode 100644
index 000000000000..ee746919c41f
--- /dev/null
+++ b/block/blk-cgroup-rwstat.h
@@ -0,0 +1,149 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * Legacy blkg rwstat helpers enabled by CONFIG_BLK_CGROUP_RWSTAT.
+ * Do not use in new code.
+ */
+#ifndef _BLK_CGROUP_RWSTAT_H
+#define _BLK_CGROUP_RWSTAT_H
+
+#include <linux/blk-cgroup.h>
+
+enum blkg_rwstat_type {
+ BLKG_RWSTAT_READ,
+ BLKG_RWSTAT_WRITE,
+ BLKG_RWSTAT_SYNC,
+ BLKG_RWSTAT_ASYNC,
+ BLKG_RWSTAT_DISCARD,
+
+ BLKG_RWSTAT_NR,
+ BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
+};
+
+/*
+ * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
+ * recursive. Used to carry stats of dead children.
+ */
+struct blkg_rwstat {
+ struct percpu_counter cpu_cnt[BLKG_RWSTAT_NR];
+ atomic64_t aux_cnt[BLKG_RWSTAT_NR];
+};
+
+struct blkg_rwstat_sample {
+ u64 cnt[BLKG_RWSTAT_NR];
+};
+
+static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
+ unsigned int idx)
+{
+ return atomic64_read(&rwstat->aux_cnt[idx]) +
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
+}
+
+int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp);
+void blkg_rwstat_exit(struct blkg_rwstat *rwstat);
+u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ const struct blkg_rwstat_sample *rwstat);
+u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off);
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum);
+
+
+/**
+ * blkg_rwstat_add - add a value to a blkg_rwstat
+ * @rwstat: target blkg_rwstat
+ * @op: REQ_OP and flags
+ * @val: value to add
+ *
+ * Add @val to @rwstat. The counters are chosen according to @rw. The
+ * caller is responsible for synchronizing calls to this function.
+ */
+static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
+ unsigned int op, uint64_t val)
+{
+ struct percpu_counter *cnt;
+
+ if (op_is_discard(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
+ else if (op_is_write(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
+
+ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+
+ if (op_is_sync(op))
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
+ else
+ cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
+
+ percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * blkg_rwstat_read - read the current values of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Read the current snapshot of @rwstat and return it in the aux counts.
+ */
+static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
+ struct blkg_rwstat_sample *result)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ result->cnt[i] =
+ percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
+}
+
+/**
+ * blkg_rwstat_total - read the total count of a blkg_rwstat
+ * @rwstat: blkg_rwstat to read
+ *
+ * Return the total count of @rwstat regardless of the IO direction. This
+ * function can be called without synchronization and takes care of u64
+ * atomicity.
+ */
+static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
+{
+ struct blkg_rwstat_sample tmp = { };
+
+ blkg_rwstat_read(rwstat, &tmp);
+ return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
+}
+
+/**
+ * blkg_rwstat_reset - reset a blkg_rwstat
+ * @rwstat: blkg_rwstat to reset
+ */
+static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
+{
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++) {
+ percpu_counter_set(&rwstat->cpu_cnt[i], 0);
+ atomic64_set(&rwstat->aux_cnt[i], 0);
+ }
+}
+
+/**
+ * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
+ * @to: the destination blkg_rwstat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
+ struct blkg_rwstat *from)
+{
+ u64 sum[BLKG_RWSTAT_NR];
+ int i;
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
+
+ for (i = 0; i < BLKG_RWSTAT_NR; i++)
+ atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
+ &to->aux_cnt[i]);
+}
+#endif /* _BLK_CGROUP_RWSTAT_H */
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1f7127b03490..0ecc897b225c 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,7 @@
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -47,12 +48,14 @@ struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
-static bool blkcg_debug_stats = false;
+bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
@@ -77,8 +80,8 @@ static void blkg_free(struct blkcg_gq *blkg)
if (blkg->pd[i])
blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
- blkg_rwstat_exit(&blkg->stat_ios);
- blkg_rwstat_exit(&blkg->stat_bytes);
+ free_percpu(blkg->iostat_cpu);
+ percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
}
@@ -86,7 +89,7 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
- percpu_ref_exit(&blkg->refcnt);
+ WARN_ON(!bio_list_empty(&blkg->async_bios));
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -113,6 +116,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ async_bio_work);
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ /* as long as there are pending bios, @blkg can't go away */
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_merge(&bios, &blkg->async_bios);
+ bio_list_init(&blkg->async_bios);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ while ((bio = bio_list_pop(&bios)))
+ submit_bio(bio);
+}
+
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
@@ -125,21 +145,31 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
gfp_t gfp_mask)
{
struct blkcg_gq *blkg;
- int i;
+ int i, cpu;
/* alloc and init base part */
blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
if (!blkg)
return NULL;
- if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
- blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
+ if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
+ goto err_free;
+
+ blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
+ if (!blkg->iostat_cpu)
goto err_free;
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
+ spin_lock_init(&blkg->async_bio_lock);
+ bio_list_init(&blkg->async_bios);
+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
+ u64_stats_init(&blkg->iostat.sync);
+ for_each_possible_cpu(cpu)
+ u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd;
@@ -148,7 +178,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
continue;
/* alloc per-policy data and attach it to blkg */
- pd = pol->pd_alloc_fn(gfp_mask, q->node);
+ pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
if (!pd)
goto err_free;
@@ -244,11 +274,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_get(blkg->parent);
}
- ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
- GFP_NOWAIT | __GFP_NOWARN);
- if (ret)
- goto err_cancel_ref;
-
/* invoke per-policy init */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -281,8 +306,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
-err_cancel_ref:
- percpu_ref_exit(&blkg->refcnt);
err_put_congested:
wb_congested_put(wb_congested);
err_put_css:
@@ -373,7 +396,6 @@ struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
static void blkg_destroy(struct blkcg_gq *blkg)
{
struct blkcg *blkcg = blkg->blkcg;
- struct blkcg_gq *parent = blkg->parent;
int i;
lockdep_assert_held(&blkg->q->queue_lock);
@@ -390,11 +412,6 @@ static void blkg_destroy(struct blkcg_gq *blkg)
pol->pd_offline_fn(blkg->pd[i]);
}
- if (parent) {
- blkg_rwstat_add_aux(&parent->stat_bytes, &blkg->stat_bytes);
- blkg_rwstat_add_aux(&parent->stat_ios, &blkg->stat_ios);
- }
-
blkg->online = false;
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
@@ -444,7 +461,7 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
{
struct blkcg *blkcg = css_to_blkcg(css);
struct blkcg_gq *blkg;
- int i;
+ int i, cpu;
mutex_lock(&blkcg_pol_mutex);
spin_lock_irq(&blkcg->lock);
@@ -455,8 +472,12 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
* anyway. If you get hit by a race, retry.
*/
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
- blkg_rwstat_reset(&blkg->stat_bytes);
- blkg_rwstat_reset(&blkg->stat_ios);
+ for_each_possible_cpu(cpu) {
+ struct blkg_iostat_set *bis =
+ per_cpu_ptr(blkg->iostat_cpu, cpu);
+ memset(bis, 0, sizeof(*bis));
+ }
+ memset(&blkg->iostat, 0, sizeof(blkg->iostat));
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -475,7 +496,7 @@ const char *blkg_dev_name(struct blkcg_gq *blkg)
{
/* some drivers (floppy) instantiate a queue w/o disk registered */
if (blkg->q->backing_dev_info->dev)
- return dev_name(blkg->q->backing_dev_info->dev);
+ return bdi_dev_name(blkg->q->backing_dev_info);
return NULL;
}
@@ -540,254 +561,55 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
-/**
- * __blkg_prfill_rwstat - prfill helper for a blkg_rwstat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @rwstat: rwstat to print
- *
- * Print @rwstat to @sf for the device assocaited with @pd.
- */
-u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- const struct blkg_rwstat *rwstat)
-{
- static const char *rwstr[] = {
- [BLKG_RWSTAT_READ] = "Read",
- [BLKG_RWSTAT_WRITE] = "Write",
- [BLKG_RWSTAT_SYNC] = "Sync",
- [BLKG_RWSTAT_ASYNC] = "Async",
- [BLKG_RWSTAT_DISCARD] = "Discard",
- };
- const char *dname = blkg_dev_name(pd->blkg);
- u64 v;
- int i;
-
- if (!dname)
- return 0;
-
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
- (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
-
- v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
- seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
- return v;
-}
-EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
-
-/**
- * blkg_prfill_stat - prfill callback for blkg_stat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
- *
- * prfill callback for printing a blkg_stat.
- */
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
-{
- return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
-}
-EXPORT_SYMBOL_GPL(blkg_prfill_stat);
-
-/**
- * blkg_prfill_rwstat - prfill callback for blkg_rwstat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_rwstat in @pd
- *
- * prfill callback for printing a blkg_rwstat.
- */
-u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
-
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
-
-static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
- struct blkg_policy_data *pd, int off)
-{
- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
-
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-/**
- * blkg_print_stat_bytes - seq_show callback for blkg->stat_bytes
- * @sf: seq_file to print to
- * @v: unused
- *
- * To be used as cftype->seq_show to print blkg->stat_bytes.
- * cftype->private must be set to the blkcg_policy.
- */
-int blkg_print_stat_bytes(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_bytes), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes);
-
-/**
- * blkg_print_stat_bytes - seq_show callback for blkg->stat_ios
- * @sf: seq_file to print to
- * @v: unused
- *
- * To be used as cftype->seq_show to print blkg->stat_ios. cftype->private
- * must be set to the blkcg_policy.
- */
-int blkg_print_stat_ios(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field, (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_ios), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_ios);
-
-static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
- struct blkg_policy_data *pd,
- int off)
-{
- struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
- NULL, off);
- return __blkg_prfill_rwstat(sf, pd, &rwstat);
-}
-
-/**
- * blkg_print_stat_bytes_recursive - recursive version of blkg_print_stat_bytes
- * @sf: seq_file to print to
- * @v: unused
- */
-int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field_recursive,
- (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_bytes), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_bytes_recursive);
-
-/**
- * blkg_print_stat_ios_recursive - recursive version of blkg_print_stat_ios
- * @sf: seq_file to print to
- * @v: unused
- */
-int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
-{
- blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
- blkg_prfill_rwstat_field_recursive,
- (void *)seq_cft(sf)->private,
- offsetof(struct blkcg_gq, stat_ios), true);
- return 0;
-}
-EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
-
-/**
- * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @blkg: blkg of interest
- * @pol: blkcg_policy which contains the blkg_stat
- * @off: offset to the blkg_stat in blkg_policy_data or @blkg
- *
- * Collect the blkg_stat specified by @blkg, @pol and @off and all its
- * online descendants and their aux counts. The caller must be holding the
- * queue lock for online tests.
- *
- * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
- * at @off bytes into @blkg's blkg_policy_data of the policy.
- */
-u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
- struct blkcg_policy *pol, int off)
+/* Performs queue bypass and policy enabled checks then looks up blkg. */
+static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
+ const struct blkcg_policy *pol,
+ struct request_queue *q)
{
- struct blkcg_gq *pos_blkg;
- struct cgroup_subsys_state *pos_css;
- u64 sum = 0;
-
- lockdep_assert_held(&blkg->q->queue_lock);
-
- rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
- struct blkg_stat *stat;
-
- if (!pos_blkg->online)
- continue;
-
- if (pol)
- stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
- else
- stat = (void *)blkg + off;
-
- sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
- }
- rcu_read_unlock();
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ lockdep_assert_held(&q->queue_lock);
- return sum;
+ if (!blkcg_policy_enabled(q, pol))
+ return ERR_PTR(-EOPNOTSUPP);
+ return __blkg_lookup(blkcg, q, true /* update_hint */);
}
-EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
/**
- * blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
- * @blkg: blkg of interest
- * @pol: blkcg_policy which contains the blkg_rwstat
- * @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ * blkg_conf_prep - parse and prepare for per-blkg config update
+ * @inputp: input string pointer
*
- * Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
- * online descendants and their aux counts. The caller must be holding the
- * queue lock for online tests.
+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
+ * from @input and get and return the matching gendisk. *@inputp is
+ * updated to point past the device node prefix. Returns an ERR_PTR()
+ * value on error.
*
- * If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
- * is at @off bytes into @blkg's blkg_policy_data of the policy.
+ * Use this function iff blkg_conf_prep() can't be used for some reason.
*/
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
- struct blkcg_policy *pol, int off)
+struct gendisk *blkcg_conf_get_disk(char **inputp)
{
- struct blkcg_gq *pos_blkg;
- struct cgroup_subsys_state *pos_css;
- struct blkg_rwstat sum = { };
- int i;
-
- lockdep_assert_held(&blkg->q->queue_lock);
-
- rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
- struct blkg_rwstat *rwstat;
+ char *input = *inputp;
+ unsigned int major, minor;
+ struct gendisk *disk;
+ int key_len, part;
- if (!pos_blkg->online)
- continue;
+ if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
+ return ERR_PTR(-EINVAL);
- if (pol)
- rwstat = (void *)blkg_to_pd(pos_blkg, pol) + off;
- else
- rwstat = (void *)pos_blkg + off;
+ input += key_len;
+ if (!isspace(*input))
+ return ERR_PTR(-EINVAL);
+ input = skip_spaces(input);
- for (i = 0; i < BLKG_RWSTAT_NR; i++)
- atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
- percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
- &sum.aux_cnt[i]);
+ disk = get_gendisk(MKDEV(major, minor), &part);
+ if (!disk)
+ return ERR_PTR(-ENODEV);
+ if (part) {
+ put_disk_and_module(disk);
+ return ERR_PTR(-ENODEV);
}
- rcu_read_unlock();
-
- return sum;
-}
-EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
-
-/* Performs queue bypass and policy enabled checks then looks up blkg. */
-static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
- const struct blkcg_policy *pol,
- struct request_queue *q)
-{
- WARN_ON_ONCE(!rcu_read_lock_held());
- lockdep_assert_held(&q->queue_lock);
- if (!blkcg_policy_enabled(q, pol))
- return ERR_PTR(-EOPNOTSUPP);
- return __blkg_lookup(blkcg, q, true /* update_hint */);
+ *inputp = input;
+ return disk;
}
/**
@@ -809,25 +631,11 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
- unsigned int major, minor;
- int key_len, part, ret;
- char *body;
-
- if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
- return -EINVAL;
-
- body = input + key_len;
- if (!isspace(*body))
- return -EINVAL;
- body = skip_spaces(body);
+ int ret;
- disk = get_gendisk(MKDEV(major, minor), &part);
- if (!disk)
- return -ENODEV;
- if (part) {
- ret = -ENODEV;
- goto fail;
- }
+ disk = blkcg_conf_get_disk(&input);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
q = disk->queue;
@@ -893,7 +701,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
success:
ctx->disk = disk;
ctx->blkg = blkg;
- ctx->body = body;
+ ctx->body = input;
return 0;
fail_unlock:
@@ -913,6 +721,7 @@ fail:
}
return ret;
}
+EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
* blkg_conf_finish - finish up per-blkg config update
@@ -928,26 +737,34 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
rcu_read_unlock();
put_disk_and_module(ctx->disk);
}
+EXPORT_SYMBOL_GPL(blkg_conf_finish);
static int blkcg_print_stat(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct blkcg_gq *blkg;
+ cgroup_rstat_flush(blkcg->css.cgroup);
rcu_read_lock();
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct blkg_iostat_set *bis = &blkg->iostat;
const char *dname;
char *buf;
- struct blkg_rwstat rwstat;
u64 rbytes, wbytes, rios, wios, dbytes, dios;
size_t size = seq_get_buf(sf, &buf), off = 0;
int i;
bool has_stats = false;
+ unsigned seq;
+
+ spin_lock_irq(&blkg->q->queue_lock);
+
+ if (!blkg->online)
+ goto skip;
dname = blkg_dev_name(blkg);
if (!dname)
- continue;
+ goto skip;
/*
* Hooray string manipulation, count is the size written NOT
@@ -957,21 +774,16 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
*/
off += scnprintf(buf+off, size-off, "%s ", dname);
- spin_lock_irq(&blkg->q->queue_lock);
-
- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes));
- rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
- wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
- dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
+ do {
+ seq = u64_stats_fetch_begin(&bis->sync);
- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
- offsetof(struct blkcg_gq, stat_ios));
- rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
- wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
- dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
-
- spin_unlock_irq(&blkg->q->queue_lock);
+ rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+ wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+ dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+ rios = bis->cur.ios[BLKG_IOSTAT_READ];
+ wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+ dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+ } while (u64_stats_fetch_retry(&bis->sync, seq));
if (rbytes || wbytes || rios || wios) {
has_stats = true;
@@ -981,10 +793,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
dbytes, dios);
}
- if (!blkcg_debug_stats)
- goto next;
-
- if (atomic_read(&blkg->use_delay)) {
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
has_stats = true;
off += scnprintf(buf+off, size-off,
" use_delay=%d delay_nsec=%llu",
@@ -1004,11 +813,17 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
has_stats = true;
off += written;
}
-next:
+
if (has_stats) {
- off += scnprintf(buf+off, size-off, "\n");
- seq_commit(sf, off);
+ if (off < size - 1) {
+ off += scnprintf(buf+off, size-off, "\n");
+ seq_commit(sf, off);
+ } else {
+ seq_commit(sf, -1);
+ }
}
+ skip:
+ spin_unlock_irq(&blkg->q->queue_lock);
}
rcu_read_unlock();
@@ -1068,8 +883,8 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
/* this prevents anyone from attaching or migrating to this blkcg */
wb_blkcg_offline(blkcg);
- /* put the base cgwb reference allowing step 2 to be triggered */
- blkcg_cgwb_put(blkcg);
+ /* put the base online pin allowing step 2 to be triggered */
+ blkcg_unpin_online(blkcg);
}
/**
@@ -1168,11 +983,11 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
}
spin_lock_init(&blkcg->lock);
+ refcount_set(&blkcg->online_pin, 1);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
- refcount_set(&blkcg->cgwb_refcnt, 1);
#endif
list_add_tail(&blkcg->all_blkcgs_node, &all_blkcgs);
@@ -1191,11 +1006,26 @@ unlock:
return ret;
}
+static int blkcg_css_online(struct cgroup_subsys_state *css)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg *parent = blkcg_parent(blkcg);
+
+ /*
+ * blkcg_pin_online() is used to delay blkcg offline so that blkgs
+ * don't go offline while cgwbs are still active on them. Pin the
+ * parent so that offline always happens towards the root.
+ */
+ if (parent)
+ blkcg_pin_online(parent);
+ return 0;
+}
+
/**
* blkcg_init_queue - initialize blkcg part of request queue
* @q: request_queue to initialize
*
- * Called from blk_alloc_queue_node(). Responsible for initializing blkcg
+ * Called from __blk_alloc_queue(). Responsible for initializing blkcg
* part of new request_queue @q.
*
* RETURNS:
@@ -1247,26 +1077,6 @@ err_unlock:
}
/**
- * blkcg_drain_queue - drain blkcg part of request_queue
- * @q: request_queue to drain
- *
- * Called from blk_drain_queue(). Responsible for draining blkcg part.
- */
-void blkcg_drain_queue(struct request_queue *q)
-{
- lockdep_assert_held(&q->queue_lock);
-
- /*
- * @q could be exiting and already have destroyed all blkgs as
- * indicated by NULL root_blkg. If so, don't confuse policies.
- */
- if (!q->root_blkg)
- return;
-
- blk_throtl_drain(q);
-}
-
-/**
* blkcg_exit_queue - exit and release blkcg part of request_queue
* @q: request_queue being released
*
@@ -1304,6 +1114,77 @@ static int blkcg_can_attach(struct cgroup_taskset *tset)
return ret;
}
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] = src->bytes[i];
+ dst->ios[i] = src->ios[i];
+ }
+}
+
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] += src->bytes[i];
+ dst->ios[i] += src->ios[i];
+ }
+}
+
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
+
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] -= src->bytes[i];
+ dst->ios[i] -= src->ios[i];
+ }
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ struct blkcg *blkcg = css_to_blkcg(css);
+ struct blkcg_gq *blkg;
+
+ rcu_read_lock();
+
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct blkcg_gq *parent = blkg->parent;
+ struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
+ struct blkg_iostat cur, delta;
+ unsigned seq;
+
+ /* fetch the current per-cpu values */
+ do {
+ seq = u64_stats_fetch_begin(&bisc->sync);
+ blkg_iostat_set(&cur, &bisc->cur);
+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
+
+ /* propagate percpu delta to global */
+ u64_stats_update_begin(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, &cur);
+ blkg_iostat_sub(&delta, &bisc->last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(&bisc->last, &delta);
+ u64_stats_update_end(&blkg->iostat.sync);
+
+ /* propagate global delta to parent */
+ if (parent) {
+ u64_stats_update_begin(&parent->iostat.sync);
+ blkg_iostat_set(&delta, &blkg->iostat.cur);
+ blkg_iostat_sub(&delta, &blkg->iostat.last);
+ blkg_iostat_add(&parent->iostat.cur, &delta);
+ blkg_iostat_add(&blkg->iostat.last, &delta);
+ u64_stats_update_end(&parent->iostat.sync);
+ }
+ }
+
+ rcu_read_unlock();
+}
+
static void blkcg_bind(struct cgroup_subsys_state *root_css)
{
int i;
@@ -1333,9 +1214,11 @@ static void blkcg_exit(struct task_struct *tsk)
struct cgroup_subsys io_cgrp_subsys = {
.css_alloc = blkcg_css_alloc,
+ .css_online = blkcg_css_online,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
.can_attach = blkcg_can_attach,
+ .css_rstat_flush = blkcg_rstat_flush,
.bind = blkcg_bind,
.dfl_cftypes = blkcg_files,
.legacy_cftypes = blkcg_legacy_files,
@@ -1372,7 +1255,7 @@ int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol)
{
struct blkg_policy_data *pd_prealloc = NULL;
- struct blkcg_gq *blkg;
+ struct blkcg_gq *blkg, *pinned_blkg = NULL;
int ret;
if (blkcg_policy_enabled(q, pol))
@@ -1380,48 +1263,82 @@ int blkcg_activate_policy(struct request_queue *q,
if (queue_is_mq(q))
blk_mq_freeze_queue(q);
-pd_prealloc:
- if (!pd_prealloc) {
- pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
- if (!pd_prealloc) {
- ret = -ENOMEM;
- goto out_bypass_end;
- }
- }
-
+retry:
spin_lock_irq(&q->queue_lock);
- list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ /* blkg_list is pushed at the head, reverse walk to allocate parents first */
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
if (blkg->pd[pol->plid])
continue;
- pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
- if (!pd)
- swap(pd, pd_prealloc);
+ /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
+ if (blkg == pinned_blkg) {
+ pd = pd_prealloc;
+ pd_prealloc = NULL;
+ } else {
+ pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
+ blkg->blkcg);
+ }
+
if (!pd) {
+ /*
+ * GFP_NOWAIT failed. Free the existing one and
+ * prealloc for @blkg w/ GFP_KERNEL.
+ */
+ if (pinned_blkg)
+ blkg_put(pinned_blkg);
+ blkg_get(blkg);
+ pinned_blkg = blkg;
+
spin_unlock_irq(&q->queue_lock);
- goto pd_prealloc;
+
+ if (pd_prealloc)
+ pol->pd_free_fn(pd_prealloc);
+ pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
+ blkg->blkcg);
+ if (pd_prealloc)
+ goto retry;
+ else
+ goto enomem;
}
blkg->pd[pol->plid] = pd;
pd->blkg = blkg;
pd->plid = pol->plid;
- if (pol->pd_init_fn)
- pol->pd_init_fn(pd);
}
+ /* all allocated, init in the same order */
+ if (pol->pd_init_fn)
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
+ pol->pd_init_fn(blkg->pd[pol->plid]);
+
__set_bit(pol->plid, q->blkcg_pols);
ret = 0;
spin_unlock_irq(&q->queue_lock);
-out_bypass_end:
+out:
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q);
+ if (pinned_blkg)
+ blkg_put(pinned_blkg);
if (pd_prealloc)
pol->pd_free_fn(pd_prealloc);
return ret;
+
+enomem:
+ /* alloc failed, nothing's initialized yet, free everything */
+ spin_lock_irq(&q->queue_lock);
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ if (blkg->pd[pol->plid]) {
+ pol->pd_free_fn(blkg->pd[pol->plid]);
+ blkg->pd[pol->plid] = NULL;
+ }
+ }
+ spin_unlock_irq(&q->queue_lock);
+ ret = -ENOMEM;
+ goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1510,7 +1427,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
cpd->plid = pol->plid;
- pol->cpd_init_fn(cpd);
+ if (pol->cpd_init_fn)
+ pol->cpd_init_fn(cpd);
}
}
@@ -1583,6 +1501,25 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+
+ /* consume the flag first */
+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+ /* never bounce for the root cgroup */
+ if (!blkg->parent)
+ return false;
+
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_add(&blkg->async_bios, bio);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+ return true;
+}
+
/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1593,6 +1530,10 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
{
u64 old = atomic64_read(&blkg->delay_start);
+ /* negative use_delay means no scaling, see blkcg_set_delay() */
+ if (atomic_read(&blkg->use_delay) < 0)
+ return;
+
/*
* We only want to scale down every second. The idea here is that we
* want to delay people for min(delay_nsec, NSEC_PER_SEC) in a certain
@@ -1644,6 +1585,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
*/
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
+ unsigned long pflags;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
@@ -1670,11 +1612,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*/
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
- /*
- * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
- * that hasn't landed upstream yet. Once that stuff is in place we need
- * to do a psi_memstall_enter/leave if memdelay is set.
- */
+ if (use_memdelay)
+ psi_memstall_enter(&pflags);
exp = ktime_add_ns(now, delay_nsec);
tok = io_schedule_prepare();
@@ -1684,6 +1623,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
break;
} while (!fatal_signal_pending(current));
io_schedule_finish(tok);
+
+ if (use_memdelay)
+ psi_memstall_leave(&pflags);
}
/**
@@ -1779,9 +1721,22 @@ void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
*/
void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
blkcg_scale_delay(blkg, now);
atomic64_add(delta, &blkg->delay_nsec);
}
+static int __init blkcg_init(void)
+{
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!blkcg_punt_bio_wq)
+ return -ENOMEM;
+ return 0;
+}
+subsys_initcall(blkcg_init);
+
module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..03252af8c82c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -20,6 +20,7 @@
#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
@@ -34,8 +35,12 @@
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
+#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
+#include <linux/psi.h>
+#include <linux/sched/sysctl.h>
+#include <linux/blk-crypto.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -117,9 +122,52 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
+ refcount_set(&rq->ref, 1);
+ blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
+#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
+static const char *const blk_op_name[] = {
+ REQ_OP_NAME(READ),
+ REQ_OP_NAME(WRITE),
+ REQ_OP_NAME(FLUSH),
+ REQ_OP_NAME(DISCARD),
+ REQ_OP_NAME(SECURE_ERASE),
+ REQ_OP_NAME(ZONE_RESET),
+ REQ_OP_NAME(ZONE_RESET_ALL),
+ REQ_OP_NAME(ZONE_OPEN),
+ REQ_OP_NAME(ZONE_CLOSE),
+ REQ_OP_NAME(ZONE_FINISH),
+ REQ_OP_NAME(ZONE_APPEND),
+ REQ_OP_NAME(WRITE_SAME),
+ REQ_OP_NAME(WRITE_ZEROES),
+ REQ_OP_NAME(SCSI_IN),
+ REQ_OP_NAME(SCSI_OUT),
+ REQ_OP_NAME(DRV_IN),
+ REQ_OP_NAME(DRV_OUT),
+};
+#undef REQ_OP_NAME
+
+/**
+ * blk_op_str - Return string XXX in the REQ_OP_XXX.
+ * @op: REQ_OP_XXX.
+ *
+ * Description: Centralize block layer function to convert REQ_OP_XXX into
+ * string format. Useful in the debugging and tracing bio or request. For
+ * invalid REQ_OP_XXX it returns string "UNKNOWN".
+ */
+inline const char *blk_op_str(unsigned int op)
+{
+ const char *op_str = "UNKNOWN";
+
+ if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
+ op_str = blk_op_name[op];
+
+ return op_str;
+}
+EXPORT_SYMBOL_GPL(blk_op_str);
+
static const struct {
int errno;
const char *name;
@@ -167,18 +215,23 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
-static void print_req_error(struct request *req, blk_status_t status)
+static void print_req_error(struct request *req, blk_status_t status,
+ const char *caller)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return;
- printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n",
- __func__, blk_errors[idx].name,
- req->rq_disk ? req->rq_disk->disk_name : "?",
- (unsigned long long)blk_rq_pos(req),
- req->cmd_flags);
+ printk_ratelimited(KERN_ERR
+ "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+ "phys_seg %u prio class %u\n",
+ caller, blk_errors[idx].name,
+ req->rq_disk ? req->rq_disk->disk_name : "?",
+ blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
+ req->cmd_flags & ~REQ_OP_MASK,
+ req->nr_phys_segments,
+ IOPRIO_PRIO_CLASS(req->ioprio));
}
static void req_bio_endio(struct request *rq, struct bio *bio,
@@ -192,6 +245,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
bio_advance(bio, nbytes);
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ */
+ if (bio->bi_iter.bi_size)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
@@ -291,18 +355,18 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying);
*/
void blk_cleanup_queue(struct request_queue *q)
{
+ WARN_ON_ONCE(blk_queue_registered(q));
+
/* mark @q DYING, no new request or merges will be allowed afterwards */
- mutex_lock(&q->sysfs_lock);
blk_set_queue_dying(q);
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
- blk_queue_flag_set(QUEUE_FLAG_DYING, q);
- mutex_unlock(&q->sysfs_lock);
/*
* Drain all requests queued before DYING marking. Set DEAD flag to
- * prevent that q->request_fn() gets invoked after draining finished.
+ * prevent that blk_mq_run_hw_queues() accesses the hardware queues
+ * after draining finished.
*/
blk_freeze_queue(q);
@@ -340,12 +404,6 @@ void blk_cleanup_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_cleanup_queue);
-struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
-{
- return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(blk_alloc_queue);
-
/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
@@ -398,6 +456,23 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
}
}
+static inline int bio_queue_enter(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ int ret;
+
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
+ if (unlikely(ret)) {
+ if (nowait && !blk_queue_dying(q))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ }
+
+ return ret;
+}
+
void blk_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->q_usage_counter);
@@ -422,25 +497,19 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-/**
- * blk_alloc_queue_node - allocate a request queue
- * @gfp_mask: memory allocation flags
- * @node_id: NUMA node to allocate memory from
- */
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+struct request_queue *__blk_alloc_queue(int node_id)
{
struct request_queue *q;
int ret;
q = kmem_cache_alloc_node(blk_requestq_cachep,
- gfp_mask | __GFP_ZERO, node_id);
+ GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
- INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
goto fail_q;
@@ -448,7 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (ret)
goto fail_id;
- q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
+ q->backing_dev_info = bdi_alloc(node_id);
if (!q->backing_dev_info)
goto fail_split;
@@ -458,7 +527,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
- q->backing_dev_info->name = "block";
q->node = node_id;
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
@@ -476,6 +544,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
mutex_init(&q->blk_trace_mutex);
#endif
mutex_init(&q->sysfs_lock);
+ mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
@@ -493,6 +562,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (blkcg_init_queue(q))
goto fail_ref;
+ blk_queue_dma_alignment(q, 511);
+ blk_set_default_limits(&q->limits);
+
return q;
fail_ref:
@@ -509,7 +581,22 @@ fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
-EXPORT_SYMBOL(blk_alloc_queue_node);
+
+struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id)
+{
+ struct request_queue *q;
+
+ if (WARN_ON_ONCE(!make_request))
+ return NULL;
+
+ q = __blk_alloc_queue(node_id);
+ if (!q)
+ return NULL;
+ q->make_request_fn = make_request;
+ q->nr_requests = BLKDEV_MAX_RQ;
+ return q;
+}
+EXPORT_SYMBOL(blk_alloc_queue);
bool blk_get_queue(struct request_queue *q)
{
@@ -550,15 +637,26 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+static void blk_account_io_merge_bio(struct request *req)
+{
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+}
+
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
- if (!ll_back_merge_fn(q, req, bio))
+ if (!ll_back_merge_fn(req, bio, nr_segs))
return false;
- trace_block_bio_backmerge(q, req, bio);
+ trace_block_bio_backmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -567,19 +665,22 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_free_ctx(bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
- if (!ll_front_merge_fn(q, req, bio))
+ if (!ll_front_merge_fn(req, bio, nr_segs))
return false;
- trace_block_bio_frontmerge(q, req, bio);
+ trace_block_bio_frontmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -590,7 +691,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_do_front_merge(req, bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
@@ -605,12 +708,14 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
+ rq_qos_merge(q, req, bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
no_merge:
req_set_nomerge(q, req);
@@ -621,6 +726,7 @@ no_merge:
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
+ * @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
@@ -639,13 +745,13 @@ no_merge:
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- struct request **same_queue_rq)
+ unsigned int nr_segs, struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (!plug)
return false;
@@ -668,10 +774,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
- merged = bio_attempt_back_merge(q, rq, bio);
+ merged = bio_attempt_back_merge(rq, bio, nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
- merged = bio_attempt_front_merge(q, rq, bio);
+ merged = bio_attempt_front_merge(rq, bio, nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
@@ -687,18 +793,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
return false;
}
-void blk_init_request_from_bio(struct request *req, struct bio *bio)
-{
- if (bio->bi_opf & REQ_RAHEAD)
- req->cmd_flags |= REQ_FAILFAST_MASK;
-
- req->__sector = bio->bi_iter.bi_sector;
- req->ioprio = bio_prio(bio);
- req->write_hint = bio->bi_write_hint;
- blk_rq_bio_prep(req->q, req, bio);
-}
-EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
-
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
@@ -809,11 +903,7 @@ static inline int blk_partition_remap(struct bio *bio)
if (unlikely(bio_check_ro(bio, p)))
goto out;
- /*
- * Zone reset does not include bi_size so bio_sectors() is always 0.
- * Include a test for the reset op code and perform the remap if needed.
- */
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
+ if (bio_sectors(bio)) {
if (bio_check_eod(bio, part_nr_sects_read(p)))
goto out;
bio->bi_iter.bi_sector += p->start_sect;
@@ -827,6 +917,41 @@ out:
return ret;
}
+/*
+ * Check write append to a zoned block device.
+ */
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
+ struct bio *bio)
+{
+ sector_t pos = bio->bi_iter.bi_sector;
+ int nr_sectors = bio_sectors(bio);
+
+ /* Only applicable to zoned block devices */
+ if (!blk_queue_is_zoned(q))
+ return BLK_STS_NOTSUPP;
+
+ /* The bio sector must point to the start of a sequential zone */
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
+ !blk_queue_zone_is_seq(q, pos))
+ return BLK_STS_IOERR;
+
+ /*
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
+ * split and could result in non-contiguous sectors being written in
+ * different zones.
+ */
+ if (nr_sectors > q->limits.chunk_sectors)
+ return BLK_STS_IOERR;
+
+ /* Make sure the BIO is small enough and will not get split */
+ if (nr_sectors > q->limits.max_zone_append_sectors)
+ return BLK_STS_IOERR;
+
+ bio->bi_opf |= REQ_NOMERGE;
+
+ return BLK_STS_OK;
+}
+
static noinline_for_stack bool
generic_make_request_checks(struct bio *bio)
{
@@ -896,10 +1021,22 @@ generic_make_request_checks(struct bio *bio)
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
+ case REQ_OP_ZONE_APPEND:
+ status = blk_check_zone_append(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ break;
case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_OPEN:
+ case REQ_OP_ZONE_CLOSE:
+ case REQ_OP_ZONE_FINISH:
if (!blk_queue_is_zoned(q))
goto not_supported;
break;
+ case REQ_OP_ZONE_RESET_ALL:
+ if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+ goto not_supported;
+ break;
case REQ_OP_WRITE_ZEROES:
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
@@ -909,12 +1046,13 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
+ * Various block parts want %current->io_context, so allocate it up
+ * front rather than dealing with lots of pain to allocate it only
+ * where needed. This may fail and the block layer knows how to live
+ * with it.
*/
- create_io_context(GFP_ATOMIC, q->node);
+ if (unlikely(!current->io_context))
+ create_task_io_context(current, GFP_ATOMIC, q->node);
if (!blkcg_bio_issue_check(q, bio))
return false;
@@ -936,29 +1074,28 @@ end_io:
return false;
}
+static blk_qc_t do_make_request(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ blk_qc_t ret = BLK_QC_T_NONE;
+
+ if (blk_crypto_bio_prep(&bio)) {
+ if (!q->make_request_fn)
+ return blk_mq_make_request(q, bio);
+ ret = q->make_request_fn(q, bio);
+ }
+ blk_queue_exit(q);
+ return ret;
+}
+
/**
- * generic_make_request - hand a buffer to its device driver for I/O
+ * generic_make_request - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status. The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may resubmit the bio to
- * a lower device by calling into generic_make_request recursively, which
- * means the bio should NOT be touched after the call to ->make_request_fn.
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
*/
blk_qc_t generic_make_request(struct bio *bio)
{
@@ -1009,18 +1146,14 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
- blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
- BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, flags) == 0)) {
+ if (likely(bio_queue_enter(bio) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
- ret = q->make_request_fn(q, bio);
-
- blk_queue_exit(q);
+ ret = do_make_request(bio);
/* sort new bios into those for a lower level
* and those for the same level
@@ -1036,12 +1169,6 @@ blk_qc_t generic_make_request(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } else {
- if (unlikely(!blk_queue_dying(q) &&
- (bio->bi_opf & REQ_NOWAIT)))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
}
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
@@ -1058,31 +1185,25 @@ EXPORT_SYMBOL(generic_make_request);
*
* This function behaves like generic_make_request(), but does not protect
* against recursion. Must only be used if the called driver is known
- * to not call generic_make_request (or direct_make_request) again from
- * its make_request function. (Calling direct_make_request again from
- * a workqueue is perfectly fine as that doesn't recurse).
+ * to be blk-mq based.
*/
blk_qc_t direct_make_request(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
- bool nowait = bio->bi_opf & REQ_NOWAIT;
- blk_qc_t ret;
+ if (WARN_ON_ONCE(q->make_request_fn)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;
-
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
- if (nowait && !blk_queue_dying(q))
- bio->bi_status = BLK_STS_AGAIN;
- else
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
+ if (unlikely(bio_queue_enter(bio)))
+ return BLK_QC_T_NONE;
+ if (!blk_crypto_bio_prep(&bio)) {
+ blk_queue_exit(q);
return BLK_QC_T_NONE;
}
-
- ret = q->make_request_fn(q, bio);
- blk_queue_exit(q);
- return ret;
+ return blk_mq_make_request(q, bio);
}
EXPORT_SYMBOL_GPL(direct_make_request);
@@ -1090,13 +1211,20 @@ EXPORT_SYMBOL_GPL(direct_make_request);
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
- * submit_bio() is very similar in purpose to generic_make_request(), and
- * uses that function to do most of the work. Both are fairly rough
- * interfaces; @bio must be presetup and ready for I/O.
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
+ * fully set up &struct bio that describes the I/O that needs to be done. The
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
*
+ * The success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
+ * been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
+ if (blkcg_punt_bio_submit(bio))
+ return BLK_QC_T_NONE;
+
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
@@ -1126,13 +1254,31 @@ blk_qc_t submit_bio(struct bio *bio)
}
}
+ /*
+ * If we're reading data that is part of the userspace workingset, count
+ * submission time as memory stall. When the device is congested, or
+ * the submitting cgroup IO-throttled, submission can be a significant
+ * part of overall IO time.
+ */
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
+ bio_flagged(bio, BIO_WORKINGSET))) {
+ unsigned long pflags;
+ blk_qc_t ret;
+
+ psi_memstall_enter(&pflags);
+ ret = generic_make_request(bio);
+ psi_memstall_leave(&pflags);
+
+ return ret;
+ }
+
return generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);
/**
* blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for new the queue limits
+ * for the new queue limits
* @q: the queue
* @rq: the request being checked
*
@@ -1163,7 +1309,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
* Recalculate it to check the request correctly on this queue's
* limitation.
*/
- blk_recalc_rq_segments(rq);
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q));
@@ -1187,8 +1333,11 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
+ if (blk_crypto_insert_cloned_request(rq))
+ return BLK_STS_IOERR;
+
if (blk_queue_io_stat(q))
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
@@ -1240,9 +1389,24 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-void blk_account_io_completion(struct request *req, unsigned int bytes)
+static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
{
- if (blk_do_io_stat(req)) {
+ unsigned long stamp;
+again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+ if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
@@ -1260,59 +1424,68 @@ void blk_account_io_done(struct request *req, u64 now)
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
- if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ if (req->part && blk_do_io_stat(req) &&
+ !(req->rq_flags & RQF_FLUSH_SEQ)) {
const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
part_stat_lock();
part = req->part;
- update_io_ticks(part, jiffies);
+ update_io_ticks(part, jiffies, true);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
- part_dec_in_flight(req->q, part, rq_data_dir(req));
+ part_stat_unlock();
hd_struct_put(part);
- part_stat_unlock();
}
}
-void blk_account_io_start(struct request *rq, bool new_io)
+void blk_account_io_start(struct request *rq)
{
- struct hd_struct *part;
- int rw = rq_data_dir(rq);
-
if (!blk_do_io_stat(rq))
return;
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+
part_stat_lock();
+ update_io_ticks(rq->part, jiffies, false);
+ part_stat_unlock();
+}
- if (!new_io) {
- part = rq->part;
- part_stat_inc(part, merges[rw]);
- } else {
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!hd_struct_try_get(part)) {
- /*
- * The partition is already being removed,
- * the request will be accounted on the disk only
- *
- * We take a reference on disk->part0 although that
- * partition will never be deleted, so we can treat
- * it as any other partition.
- */
- part = &rq->rq_disk->part0;
- hd_struct_get(part);
- }
- part_inc_in_flight(rq->q, part, rw);
- rq->part = part;
- }
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
+ unsigned int op)
+{
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+
+ part_stat_lock();
+ update_io_ticks(part, now, false);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, sectors[sgrp], sectors);
+ part_stat_local_inc(part, in_flight[op_is_write(op)]);
+ part_stat_unlock();
- update_io_ticks(part, jiffies);
+ return now;
+}
+EXPORT_SYMBOL(disk_start_io_acct);
+
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+ unsigned long start_time)
+{
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+ unsigned long duration = now - start_time;
+ part_stat_lock();
+ update_io_ticks(part, now, true);
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
+EXPORT_SYMBOL(disk_end_io_acct);
/*
* Steal bios from a request and add them to a bio list.
@@ -1348,7 +1521,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
*
* This special helper function is only for request stacking drivers
* (e.g. request-based dm) so that they can handle partial completion.
- * Actual device drivers should use blk_end_request instead.
+ * Actual device drivers should use blk_mq_end_request instead.
*
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
@@ -1371,9 +1544,15 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (!req->bio)
return false;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+ error == BLK_STS_OK)
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)))
- print_req_error(req, error);
+ print_req_error(req, error, __func__);
blk_account_io_completion(req, nr_bytes);
@@ -1432,28 +1611,13 @@ bool blk_update_request(struct request *req, blk_status_t error,
}
/* recalculate the number of segments */
- blk_recalc_rq_segments(req);
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
}
return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
- struct bio *bio)
-{
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
- rq->nr_phys_segments = 1;
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
-
- if (bio->bi_disk)
- rq->rq_disk = bio->bi_disk;
-}
-
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
* rq_flush_dcache_pages - Helper function to flush all pages in a request
@@ -1520,23 +1684,6 @@ void blk_rq_unprep_clone(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
-{
- dst->__sector = blk_rq_pos(src);
- dst->__data_len = blk_rq_bytes(src);
- if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
- dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
- dst->special_vec = src->special_vec;
- }
- dst->nr_phys_segments = src->nr_phys_segments;
- dst->ioprio = src->ioprio;
- dst->extra_len = src->extra_len;
-}
-
/**
* blk_rq_prep_clone - Helper function to setup clone request
* @rq: the request to be setup
@@ -1549,8 +1696,6 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
*
* Description:
* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- * are not copied, and copying such parts is the caller's responsibility.
* Also, pages which the original bios are pointing to are not copied
* and the cloned bios just point same pages.
* So cloned bios must be completed before original bios, which means
@@ -1581,7 +1726,18 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->bio = rq->biotail = bio;
}
- __blk_rq_prep_clone(rq, rq_src);
+ /* Copy attributes of the original request to the clone request. */
+ rq->__sector = blk_rq_pos(rq_src);
+ rq->__data_len = blk_rq_bytes(rq_src);
+ if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+ rq->special_vec = rq_src->special_vec;
+ }
+ rq->nr_phys_segments = rq_src->nr_phys_segments;
+ rq->ioprio = rq_src->ioprio;
+
+ if (rq->bio)
+ blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask);
return 0;
@@ -1600,12 +1756,6 @@ int kblockd_schedule_work(struct work_struct *work)
}
EXPORT_SYMBOL(kblockd_schedule_work);
-int kblockd_schedule_work_on(int cpu, struct work_struct *work)
-{
- return queue_work_on(cpu, kblockd_workqueue, work);
-}
-EXPORT_SYMBOL(kblockd_schedule_work_on);
-
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
@@ -1729,13 +1879,25 @@ void blk_finish_plug(struct blk_plug *plug)
}
EXPORT_SYMBOL(blk_finish_plug);
+void blk_io_schedule(void)
+{
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
+
+ if (timeout)
+ io_schedule_timeout(timeout);
+ else
+ io_schedule();
+}
+EXPORT_SYMBOL_GPL(blk_io_schedule);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
- FIELD_SIZEOF(struct request, cmd_flags));
+ sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
- FIELD_SIZEOF(struct bio, bi_opf));
+ sizeof_field(struct bio, bi_opf));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
new file mode 100644
index 000000000000..6e49688a2d80
--- /dev/null
+++ b/block/blk-crypto-fallback.c
@@ -0,0 +1,657 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Refer to Documentation/block/inline-encryption.rst for detailed explanation.
+ */
+
+#define pr_fmt(fmt) "blk-crypto-fallback: " fmt
+
+#include <crypto/skcipher.h>
+#include <linux/blk-cgroup.h>
+#include <linux/blk-crypto.h>
+#include <linux/blkdev.h>
+#include <linux/crypto.h>
+#include <linux/keyslot-manager.h>
+#include <linux/mempool.h>
+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "blk-crypto-internal.h"
+
+static unsigned int num_prealloc_bounce_pg = 32;
+module_param(num_prealloc_bounce_pg, uint, 0);
+MODULE_PARM_DESC(num_prealloc_bounce_pg,
+ "Number of preallocated bounce pages for the blk-crypto crypto API fallback");
+
+static unsigned int blk_crypto_num_keyslots = 100;
+module_param_named(num_keyslots, blk_crypto_num_keyslots, uint, 0);
+MODULE_PARM_DESC(num_keyslots,
+ "Number of keyslots for the blk-crypto crypto API fallback");
+
+static unsigned int num_prealloc_fallback_crypt_ctxs = 128;
+module_param(num_prealloc_fallback_crypt_ctxs, uint, 0);
+MODULE_PARM_DESC(num_prealloc_crypt_fallback_ctxs,
+ "Number of preallocated bio fallback crypto contexts for blk-crypto to use during crypto API fallback");
+
+struct bio_fallback_crypt_ctx {
+ struct bio_crypt_ctx crypt_ctx;
+ /*
+ * Copy of the bvec_iter when this bio was submitted.
+ * We only want to en/decrypt the part of the bio as described by the
+ * bvec_iter upon submission because bio might be split before being
+ * resubmitted
+ */
+ struct bvec_iter crypt_iter;
+ union {
+ struct {
+ struct work_struct work;
+ struct bio *bio;
+ };
+ struct {
+ void *bi_private_orig;
+ bio_end_io_t *bi_end_io_orig;
+ };
+ };
+};
+
+static struct kmem_cache *bio_fallback_crypt_ctx_cache;
+static mempool_t *bio_fallback_crypt_ctx_pool;
+
+/*
+ * Allocating a crypto tfm during I/O can deadlock, so we have to preallocate
+ * all of a mode's tfms when that mode starts being used. Since each mode may
+ * need all the keyslots at some point, each mode needs its own tfm for each
+ * keyslot; thus, a keyslot may contain tfms for multiple modes. However, to
+ * match the behavior of real inline encryption hardware (which only supports a
+ * single encryption context per keyslot), we only allow one tfm per keyslot to
+ * be used at a time - the rest of the unused tfms have their keys cleared.
+ */
+static DEFINE_MUTEX(tfms_init_lock);
+static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
+
+static struct blk_crypto_keyslot {
+ enum blk_crypto_mode_num crypto_mode;
+ struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
+} *blk_crypto_keyslots;
+
+static struct blk_keyslot_manager blk_crypto_ksm;
+static struct workqueue_struct *blk_crypto_wq;
+static mempool_t *blk_crypto_bounce_page_pool;
+
+/*
+ * This is the key we set when evicting a keyslot. This *should* be the all 0's
+ * key, but AES-XTS rejects that key, so we use some random bytes instead.
+ */
+static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
+
+static void blk_crypto_evict_keyslot(unsigned int slot)
+{
+ struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
+ int err;
+
+ WARN_ON(slotp->crypto_mode == BLK_ENCRYPTION_MODE_INVALID);
+
+ /* Clear the key in the skcipher */
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], blank_key,
+ blk_crypto_modes[crypto_mode].keysize);
+ WARN_ON(err);
+ slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
+}
+
+static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
+{
+ struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ const enum blk_crypto_mode_num crypto_mode =
+ key->crypto_cfg.crypto_mode;
+ int err;
+
+ if (crypto_mode != slotp->crypto_mode &&
+ slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
+ blk_crypto_evict_keyslot(slot);
+
+ slotp->crypto_mode = crypto_mode;
+ err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
+ key->size);
+ if (err) {
+ blk_crypto_evict_keyslot(slot);
+ return err;
+ }
+ return 0;
+}
+
+static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
+{
+ blk_crypto_evict_keyslot(slot);
+ return 0;
+}
+
+/*
+ * The crypto API fallback KSM ops - only used for a bio when it specifies a
+ * blk_crypto_key that was not supported by the device's inline encryption
+ * hardware.
+ */
+static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
+ .keyslot_program = blk_crypto_keyslot_program,
+ .keyslot_evict = blk_crypto_keyslot_evict,
+};
+
+static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
+{
+ struct bio *src_bio = enc_bio->bi_private;
+ int i;
+
+ for (i = 0; i < enc_bio->bi_vcnt; i++)
+ mempool_free(enc_bio->bi_io_vec[i].bv_page,
+ blk_crypto_bounce_page_pool);
+
+ src_bio->bi_status = enc_bio->bi_status;
+
+ bio_put(enc_bio);
+ bio_endio(src_bio);
+}
+
+static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+{
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ struct bio *bio;
+
+ bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL);
+ if (!bio)
+ return NULL;
+ bio->bi_disk = bio_src->bi_disk;
+ bio->bi_opf = bio_src->bi_opf;
+ bio->bi_ioprio = bio_src->bi_ioprio;
+ bio->bi_write_hint = bio_src->bi_write_hint;
+ bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
+ bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
+
+ bio_for_each_segment(bv, bio_src, iter)
+ bio->bi_io_vec[bio->bi_vcnt++] = bv;
+
+ bio_clone_blkg_association(bio, bio_src);
+ blkcg_bio_issue_init(bio);
+
+ return bio;
+}
+
+static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
+ struct skcipher_request **ciph_req_ret,
+ struct crypto_wait *wait)
+{
+ struct skcipher_request *ciph_req;
+ const struct blk_crypto_keyslot *slotp;
+ int keyslot_idx = blk_ksm_get_slot_idx(slot);
+
+ slotp = &blk_crypto_keyslots[keyslot_idx];
+ ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
+ GFP_NOIO);
+ if (!ciph_req)
+ return false;
+
+ skcipher_request_set_callback(ciph_req,
+ CRYPTO_TFM_REQ_MAY_BACKLOG |
+ CRYPTO_TFM_REQ_MAY_SLEEP,
+ crypto_req_done, wait);
+ *ciph_req_ret = ciph_req;
+
+ return true;
+}
+
+static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+{
+ struct bio *bio = *bio_ptr;
+ unsigned int i = 0;
+ unsigned int num_sectors = 0;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+
+ bio_for_each_segment(bv, bio, iter) {
+ num_sectors += bv.bv_len >> SECTOR_SHIFT;
+ if (++i == BIO_MAX_PAGES)
+ break;
+ }
+ if (num_sectors < bio_sectors(bio)) {
+ struct bio *split_bio;
+
+ split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL);
+ if (!split_bio) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ return false;
+ }
+ bio_chain(split_bio, bio);
+ generic_make_request(bio);
+ *bio_ptr = split_bio;
+ }
+
+ return true;
+}
+
+union blk_crypto_iv {
+ __le64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+ u8 bytes[BLK_CRYPTO_MAX_IV_SIZE];
+};
+
+static void blk_crypto_dun_to_iv(const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
+ union blk_crypto_iv *iv)
+{
+ int i;
+
+ for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++)
+ iv->dun[i] = cpu_to_le64(dun[i]);
+}
+
+/*
+ * The crypto API fallback's encryption routine.
+ * Allocate a bounce bio for encryption, encrypt the input bio using crypto API,
+ * and replace *bio_ptr with the bounce bio. May split input bio if it's too
+ * large. Returns true on success. Returns false and sets bio->bi_status on
+ * error.
+ */
+static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
+{
+ struct bio *src_bio, *enc_bio;
+ struct bio_crypt_ctx *bc;
+ struct blk_ksm_keyslot *slot;
+ int data_unit_size;
+ struct skcipher_request *ciph_req = NULL;
+ DECLARE_CRYPTO_WAIT(wait);
+ u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+ struct scatterlist src, dst;
+ union blk_crypto_iv iv;
+ unsigned int i, j;
+ bool ret = false;
+ blk_status_t blk_st;
+
+ /* Split the bio if it's too big for single page bvec */
+ if (!blk_crypto_split_bio_if_needed(bio_ptr))
+ return false;
+
+ src_bio = *bio_ptr;
+ bc = src_bio->bi_crypt_context;
+ data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
+
+ /* Allocate bounce bio for encryption */
+ enc_bio = blk_crypto_clone_bio(src_bio);
+ if (!enc_bio) {
+ src_bio->bi_status = BLK_STS_RESOURCE;
+ return false;
+ }
+
+ /*
+ * Use the crypto API fallback keyslot manager to get a crypto_skcipher
+ * for the algorithm and key specified for this bio.
+ */
+ blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ if (blk_st != BLK_STS_OK) {
+ src_bio->bi_status = blk_st;
+ goto out_put_enc_bio;
+ }
+
+ /* and then allocate an skcipher_request for it */
+ if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ src_bio->bi_status = BLK_STS_RESOURCE;
+ goto out_release_keyslot;
+ }
+
+ memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
+ sg_init_table(&src, 1);
+ sg_init_table(&dst, 1);
+
+ skcipher_request_set_crypt(ciph_req, &src, &dst, data_unit_size,
+ iv.bytes);
+
+ /* Encrypt each page in the bounce bio */
+ for (i = 0; i < enc_bio->bi_vcnt; i++) {
+ struct bio_vec *enc_bvec = &enc_bio->bi_io_vec[i];
+ struct page *plaintext_page = enc_bvec->bv_page;
+ struct page *ciphertext_page =
+ mempool_alloc(blk_crypto_bounce_page_pool, GFP_NOIO);
+
+ enc_bvec->bv_page = ciphertext_page;
+
+ if (!ciphertext_page) {
+ src_bio->bi_status = BLK_STS_RESOURCE;
+ goto out_free_bounce_pages;
+ }
+
+ sg_set_page(&src, plaintext_page, data_unit_size,
+ enc_bvec->bv_offset);
+ sg_set_page(&dst, ciphertext_page, data_unit_size,
+ enc_bvec->bv_offset);
+
+ /* Encrypt each data unit in this page */
+ for (j = 0; j < enc_bvec->bv_len; j += data_unit_size) {
+ blk_crypto_dun_to_iv(curr_dun, &iv);
+ if (crypto_wait_req(crypto_skcipher_encrypt(ciph_req),
+ &wait)) {
+ i++;
+ src_bio->bi_status = BLK_STS_IOERR;
+ goto out_free_bounce_pages;
+ }
+ bio_crypt_dun_increment(curr_dun, 1);
+ src.offset += data_unit_size;
+ dst.offset += data_unit_size;
+ }
+ }
+
+ enc_bio->bi_private = src_bio;
+ enc_bio->bi_end_io = blk_crypto_fallback_encrypt_endio;
+ *bio_ptr = enc_bio;
+ ret = true;
+
+ enc_bio = NULL;
+ goto out_free_ciph_req;
+
+out_free_bounce_pages:
+ while (i > 0)
+ mempool_free(enc_bio->bi_io_vec[--i].bv_page,
+ blk_crypto_bounce_page_pool);
+out_free_ciph_req:
+ skcipher_request_free(ciph_req);
+out_release_keyslot:
+ blk_ksm_put_slot(slot);
+out_put_enc_bio:
+ if (enc_bio)
+ bio_put(enc_bio);
+
+ return ret;
+}
+
+/*
+ * The crypto API fallback's main decryption routine.
+ * Decrypts input bio in place, and calls bio_endio on the bio.
+ */
+static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
+{
+ struct bio_fallback_crypt_ctx *f_ctx =
+ container_of(work, struct bio_fallback_crypt_ctx, work);
+ struct bio *bio = f_ctx->bio;
+ struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
+ struct blk_ksm_keyslot *slot;
+ struct skcipher_request *ciph_req = NULL;
+ DECLARE_CRYPTO_WAIT(wait);
+ u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
+ union blk_crypto_iv iv;
+ struct scatterlist sg;
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ const int data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
+ unsigned int i;
+ blk_status_t blk_st;
+
+ /*
+ * Use the crypto API fallback keyslot manager to get a crypto_skcipher
+ * for the algorithm and key specified for this bio.
+ */
+ blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ if (blk_st != BLK_STS_OK) {
+ bio->bi_status = blk_st;
+ goto out_no_keyslot;
+ }
+
+ /* and then allocate an skcipher_request for it */
+ if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ bio->bi_status = BLK_STS_RESOURCE;
+ goto out;
+ }
+
+ memcpy(curr_dun, bc->bc_dun, sizeof(curr_dun));
+ sg_init_table(&sg, 1);
+ skcipher_request_set_crypt(ciph_req, &sg, &sg, data_unit_size,
+ iv.bytes);
+
+ /* Decrypt each segment in the bio */
+ __bio_for_each_segment(bv, bio, iter, f_ctx->crypt_iter) {
+ struct page *page = bv.bv_page;
+
+ sg_set_page(&sg, page, data_unit_size, bv.bv_offset);
+
+ /* Decrypt each data unit in the segment */
+ for (i = 0; i < bv.bv_len; i += data_unit_size) {
+ blk_crypto_dun_to_iv(curr_dun, &iv);
+ if (crypto_wait_req(crypto_skcipher_decrypt(ciph_req),
+ &wait)) {
+ bio->bi_status = BLK_STS_IOERR;
+ goto out;
+ }
+ bio_crypt_dun_increment(curr_dun, 1);
+ sg.offset += data_unit_size;
+ }
+ }
+
+out:
+ skcipher_request_free(ciph_req);
+ blk_ksm_put_slot(slot);
+out_no_keyslot:
+ mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
+ bio_endio(bio);
+}
+
+/**
+ * blk_crypto_fallback_decrypt_endio - queue bio for fallback decryption
+ *
+ * @bio: the bio to queue
+ *
+ * Restore bi_private and bi_end_io, and queue the bio for decryption into a
+ * workqueue, since this function will be called from an atomic context.
+ */
+static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
+{
+ struct bio_fallback_crypt_ctx *f_ctx = bio->bi_private;
+
+ bio->bi_private = f_ctx->bi_private_orig;
+ bio->bi_end_io = f_ctx->bi_end_io_orig;
+
+ /* If there was an IO error, don't queue for decrypt. */
+ if (bio->bi_status) {
+ mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
+ bio_endio(bio);
+ return;
+ }
+
+ INIT_WORK(&f_ctx->work, blk_crypto_fallback_decrypt_bio);
+ f_ctx->bio = bio;
+ queue_work(blk_crypto_wq, &f_ctx->work);
+}
+
+/**
+ * blk_crypto_fallback_bio_prep - Prepare a bio to use fallback en/decryption
+ *
+ * @bio_ptr: pointer to the bio to prepare
+ *
+ * If bio is doing a WRITE operation, this splits the bio into two parts if it's
+ * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
+ * for the first part, encrypts it, and update bio_ptr to point to the bounce
+ * bio.
+ *
+ * For a READ operation, we mark the bio for decryption by using bi_private and
+ * bi_end_io.
+ *
+ * In either case, this function will make the bio look like a regular bio (i.e.
+ * as if no encryption context was ever specified) for the purposes of the rest
+ * of the stack except for blk-integrity (blk-integrity and blk-crypto are not
+ * currently supported together).
+ *
+ * Return: true on success. Sets bio->bi_status and returns false on error.
+ */
+bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
+{
+ struct bio *bio = *bio_ptr;
+ struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+ struct bio_fallback_crypt_ctx *f_ctx;
+
+ if (WARN_ON_ONCE(!tfms_inited[bc->bc_key->crypto_cfg.crypto_mode])) {
+ /* User didn't call blk_crypto_start_using_key() first */
+ bio->bi_status = BLK_STS_IOERR;
+ return false;
+ }
+
+ if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
+ &bc->bc_key->crypto_cfg)) {
+ bio->bi_status = BLK_STS_NOTSUPP;
+ return false;
+ }
+
+ if (bio_data_dir(bio) == WRITE)
+ return blk_crypto_fallback_encrypt_bio(bio_ptr);
+
+ /*
+ * bio READ case: Set up a f_ctx in the bio's bi_private and set the
+ * bi_end_io appropriately to trigger decryption when the bio is ended.
+ */
+ f_ctx = mempool_alloc(bio_fallback_crypt_ctx_pool, GFP_NOIO);
+ f_ctx->crypt_ctx = *bc;
+ f_ctx->crypt_iter = bio->bi_iter;
+ f_ctx->bi_private_orig = bio->bi_private;
+ f_ctx->bi_end_io_orig = bio->bi_end_io;
+ bio->bi_private = (void *)f_ctx;
+ bio->bi_end_io = blk_crypto_fallback_decrypt_endio;
+ bio_crypt_free_ctx(bio);
+
+ return true;
+}
+
+int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
+{
+ return blk_ksm_evict_key(&blk_crypto_ksm, key);
+}
+
+static bool blk_crypto_fallback_inited;
+static int blk_crypto_fallback_init(void)
+{
+ int i;
+ int err;
+
+ if (blk_crypto_fallback_inited)
+ return 0;
+
+ prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
+
+ err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+ if (err)
+ goto out;
+ err = -ENOMEM;
+
+ blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
+ blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+
+ /* All blk-crypto modes have a crypto API fallback. */
+ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
+ blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
+ blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+
+ blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
+ WQ_UNBOUND | WQ_HIGHPRI |
+ WQ_MEM_RECLAIM, num_online_cpus());
+ if (!blk_crypto_wq)
+ goto fail_free_ksm;
+
+ blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
+ sizeof(blk_crypto_keyslots[0]),
+ GFP_KERNEL);
+ if (!blk_crypto_keyslots)
+ goto fail_free_wq;
+
+ blk_crypto_bounce_page_pool =
+ mempool_create_page_pool(num_prealloc_bounce_pg, 0);
+ if (!blk_crypto_bounce_page_pool)
+ goto fail_free_keyslots;
+
+ bio_fallback_crypt_ctx_cache = KMEM_CACHE(bio_fallback_crypt_ctx, 0);
+ if (!bio_fallback_crypt_ctx_cache)
+ goto fail_free_bounce_page_pool;
+
+ bio_fallback_crypt_ctx_pool =
+ mempool_create_slab_pool(num_prealloc_fallback_crypt_ctxs,
+ bio_fallback_crypt_ctx_cache);
+ if (!bio_fallback_crypt_ctx_pool)
+ goto fail_free_crypt_ctx_cache;
+
+ blk_crypto_fallback_inited = true;
+
+ return 0;
+fail_free_crypt_ctx_cache:
+ kmem_cache_destroy(bio_fallback_crypt_ctx_cache);
+fail_free_bounce_page_pool:
+ mempool_destroy(blk_crypto_bounce_page_pool);
+fail_free_keyslots:
+ kfree(blk_crypto_keyslots);
+fail_free_wq:
+ destroy_workqueue(blk_crypto_wq);
+fail_free_ksm:
+ blk_ksm_destroy(&blk_crypto_ksm);
+out:
+ return err;
+}
+
+/*
+ * Prepare blk-crypto-fallback for the specified crypto mode.
+ * Returns -ENOPKG if the needed crypto API support is missing.
+ */
+int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
+{
+ const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
+ struct blk_crypto_keyslot *slotp;
+ unsigned int i;
+ int err = 0;
+
+ /*
+ * Fast path
+ * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num]
+ * for each i are visible before we try to access them.
+ */
+ if (likely(smp_load_acquire(&tfms_inited[mode_num])))
+ return 0;
+
+ mutex_lock(&tfms_init_lock);
+ if (tfms_inited[mode_num])
+ goto out;
+
+ err = blk_crypto_fallback_init();
+ if (err)
+ goto out;
+
+ for (i = 0; i < blk_crypto_num_keyslots; i++) {
+ slotp = &blk_crypto_keyslots[i];
+ slotp->tfms[mode_num] = crypto_alloc_skcipher(cipher_str, 0, 0);
+ if (IS_ERR(slotp->tfms[mode_num])) {
+ err = PTR_ERR(slotp->tfms[mode_num]);
+ if (err == -ENOENT) {
+ pr_warn_once("Missing crypto API support for \"%s\"\n",
+ cipher_str);
+ err = -ENOPKG;
+ }
+ slotp->tfms[mode_num] = NULL;
+ goto out_free_tfms;
+ }
+
+ crypto_skcipher_set_flags(slotp->tfms[mode_num],
+ CRYPTO_TFM_REQ_FORBID_WEAK_KEYS);
+ }
+
+ /*
+ * Ensure that updates to blk_crypto_keyslots[i].tfms[mode_num]
+ * for each i are visible before we set tfms_inited[mode_num].
+ */
+ smp_store_release(&tfms_inited[mode_num], true);
+ goto out;
+
+out_free_tfms:
+ for (i = 0; i < blk_crypto_num_keyslots; i++) {
+ slotp = &blk_crypto_keyslots[i];
+ crypto_free_skcipher(slotp->tfms[mode_num]);
+ slotp->tfms[mode_num] = NULL;
+ }
+out:
+ mutex_unlock(&tfms_init_lock);
+ return err;
+}
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
new file mode 100644
index 000000000000..d2b0f565d83c
--- /dev/null
+++ b/block/blk-crypto-internal.h
@@ -0,0 +1,201 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright 2019 Google LLC
+ */
+
+#ifndef __LINUX_BLK_CRYPTO_INTERNAL_H
+#define __LINUX_BLK_CRYPTO_INTERNAL_H
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+
+/* Represents a crypto mode supported by blk-crypto */
+struct blk_crypto_mode {
+ const char *cipher_str; /* crypto API name (for fallback case) */
+ unsigned int keysize; /* key size in bytes */
+ unsigned int ivsize; /* iv size in bytes */
+};
+
+extern const struct blk_crypto_mode blk_crypto_modes[];
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+
+void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
+ unsigned int inc);
+
+bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio);
+
+bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
+ struct bio_crypt_ctx *bc2);
+
+static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
+ bio->bi_crypt_context);
+}
+
+static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return bio_crypt_ctx_mergeable(bio->bi_crypt_context,
+ bio->bi_iter.bi_size, req->crypt_ctx);
+}
+
+static inline bool bio_crypt_ctx_merge_rq(struct request *req,
+ struct request *next)
+{
+ return bio_crypt_ctx_mergeable(req->crypt_ctx, blk_rq_bytes(req),
+ next->crypt_ctx);
+}
+
+static inline void blk_crypto_rq_set_defaults(struct request *rq)
+{
+ rq->crypt_ctx = NULL;
+ rq->crypt_keyslot = NULL;
+}
+
+static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
+{
+ return rq->crypt_ctx;
+}
+
+#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+
+static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
+ struct bio *bio)
+{
+ return true;
+}
+
+static inline bool bio_crypt_ctx_front_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return true;
+}
+
+static inline bool bio_crypt_ctx_back_mergeable(struct request *req,
+ struct bio *bio)
+{
+ return true;
+}
+
+static inline bool bio_crypt_ctx_merge_rq(struct request *req,
+ struct request *next)
+{
+ return true;
+}
+
+static inline void blk_crypto_rq_set_defaults(struct request *rq) { }
+
+static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
+{
+ return false;
+}
+
+#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
+
+void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
+static inline void bio_crypt_advance(struct bio *bio, unsigned int bytes)
+{
+ if (bio_has_crypt_ctx(bio))
+ __bio_crypt_advance(bio, bytes);
+}
+
+void __bio_crypt_free_ctx(struct bio *bio);
+static inline void bio_crypt_free_ctx(struct bio *bio)
+{
+ if (bio_has_crypt_ctx(bio))
+ __bio_crypt_free_ctx(bio);
+}
+
+static inline void bio_crypt_do_front_merge(struct request *rq,
+ struct bio *bio)
+{
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ if (bio_has_crypt_ctx(bio))
+ memcpy(rq->crypt_ctx->bc_dun, bio->bi_crypt_context->bc_dun,
+ sizeof(rq->crypt_ctx->bc_dun));
+#endif
+}
+
+bool __blk_crypto_bio_prep(struct bio **bio_ptr);
+static inline bool blk_crypto_bio_prep(struct bio **bio_ptr)
+{
+ if (bio_has_crypt_ctx(*bio_ptr))
+ return __blk_crypto_bio_prep(bio_ptr);
+ return true;
+}
+
+blk_status_t __blk_crypto_init_request(struct request *rq);
+static inline blk_status_t blk_crypto_init_request(struct request *rq)
+{
+ if (blk_crypto_rq_is_encrypted(rq))
+ return __blk_crypto_init_request(rq);
+ return BLK_STS_OK;
+}
+
+void __blk_crypto_free_request(struct request *rq);
+static inline void blk_crypto_free_request(struct request *rq)
+{
+ if (blk_crypto_rq_is_encrypted(rq))
+ __blk_crypto_free_request(rq);
+}
+
+void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask);
+static inline void blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
+{
+ if (bio_has_crypt_ctx(bio))
+ __blk_crypto_rq_bio_prep(rq, bio, gfp_mask);
+}
+
+/**
+ * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted
+ * into a request queue.
+ * @rq: the request being queued
+ *
+ * Return: BLK_STS_OK on success, nonzero on error.
+ */
+static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq)
+{
+
+ if (blk_crypto_rq_is_encrypted(rq))
+ return blk_crypto_init_request(rq);
+ return BLK_STS_OK;
+}
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
+
+int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num);
+
+bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr);
+
+int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key);
+
+#else /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */
+
+static inline int
+blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
+{
+ pr_warn_once("crypto API fallback is disabled\n");
+ return -ENOPKG;
+}
+
+static inline bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
+{
+ pr_warn_once("crypto API fallback disabled; failing request.\n");
+ (*bio_ptr)->bi_status = BLK_STS_NOTSUPP;
+ return false;
+}
+
+static inline int
+blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
+{
+ return 0;
+}
+
+#endif /* CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK */
+
+#endif /* __LINUX_BLK_CRYPTO_INTERNAL_H */
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
new file mode 100644
index 000000000000..6533c9b36ab8
--- /dev/null
+++ b/block/blk-crypto.c
@@ -0,0 +1,404 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/*
+ * Refer to Documentation/block/inline-encryption.rst for detailed explanation.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/bio.h>
+#include <linux/blkdev.h>
+#include <linux/keyslot-manager.h>
+#include <linux/module.h>
+#include <linux/slab.h>
+
+#include "blk-crypto-internal.h"
+
+const struct blk_crypto_mode blk_crypto_modes[] = {
+ [BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .cipher_str = "xts(aes)",
+ .keysize = 64,
+ .ivsize = 16,
+ },
+ [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .cipher_str = "essiv(cbc(aes),sha256)",
+ .keysize = 16,
+ .ivsize = 16,
+ },
+ [BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .cipher_str = "adiantum(xchacha12,aes)",
+ .keysize = 32,
+ .ivsize = 32,
+ },
+};
+
+/*
+ * This number needs to be at least (the number of threads doing IO
+ * concurrently) * (maximum recursive depth of a bio), so that we don't
+ * deadlock on crypt_ctx allocations. The default is chosen to be the same
+ * as the default number of post read contexts in both EXT4 and F2FS.
+ */
+static int num_prealloc_crypt_ctxs = 128;
+
+module_param(num_prealloc_crypt_ctxs, int, 0444);
+MODULE_PARM_DESC(num_prealloc_crypt_ctxs,
+ "Number of bio crypto contexts to preallocate");
+
+static struct kmem_cache *bio_crypt_ctx_cache;
+static mempool_t *bio_crypt_ctx_pool;
+
+static int __init bio_crypt_ctx_init(void)
+{
+ size_t i;
+
+ bio_crypt_ctx_cache = KMEM_CACHE(bio_crypt_ctx, 0);
+ if (!bio_crypt_ctx_cache)
+ goto out_no_mem;
+
+ bio_crypt_ctx_pool = mempool_create_slab_pool(num_prealloc_crypt_ctxs,
+ bio_crypt_ctx_cache);
+ if (!bio_crypt_ctx_pool)
+ goto out_no_mem;
+
+ /* This is assumed in various places. */
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+
+ /* Sanity check that no algorithm exceeds the defined limits. */
+ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ BUG_ON(blk_crypto_modes[i].keysize > BLK_CRYPTO_MAX_KEY_SIZE);
+ BUG_ON(blk_crypto_modes[i].ivsize > BLK_CRYPTO_MAX_IV_SIZE);
+ }
+
+ return 0;
+out_no_mem:
+ panic("Failed to allocate mem for bio crypt ctxs\n");
+}
+subsys_initcall(bio_crypt_ctx_init);
+
+void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
+ const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask)
+{
+ struct bio_crypt_ctx *bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+
+ bc->bc_key = key;
+ memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun));
+
+ bio->bi_crypt_context = bc;
+}
+
+void __bio_crypt_free_ctx(struct bio *bio)
+{
+ mempool_free(bio->bi_crypt_context, bio_crypt_ctx_pool);
+ bio->bi_crypt_context = NULL;
+}
+
+void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
+{
+ dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ *dst->bi_crypt_context = *src->bi_crypt_context;
+}
+EXPORT_SYMBOL_GPL(__bio_crypt_clone);
+
+/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
+void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
+ unsigned int inc)
+{
+ int i;
+
+ for (i = 0; inc && i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
+ dun[i] += inc;
+ /*
+ * If the addition in this limb overflowed, then we need to
+ * carry 1 into the next limb. Else the carry is 0.
+ */
+ if (dun[i] < inc)
+ inc = 1;
+ else
+ inc = 0;
+ }
+}
+
+void __bio_crypt_advance(struct bio *bio, unsigned int bytes)
+{
+ struct bio_crypt_ctx *bc = bio->bi_crypt_context;
+
+ bio_crypt_dun_increment(bc->bc_dun,
+ bytes >> bc->bc_key->data_unit_size_bits);
+}
+
+/*
+ * Returns true if @bc->bc_dun plus @bytes converted to data units is equal to
+ * @next_dun, treating the DUNs as multi-limb integers.
+ */
+bool bio_crypt_dun_is_contiguous(const struct bio_crypt_ctx *bc,
+ unsigned int bytes,
+ const u64 next_dun[BLK_CRYPTO_DUN_ARRAY_SIZE])
+{
+ int i;
+ unsigned int carry = bytes >> bc->bc_key->data_unit_size_bits;
+
+ for (i = 0; i < BLK_CRYPTO_DUN_ARRAY_SIZE; i++) {
+ if (bc->bc_dun[i] + carry != next_dun[i])
+ return false;
+ /*
+ * If the addition in this limb overflowed, then we need to
+ * carry 1 into the next limb. Else the carry is 0.
+ */
+ if ((bc->bc_dun[i] + carry) < carry)
+ carry = 1;
+ else
+ carry = 0;
+ }
+
+ /* If the DUN wrapped through 0, don't treat it as contiguous. */
+ return carry == 0;
+}
+
+/*
+ * Checks that two bio crypt contexts are compatible - i.e. that
+ * they are mergeable except for data_unit_num continuity.
+ */
+static bool bio_crypt_ctx_compatible(struct bio_crypt_ctx *bc1,
+ struct bio_crypt_ctx *bc2)
+{
+ if (!bc1)
+ return !bc2;
+
+ return bc2 && bc1->bc_key == bc2->bc_key;
+}
+
+bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio)
+{
+ return bio_crypt_ctx_compatible(rq->crypt_ctx, bio->bi_crypt_context);
+}
+
+/*
+ * Checks that two bio crypt contexts are compatible, and also
+ * that their data_unit_nums are continuous (and can hence be merged)
+ * in the order @bc1 followed by @bc2.
+ */
+bool bio_crypt_ctx_mergeable(struct bio_crypt_ctx *bc1, unsigned int bc1_bytes,
+ struct bio_crypt_ctx *bc2)
+{
+ if (!bio_crypt_ctx_compatible(bc1, bc2))
+ return false;
+
+ return !bc1 || bio_crypt_dun_is_contiguous(bc1, bc1_bytes, bc2->bc_dun);
+}
+
+/* Check that all I/O segments are data unit aligned. */
+static bool bio_crypt_check_alignment(struct bio *bio)
+{
+ const unsigned int data_unit_size =
+ bio->bi_crypt_context->bc_key->crypto_cfg.data_unit_size;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (!IS_ALIGNED(bv.bv_len | bv.bv_offset, data_unit_size))
+ return false;
+ }
+
+ return true;
+}
+
+blk_status_t __blk_crypto_init_request(struct request *rq)
+{
+ return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
+ &rq->crypt_keyslot);
+}
+
+/**
+ * __blk_crypto_free_request - Uninitialize the crypto fields of a request.
+ *
+ * @rq: The request whose crypto fields to uninitialize.
+ *
+ * Completely uninitializes the crypto fields of a request. If a keyslot has
+ * been programmed into some inline encryption hardware, that keyslot is
+ * released. The rq->crypt_ctx is also freed.
+ */
+void __blk_crypto_free_request(struct request *rq)
+{
+ blk_ksm_put_slot(rq->crypt_keyslot);
+ mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
+ blk_crypto_rq_set_defaults(rq);
+}
+
+/**
+ * __blk_crypto_bio_prep - Prepare bio for inline encryption
+ *
+ * @bio_ptr: pointer to original bio pointer
+ *
+ * If the bio crypt context provided for the bio is supported by the underlying
+ * device's inline encryption hardware, do nothing.
+ *
+ * Otherwise, try to perform en/decryption for this bio by falling back to the
+ * kernel crypto API. When the crypto API fallback is used for encryption,
+ * blk-crypto may choose to split the bio into 2 - the first one that will
+ * continue to be processed and the second one that will be resubmitted via
+ * generic_make_request. A bounce bio will be allocated to encrypt the contents
+ * of the aforementioned "first one", and *bio_ptr will be updated to this
+ * bounce bio.
+ *
+ * Caller must ensure bio has bio_crypt_ctx.
+ *
+ * Return: true on success; false on error (and bio->bi_status will be set
+ * appropriately, and bio_endio() will have been called so bio
+ * submission should abort).
+ */
+bool __blk_crypto_bio_prep(struct bio **bio_ptr)
+{
+ struct bio *bio = *bio_ptr;
+ const struct blk_crypto_key *bc_key = bio->bi_crypt_context->bc_key;
+
+ /* Error if bio has no data. */
+ if (WARN_ON_ONCE(!bio_has_data(bio))) {
+ bio->bi_status = BLK_STS_IOERR;
+ goto fail;
+ }
+
+ if (!bio_crypt_check_alignment(bio)) {
+ bio->bi_status = BLK_STS_IOERR;
+ goto fail;
+ }
+
+ /*
+ * Success if device supports the encryption context, or if we succeeded
+ * in falling back to the crypto API.
+ */
+ if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm,
+ &bc_key->crypto_cfg))
+ return true;
+
+ if (blk_crypto_fallback_bio_prep(bio_ptr))
+ return true;
+fail:
+ bio_endio(*bio_ptr);
+ return false;
+}
+
+/**
+ * __blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio
+ * is inserted
+ *
+ * @rq: The request to prepare
+ * @bio: The first bio being inserted into the request
+ * @gfp_mask: gfp mask
+ */
+void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
+{
+ if (!rq->crypt_ctx)
+ rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ *rq->crypt_ctx = *bio->bi_crypt_context;
+}
+
+/**
+ * blk_crypto_init_key() - Prepare a key for use with blk-crypto
+ * @blk_key: Pointer to the blk_crypto_key to initialize.
+ * @raw_key: Pointer to the raw key. Must be the correct length for the chosen
+ * @crypto_mode; see blk_crypto_modes[].
+ * @crypto_mode: identifier for the encryption algorithm to use
+ * @dun_bytes: number of bytes that will be used to specify the DUN when this
+ * key is used
+ * @data_unit_size: the data unit size to use for en/decryption
+ *
+ * Return: 0 on success, -errno on failure. The caller is responsible for
+ * zeroizing both blk_key and raw_key when done with them.
+ */
+int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
+ enum blk_crypto_mode_num crypto_mode,
+ unsigned int dun_bytes,
+ unsigned int data_unit_size)
+{
+ const struct blk_crypto_mode *mode;
+
+ memset(blk_key, 0, sizeof(*blk_key));
+
+ if (crypto_mode >= ARRAY_SIZE(blk_crypto_modes))
+ return -EINVAL;
+
+ mode = &blk_crypto_modes[crypto_mode];
+ if (mode->keysize == 0)
+ return -EINVAL;
+
+ if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE)
+ return -EINVAL;
+
+ if (!is_power_of_2(data_unit_size))
+ return -EINVAL;
+
+ blk_key->crypto_cfg.crypto_mode = crypto_mode;
+ blk_key->crypto_cfg.dun_bytes = dun_bytes;
+ blk_key->crypto_cfg.data_unit_size = data_unit_size;
+ blk_key->data_unit_size_bits = ilog2(data_unit_size);
+ blk_key->size = mode->keysize;
+ memcpy(blk_key->raw, raw_key, mode->keysize);
+
+ return 0;
+}
+
+/*
+ * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
+ * request queue it's submitted to supports inline crypto, or the
+ * blk-crypto-fallback is enabled and supports the cfg).
+ */
+bool blk_crypto_config_supported(struct request_queue *q,
+ const struct blk_crypto_config *cfg)
+{
+ return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
+ blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+}
+
+/**
+ * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @key: A key to use on the device
+ * @q: the request queue for the device
+ *
+ * Upper layers must call this function to ensure that either the hardware
+ * supports the key's crypto settings, or the crypto API fallback has transforms
+ * for the needed mode allocated and ready to go. This function may allocate
+ * an skcipher, and *should not* be called from the data path, since that might
+ * cause a deadlock
+ *
+ * Return: 0 on success; -ENOPKG if the hardware doesn't support the key and
+ * blk-crypto-fallback is either disabled or the needed algorithm
+ * is disabled in the crypto API; or another -errno code.
+ */
+int blk_crypto_start_using_key(const struct blk_crypto_key *key,
+ struct request_queue *q)
+{
+ if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+ return 0;
+ return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
+}
+
+/**
+ * blk_crypto_evict_key() - Evict a key from any inline encryption hardware
+ * it may have been programmed into
+ * @q: The request queue who's associated inline encryption hardware this key
+ * might have been programmed into
+ * @key: The key to evict
+ *
+ * Upper layers (filesystems) must call this function to ensure that a key is
+ * evicted from any hardware that it might have been programmed into. The key
+ * must not be in use by any in-flight IO when this function is called.
+ *
+ * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ */
+int blk_crypto_evict_key(struct request_queue *q,
+ const struct blk_crypto_key *key)
+{
+ if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+ return blk_ksm_evict_key(q->ksm, key);
+
+ /*
+ * If the request queue's associated inline encryption hardware didn't
+ * have support for the key, then the key might have been programmed
+ * into the fallback keyslot manager, so try to evict from there.
+ */
+ return blk_crypto_fallback_evict_key(key);
+}
diff --git a/block/blk-exec.c b/block/blk-exec.c
index 1db44ca0f4a6..85324d53d072 100644
--- a/block/blk-exec.c
+++ b/block/blk-exec.c
@@ -55,6 +55,8 @@ void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
rq->rq_disk = bd_disk;
rq->end_io = done;
+ blk_account_io_start(rq);
+
/*
* don't check dying flag for MQ because the request won't
* be reused after dying flag is set
diff --git a/block/blk-flush.c b/block/blk-flush.c
index aedd9320e605..15ae0155ec07 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,6 +69,7 @@
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
+#include <linux/lockdep.h>
#include "blk.h"
#include "blk-mq.h"
@@ -136,6 +137,17 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
blk_mq_add_to_requeue_list(rq, add_front, true);
}
+static void blk_account_io_flush(struct request *rq)
+{
+ struct hd_struct *part = &rq->rq_disk->part0;
+
+ part_stat_lock();
+ part_stat_inc(part, ios[STAT_FLUSH]);
+ part_stat_add(part, nsecs[STAT_FLUSH],
+ ktime_get_ns() - rq->start_time_ns);
+ part_stat_unlock();
+}
+
/**
* blk_flush_complete_seq - complete flush sequence
* @rq: PREFLUSH/FUA request being sequenced
@@ -148,9 +160,6 @@ static void blk_flush_queue_rq(struct request *rq, bool add_front)
*
* CONTEXT:
* spin_lock_irq(fq->mq_flush_lock)
- *
- * RETURNS:
- * %true if requests were added to the dispatch queue, %false otherwise.
*/
static void blk_flush_complete_seq(struct request *rq,
struct blk_flush_queue *fq,
@@ -185,7 +194,7 @@ static void blk_flush_complete_seq(struct request *rq,
case REQ_FSEQ_DONE:
/*
- * @rq was previously adjusted by blk_flush_issue() for
+ * @rq was previously adjusted by blk_insert_flush() for
* flush sequencing and may already have gone through the
* flush data request completion path. Restore @rq for
* normal completion and end it.
@@ -212,8 +221,20 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
struct blk_mq_hw_ctx *hctx;
+ blk_account_io_flush(flush_rq);
+
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
+
+ if (!refcount_dec_and_test(&flush_rq->ref)) {
+ fq->rq_status = error;
+ spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+ return;
+ }
+
+ if (fq->rq_status != BLK_STS_OK)
+ error = fq->rq_status;
+
hctx = flush_rq->mq_hctx;
if (!q->elevator) {
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
@@ -237,7 +258,6 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
blk_flush_complete_seq(rq, fq, seq, error);
}
- fq->flush_queue_delayed = 0;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
}
@@ -388,7 +408,7 @@ void blk_insert_flush(struct request *rq)
*/
if ((policy & REQ_FSEQ_DATA) &&
!(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- blk_mq_request_bypass_insert(rq, false);
+ blk_mq_request_bypass_insert(rq, false, false);
return;
}
@@ -412,57 +432,27 @@ void blk_insert_flush(struct request *rq)
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @error_sector: error sector
*
* Description:
- * Issue a flush for the block device in question. Caller can supply
- * room for storing the error offset in case of a flush error, if they
- * wish to.
+ * Issue a flush for the block device in question.
*/
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask,
- sector_t *error_sector)
+int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
{
- struct request_queue *q;
struct bio *bio;
int ret = 0;
- if (bdev->bd_disk == NULL)
- return -ENXIO;
-
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- /*
- * some block devices may not have their queue correctly set up here
- * (e.g. loop device without a backing file) and so issuing a flush
- * here will panic. Ensure there is a request function before issuing
- * the flush.
- */
- if (!q->make_request_fn)
- return -ENXIO;
-
bio = bio_alloc(gfp_mask, 0);
bio_set_dev(bio, bdev);
bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
ret = submit_bio_wait(bio);
-
- /*
- * The driver must store the error location in ->bi_sector, if
- * it supports it. For non-stacked drivers, this should be
- * copied from blk_rq_pos(rq).
- */
- if (error_sector)
- *error_sector = bio->bi_iter.bi_sector;
-
bio_put(bio);
return ret;
}
EXPORT_SYMBOL(blkdev_issue_flush);
-struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
- int node, int cmd_size, gfp_t flags)
+struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
+ gfp_t flags)
{
struct blk_flush_queue *fq;
int rq_sz = sizeof(struct request);
@@ -482,6 +472,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
+ lockdep_register_key(&fq->key);
+ lockdep_set_class(&fq->mq_flush_lock, &fq->key);
+
return fq;
fail_rq:
@@ -496,6 +489,7 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
if (!fq)
return;
+ lockdep_unregister_key(&fq->key);
kfree(fq->flush_rq);
kfree(fq);
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 825c9c070458..c03705cbb9c9 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
return BLK_STS_OK;
}
+static void blk_integrity_nop_prepare(struct request *rq)
+{
+}
+
+static void blk_integrity_nop_complete(struct request *rq,
+ unsigned int nr_bytes)
+{
+}
+
static const struct blk_integrity_profile nop_profile = {
.name = "nop",
.generate_fn = blk_integrity_nop_fn,
.verify_fn = blk_integrity_nop_fn,
+ .prepare_fn = blk_integrity_nop_prepare,
+ .complete_fn = blk_integrity_nop_complete,
};
/**
@@ -383,7 +394,7 @@ static const struct blk_integrity_profile nop_profile = {
* send/receive integrity metadata it must use this function to register
* the capability with the block layer. The template is a blk_integrity
* struct with values appropriate for the underlying hardware. See
- * Documentation/block/data-integrity.txt.
+ * Documentation/block/data-integrity.rst.
*/
void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
{
@@ -398,6 +409,13 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->tag_size = template->tag_size;
disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ if (disk->queue->ksm) {
+ pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+ blk_ksm_unregister(disk->queue);
+ }
+#endif
}
EXPORT_SYMBOL(blk_integrity_register);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 5ed59ac6ae58..9df50fb507ca 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -84,6 +84,7 @@ static void ioc_destroy_icq(struct io_cq *icq)
* making it impossible to determine icq_cache. Record it in @icq.
*/
icq->__rcu_icq_cache = et->icq_cache;
+ icq->flags |= ICQ_DESTROYED;
call_rcu(&icq->__rcu_head, icq_free_icq_rcu);
}
@@ -212,15 +213,21 @@ static void __ioc_clear_queue(struct list_head *icq_list)
{
unsigned long flags;
+ rcu_read_lock();
while (!list_empty(icq_list)) {
struct io_cq *icq = list_entry(icq_list->next,
struct io_cq, q_node);
struct io_context *ioc = icq->ioc;
spin_lock_irqsave(&ioc->lock, flags);
+ if (icq->flags & ICQ_DESTROYED) {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ continue;
+ }
ioc_destroy_icq(icq);
spin_unlock_irqrestore(&ioc->lock, flags);
}
+ rcu_read_unlock();
}
/**
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
new file mode 100644
index 000000000000..8ac4aad66ebc
--- /dev/null
+++ b/block/blk-iocost.c
@@ -0,0 +1,2538 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * IO cost model based controller.
+ *
+ * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
+ * Copyright (C) 2019 Andy Newell <newella@fb.com>
+ * Copyright (C) 2019 Facebook
+ *
+ * One challenge of controlling IO resources is the lack of trivially
+ * observable cost metric. This is distinguished from CPU and memory where
+ * wallclock time and the number of bytes can serve as accurate enough
+ * approximations.
+ *
+ * Bandwidth and iops are the most commonly used metrics for IO devices but
+ * depending on the type and specifics of the device, different IO patterns
+ * easily lead to multiple orders of magnitude variations rendering them
+ * useless for the purpose of IO capacity distribution. While on-device
+ * time, with a lot of clutches, could serve as a useful approximation for
+ * non-queued rotational devices, this is no longer viable with modern
+ * devices, even the rotational ones.
+ *
+ * While there is no cost metric we can trivially observe, it isn't a
+ * complete mystery. For example, on a rotational device, seek cost
+ * dominates while a contiguous transfer contributes a smaller amount
+ * proportional to the size. If we can characterize at least the relative
+ * costs of these different types of IOs, it should be possible to
+ * implement a reasonable work-conserving proportional IO resource
+ * distribution.
+ *
+ * 1. IO Cost Model
+ *
+ * IO cost model estimates the cost of an IO given its basic parameters and
+ * history (e.g. the end sector of the last IO). The cost is measured in
+ * device time. If a given IO is estimated to cost 10ms, the device should
+ * be able to process ~100 of those IOs in a second.
+ *
+ * Currently, there's only one builtin cost model - linear. Each IO is
+ * classified as sequential or random and given a base cost accordingly.
+ * On top of that, a size cost proportional to the length of the IO is
+ * added. While simple, this model captures the operational
+ * characteristics of a wide varienty of devices well enough. Default
+ * paramters for several different classes of devices are provided and the
+ * parameters can be configured from userspace via
+ * /sys/fs/cgroup/io.cost.model.
+ *
+ * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
+ * device-specific coefficients.
+ *
+ * 2. Control Strategy
+ *
+ * The device virtual time (vtime) is used as the primary control metric.
+ * The control strategy is composed of the following three parts.
+ *
+ * 2-1. Vtime Distribution
+ *
+ * When a cgroup becomes active in terms of IOs, its hierarchical share is
+ * calculated. Please consider the following hierarchy where the numbers
+ * inside parentheses denote the configured weights.
+ *
+ * root
+ * / \
+ * A (w:100) B (w:300)
+ * / \
+ * A0 (w:100) A1 (w:100)
+ *
+ * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
+ * of equal weight, each gets 50% share. If then B starts issuing IOs, B
+ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
+ * 12.5% each. The distribution mechanism only cares about these flattened
+ * shares. They're called hweights (hierarchical weights) and always add
+ * upto 1 (HWEIGHT_WHOLE).
+ *
+ * A given cgroup's vtime runs slower in inverse proportion to its hweight.
+ * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
+ * against the device vtime - an IO which takes 10ms on the underlying
+ * device is considered to take 80ms on A0.
+ *
+ * This constitutes the basis of IO capacity distribution. Each cgroup's
+ * vtime is running at a rate determined by its hweight. A cgroup tracks
+ * the vtime consumed by past IOs and can issue a new IO iff doing so
+ * wouldn't outrun the current device vtime. Otherwise, the IO is
+ * suspended until the vtime has progressed enough to cover it.
+ *
+ * 2-2. Vrate Adjustment
+ *
+ * It's unrealistic to expect the cost model to be perfect. There are too
+ * many devices and even on the same device the overall performance
+ * fluctuates depending on numerous factors such as IO mixture and device
+ * internal garbage collection. The controller needs to adapt dynamically.
+ *
+ * This is achieved by adjusting the overall IO rate according to how busy
+ * the device is. If the device becomes overloaded, we're sending down too
+ * many IOs and should generally slow down. If there are waiting issuers
+ * but the device isn't saturated, we're issuing too few and should
+ * generally speed up.
+ *
+ * To slow down, we lower the vrate - the rate at which the device vtime
+ * passes compared to the wall clock. For example, if the vtime is running
+ * at the vrate of 75%, all cgroups added up would only be able to issue
+ * 750ms worth of IOs per second, and vice-versa for speeding up.
+ *
+ * Device business is determined using two criteria - rq wait and
+ * completion latencies.
+ *
+ * When a device gets saturated, the on-device and then the request queues
+ * fill up and a bio which is ready to be issued has to wait for a request
+ * to become available. When this delay becomes noticeable, it's a clear
+ * indication that the device is saturated and we lower the vrate. This
+ * saturation signal is fairly conservative as it only triggers when both
+ * hardware and software queues are filled up, and is used as the default
+ * busy signal.
+ *
+ * As devices can have deep queues and be unfair in how the queued commands
+ * are executed, soley depending on rq wait may not result in satisfactory
+ * control quality. For a better control quality, completion latency QoS
+ * parameters can be configured so that the device is considered saturated
+ * if N'th percentile completion latency rises above the set point.
+ *
+ * The completion latency requirements are a function of both the
+ * underlying device characteristics and the desired IO latency quality of
+ * service. There is an inherent trade-off - the tighter the latency QoS,
+ * the higher the bandwidth lossage. Latency QoS is disabled by default
+ * and can be set through /sys/fs/cgroup/io.cost.qos.
+ *
+ * 2-3. Work Conservation
+ *
+ * Imagine two cgroups A and B with equal weights. A is issuing a small IO
+ * periodically while B is sending out enough parallel IOs to saturate the
+ * device on its own. Let's say A's usage amounts to 100ms worth of IO
+ * cost per second, i.e., 10% of the device capacity. The naive
+ * distribution of half and half would lead to 60% utilization of the
+ * device, a significant reduction in the total amount of work done
+ * compared to free-for-all competition. This is too high a cost to pay
+ * for IO control.
+ *
+ * To conserve the total amount of work done, we keep track of how much
+ * each active cgroup is actually using and yield part of its weight if
+ * there are other cgroups which can make use of it. In the above case,
+ * A's weight will be lowered so that it hovers above the actual usage and
+ * B would be able to use the rest.
+ *
+ * As we don't want to penalize a cgroup for donating its weight, the
+ * surplus weight adjustment factors in a margin and has an immediate
+ * snapback mechanism in case the cgroup needs more IO vtime for itself.
+ *
+ * Note that adjusting down surplus weights has the same effects as
+ * accelerating vtime for other cgroups and work conservation can also be
+ * implemented by adjusting vrate dynamically. However, squaring who can
+ * donate and should take back how much requires hweight propagations
+ * anyway making it easier to implement and understand as a separate
+ * mechanism.
+ *
+ * 3. Monitoring
+ *
+ * Instead of debugfs or other clumsy monitoring mechanisms, this
+ * controller uses a drgn based monitoring script -
+ * tools/cgroup/iocost_monitor.py. For details on drgn, please see
+ * https://github.com/osandov/drgn. The ouput looks like the following.
+ *
+ * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
+ * active weight hweight% inflt% dbt delay usages%
+ * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
+ * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
+ *
+ * - per : Timer period
+ * - cur_per : Internal wall and device vtime clock
+ * - vrate : Device virtual time rate against wall clock
+ * - weight : Surplus-adjusted and configured weights
+ * - hweight : Surplus-adjusted and configured hierarchical weights
+ * - inflt : The percentage of in-flight IO cost at the end of last period
+ * - del_ms : Deferred issuer delay induction level and duration
+ * - usages : Usage history
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/time64.h>
+#include <linux/parser.h>
+#include <linux/sched/signal.h>
+#include <linux/blk-cgroup.h>
+#include "blk-rq-qos.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
+
+#ifdef CONFIG_TRACEPOINTS
+
+/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
+#define TRACE_IOCG_PATH_LEN 1024
+static DEFINE_SPINLOCK(trace_iocg_path_lock);
+static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
+
+#define TRACE_IOCG_PATH(type, iocg, ...) \
+ do { \
+ unsigned long flags; \
+ if (trace_iocost_##type##_enabled()) { \
+ spin_lock_irqsave(&trace_iocg_path_lock, flags); \
+ cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
+ trace_iocg_path, TRACE_IOCG_PATH_LEN); \
+ trace_iocost_##type(iocg, trace_iocg_path, \
+ ##__VA_ARGS__); \
+ spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
+ } \
+ } while (0)
+
+#else /* CONFIG_TRACE_POINTS */
+#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
+#endif /* CONFIG_TRACE_POINTS */
+
+enum {
+ MILLION = 1000000,
+
+ /* timer period is calculated from latency requirements, bound it */
+ MIN_PERIOD = USEC_PER_MSEC,
+ MAX_PERIOD = USEC_PER_SEC,
+
+ /*
+ * A cgroup's vtime can run 50% behind the device vtime, which
+ * serves as its IO credit buffer. Surplus weight adjustment is
+ * immediately canceled if the vtime margin runs below 10%.
+ */
+ MARGIN_PCT = 50,
+ INUSE_MARGIN_PCT = 10,
+
+ /* Have some play in waitq timer operations */
+ WAITQ_TIMER_MARGIN_PCT = 5,
+
+ /*
+ * vtime can wrap well within a reasonable uptime when vrate is
+ * consistently raised. Don't trust recorded cgroup vtime if the
+ * period counter indicates that it's older than 5mins.
+ */
+ VTIME_VALID_DUR = 300 * USEC_PER_SEC,
+
+ /*
+ * Remember the past three non-zero usages and use the max for
+ * surplus calculation. Three slots guarantee that we remember one
+ * full period usage from the last active stretch even after
+ * partial deactivation and re-activation periods. Don't start
+ * giving away weight before collecting two data points to prevent
+ * hweight adjustments based on one partial activation period.
+ */
+ NR_USAGE_SLOTS = 3,
+ MIN_VALID_USAGES = 2,
+
+ /* 1/64k is granular enough and can easily be handled w/ u32 */
+ HWEIGHT_WHOLE = 1 << 16,
+
+ /*
+ * As vtime is used to calculate the cost of each IO, it needs to
+ * be fairly high precision. For example, it should be able to
+ * represent the cost of a single page worth of discard with
+ * suffificient accuracy. At the same time, it should be able to
+ * represent reasonably long enough durations to be useful and
+ * convenient during operation.
+ *
+ * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
+ * granularity and days of wrap-around time even at extreme vrates.
+ */
+ VTIME_PER_SEC_SHIFT = 37,
+ VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
+ VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
+ VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
+
+ /* bound vrate adjustments within two orders of magnitude */
+ VRATE_MIN_PPM = 10000, /* 1% */
+ VRATE_MAX_PPM = 100000000, /* 10000% */
+
+ VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
+ VRATE_CLAMP_ADJ_PCT = 4,
+
+ /* if IOs end up waiting for requests, issue less */
+ RQ_WAIT_BUSY_PCT = 5,
+
+ /* unbusy hysterisis */
+ UNBUSY_THR_PCT = 75,
+
+ /* don't let cmds which take a very long time pin lagging for too long */
+ MAX_LAGGING_PERIODS = 10,
+
+ /*
+ * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
+ * donate the surplus.
+ */
+ SURPLUS_SCALE_PCT = 125, /* * 125% */
+ SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
+ SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
+
+ /* switch iff the conditions are met for longer than this */
+ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
+
+ /*
+ * Count IO size in 4k pages. The 12bit shift helps keeping
+ * size-proportional components of cost calculation in closer
+ * numbers of digits to per-IO cost components.
+ */
+ IOC_PAGE_SHIFT = 12,
+ IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
+ IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
+
+ /* if apart further than 16M, consider randio for linear model */
+ LCOEF_RANDIO_PAGES = 4096,
+};
+
+enum ioc_running {
+ IOC_IDLE,
+ IOC_RUNNING,
+ IOC_STOP,
+};
+
+/* io.cost.qos controls including per-dev enable of the whole controller */
+enum {
+ QOS_ENABLE,
+ QOS_CTRL,
+ NR_QOS_CTRL_PARAMS,
+};
+
+/* io.cost.qos params */
+enum {
+ QOS_RPPM,
+ QOS_RLAT,
+ QOS_WPPM,
+ QOS_WLAT,
+ QOS_MIN,
+ QOS_MAX,
+ NR_QOS_PARAMS,
+};
+
+/* io.cost.model controls */
+enum {
+ COST_CTRL,
+ COST_MODEL,
+ NR_COST_CTRL_PARAMS,
+};
+
+/* builtin linear cost model coefficients */
+enum {
+ I_LCOEF_RBPS,
+ I_LCOEF_RSEQIOPS,
+ I_LCOEF_RRANDIOPS,
+ I_LCOEF_WBPS,
+ I_LCOEF_WSEQIOPS,
+ I_LCOEF_WRANDIOPS,
+ NR_I_LCOEFS,
+};
+
+enum {
+ LCOEF_RPAGE,
+ LCOEF_RSEQIO,
+ LCOEF_RRANDIO,
+ LCOEF_WPAGE,
+ LCOEF_WSEQIO,
+ LCOEF_WRANDIO,
+ NR_LCOEFS,
+};
+
+enum {
+ AUTOP_INVALID,
+ AUTOP_HDD,
+ AUTOP_SSD_QD1,
+ AUTOP_SSD_DFL,
+ AUTOP_SSD_FAST,
+};
+
+struct ioc_gq;
+
+struct ioc_params {
+ u32 qos[NR_QOS_PARAMS];
+ u64 i_lcoefs[NR_I_LCOEFS];
+ u64 lcoefs[NR_LCOEFS];
+ u32 too_fast_vrate_pct;
+ u32 too_slow_vrate_pct;
+};
+
+struct ioc_missed {
+ u32 nr_met;
+ u32 nr_missed;
+ u32 last_met;
+ u32 last_missed;
+};
+
+struct ioc_pcpu_stat {
+ struct ioc_missed missed[2];
+
+ u64 rq_wait_ns;
+ u64 last_rq_wait_ns;
+};
+
+/* per device */
+struct ioc {
+ struct rq_qos rqos;
+
+ bool enabled;
+
+ struct ioc_params params;
+ u32 period_us;
+ u32 margin_us;
+ u64 vrate_min;
+ u64 vrate_max;
+
+ spinlock_t lock;
+ struct timer_list timer;
+ struct list_head active_iocgs; /* active cgroups */
+ struct ioc_pcpu_stat __percpu *pcpu_stat;
+
+ enum ioc_running running;
+ atomic64_t vtime_rate;
+
+ seqcount_t period_seqcount;
+ u32 period_at; /* wallclock starttime */
+ u64 period_at_vtime; /* vtime starttime */
+
+ atomic64_t cur_period; /* inc'd each period */
+ int busy_level; /* saturation history */
+
+ u64 inuse_margin_vtime;
+ bool weights_updated;
+ atomic_t hweight_gen; /* for lazy hweights */
+
+ u64 autop_too_fast_at;
+ u64 autop_too_slow_at;
+ int autop_idx;
+ bool user_qos_params:1;
+ bool user_cost_model:1;
+};
+
+/* per device-cgroup pair */
+struct ioc_gq {
+ struct blkg_policy_data pd;
+ struct ioc *ioc;
+
+ /*
+ * A iocg can get its weight from two sources - an explicit
+ * per-device-cgroup configuration or the default weight of the
+ * cgroup. `cfg_weight` is the explicit per-device-cgroup
+ * configuration. `weight` is the effective considering both
+ * sources.
+ *
+ * When an idle cgroup becomes active its `active` goes from 0 to
+ * `weight`. `inuse` is the surplus adjusted active weight.
+ * `active` and `inuse` are used to calculate `hweight_active` and
+ * `hweight_inuse`.
+ *
+ * `last_inuse` remembers `inuse` while an iocg is idle to persist
+ * surplus adjustments.
+ */
+ u32 cfg_weight;
+ u32 weight;
+ u32 active;
+ u32 inuse;
+ u32 last_inuse;
+
+ sector_t cursor; /* to detect randio */
+
+ /*
+ * `vtime` is this iocg's vtime cursor which progresses as IOs are
+ * issued. If lagging behind device vtime, the delta represents
+ * the currently available IO budget. If runnning ahead, the
+ * overage.
+ *
+ * `vtime_done` is the same but progressed on completion rather
+ * than issue. The delta behind `vtime` represents the cost of
+ * currently in-flight IOs.
+ *
+ * `last_vtime` is used to remember `vtime` at the end of the last
+ * period to calculate utilization.
+ */
+ atomic64_t vtime;
+ atomic64_t done_vtime;
+ u64 abs_vdebt;
+ u64 last_vtime;
+
+ /*
+ * The period this iocg was last active in. Used for deactivation
+ * and invalidating `vtime`.
+ */
+ atomic64_t active_period;
+ struct list_head active_list;
+
+ /* see __propagate_active_weight() and current_hweight() for details */
+ u64 child_active_sum;
+ u64 child_inuse_sum;
+ int hweight_gen;
+ u32 hweight_active;
+ u32 hweight_inuse;
+ bool has_surplus;
+
+ struct wait_queue_head waitq;
+ struct hrtimer waitq_timer;
+ struct hrtimer delay_timer;
+
+ /* usage is recorded as fractions of HWEIGHT_WHOLE */
+ int usage_idx;
+ u32 usages[NR_USAGE_SLOTS];
+
+ /* this iocg's depth in the hierarchy and ancestors including self */
+ int level;
+ struct ioc_gq *ancestors[];
+};
+
+/* per cgroup */
+struct ioc_cgrp {
+ struct blkcg_policy_data cpd;
+ unsigned int dfl_weight;
+};
+
+struct ioc_now {
+ u64 now_ns;
+ u32 now;
+ u64 vnow;
+ u64 vrate;
+};
+
+struct iocg_wait {
+ struct wait_queue_entry wait;
+ struct bio *bio;
+ u64 abs_cost;
+ bool committed;
+};
+
+struct iocg_wake_ctx {
+ struct ioc_gq *iocg;
+ u32 hw_inuse;
+ s64 vbudget;
+};
+
+static const struct ioc_params autop[] = {
+ [AUTOP_HDD] = {
+ .qos = {
+ [QOS_RLAT] = 250000, /* 250ms */
+ [QOS_WLAT] = 250000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 174019176,
+ [I_LCOEF_RSEQIOPS] = 41708,
+ [I_LCOEF_RRANDIOPS] = 370,
+ [I_LCOEF_WBPS] = 178075866,
+ [I_LCOEF_WSEQIOPS] = 42705,
+ [I_LCOEF_WRANDIOPS] = 378,
+ },
+ },
+ [AUTOP_SSD_QD1] = {
+ .qos = {
+ [QOS_RLAT] = 25000, /* 25ms */
+ [QOS_WLAT] = 25000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 245855193,
+ [I_LCOEF_RSEQIOPS] = 61575,
+ [I_LCOEF_RRANDIOPS] = 6946,
+ [I_LCOEF_WBPS] = 141365009,
+ [I_LCOEF_WSEQIOPS] = 33716,
+ [I_LCOEF_WRANDIOPS] = 26796,
+ },
+ },
+ [AUTOP_SSD_DFL] = {
+ .qos = {
+ [QOS_RLAT] = 25000, /* 25ms */
+ [QOS_WLAT] = 25000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 488636629,
+ [I_LCOEF_RSEQIOPS] = 8932,
+ [I_LCOEF_RRANDIOPS] = 8518,
+ [I_LCOEF_WBPS] = 427891549,
+ [I_LCOEF_WSEQIOPS] = 28755,
+ [I_LCOEF_WRANDIOPS] = 21940,
+ },
+ .too_fast_vrate_pct = 500,
+ },
+ [AUTOP_SSD_FAST] = {
+ .qos = {
+ [QOS_RLAT] = 5000, /* 5ms */
+ [QOS_WLAT] = 5000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 3102524156LLU,
+ [I_LCOEF_RSEQIOPS] = 724816,
+ [I_LCOEF_RRANDIOPS] = 778122,
+ [I_LCOEF_WBPS] = 1742780862LLU,
+ [I_LCOEF_WSEQIOPS] = 425702,
+ [I_LCOEF_WRANDIOPS] = 443193,
+ },
+ .too_slow_vrate_pct = 10,
+ },
+};
+
+/*
+ * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
+ * vtime credit shortage and down on device saturation.
+ */
+static u32 vrate_adj_pct[] =
+ { 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
+
+static struct blkcg_policy blkcg_policy_iocost;
+
+/* accessors and helpers */
+static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct ioc, rqos);
+}
+
+static struct ioc *q_to_ioc(struct request_queue *q)
+{
+ return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
+}
+
+static const char *q_name(struct request_queue *q)
+{
+ if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ return kobject_name(q->kobj.parent);
+ else
+ return "<unknown>";
+}
+
+static const char __maybe_unused *ioc_name(struct ioc *ioc)
+{
+ return q_name(ioc->rqos.q);
+}
+
+static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
+}
+
+static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
+{
+ return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
+}
+
+static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
+{
+ return pd_to_blkg(&iocg->pd);
+}
+
+static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
+{
+ return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
+ struct ioc_cgrp, cpd);
+}
+
+/*
+ * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
+ * weight, the more expensive each IO. Must round up.
+ */
+static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
+{
+ return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
+}
+
+/*
+ * The inverse of abs_cost_to_cost(). Must round up.
+ */
+static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
+{
+ return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
+}
+
+static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
+{
+ bio->bi_iocost_cost = cost;
+ atomic64_add(cost, &iocg->vtime);
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/iocost.h>
+
+/* latency Qos params changed, update period_us and all the dependent params */
+static void ioc_refresh_period_us(struct ioc *ioc)
+{
+ u32 ppm, lat, multi, period_us;
+
+ lockdep_assert_held(&ioc->lock);
+
+ /* pick the higher latency target */
+ if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
+ ppm = ioc->params.qos[QOS_RPPM];
+ lat = ioc->params.qos[QOS_RLAT];
+ } else {
+ ppm = ioc->params.qos[QOS_WPPM];
+ lat = ioc->params.qos[QOS_WLAT];
+ }
+
+ /*
+ * We want the period to be long enough to contain a healthy number
+ * of IOs while short enough for granular control. Define it as a
+ * multiple of the latency target. Ideally, the multiplier should
+ * be scaled according to the percentile so that it would nominally
+ * contain a certain number of requests. Let's be simpler and
+ * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
+ */
+ if (ppm)
+ multi = max_t(u32, (MILLION - ppm) / 50000, 2);
+ else
+ multi = 2;
+ period_us = multi * lat;
+ period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
+
+ /* calculate dependent params */
+ ioc->period_us = period_us;
+ ioc->margin_us = period_us * MARGIN_PCT / 100;
+ ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
+ period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
+}
+
+static int ioc_autop_idx(struct ioc *ioc)
+{
+ int idx = ioc->autop_idx;
+ const struct ioc_params *p = &autop[idx];
+ u32 vrate_pct;
+ u64 now_ns;
+
+ /* rotational? */
+ if (!blk_queue_nonrot(ioc->rqos.q))
+ return AUTOP_HDD;
+
+ /* handle SATA SSDs w/ broken NCQ */
+ if (blk_queue_depth(ioc->rqos.q) == 1)
+ return AUTOP_SSD_QD1;
+
+ /* use one of the normal ssd sets */
+ if (idx < AUTOP_SSD_DFL)
+ return AUTOP_SSD_DFL;
+
+ /* if user is overriding anything, maintain what was there */
+ if (ioc->user_qos_params || ioc->user_cost_model)
+ return idx;
+
+ /* step up/down based on the vrate */
+ vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
+ VTIME_PER_USEC);
+ now_ns = ktime_get_ns();
+
+ if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
+ if (!ioc->autop_too_fast_at)
+ ioc->autop_too_fast_at = now_ns;
+ if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
+ return idx + 1;
+ } else {
+ ioc->autop_too_fast_at = 0;
+ }
+
+ if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
+ if (!ioc->autop_too_slow_at)
+ ioc->autop_too_slow_at = now_ns;
+ if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
+ return idx - 1;
+ } else {
+ ioc->autop_too_slow_at = 0;
+ }
+
+ return idx;
+}
+
+/*
+ * Take the followings as input
+ *
+ * @bps maximum sequential throughput
+ * @seqiops maximum sequential 4k iops
+ * @randiops maximum random 4k iops
+ *
+ * and calculate the linear model cost coefficients.
+ *
+ * *@page per-page cost 1s / (@bps / 4096)
+ * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
+ * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
+ */
+static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
+ u64 *page, u64 *seqio, u64 *randio)
+{
+ u64 v;
+
+ *page = *seqio = *randio = 0;
+
+ if (bps)
+ *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
+ DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
+
+ if (seqiops) {
+ v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
+ if (v > *page)
+ *seqio = v - *page;
+ }
+
+ if (randiops) {
+ v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
+ if (v > *page)
+ *randio = v - *page;
+ }
+}
+
+static void ioc_refresh_lcoefs(struct ioc *ioc)
+{
+ u64 *u = ioc->params.i_lcoefs;
+ u64 *c = ioc->params.lcoefs;
+
+ calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
+ &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
+ calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
+ &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
+}
+
+static bool ioc_refresh_params(struct ioc *ioc, bool force)
+{
+ const struct ioc_params *p;
+ int idx;
+
+ lockdep_assert_held(&ioc->lock);
+
+ idx = ioc_autop_idx(ioc);
+ p = &autop[idx];
+
+ if (idx == ioc->autop_idx && !force)
+ return false;
+
+ if (idx != ioc->autop_idx)
+ atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+
+ ioc->autop_idx = idx;
+ ioc->autop_too_fast_at = 0;
+ ioc->autop_too_slow_at = 0;
+
+ if (!ioc->user_qos_params)
+ memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
+ if (!ioc->user_cost_model)
+ memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
+
+ ioc_refresh_period_us(ioc);
+ ioc_refresh_lcoefs(ioc);
+
+ ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
+ VTIME_PER_USEC, MILLION);
+ ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
+ VTIME_PER_USEC, MILLION);
+
+ return true;
+}
+
+/* take a snapshot of the current [v]time and vrate */
+static void ioc_now(struct ioc *ioc, struct ioc_now *now)
+{
+ unsigned seq;
+
+ now->now_ns = ktime_get();
+ now->now = ktime_to_us(now->now_ns);
+ now->vrate = atomic64_read(&ioc->vtime_rate);
+
+ /*
+ * The current vtime is
+ *
+ * vtime at period start + (wallclock time since the start) * vrate
+ *
+ * As a consistent snapshot of `period_at_vtime` and `period_at` is
+ * needed, they're seqcount protected.
+ */
+ do {
+ seq = read_seqcount_begin(&ioc->period_seqcount);
+ now->vnow = ioc->period_at_vtime +
+ (now->now - ioc->period_at) * now->vrate;
+ } while (read_seqcount_retry(&ioc->period_seqcount, seq));
+}
+
+static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
+{
+ lockdep_assert_held(&ioc->lock);
+ WARN_ON_ONCE(ioc->running != IOC_RUNNING);
+
+ write_seqcount_begin(&ioc->period_seqcount);
+ ioc->period_at = now->now;
+ ioc->period_at_vtime = now->vnow;
+ write_seqcount_end(&ioc->period_seqcount);
+
+ ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
+ add_timer(&ioc->timer);
+}
+
+/*
+ * Update @iocg's `active` and `inuse` to @active and @inuse, update level
+ * weight sums and propagate upwards accordingly.
+ */
+static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+{
+ struct ioc *ioc = iocg->ioc;
+ int lvl;
+
+ lockdep_assert_held(&ioc->lock);
+
+ inuse = min(active, inuse);
+
+ for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
+ struct ioc_gq *parent = iocg->ancestors[lvl];
+ struct ioc_gq *child = iocg->ancestors[lvl + 1];
+ u32 parent_active = 0, parent_inuse = 0;
+
+ /* update the level sums */
+ parent->child_active_sum += (s32)(active - child->active);
+ parent->child_inuse_sum += (s32)(inuse - child->inuse);
+ /* apply the udpates */
+ child->active = active;
+ child->inuse = inuse;
+
+ /*
+ * The delta between inuse and active sums indicates that
+ * that much of weight is being given away. Parent's inuse
+ * and active should reflect the ratio.
+ */
+ if (parent->child_active_sum) {
+ parent_active = parent->weight;
+ parent_inuse = DIV64_U64_ROUND_UP(
+ parent_active * parent->child_inuse_sum,
+ parent->child_active_sum);
+ }
+
+ /* do we need to keep walking up? */
+ if (parent_active == parent->active &&
+ parent_inuse == parent->inuse)
+ break;
+
+ active = parent_active;
+ inuse = parent_inuse;
+ }
+
+ ioc->weights_updated = true;
+}
+
+static void commit_active_weights(struct ioc *ioc)
+{
+ lockdep_assert_held(&ioc->lock);
+
+ if (ioc->weights_updated) {
+ /* paired with rmb in current_hweight(), see there */
+ smp_wmb();
+ atomic_inc(&ioc->hweight_gen);
+ ioc->weights_updated = false;
+ }
+}
+
+static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+{
+ __propagate_active_weight(iocg, active, inuse);
+ commit_active_weights(iocg->ioc);
+}
+
+static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
+{
+ struct ioc *ioc = iocg->ioc;
+ int lvl;
+ u32 hwa, hwi;
+ int ioc_gen;
+
+ /* hot path - if uptodate, use cached */
+ ioc_gen = atomic_read(&ioc->hweight_gen);
+ if (ioc_gen == iocg->hweight_gen)
+ goto out;
+
+ /*
+ * Paired with wmb in commit_active_weights(). If we saw the
+ * updated hweight_gen, all the weight updates from
+ * __propagate_active_weight() are visible too.
+ *
+ * We can race with weight updates during calculation and get it
+ * wrong. However, hweight_gen would have changed and a future
+ * reader will recalculate and we're guaranteed to discard the
+ * wrong result soon.
+ */
+ smp_rmb();
+
+ hwa = hwi = HWEIGHT_WHOLE;
+ for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
+ struct ioc_gq *parent = iocg->ancestors[lvl];
+ struct ioc_gq *child = iocg->ancestors[lvl + 1];
+ u32 active_sum = READ_ONCE(parent->child_active_sum);
+ u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
+ u32 active = READ_ONCE(child->active);
+ u32 inuse = READ_ONCE(child->inuse);
+
+ /* we can race with deactivations and either may read as zero */
+ if (!active_sum || !inuse_sum)
+ continue;
+
+ active_sum = max(active, active_sum);
+ hwa = hwa * active / active_sum; /* max 16bits * 10000 */
+
+ inuse_sum = max(inuse, inuse_sum);
+ hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
+ }
+
+ iocg->hweight_active = max_t(u32, hwa, 1);
+ iocg->hweight_inuse = max_t(u32, hwi, 1);
+ iocg->hweight_gen = ioc_gen;
+out:
+ if (hw_activep)
+ *hw_activep = iocg->hweight_active;
+ if (hw_inusep)
+ *hw_inusep = iocg->hweight_inuse;
+}
+
+static void weight_updated(struct ioc_gq *iocg)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct blkcg_gq *blkg = iocg_to_blkg(iocg);
+ struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
+ u32 weight;
+
+ lockdep_assert_held(&ioc->lock);
+
+ weight = iocg->cfg_weight ?: iocc->dfl_weight;
+ if (weight != iocg->weight && iocg->active)
+ propagate_active_weight(iocg, weight,
+ DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
+ iocg->weight = weight;
+}
+
+static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 last_period, cur_period, max_period_delta;
+ u64 vtime, vmargin, vmin;
+ int i;
+
+ /*
+ * If seem to be already active, just update the stamp to tell the
+ * timer that we're still active. We don't mind occassional races.
+ */
+ if (!list_empty(&iocg->active_list)) {
+ ioc_now(ioc, now);
+ cur_period = atomic64_read(&ioc->cur_period);
+ if (atomic64_read(&iocg->active_period) != cur_period)
+ atomic64_set(&iocg->active_period, cur_period);
+ return true;
+ }
+
+ /* racy check on internal node IOs, treat as root level IOs */
+ if (iocg->child_active_sum)
+ return false;
+
+ spin_lock_irq(&ioc->lock);
+
+ ioc_now(ioc, now);
+
+ /* update period */
+ cur_period = atomic64_read(&ioc->cur_period);
+ last_period = atomic64_read(&iocg->active_period);
+ atomic64_set(&iocg->active_period, cur_period);
+
+ /* already activated or breaking leaf-only constraint? */
+ if (!list_empty(&iocg->active_list))
+ goto succeed_unlock;
+ for (i = iocg->level - 1; i > 0; i--)
+ if (!list_empty(&iocg->ancestors[i]->active_list))
+ goto fail_unlock;
+
+ if (iocg->child_active_sum)
+ goto fail_unlock;
+
+ /*
+ * vtime may wrap when vrate is raised substantially due to
+ * underestimated IO costs. Look at the period and ignore its
+ * vtime if the iocg has been idle for too long. Also, cap the
+ * budget it can start with to the margin.
+ */
+ max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
+ vtime = atomic64_read(&iocg->vtime);
+ vmargin = ioc->margin_us * now->vrate;
+ vmin = now->vnow - vmargin;
+
+ if (last_period + max_period_delta < cur_period ||
+ time_before64(vtime, vmin)) {
+ atomic64_add(vmin - vtime, &iocg->vtime);
+ atomic64_add(vmin - vtime, &iocg->done_vtime);
+ vtime = vmin;
+ }
+
+ /*
+ * Activate, propagate weight and start period timer if not
+ * running. Reset hweight_gen to avoid accidental match from
+ * wrapping.
+ */
+ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
+ list_add(&iocg->active_list, &ioc->active_iocgs);
+ propagate_active_weight(iocg, iocg->weight,
+ iocg->last_inuse ?: iocg->weight);
+
+ TRACE_IOCG_PATH(iocg_activate, iocg, now,
+ last_period, cur_period, vtime);
+
+ iocg->last_vtime = vtime;
+
+ if (ioc->running == IOC_IDLE) {
+ ioc->running = IOC_RUNNING;
+ ioc_start_period(ioc, now);
+ }
+
+succeed_unlock:
+ spin_unlock_irq(&ioc->lock);
+ return true;
+
+fail_unlock:
+ spin_unlock_irq(&ioc->lock);
+ return false;
+}
+
+static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
+ int flags, void *key)
+{
+ struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
+ struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
+ u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
+
+ ctx->vbudget -= cost;
+
+ if (ctx->vbudget < 0)
+ return -1;
+
+ iocg_commit_bio(ctx->iocg, wait->bio, cost);
+
+ /*
+ * autoremove_wake_function() removes the wait entry only when it
+ * actually changed the task state. We want the wait always
+ * removed. Remove explicitly and use default_wake_function().
+ */
+ list_del_init(&wq_entry->entry);
+ wait->committed = true;
+
+ default_wake_function(wq_entry, mode, flags, key);
+ return 0;
+}
+
+static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct iocg_wake_ctx ctx = { .iocg = iocg };
+ u64 margin_ns = (u64)(ioc->period_us *
+ WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
+ u64 vdebt, vshortage, expires, oexpires;
+ s64 vbudget;
+ u32 hw_inuse;
+
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ current_hweight(iocg, NULL, &hw_inuse);
+ vbudget = now->vnow - atomic64_read(&iocg->vtime);
+
+ /* pay off debt */
+ vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
+ if (vdebt && vbudget > 0) {
+ u64 delta = min_t(u64, vbudget, vdebt);
+ u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
+ iocg->abs_vdebt);
+
+ atomic64_add(delta, &iocg->vtime);
+ atomic64_add(delta, &iocg->done_vtime);
+ iocg->abs_vdebt -= abs_delta;
+ }
+
+ /*
+ * Wake up the ones which are due and see how much vtime we'll need
+ * for the next one.
+ */
+ ctx.hw_inuse = hw_inuse;
+ ctx.vbudget = vbudget - vdebt;
+ __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
+ if (!waitqueue_active(&iocg->waitq))
+ return;
+ if (WARN_ON_ONCE(ctx.vbudget >= 0))
+ return;
+
+ /* determine next wakeup, add a quarter margin to guarantee chunking */
+ vshortage = -ctx.vbudget;
+ expires = now->now_ns +
+ DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
+ expires += margin_ns / 4;
+
+ /* if already active and close enough, don't bother */
+ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
+ if (hrtimer_is_queued(&iocg->waitq_timer) &&
+ abs(oexpires - expires) <= margin_ns / 4)
+ return;
+
+ hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
+ margin_ns / 4, HRTIMER_MODE_ABS);
+}
+
+static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
+{
+ struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
+ struct ioc_now now;
+ unsigned long flags;
+
+ ioc_now(iocg->ioc, &now);
+
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ iocg_kick_waitq(iocg, &now);
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+
+ return HRTIMER_NORESTART;
+}
+
+static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct blkcg_gq *blkg = iocg_to_blkg(iocg);
+ u64 vtime = atomic64_read(&iocg->vtime);
+ u64 vmargin = ioc->margin_us * now->vrate;
+ u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
+ u64 delta_ns, expires, oexpires;
+ u32 hw_inuse;
+
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /* debt-adjust vtime */
+ current_hweight(iocg, NULL, &hw_inuse);
+ vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
+
+ /*
+ * Clear or maintain depending on the overage. Non-zero vdebt is what
+ * guarantees that @iocg is online and future iocg_kick_delay() will
+ * clear use_delay. Don't leave it on when there's no vdebt.
+ */
+ if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
+ blkcg_clear_delay(blkg);
+ return false;
+ }
+ if (!atomic_read(&blkg->use_delay) &&
+ time_before_eq64(vtime, now->vnow + vmargin))
+ return false;
+
+ /* use delay */
+ delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
+ now->vrate) * NSEC_PER_USEC;
+ blkcg_set_delay(blkg, delta_ns);
+ expires = now->now_ns + delta_ns;
+
+ /* if already active and close enough, don't bother */
+ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
+ if (hrtimer_is_queued(&iocg->delay_timer) &&
+ abs(oexpires - expires) <= margin_ns / 4)
+ return true;
+
+ hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
+ margin_ns / 4, HRTIMER_MODE_ABS);
+ return true;
+}
+
+static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
+{
+ struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
+ struct ioc_now now;
+ unsigned long flags;
+
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ ioc_now(iocg->ioc, &now);
+ iocg_kick_delay(iocg, &now);
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+
+ return HRTIMER_NORESTART;
+}
+
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
+{
+ u32 nr_met[2] = { };
+ u32 nr_missed[2] = { };
+ u64 rq_wait_ns = 0;
+ int cpu, rw;
+
+ for_each_online_cpu(cpu) {
+ struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
+ u64 this_rq_wait_ns;
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
+ u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
+
+ nr_met[rw] += this_met - stat->missed[rw].last_met;
+ nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
+ stat->missed[rw].last_met = this_met;
+ stat->missed[rw].last_missed = this_missed;
+ }
+
+ this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
+ rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
+ stat->last_rq_wait_ns = this_rq_wait_ns;
+ }
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ if (nr_met[rw] + nr_missed[rw])
+ missed_ppm_ar[rw] =
+ DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
+ nr_met[rw] + nr_missed[rw]);
+ else
+ missed_ppm_ar[rw] = 0;
+ }
+
+ *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
+ ioc->period_us * NSEC_PER_USEC);
+}
+
+/* was iocg idle this period? */
+static bool iocg_is_idle(struct ioc_gq *iocg)
+{
+ struct ioc *ioc = iocg->ioc;
+
+ /* did something get issued this period? */
+ if (atomic64_read(&iocg->active_period) ==
+ atomic64_read(&ioc->cur_period))
+ return false;
+
+ /* is something in flight? */
+ if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
+ return false;
+
+ return true;
+}
+
+/* returns usage with margin added if surplus is large enough */
+static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
+{
+ /* add margin */
+ usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
+ usage += SURPLUS_SCALE_ABS;
+
+ /* don't bother if the surplus is too small */
+ if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
+ return 0;
+
+ return usage;
+}
+
+static void ioc_timer_fn(struct timer_list *timer)
+{
+ struct ioc *ioc = container_of(timer, struct ioc, timer);
+ struct ioc_gq *iocg, *tiocg;
+ struct ioc_now now;
+ int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
+ u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
+ u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
+ u32 missed_ppm[2], rq_wait_pct;
+ u64 period_vtime;
+ int prev_busy_level, i;
+
+ /* how were the latencies during the period? */
+ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+
+ /* take care of active iocgs */
+ spin_lock_irq(&ioc->lock);
+
+ ioc_now(ioc, &now);
+
+ period_vtime = now.vnow - ioc->period_at_vtime;
+ if (WARN_ON_ONCE(!period_vtime)) {
+ spin_unlock_irq(&ioc->lock);
+ return;
+ }
+
+ /*
+ * Waiters determine the sleep durations based on the vrate they
+ * saw at the time of sleep. If vrate has increased, some waiters
+ * could be sleeping for too long. Wake up tardy waiters which
+ * should have woken up in the last period and expire idle iocgs.
+ */
+ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
+ if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
+ !iocg_is_idle(iocg))
+ continue;
+
+ spin_lock(&iocg->waitq.lock);
+
+ if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
+ /* might be oversleeping vtime / hweight changes, kick */
+ iocg_kick_waitq(iocg, &now);
+ iocg_kick_delay(iocg, &now);
+ } else if (iocg_is_idle(iocg)) {
+ /* no waiter and idle, deactivate */
+ iocg->last_inuse = iocg->inuse;
+ __propagate_active_weight(iocg, 0, 0);
+ list_del_init(&iocg->active_list);
+ }
+
+ spin_unlock(&iocg->waitq.lock);
+ }
+ commit_active_weights(ioc);
+
+ /* calc usages and see whether some weights need to be moved around */
+ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
+ u64 vdone, vtime, vusage, vmargin, vmin;
+ u32 hw_active, hw_inuse, usage;
+
+ /*
+ * Collect unused and wind vtime closer to vnow to prevent
+ * iocgs from accumulating a large amount of budget.
+ */
+ vdone = atomic64_read(&iocg->done_vtime);
+ vtime = atomic64_read(&iocg->vtime);
+ current_hweight(iocg, &hw_active, &hw_inuse);
+
+ /*
+ * Latency QoS detection doesn't account for IOs which are
+ * in-flight for longer than a period. Detect them by
+ * comparing vdone against period start. If lagging behind
+ * IOs from past periods, don't increase vrate.
+ */
+ if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
+ !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
+ time_after64(vtime, vdone) &&
+ time_after64(vtime, now.vnow -
+ MAX_LAGGING_PERIODS * period_vtime) &&
+ time_before64(vdone, now.vnow - period_vtime))
+ nr_lagging++;
+
+ if (waitqueue_active(&iocg->waitq))
+ vusage = now.vnow - iocg->last_vtime;
+ else if (time_before64(iocg->last_vtime, vtime))
+ vusage = vtime - iocg->last_vtime;
+ else
+ vusage = 0;
+
+ iocg->last_vtime += vusage;
+ /*
+ * Factor in in-flight vtime into vusage to avoid
+ * high-latency completions appearing as idle. This should
+ * be done after the above ->last_time adjustment.
+ */
+ vusage = max(vusage, vtime - vdone);
+
+ /* calculate hweight based usage ratio and record */
+ if (vusage) {
+ usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
+ period_vtime);
+ iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
+ iocg->usages[iocg->usage_idx] = usage;
+ } else {
+ usage = 0;
+ }
+
+ /* see whether there's surplus vtime */
+ vmargin = ioc->margin_us * now.vrate;
+ vmin = now.vnow - vmargin;
+
+ iocg->has_surplus = false;
+
+ if (!waitqueue_active(&iocg->waitq) &&
+ time_before64(vtime, vmin)) {
+ u64 delta = vmin - vtime;
+
+ /* throw away surplus vtime */
+ atomic64_add(delta, &iocg->vtime);
+ atomic64_add(delta, &iocg->done_vtime);
+ iocg->last_vtime += delta;
+ /* if usage is sufficiently low, maybe it can donate */
+ if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
+ iocg->has_surplus = true;
+ nr_surpluses++;
+ }
+ } else if (hw_inuse < hw_active) {
+ u32 new_hwi, new_inuse;
+
+ /* was donating but might need to take back some */
+ if (waitqueue_active(&iocg->waitq)) {
+ new_hwi = hw_active;
+ } else {
+ new_hwi = max(hw_inuse,
+ usage * SURPLUS_SCALE_PCT / 100 +
+ SURPLUS_SCALE_ABS);
+ }
+
+ new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
+ hw_inuse);
+ new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
+
+ if (new_inuse > iocg->inuse) {
+ TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
+ iocg->inuse, new_inuse,
+ hw_inuse, new_hwi);
+ __propagate_active_weight(iocg, iocg->weight,
+ new_inuse);
+ }
+ } else {
+ /* genuninely out of vtime */
+ nr_shortages++;
+ }
+ }
+
+ if (!nr_shortages || !nr_surpluses)
+ goto skip_surplus_transfers;
+
+ /* there are both shortages and surpluses, transfer surpluses */
+ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
+ u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
+ int nr_valid = 0;
+
+ if (!iocg->has_surplus)
+ continue;
+
+ /* base the decision on max historical usage */
+ for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
+ if (iocg->usages[i]) {
+ usage = max(usage, iocg->usages[i]);
+ nr_valid++;
+ }
+ }
+ if (nr_valid < MIN_VALID_USAGES)
+ continue;
+
+ current_hweight(iocg, &hw_active, &hw_inuse);
+ new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
+ if (!new_hwi)
+ continue;
+
+ new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
+ hw_inuse);
+ if (new_inuse < iocg->inuse) {
+ TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
+ iocg->inuse, new_inuse,
+ hw_inuse, new_hwi);
+ __propagate_active_weight(iocg, iocg->weight, new_inuse);
+ }
+ }
+skip_surplus_transfers:
+ commit_active_weights(ioc);
+
+ /*
+ * If q is getting clogged or we're missing too much, we're issuing
+ * too much IO and should lower vtime rate. If we're not missing
+ * and experiencing shortages but not surpluses, we're too stingy
+ * and should increase vtime rate.
+ */
+ prev_busy_level = ioc->busy_level;
+ if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+ missed_ppm[READ] > ppm_rthr ||
+ missed_ppm[WRITE] > ppm_wthr) {
+ /* clearly missing QoS targets, slow down vrate */
+ ioc->busy_level = max(ioc->busy_level, 0);
+ ioc->busy_level++;
+ } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
+ missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
+ missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
+ /* QoS targets are being met with >25% margin */
+ if (nr_shortages) {
+ /*
+ * We're throttling while the device has spare
+ * capacity. If vrate was being slowed down, stop.
+ */
+ ioc->busy_level = min(ioc->busy_level, 0);
+
+ /*
+ * If there are IOs spanning multiple periods, wait
+ * them out before pushing the device harder. If
+ * there are surpluses, let redistribution work it
+ * out first.
+ */
+ if (!nr_lagging && !nr_surpluses)
+ ioc->busy_level--;
+ } else {
+ /*
+ * Nobody is being throttled and the users aren't
+ * issuing enough IOs to saturate the device. We
+ * simply don't know how close the device is to
+ * saturation. Coast.
+ */
+ ioc->busy_level = 0;
+ }
+ } else {
+ /* inside the hysterisis margin, we're good */
+ ioc->busy_level = 0;
+ }
+
+ ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
+
+ if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
+ u64 vrate = atomic64_read(&ioc->vtime_rate);
+ u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
+
+ /* rq_wait signal is always reliable, ignore user vrate_min */
+ if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
+ vrate_min = VRATE_MIN;
+
+ /*
+ * If vrate is out of bounds, apply clamp gradually as the
+ * bounds can change abruptly. Otherwise, apply busy_level
+ * based adjustment.
+ */
+ if (vrate < vrate_min) {
+ vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
+ 100);
+ vrate = min(vrate, vrate_min);
+ } else if (vrate > vrate_max) {
+ vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
+ 100);
+ vrate = max(vrate, vrate_max);
+ } else {
+ int idx = min_t(int, abs(ioc->busy_level),
+ ARRAY_SIZE(vrate_adj_pct) - 1);
+ u32 adj_pct = vrate_adj_pct[idx];
+
+ if (ioc->busy_level > 0)
+ adj_pct = 100 - adj_pct;
+ else
+ adj_pct = 100 + adj_pct;
+
+ vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
+ vrate_min, vrate_max);
+ }
+
+ trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
+ nr_lagging, nr_shortages,
+ nr_surpluses);
+
+ atomic64_set(&ioc->vtime_rate, vrate);
+ ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
+ ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
+ } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
+ trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
+ missed_ppm, rq_wait_pct, nr_lagging,
+ nr_shortages, nr_surpluses);
+ }
+
+ ioc_refresh_params(ioc, false);
+
+ /*
+ * This period is done. Move onto the next one. If nothing's
+ * going on with the device, stop the timer.
+ */
+ atomic64_inc(&ioc->cur_period);
+
+ if (ioc->running != IOC_STOP) {
+ if (!list_empty(&ioc->active_iocgs)) {
+ ioc_start_period(ioc, &now);
+ } else {
+ ioc->busy_level = 0;
+ ioc->running = IOC_IDLE;
+ }
+ }
+
+ spin_unlock_irq(&ioc->lock);
+}
+
+static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
+ bool is_merge, u64 *costp)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 coef_seqio, coef_randio, coef_page;
+ u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
+ u64 seek_pages = 0;
+ u64 cost = 0;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
+ coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
+ coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
+ break;
+ case REQ_OP_WRITE:
+ coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
+ coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
+ coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
+ break;
+ default:
+ goto out;
+ }
+
+ if (iocg->cursor) {
+ seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
+ seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
+ }
+
+ if (!is_merge) {
+ if (seek_pages > LCOEF_RANDIO_PAGES) {
+ cost += coef_randio;
+ } else {
+ cost += coef_seqio;
+ }
+ }
+ cost += pages * coef_page;
+out:
+ *costp = cost;
+}
+
+static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
+{
+ u64 cost;
+
+ calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
+ return cost;
+}
+
+static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
+ u64 *costp)
+{
+ unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
+
+ switch (req_op(rq)) {
+ case REQ_OP_READ:
+ *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
+ break;
+ case REQ_OP_WRITE:
+ *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
+ break;
+ default:
+ *costp = 0;
+ }
+}
+
+static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
+{
+ u64 cost;
+
+ calc_size_vtime_cost_builtin(rq, ioc, &cost);
+ return cost;
+}
+
+static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+ struct ioc *ioc = rqos_to_ioc(rqos);
+ struct ioc_gq *iocg = blkg_to_iocg(blkg);
+ struct ioc_now now;
+ struct iocg_wait wait;
+ u32 hw_active, hw_inuse;
+ u64 abs_cost, cost, vtime;
+
+ /* bypass IOs if disabled or for root cgroup */
+ if (!ioc->enabled || !iocg->level)
+ return;
+
+ /* always activate so that even 0 cost IOs get protected to some level */
+ if (!iocg_activate(iocg, &now))
+ return;
+
+ /* calculate the absolute vtime cost */
+ abs_cost = calc_vtime_cost(bio, iocg, false);
+ if (!abs_cost)
+ return;
+
+ iocg->cursor = bio_end_sector(bio);
+
+ vtime = atomic64_read(&iocg->vtime);
+ current_hweight(iocg, &hw_active, &hw_inuse);
+
+ if (hw_inuse < hw_active &&
+ time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
+ TRACE_IOCG_PATH(inuse_reset, iocg, &now,
+ iocg->inuse, iocg->weight, hw_inuse, hw_active);
+ spin_lock_irq(&ioc->lock);
+ propagate_active_weight(iocg, iocg->weight, iocg->weight);
+ spin_unlock_irq(&ioc->lock);
+ current_hweight(iocg, &hw_active, &hw_inuse);
+ }
+
+ cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
+ /*
+ * If no one's waiting and within budget, issue right away. The
+ * tests are racy but the races aren't systemic - we only miss once
+ * in a while which is fine.
+ */
+ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
+ time_before_eq64(vtime + cost, now.vnow)) {
+ iocg_commit_bio(iocg, bio, cost);
+ return;
+ }
+
+ /*
+ * We activated above but w/o any synchronization. Deactivation is
+ * synchronized with waitq.lock and we won't get deactivated as long
+ * as we're waiting or has debt, so we're good if we're activated
+ * here. In the unlikely case that we aren't, just issue the IO.
+ */
+ spin_lock_irq(&iocg->waitq.lock);
+
+ if (unlikely(list_empty(&iocg->active_list))) {
+ spin_unlock_irq(&iocg->waitq.lock);
+ iocg_commit_bio(iocg, bio, cost);
+ return;
+ }
+
+ /*
+ * We're over budget. If @bio has to be issued regardless, remember
+ * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
+ * off the debt before waking more IOs.
+ *
+ * This way, the debt is continuously paid off each period with the
+ * actual budget available to the cgroup. If we just wound vtime, we
+ * would incorrectly use the current hw_inuse for the entire amount
+ * which, for example, can lead to the cgroup staying blocked for a
+ * long time even with substantially raised hw_inuse.
+ *
+ * An iocg with vdebt should stay online so that the timer can keep
+ * deducting its vdebt and [de]activate use_delay mechanism
+ * accordingly. We don't want to race against the timer trying to
+ * clear them and leave @iocg inactive w/ dangling use_delay heavily
+ * penalizing the cgroup and its descendants.
+ */
+ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
+ iocg->abs_vdebt += abs_cost;
+ if (iocg_kick_delay(iocg, &now))
+ blkcg_schedule_throttle(rqos->q,
+ (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+ spin_unlock_irq(&iocg->waitq.lock);
+ return;
+ }
+
+ /*
+ * Append self to the waitq and schedule the wakeup timer if we're
+ * the first waiter. The timer duration is calculated based on the
+ * current vrate. vtime and hweight changes can make it too short
+ * or too long. Each wait entry records the absolute cost it's
+ * waiting for to allow re-evaluation using a custom wait entry.
+ *
+ * If too short, the timer simply reschedules itself. If too long,
+ * the period timer will notice and trigger wakeups.
+ *
+ * All waiters are on iocg->waitq and the wait states are
+ * synchronized using waitq.lock.
+ */
+ init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
+ wait.wait.private = current;
+ wait.bio = bio;
+ wait.abs_cost = abs_cost;
+ wait.committed = false; /* will be set true by waker */
+
+ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
+ iocg_kick_waitq(iocg, &now);
+
+ spin_unlock_irq(&iocg->waitq.lock);
+
+ while (true) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (wait.committed)
+ break;
+ io_schedule();
+ }
+
+ /* waker already committed us, proceed */
+ finish_wait(&iocg->waitq, &wait.wait);
+}
+
+static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
+ struct bio *bio)
+{
+ struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
+ struct ioc *ioc = iocg->ioc;
+ sector_t bio_end = bio_end_sector(bio);
+ struct ioc_now now;
+ u32 hw_inuse;
+ u64 abs_cost, cost;
+ unsigned long flags;
+
+ /* bypass if disabled or for root cgroup */
+ if (!ioc->enabled || !iocg->level)
+ return;
+
+ abs_cost = calc_vtime_cost(bio, iocg, true);
+ if (!abs_cost)
+ return;
+
+ ioc_now(ioc, &now);
+ current_hweight(iocg, NULL, &hw_inuse);
+ cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
+ /* update cursor if backmerging into the request at the cursor */
+ if (blk_rq_pos(rq) < bio_end &&
+ blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
+ iocg->cursor = bio_end;
+
+ /*
+ * Charge if there's enough vtime budget and the existing request has
+ * cost assigned.
+ */
+ if (rq->bio && rq->bio->bi_iocost_cost &&
+ time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
+ iocg_commit_bio(iocg, bio, cost);
+ return;
+ }
+
+ /*
+ * Otherwise, account it as debt if @iocg is online, which it should
+ * be for the vast majority of cases. See debt handling in
+ * ioc_rqos_throttle() for details.
+ */
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ if (likely(!list_empty(&iocg->active_list))) {
+ iocg->abs_vdebt += abs_cost;
+ iocg_kick_delay(iocg, &now);
+ } else {
+ iocg_commit_bio(iocg, bio, cost);
+ }
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+}
+
+static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+ struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
+
+ if (iocg && bio->bi_iocost_cost)
+ atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
+}
+
+static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
+{
+ struct ioc *ioc = rqos_to_ioc(rqos);
+ u64 on_q_ns, rq_wait_ns, size_nsec;
+ int pidx, rw;
+
+ if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
+ return;
+
+ switch (req_op(rq) & REQ_OP_MASK) {
+ case REQ_OP_READ:
+ pidx = QOS_RLAT;
+ rw = READ;
+ break;
+ case REQ_OP_WRITE:
+ pidx = QOS_WLAT;
+ rw = WRITE;
+ break;
+ default:
+ return;
+ }
+
+ on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
+ rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
+ size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
+
+ if (on_q_ns <= size_nsec ||
+ on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
+ this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
+ else
+ this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
+
+ this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
+}
+
+static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
+{
+ struct ioc *ioc = rqos_to_ioc(rqos);
+
+ spin_lock_irq(&ioc->lock);
+ ioc_refresh_params(ioc, false);
+ spin_unlock_irq(&ioc->lock);
+}
+
+static void ioc_rqos_exit(struct rq_qos *rqos)
+{
+ struct ioc *ioc = rqos_to_ioc(rqos);
+
+ blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
+
+ spin_lock_irq(&ioc->lock);
+ ioc->running = IOC_STOP;
+ spin_unlock_irq(&ioc->lock);
+
+ del_timer_sync(&ioc->timer);
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+}
+
+static struct rq_qos_ops ioc_rqos_ops = {
+ .throttle = ioc_rqos_throttle,
+ .merge = ioc_rqos_merge,
+ .done_bio = ioc_rqos_done_bio,
+ .done = ioc_rqos_done,
+ .queue_depth_changed = ioc_rqos_queue_depth_changed,
+ .exit = ioc_rqos_exit,
+};
+
+static int blk_iocost_init(struct request_queue *q)
+{
+ struct ioc *ioc;
+ struct rq_qos *rqos;
+ int ret;
+
+ ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
+ if (!ioc)
+ return -ENOMEM;
+
+ ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
+ if (!ioc->pcpu_stat) {
+ kfree(ioc);
+ return -ENOMEM;
+ }
+
+ rqos = &ioc->rqos;
+ rqos->id = RQ_QOS_COST;
+ rqos->ops = &ioc_rqos_ops;
+ rqos->q = q;
+
+ spin_lock_init(&ioc->lock);
+ timer_setup(&ioc->timer, ioc_timer_fn, 0);
+ INIT_LIST_HEAD(&ioc->active_iocgs);
+
+ ioc->running = IOC_IDLE;
+ atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+ seqcount_init(&ioc->period_seqcount);
+ ioc->period_at = ktime_to_us(ktime_get());
+ atomic64_set(&ioc->cur_period, 0);
+ atomic_set(&ioc->hweight_gen, 0);
+
+ spin_lock_irq(&ioc->lock);
+ ioc->autop_idx = AUTOP_INVALID;
+ ioc_refresh_params(ioc, true);
+ spin_unlock_irq(&ioc->lock);
+
+ rq_qos_add(q, rqos);
+ ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
+ if (ret) {
+ rq_qos_del(q, rqos);
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+ return ret;
+ }
+ return 0;
+}
+
+static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
+{
+ struct ioc_cgrp *iocc;
+
+ iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
+ if (!iocc)
+ return NULL;
+
+ iocc->dfl_weight = CGROUP_WEIGHT_DFL;
+ return &iocc->cpd;
+}
+
+static void ioc_cpd_free(struct blkcg_policy_data *cpd)
+{
+ kfree(container_of(cpd, struct ioc_cgrp, cpd));
+}
+
+static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
+ struct blkcg *blkcg)
+{
+ int levels = blkcg->css.cgroup->level + 1;
+ struct ioc_gq *iocg;
+
+ iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
+ gfp, q->node);
+ if (!iocg)
+ return NULL;
+
+ return &iocg->pd;
+}
+
+static void ioc_pd_init(struct blkg_policy_data *pd)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+ struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
+ struct ioc *ioc = q_to_ioc(blkg->q);
+ struct ioc_now now;
+ struct blkcg_gq *tblkg;
+ unsigned long flags;
+
+ ioc_now(ioc, &now);
+
+ iocg->ioc = ioc;
+ atomic64_set(&iocg->vtime, now.vnow);
+ atomic64_set(&iocg->done_vtime, now.vnow);
+ atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
+ INIT_LIST_HEAD(&iocg->active_list);
+ iocg->hweight_active = HWEIGHT_WHOLE;
+ iocg->hweight_inuse = HWEIGHT_WHOLE;
+
+ init_waitqueue_head(&iocg->waitq);
+ hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ iocg->waitq_timer.function = iocg_waitq_timer_fn;
+ hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ iocg->delay_timer.function = iocg_delay_timer_fn;
+
+ iocg->level = blkg->blkcg->css.cgroup->level;
+
+ for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
+ struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
+ iocg->ancestors[tiocg->level] = tiocg;
+ }
+
+ spin_lock_irqsave(&ioc->lock, flags);
+ weight_updated(iocg);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+static void ioc_pd_free(struct blkg_policy_data *pd)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+ struct ioc *ioc = iocg->ioc;
+
+ if (ioc) {
+ spin_lock(&ioc->lock);
+ if (!list_empty(&iocg->active_list)) {
+ propagate_active_weight(iocg, 0, 0);
+ list_del_init(&iocg->active_list);
+ }
+ spin_unlock(&ioc->lock);
+
+ hrtimer_cancel(&iocg->waitq_timer);
+ hrtimer_cancel(&iocg->delay_timer);
+ }
+ kfree(iocg);
+}
+
+static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ const char *dname = blkg_dev_name(pd->blkg);
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+
+ if (dname && iocg->cfg_weight)
+ seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
+ return 0;
+}
+
+
+static int ioc_weight_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
+
+ seq_printf(sf, "default %u\n", iocc->dfl_weight);
+ blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
+ &blkcg_policy_iocost, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
+ struct blkg_conf_ctx ctx;
+ struct ioc_gq *iocg;
+ u32 v;
+ int ret;
+
+ if (!strchr(buf, ':')) {
+ struct blkcg_gq *blkg;
+
+ if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
+ return -EINVAL;
+
+ if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
+ return -EINVAL;
+
+ spin_lock(&blkcg->lock);
+ iocc->dfl_weight = v;
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct ioc_gq *iocg = blkg_to_iocg(blkg);
+
+ if (iocg) {
+ spin_lock_irq(&iocg->ioc->lock);
+ weight_updated(iocg);
+ spin_unlock_irq(&iocg->ioc->lock);
+ }
+ }
+ spin_unlock(&blkcg->lock);
+
+ return nbytes;
+ }
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
+ if (ret)
+ return ret;
+
+ iocg = blkg_to_iocg(ctx.blkg);
+
+ if (!strncmp(ctx.body, "default", 7)) {
+ v = 0;
+ } else {
+ if (!sscanf(ctx.body, "%u", &v))
+ goto einval;
+ if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
+ goto einval;
+ }
+
+ spin_lock(&iocg->ioc->lock);
+ iocg->cfg_weight = v;
+ weight_updated(iocg);
+ spin_unlock(&iocg->ioc->lock);
+
+ blkg_conf_finish(&ctx);
+ return nbytes;
+
+einval:
+ blkg_conf_finish(&ctx);
+ return -EINVAL;
+}
+
+static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ const char *dname = blkg_dev_name(pd->blkg);
+ struct ioc *ioc = pd_to_iocg(pd)->ioc;
+
+ if (!dname)
+ return 0;
+
+ seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
+ dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
+ ioc->params.qos[QOS_RPPM] / 10000,
+ ioc->params.qos[QOS_RPPM] % 10000 / 100,
+ ioc->params.qos[QOS_RLAT],
+ ioc->params.qos[QOS_WPPM] / 10000,
+ ioc->params.qos[QOS_WPPM] % 10000 / 100,
+ ioc->params.qos[QOS_WLAT],
+ ioc->params.qos[QOS_MIN] / 10000,
+ ioc->params.qos[QOS_MIN] % 10000 / 100,
+ ioc->params.qos[QOS_MAX] / 10000,
+ ioc->params.qos[QOS_MAX] % 10000 / 100);
+ return 0;
+}
+
+static int ioc_qos_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+ blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
+ &blkcg_policy_iocost, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static const match_table_t qos_ctrl_tokens = {
+ { QOS_ENABLE, "enable=%u" },
+ { QOS_CTRL, "ctrl=%s" },
+ { NR_QOS_CTRL_PARAMS, NULL },
+};
+
+static const match_table_t qos_tokens = {
+ { QOS_RPPM, "rpct=%s" },
+ { QOS_RLAT, "rlat=%u" },
+ { QOS_WPPM, "wpct=%s" },
+ { QOS_WLAT, "wlat=%u" },
+ { QOS_MIN, "min=%s" },
+ { QOS_MAX, "max=%s" },
+ { NR_QOS_PARAMS, NULL },
+};
+
+static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+ size_t nbytes, loff_t off)
+{
+ struct gendisk *disk;
+ struct ioc *ioc;
+ u32 qos[NR_QOS_PARAMS];
+ bool enable, user;
+ char *p;
+ int ret;
+
+ disk = blkcg_conf_get_disk(&input);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
+
+ ioc = q_to_ioc(disk->queue);
+ if (!ioc) {
+ ret = blk_iocost_init(disk->queue);
+ if (ret)
+ goto err;
+ ioc = q_to_ioc(disk->queue);
+ }
+
+ spin_lock_irq(&ioc->lock);
+ memcpy(qos, ioc->params.qos, sizeof(qos));
+ enable = ioc->enabled;
+ user = ioc->user_qos_params;
+ spin_unlock_irq(&ioc->lock);
+
+ while ((p = strsep(&input, " \t\n"))) {
+ substring_t args[MAX_OPT_ARGS];
+ char buf[32];
+ int tok;
+ s64 v;
+
+ if (!*p)
+ continue;
+
+ switch (match_token(p, qos_ctrl_tokens, args)) {
+ case QOS_ENABLE:
+ match_u64(&args[0], &v);
+ enable = v;
+ continue;
+ case QOS_CTRL:
+ match_strlcpy(buf, &args[0], sizeof(buf));
+ if (!strcmp(buf, "auto"))
+ user = false;
+ else if (!strcmp(buf, "user"))
+ user = true;
+ else
+ goto einval;
+ continue;
+ }
+
+ tok = match_token(p, qos_tokens, args);
+ switch (tok) {
+ case QOS_RPPM:
+ case QOS_WPPM:
+ if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
+ sizeof(buf))
+ goto einval;
+ if (cgroup_parse_float(buf, 2, &v))
+ goto einval;
+ if (v < 0 || v > 10000)
+ goto einval;
+ qos[tok] = v * 100;
+ break;
+ case QOS_RLAT:
+ case QOS_WLAT:
+ if (match_u64(&args[0], &v))
+ goto einval;
+ qos[tok] = v;
+ break;
+ case QOS_MIN:
+ case QOS_MAX:
+ if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
+ sizeof(buf))
+ goto einval;
+ if (cgroup_parse_float(buf, 2, &v))
+ goto einval;
+ if (v < 0)
+ goto einval;
+ qos[tok] = clamp_t(s64, v * 100,
+ VRATE_MIN_PPM, VRATE_MAX_PPM);
+ break;
+ default:
+ goto einval;
+ }
+ user = true;
+ }
+
+ if (qos[QOS_MIN] > qos[QOS_MAX])
+ goto einval;
+
+ spin_lock_irq(&ioc->lock);
+
+ if (enable) {
+ blk_stat_enable_accounting(ioc->rqos.q);
+ blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+ ioc->enabled = true;
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+ ioc->enabled = false;
+ }
+
+ if (user) {
+ memcpy(ioc->params.qos, qos, sizeof(qos));
+ ioc->user_qos_params = true;
+ } else {
+ ioc->user_qos_params = false;
+ }
+
+ ioc_refresh_params(ioc, true);
+ spin_unlock_irq(&ioc->lock);
+
+ put_disk_and_module(disk);
+ return nbytes;
+einval:
+ ret = -EINVAL;
+err:
+ put_disk_and_module(disk);
+ return ret;
+}
+
+static u64 ioc_cost_model_prfill(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ const char *dname = blkg_dev_name(pd->blkg);
+ struct ioc *ioc = pd_to_iocg(pd)->ioc;
+ u64 *u = ioc->params.i_lcoefs;
+
+ if (!dname)
+ return 0;
+
+ seq_printf(sf, "%s ctrl=%s model=linear "
+ "rbps=%llu rseqiops=%llu rrandiops=%llu "
+ "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
+ dname, ioc->user_cost_model ? "user" : "auto",
+ u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
+ u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
+ return 0;
+}
+
+static int ioc_cost_model_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+ blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
+ &blkcg_policy_iocost, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static const match_table_t cost_ctrl_tokens = {
+ { COST_CTRL, "ctrl=%s" },
+ { COST_MODEL, "model=%s" },
+ { NR_COST_CTRL_PARAMS, NULL },
+};
+
+static const match_table_t i_lcoef_tokens = {
+ { I_LCOEF_RBPS, "rbps=%u" },
+ { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
+ { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
+ { I_LCOEF_WBPS, "wbps=%u" },
+ { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
+ { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
+ { NR_I_LCOEFS, NULL },
+};
+
+static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+ size_t nbytes, loff_t off)
+{
+ struct gendisk *disk;
+ struct ioc *ioc;
+ u64 u[NR_I_LCOEFS];
+ bool user;
+ char *p;
+ int ret;
+
+ disk = blkcg_conf_get_disk(&input);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
+
+ ioc = q_to_ioc(disk->queue);
+ if (!ioc) {
+ ret = blk_iocost_init(disk->queue);
+ if (ret)
+ goto err;
+ ioc = q_to_ioc(disk->queue);
+ }
+
+ spin_lock_irq(&ioc->lock);
+ memcpy(u, ioc->params.i_lcoefs, sizeof(u));
+ user = ioc->user_cost_model;
+ spin_unlock_irq(&ioc->lock);
+
+ while ((p = strsep(&input, " \t\n"))) {
+ substring_t args[MAX_OPT_ARGS];
+ char buf[32];
+ int tok;
+ u64 v;
+
+ if (!*p)
+ continue;
+
+ switch (match_token(p, cost_ctrl_tokens, args)) {
+ case COST_CTRL:
+ match_strlcpy(buf, &args[0], sizeof(buf));
+ if (!strcmp(buf, "auto"))
+ user = false;
+ else if (!strcmp(buf, "user"))
+ user = true;
+ else
+ goto einval;
+ continue;
+ case COST_MODEL:
+ match_strlcpy(buf, &args[0], sizeof(buf));
+ if (strcmp(buf, "linear"))
+ goto einval;
+ continue;
+ }
+
+ tok = match_token(p, i_lcoef_tokens, args);
+ if (tok == NR_I_LCOEFS)
+ goto einval;
+ if (match_u64(&args[0], &v))
+ goto einval;
+ u[tok] = v;
+ user = true;
+ }
+
+ spin_lock_irq(&ioc->lock);
+ if (user) {
+ memcpy(ioc->params.i_lcoefs, u, sizeof(u));
+ ioc->user_cost_model = true;
+ } else {
+ ioc->user_cost_model = false;
+ }
+ ioc_refresh_params(ioc, true);
+ spin_unlock_irq(&ioc->lock);
+
+ put_disk_and_module(disk);
+ return nbytes;
+
+einval:
+ ret = -EINVAL;
+err:
+ put_disk_and_module(disk);
+ return ret;
+}
+
+static struct cftype ioc_files[] = {
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = ioc_weight_show,
+ .write = ioc_weight_write,
+ },
+ {
+ .name = "cost.qos",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = ioc_qos_show,
+ .write = ioc_qos_write,
+ },
+ {
+ .name = "cost.model",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = ioc_cost_model_show,
+ .write = ioc_cost_model_write,
+ },
+ {}
+};
+
+static struct blkcg_policy blkcg_policy_iocost = {
+ .dfl_cftypes = ioc_files,
+ .cpd_alloc_fn = ioc_cpd_alloc,
+ .cpd_free_fn = ioc_cpd_free,
+ .pd_alloc_fn = ioc_pd_alloc,
+ .pd_init_fn = ioc_pd_init,
+ .pd_free_fn = ioc_pd_free,
+};
+
+static int __init ioc_init(void)
+{
+ return blkcg_policy_register(&blkcg_policy_iocost);
+}
+
+static void __exit ioc_exit(void)
+{
+ return blkcg_policy_unregister(&blkcg_policy_iocost);
+}
+
+module_init(ioc_init);
+module_exit(ioc_exit);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index d22e61bced86..c128d50cb410 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -618,44 +618,26 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
inflight = atomic_dec_return(&rqw->inflight);
WARN_ON_ONCE(inflight < 0);
- if (iolat->min_lat_nsec == 0)
- goto next;
- iolatency_record_time(iolat, &bio->bi_issue, now,
- issue_as_root);
- window_start = atomic64_read(&iolat->window_start);
- if (now > window_start &&
- (now - window_start) >= iolat->cur_win_nsec) {
- if (atomic64_cmpxchg(&iolat->window_start,
- window_start, now) == window_start)
- iolatency_check_latencies(iolat, now);
+ /*
+ * If bi_status is BLK_STS_AGAIN, the bio wasn't actually
+ * submitted, so do not account for it.
+ */
+ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
+ iolatency_record_time(iolat, &bio->bi_issue, now,
+ issue_as_root);
+ window_start = atomic64_read(&iolat->window_start);
+ if (now > window_start &&
+ (now - window_start) >= iolat->cur_win_nsec) {
+ if (atomic64_cmpxchg(&iolat->window_start,
+ window_start, now) == window_start)
+ iolatency_check_latencies(iolat, now);
+ }
}
-next:
wake_up(&rqw->wait);
blkg = blkg->parent;
}
}
-static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
-{
- struct blkcg_gq *blkg;
-
- blkg = bio->bi_blkg;
- while (blkg && blkg->parent) {
- struct rq_wait *rqw;
- struct iolatency_grp *iolat;
-
- iolat = blkg_to_lat(blkg);
- if (!iolat)
- goto next;
-
- rqw = &iolat->rq_wait;
- atomic_dec(&rqw->inflight);
- wake_up(&rqw->wait);
-next:
- blkg = blkg->parent;
- }
-}
-
static void blkcg_iolatency_exit(struct rq_qos *rqos)
{
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
@@ -667,7 +649,6 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos)
static struct rq_qos_ops blkcg_iolatency_ops = {
.throttle = blkcg_iolatency_throttle,
- .cleanup = blkcg_iolatency_cleanup,
.done_bio = blkcg_iolatency_done_bio,
.exit = blkcg_iolatency_exit,
};
@@ -744,7 +725,7 @@ int blk_iolatency_init(struct request_queue *q)
return -ENOMEM;
rqos = &blkiolat->rqos;
- rqos->id = RQ_QOS_CGROUP;
+ rqos->id = RQ_QOS_LATENCY;
rqos->ops = &blkcg_iolatency_ops;
rqos->q = q;
@@ -778,8 +759,10 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
if (!oldval && val)
return 1;
- if (oldval && !val)
+ if (oldval && !val) {
+ blkcg_clear_delay(blkg);
return -1;
+ }
return 0;
}
@@ -934,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
unsigned long long avg_lat;
unsigned long long cur_win;
+ if (!blkcg_debug_stats)
+ return 0;
+
if (iolat->ssd)
return iolatency_ssd_stat(iolat, buf, size);
@@ -948,11 +934,13 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
}
-static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
+ struct request_queue *q,
+ struct blkcg *blkcg)
{
struct iolatency_grp *iolat;
- iolat = kzalloc_node(sizeof(*iolat), gfp, node);
+ iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
if (!iolat)
return NULL;
iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
diff --git a/block/blk-map.c b/block/blk-map.c
index db9373bd31ac..6e804892d5ec 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -11,6 +11,515 @@
#include "blk.h"
+struct bio_map_data {
+ int is_our_pages;
+ struct iov_iter iter;
+ struct iovec iov[];
+};
+
+static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
+ gfp_t gfp_mask)
+{
+ struct bio_map_data *bmd;
+
+ if (data->nr_segs > UIO_MAXIOV)
+ return NULL;
+
+ bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask);
+ if (!bmd)
+ return NULL;
+ memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
+ bmd->iter = *data;
+ bmd->iter.iov = bmd->iov;
+ return bmd;
+}
+
+/**
+ * bio_copy_from_iter - copy all pages from iov_iter to bio
+ * @bio: The &struct bio which describes the I/O as destination
+ * @iter: iov_iter as source
+ *
+ * Copy all pages from iov_iter to bio.
+ * Returns 0 on success, or error on failure.
+ */
+static int bio_copy_from_iter(struct bio *bio, struct iov_iter *iter)
+{
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ ssize_t ret;
+
+ ret = copy_page_from_iter(bvec->bv_page,
+ bvec->bv_offset,
+ bvec->bv_len,
+ iter);
+
+ if (!iov_iter_count(iter))
+ break;
+
+ if (ret < bvec->bv_len)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * bio_copy_to_iter - copy all pages from bio to iov_iter
+ * @bio: The &struct bio which describes the I/O as source
+ * @iter: iov_iter as destination
+ *
+ * Copy all pages from bio to iov_iter.
+ * Returns 0 on success, or error on failure.
+ */
+static int bio_copy_to_iter(struct bio *bio, struct iov_iter iter)
+{
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ ssize_t ret;
+
+ ret = copy_page_to_iter(bvec->bv_page,
+ bvec->bv_offset,
+ bvec->bv_len,
+ &iter);
+
+ if (!iov_iter_count(&iter))
+ break;
+
+ if (ret < bvec->bv_len)
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
+/**
+ * bio_uncopy_user - finish previously mapped bio
+ * @bio: bio being terminated
+ *
+ * Free pages allocated from bio_copy_user_iov() and write back data
+ * to user space in case of a read.
+ */
+static int bio_uncopy_user(struct bio *bio)
+{
+ struct bio_map_data *bmd = bio->bi_private;
+ int ret = 0;
+
+ if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
+ /*
+ * if we're in a workqueue, the request is orphaned, so
+ * don't copy into a random user address space, just free
+ * and return -EINTR so user space doesn't expect any data.
+ */
+ if (!current->mm)
+ ret = -EINTR;
+ else if (bio_data_dir(bio) == READ)
+ ret = bio_copy_to_iter(bio, bmd->iter);
+ if (bmd->is_our_pages)
+ bio_free_pages(bio);
+ }
+ kfree(bmd);
+ bio_put(bio);
+ return ret;
+}
+
+/**
+ * bio_copy_user_iov - copy user data to bio
+ * @q: destination block queue
+ * @map_data: pointer to the rq_map_data holding pages (if necessary)
+ * @iter: iovec iterator
+ * @gfp_mask: memory allocation flags
+ *
+ * Prepares and returns a bio for indirect user io, bouncing data
+ * to/from kernel pages as necessary. Must be paired with
+ * call bio_uncopy_user() on io completion.
+ */
+static struct bio *bio_copy_user_iov(struct request_queue *q,
+ struct rq_map_data *map_data, struct iov_iter *iter,
+ gfp_t gfp_mask)
+{
+ struct bio_map_data *bmd;
+ struct page *page;
+ struct bio *bio;
+ int i = 0, ret;
+ int nr_pages;
+ unsigned int len = iter->count;
+ unsigned int offset = map_data ? offset_in_page(map_data->offset) : 0;
+
+ bmd = bio_alloc_map_data(iter, gfp_mask);
+ if (!bmd)
+ return ERR_PTR(-ENOMEM);
+
+ /*
+ * We need to do a deep copy of the iov_iter including the iovecs.
+ * The caller provided iov might point to an on-stack or otherwise
+ * shortlived one.
+ */
+ bmd->is_our_pages = map_data ? 0 : 1;
+
+ nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
+ if (nr_pages > BIO_MAX_PAGES)
+ nr_pages = BIO_MAX_PAGES;
+
+ ret = -ENOMEM;
+ bio = bio_kmalloc(gfp_mask, nr_pages);
+ if (!bio)
+ goto out_bmd;
+
+ ret = 0;
+
+ if (map_data) {
+ nr_pages = 1 << map_data->page_order;
+ i = map_data->offset / PAGE_SIZE;
+ }
+ while (len) {
+ unsigned int bytes = PAGE_SIZE;
+
+ bytes -= offset;
+
+ if (bytes > len)
+ bytes = len;
+
+ if (map_data) {
+ if (i == map_data->nr_entries * nr_pages) {
+ ret = -ENOMEM;
+ break;
+ }
+
+ page = map_data->pages[i / nr_pages];
+ page += (i % nr_pages);
+
+ i++;
+ } else {
+ page = alloc_page(q->bounce_gfp | gfp_mask);
+ if (!page) {
+ ret = -ENOMEM;
+ break;
+ }
+ }
+
+ if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
+ if (!map_data)
+ __free_page(page);
+ break;
+ }
+
+ len -= bytes;
+ offset = 0;
+ }
+
+ if (ret)
+ goto cleanup;
+
+ if (map_data)
+ map_data->offset += bio->bi_iter.bi_size;
+
+ /*
+ * success
+ */
+ if ((iov_iter_rw(iter) == WRITE &&
+ (!map_data || !map_data->null_mapped)) ||
+ (map_data && map_data->from_user)) {
+ ret = bio_copy_from_iter(bio, iter);
+ if (ret)
+ goto cleanup;
+ } else {
+ if (bmd->is_our_pages)
+ zero_fill_bio(bio);
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+ }
+
+ bio->bi_private = bmd;
+ if (map_data && map_data->null_mapped)
+ bio_set_flag(bio, BIO_NULL_MAPPED);
+ return bio;
+cleanup:
+ if (!map_data)
+ bio_free_pages(bio);
+ bio_put(bio);
+out_bmd:
+ kfree(bmd);
+ return ERR_PTR(ret);
+}
+
+/**
+ * bio_map_user_iov - map user iovec into bio
+ * @q: the struct request_queue for the bio
+ * @iter: iovec iterator
+ * @gfp_mask: memory allocation flags
+ *
+ * Map the user space address into a bio suitable for io to a block
+ * device. Returns an error pointer in case of error.
+ */
+static struct bio *bio_map_user_iov(struct request_queue *q,
+ struct iov_iter *iter, gfp_t gfp_mask)
+{
+ unsigned int max_sectors = queue_max_hw_sectors(q);
+ int j;
+ struct bio *bio;
+ int ret;
+
+ if (!iov_iter_count(iter))
+ return ERR_PTR(-EINVAL);
+
+ bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+
+ while (iov_iter_count(iter)) {
+ struct page **pages;
+ ssize_t bytes;
+ size_t offs, added = 0;
+ int npages;
+
+ bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
+ if (unlikely(bytes <= 0)) {
+ ret = bytes ? bytes : -EFAULT;
+ goto out_unmap;
+ }
+
+ npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
+
+ if (unlikely(offs & queue_dma_alignment(q))) {
+ ret = -EINVAL;
+ j = 0;
+ } else {
+ for (j = 0; j < npages; j++) {
+ struct page *page = pages[j];
+ unsigned int n = PAGE_SIZE - offs;
+ bool same_page = false;
+
+ if (n > bytes)
+ n = bytes;
+
+ if (!bio_add_hw_page(q, bio, page, n, offs,
+ max_sectors, &same_page)) {
+ if (same_page)
+ put_page(page);
+ break;
+ }
+
+ added += n;
+ bytes -= n;
+ offs = 0;
+ }
+ iov_iter_advance(iter, added);
+ }
+ /*
+ * release the pages we didn't map into the bio, if any
+ */
+ while (j < npages)
+ put_page(pages[j++]);
+ kvfree(pages);
+ /* couldn't stuff something into bio? */
+ if (bytes)
+ break;
+ }
+
+ bio_set_flag(bio, BIO_USER_MAPPED);
+
+ /*
+ * subtle -- if bio_map_user_iov() ended up bouncing a bio,
+ * it would normally disappear when its bi_end_io is run.
+ * however, we need it for the unmap, so grab an extra
+ * reference to it
+ */
+ bio_get(bio);
+ return bio;
+
+ out_unmap:
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ return ERR_PTR(ret);
+}
+
+/**
+ * bio_unmap_user - unmap a bio
+ * @bio: the bio being unmapped
+ *
+ * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
+ * process context.
+ *
+ * bio_unmap_user() may sleep.
+ */
+static void bio_unmap_user(struct bio *bio)
+{
+ bio_release_pages(bio, bio_data_dir(bio) == READ);
+ bio_put(bio);
+ bio_put(bio);
+}
+
+static void bio_invalidate_vmalloc_pages(struct bio *bio)
+{
+#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+ if (bio->bi_private && !op_is_write(bio_op(bio))) {
+ unsigned long i, len = 0;
+
+ for (i = 0; i < bio->bi_vcnt; i++)
+ len += bio->bi_io_vec[i].bv_len;
+ invalidate_kernel_vmap_range(bio->bi_private, len);
+ }
+#endif
+}
+
+static void bio_map_kern_endio(struct bio *bio)
+{
+ bio_invalidate_vmalloc_pages(bio);
+ bio_put(bio);
+}
+
+/**
+ * bio_map_kern - map kernel address into bio
+ * @q: the struct request_queue for the bio
+ * @data: pointer to buffer to map
+ * @len: length in bytes
+ * @gfp_mask: allocation flags for bio allocation
+ *
+ * Map the kernel address into a bio suitable for io to a block
+ * device. Returns an error pointer in case of error.
+ */
+static struct bio *bio_map_kern(struct request_queue *q, void *data,
+ unsigned int len, gfp_t gfp_mask)
+{
+ unsigned long kaddr = (unsigned long)data;
+ unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned long start = kaddr >> PAGE_SHIFT;
+ const int nr_pages = end - start;
+ bool is_vmalloc = is_vmalloc_addr(data);
+ struct page *page;
+ int offset, i;
+ struct bio *bio;
+
+ bio = bio_kmalloc(gfp_mask, nr_pages);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+
+ if (is_vmalloc) {
+ flush_kernel_vmap_range(data, len);
+ bio->bi_private = data;
+ }
+
+ offset = offset_in_page(kaddr);
+ for (i = 0; i < nr_pages; i++) {
+ unsigned int bytes = PAGE_SIZE - offset;
+
+ if (len <= 0)
+ break;
+
+ if (bytes > len)
+ bytes = len;
+
+ if (!is_vmalloc)
+ page = virt_to_page(data);
+ else
+ page = vmalloc_to_page(data);
+ if (bio_add_pc_page(q, bio, page, bytes,
+ offset) < bytes) {
+ /* we don't support partial mappings */
+ bio_put(bio);
+ return ERR_PTR(-EINVAL);
+ }
+
+ data += bytes;
+ len -= bytes;
+ offset = 0;
+ }
+
+ bio->bi_end_io = bio_map_kern_endio;
+ return bio;
+}
+
+static void bio_copy_kern_endio(struct bio *bio)
+{
+ bio_free_pages(bio);
+ bio_put(bio);
+}
+
+static void bio_copy_kern_endio_read(struct bio *bio)
+{
+ char *p = bio->bi_private;
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
+ p += bvec->bv_len;
+ }
+
+ bio_copy_kern_endio(bio);
+}
+
+/**
+ * bio_copy_kern - copy kernel address into bio
+ * @q: the struct request_queue for the bio
+ * @data: pointer to buffer to copy
+ * @len: length in bytes
+ * @gfp_mask: allocation flags for bio and page allocation
+ * @reading: data direction is READ
+ *
+ * copy the kernel address into a bio suitable for io to a block
+ * device. Returns an error pointer in case of error.
+ */
+static struct bio *bio_copy_kern(struct request_queue *q, void *data,
+ unsigned int len, gfp_t gfp_mask, int reading)
+{
+ unsigned long kaddr = (unsigned long)data;
+ unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
+ unsigned long start = kaddr >> PAGE_SHIFT;
+ struct bio *bio;
+ void *p = data;
+ int nr_pages = 0;
+
+ /*
+ * Overflow, abort
+ */
+ if (end < start)
+ return ERR_PTR(-EINVAL);
+
+ nr_pages = end - start;
+ bio = bio_kmalloc(gfp_mask, nr_pages);
+ if (!bio)
+ return ERR_PTR(-ENOMEM);
+
+ while (len) {
+ struct page *page;
+ unsigned int bytes = PAGE_SIZE;
+
+ if (bytes > len)
+ bytes = len;
+
+ page = alloc_page(q->bounce_gfp | gfp_mask);
+ if (!page)
+ goto cleanup;
+
+ if (!reading)
+ memcpy(page_address(page), p, bytes);
+
+ if (bio_add_pc_page(q, bio, page, bytes, 0) < bytes)
+ break;
+
+ len -= bytes;
+ p += bytes;
+ }
+
+ if (reading) {
+ bio->bi_end_io = bio_copy_kern_endio_read;
+ bio->bi_private = data;
+ } else {
+ bio->bi_end_io = bio_copy_kern_endio;
+ }
+
+ return bio;
+
+cleanup:
+ bio_free_pages(bio);
+ bio_put(bio);
+ return ERR_PTR(-ENOMEM);
+}
+
/*
* Append a bio to a passthrough request. Only works if the bio can be merged
* into the request based on the driver constraints.
@@ -18,13 +527,19 @@
int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
struct bio *orig_bio = *bio;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned int nr_segs = 0;
blk_queue_bounce(rq->q, bio);
+ bio_for_each_bvec(bv, *bio, iter)
+ nr_segs++;
+
if (!rq->bio) {
- blk_rq_bio_prep(rq->q, rq, *bio);
+ blk_rq_bio_prep(rq, *bio, nr_segs);
} else {
- if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+ if (!ll_back_merge_fn(rq, *bio, nr_segs)) {
if (orig_bio != *bio) {
bio_put(*bio);
*bio = orig_bio;
@@ -35,6 +550,7 @@ int blk_rq_append_bio(struct request *rq, struct bio **bio)
rq->biotail->bi_next = *bio;
rq->biotail = *bio;
rq->__data_len += (*bio)->bi_iter.bi_size;
+ bio_crypt_free_ctx(*bio);
}
return 0;
@@ -140,12 +656,10 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
bio = rq->bio;
} while (iov_iter_count(&i));
- if (!bio_flagged(bio, BIO_USER_MAPPED))
- rq->rq_flags |= RQF_COPY_USER;
return 0;
unmap_rq:
- __blk_rq_unmap_user(bio);
+ blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
return ret;
@@ -217,7 +731,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
- int do_copy = 0;
struct bio *bio, *orig_bio;
int ret;
@@ -226,8 +739,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (!len || !kbuf)
return -EINVAL;
- do_copy = !blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf);
- if (do_copy)
+ if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf))
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
@@ -238,9 +750,6 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
- if (do_copy)
- rq->rq_flags |= RQF_COPY_USER;
-
orig_bio = bio;
ret = blk_rq_append_bio(rq, &bio);
if (unlikely(ret)) {
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 17713d7d98d5..f0b0bae075a0 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,7 +105,7 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
struct bio *bio, struct bio_set *bs, unsigned *nsegs)
{
- *nsegs = 1;
+ *nsegs = 0;
if (!q->limits.max_write_zeroes_sectors)
return NULL;
@@ -132,51 +132,83 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
}
+/*
+ * Return the maximum number of sectors from the start of a bio that may be
+ * submitted as a single request to a block device. If enough sectors remain,
+ * align the end to the physical block size. Otherwise align the end to the
+ * logical block size. This approach minimizes the number of non-aligned
+ * requests that are submitted to a block device if the start of a bio is not
+ * aligned to a physical block boundary.
+ */
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
- unsigned mask = queue_logical_block_size(q) - 1;
+ unsigned max_sectors = sectors;
+ unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
+ unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
+ unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
- /* aligned to logical block size */
- sectors &= ~(mask >> 9);
+ max_sectors += start_offset;
+ max_sectors &= ~(pbs - 1);
+ if (max_sectors > start_offset)
+ return max_sectors - start_offset;
- return sectors;
+ return sectors & (lbs - 1);
}
-static unsigned get_max_segment_size(struct request_queue *q,
- unsigned offset)
+static inline unsigned get_max_segment_size(const struct request_queue *q,
+ struct page *start_page,
+ unsigned long offset)
{
unsigned long mask = queue_segment_boundary(q);
- /* default segment boundary mask means no boundary limit */
- if (mask == BLK_SEG_BOUNDARY_MASK)
- return queue_max_segment_size(q);
+ offset = mask & (page_to_phys(start_page) + offset);
- return min_t(unsigned long, mask - (mask & offset) + 1,
- queue_max_segment_size(q));
+ /*
+ * overflow may be triggered in case of zero page physical address
+ * on 32bit arch, use queue's max segment size when that happens.
+ */
+ return min_not_zero(mask - offset + 1,
+ (unsigned long)queue_max_segment_size(q));
}
-/*
- * Split the bvec @bv into segments, and update all kinds of
- * variables.
+/**
+ * bvec_split_segs - verify whether or not a bvec should be split in the middle
+ * @q: [in] request queue associated with the bio associated with @bv
+ * @bv: [in] bvec to examine
+ * @nsegs: [in,out] Number of segments in the bio being built. Incremented
+ * by the number of segments from @bv that may be appended to that
+ * bio without exceeding @max_segs
+ * @sectors: [in,out] Number of sectors in the bio being built. Incremented
+ * by the number of sectors from @bv that may be appended to that
+ * bio without exceeding @max_sectors
+ * @max_segs: [in] upper bound for *@nsegs
+ * @max_sectors: [in] upper bound for *@sectors
+ *
+ * When splitting a bio, it can happen that a bvec is encountered that is too
+ * big to fit in a single segment and hence that it has to be split in the
+ * middle. This function verifies whether or not that should happen. The value
+ * %true is returned if and only if appending the entire @bv to a bio with
+ * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
+ * the block driver.
*/
-static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
- unsigned *nsegs, unsigned *sectors, unsigned max_segs)
+static bool bvec_split_segs(const struct request_queue *q,
+ const struct bio_vec *bv, unsigned *nsegs,
+ unsigned *sectors, unsigned max_segs,
+ unsigned max_sectors)
{
- unsigned len = bv->bv_len;
+ unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+ unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
- unsigned new_nsegs = 0, seg_size = 0;
+ unsigned seg_size = 0;
- /*
- * Multi-page bvec may be too big to hold in one segment, so the
- * current bvec has to be splitted as multiple segments.
- */
- while (len && new_nsegs + *nsegs < max_segs) {
- seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
+ while (len && *nsegs < max_segs) {
+ seg_size = get_max_segment_size(q, bv->bv_page,
+ bv->bv_offset + total_len);
seg_size = min(seg_size, len);
- new_nsegs++;
+ (*nsegs)++;
total_len += seg_size;
len -= seg_size;
@@ -184,16 +216,31 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
break;
}
- if (new_nsegs) {
- *nsegs += new_nsegs;
- if (sectors)
- *sectors += total_len >> 9;
- }
+ *sectors += total_len >> 9;
- /* split in the middle of the bvec if len != 0 */
- return !!len;
+ /* tell the caller to split the bvec if it is too big to fit */
+ return len > 0 || bv->bv_len > max_len;
}
+/**
+ * blk_bio_segment_split - split a bio in two bios
+ * @q: [in] request queue pointer
+ * @bio: [in] bio to be split
+ * @bs: [in] bio set to allocate the clone from
+ * @segs: [out] number of segments in the bio with the first half of the sectors
+ *
+ * Clone @bio, update the bi_iter of the clone to represent the first sectors
+ * of @bio and update @bio->bi_iter to represent the remaining sectors. The
+ * following is guaranteed for the cloned bio:
+ * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most queue_max_segments(@q) segments.
+ *
+ * Except for discard requests the cloned bio will point at the bi_io_vec of
+ * the original bio. It is the responsibility of the caller to ensure that the
+ * original bio is not freed before the cloned bio. The caller is also
+ * responsible for ensuring that @bs is only destroyed after processing of the
+ * split bio has finished.
+ */
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -202,8 +249,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, sectors = 0;
- bool do_split = true;
- struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
const unsigned max_segs = queue_max_segments(q);
@@ -215,108 +260,119 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
- if (sectors + (bv.bv_len >> 9) > max_sectors) {
- /*
- * Consider this a new segment if we're splitting in
- * the middle of this vector.
- */
- if (nsegs < max_segs &&
- sectors < max_sectors) {
- /* split in the middle of bvec */
- bv.bv_len = (max_sectors - sectors) << 9;
- bvec_split_segs(q, &bv, &nsegs,
- &sectors, max_segs);
- }
+ if (nsegs < max_segs &&
+ sectors + (bv.bv_len >> 9) <= max_sectors &&
+ bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+ nsegs++;
+ sectors += bv.bv_len >> 9;
+ } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
+ max_sectors)) {
goto split;
}
- if (nsegs == max_segs)
- goto split;
-
bvprv = bv;
bvprvp = &bvprv;
-
- if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
- nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, &sectors,
- max_segs)) {
- goto split;
- }
}
- do_split = false;
+ *segs = nsegs;
+ return NULL;
split:
*segs = nsegs;
-
- if (do_split) {
- new = bio_split(bio, sectors, GFP_NOIO, bs);
- if (new)
- bio = new;
- }
-
- return do_split ? new : NULL;
+ return bio_split(bio, sectors, GFP_NOIO, bs);
}
-void blk_queue_split(struct request_queue *q, struct bio **bio)
+/**
+ * __blk_queue_split - split a bio and submit the second half
+ * @q: [in] request queue pointer
+ * @bio: [in, out] bio to be split
+ * @nr_segs: [out] number of segments in the first bio
+ *
+ * Split a bio into two bios, chain the two bios, submit the second half and
+ * store a pointer to the first half in *@bio. If the second bio is still too
+ * big it will be split by a recursive call to this function. Since this
+ * function may allocate a new bio from @q->bio_split, it is the responsibility
+ * of the caller to ensure that @q is only released after processing of the
+ * split bio has finished.
+ */
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+ unsigned int *nr_segs)
{
- struct bio *split, *res;
- unsigned nsegs;
+ struct bio *split = NULL;
switch (bio_op(*bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
- split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
break;
case REQ_OP_WRITE_ZEROES:
- split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
+ nr_segs);
break;
case REQ_OP_WRITE_SAME:
- split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_write_same_split(q, *bio, &q->bio_split,
+ nr_segs);
break;
default:
- split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
+ /*
+ * All drivers must accept single-segments bios that are <=
+ * PAGE_SIZE. This is a quick and dirty check that relies on
+ * the fact that bi_io_vec[0] is always valid if a bio has data.
+ * The check might lead to occasional false negatives when bios
+ * are cloned, but compared to the performance impact of cloned
+ * bios themselves the loop below doesn't matter anyway.
+ */
+ if (!q->limits.chunk_sectors &&
+ (*bio)->bi_vcnt == 1 &&
+ ((*bio)->bi_io_vec[0].bv_len +
+ (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
+ *nr_segs = 1;
+ break;
+ }
+ split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break;
}
- /* physical segments can be figured out during splitting */
- res = split ? split : *bio;
- res->bi_phys_segments = nsegs;
- bio_set_flag(res, BIO_SEG_VALID);
-
if (split) {
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
- /*
- * Since we're recursing into make_request here, ensure
- * that we mark this bio as already having entered the queue.
- * If not, and the queue is going away, we can get stuck
- * forever on waiting for the queue reference to drop. But
- * that will never happen, as we're already holding a
- * reference to it.
- */
- bio_set_flag(*bio, BIO_QUEUE_ENTERED);
-
bio_chain(split, *bio);
trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
generic_make_request(*bio);
*bio = split;
}
}
+
+/**
+ * blk_queue_split - split a bio and submit the second half
+ * @q: [in] request queue pointer
+ * @bio: [in, out] bio to be split
+ *
+ * Split a bio into two bios, chains the two bios, submit the second half and
+ * store a pointer to the first half in *@bio. Since this function may allocate
+ * a new bio from @q->bio_split, it is the responsibility of the caller to
+ * ensure that @q is only released after processing of the split bio has
+ * finished.
+ */
+void blk_queue_split(struct request_queue *q, struct bio **bio)
+{
+ unsigned int nr_segs;
+
+ __blk_queue_split(q, bio, &nr_segs);
+}
EXPORT_SYMBOL(blk_queue_split);
-static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
- struct bio *bio)
+unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
- struct bvec_iter iter;
+ unsigned int nr_sectors = 0;
+ struct req_iterator iter;
struct bio_vec bv;
- if (!bio)
+ if (!rq->bio)
return 0;
- switch (bio_op(bio)) {
+ switch (bio_op(rq->bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
@@ -325,30 +381,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
return 1;
}
- for_each_bio(bio) {
- bio_for_each_bvec(bv, bio, iter)
- bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX);
- }
-
+ rq_for_each_bvec(bv, rq, iter)
+ bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+ UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
-void blk_recalc_rq_segments(struct request *rq)
-{
- rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
-}
-
-void blk_recount_segments(struct request_queue *q, struct bio *bio)
-{
- struct bio *nxt = bio->bi_next;
-
- bio->bi_next = NULL;
- bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
- bio->bi_next = nxt;
-
- bio_set_flag(bio, BIO_SEG_VALID);
-}
-
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
struct scatterlist *sglist)
{
@@ -375,7 +413,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
while (nbytes > 0) {
unsigned offset = bvec->bv_offset + total;
- unsigned len = min(get_max_segment_size(q, offset), nbytes);
+ unsigned len = min(get_max_segment_size(q, bvec->bv_page,
+ offset), nbytes);
struct page *page = bvec->bv_page;
/*
@@ -470,44 +509,20 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
* map a request to scatterlist, return number of sg entries setup. Caller
* must make sure sg can hold rq->nr_phys_segments entries
*/
-int blk_rq_map_sg(struct request_queue *q, struct request *rq,
- struct scatterlist *sglist)
+int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
+ struct scatterlist *sglist, struct scatterlist **last_sg)
{
- struct scatterlist *sg = NULL;
int nsegs = 0;
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
- nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, &sg);
+ nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
- nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, &sg);
+ nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
else if (rq->bio)
- nsegs = __blk_bios_map_sg(q, rq->bio, sglist, &sg);
+ nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
- if (unlikely(rq->rq_flags & RQF_COPY_USER) &&
- (blk_rq_bytes(rq) & q->dma_pad_mask)) {
- unsigned int pad_len =
- (q->dma_pad_mask & ~blk_rq_bytes(rq)) + 1;
-
- sg->length += pad_len;
- rq->extra_len += pad_len;
- }
-
- if (q->dma_drain_size && q->dma_drain_needed(rq)) {
- if (op_is_write(req_op(rq)))
- memset(q->dma_drain_buffer, 0, q->dma_drain_size);
-
- sg_unmark_end(sg);
- sg = sg_next(sg);
- sg_set_page(sg, virt_to_page(q->dma_drain_buffer),
- q->dma_drain_size,
- ((unsigned long)q->dma_drain_buffer) &
- (PAGE_SIZE - 1));
- nsegs++;
- rq->extra_len += q->dma_drain_size;
- }
-
- if (sg)
- sg_mark_end(sg);
+ if (*last_sg)
+ sg_mark_end(*last_sg);
/*
* Something must have been wrong if the figured number of
@@ -517,18 +532,15 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
return nsegs;
}
-EXPORT_SYMBOL(blk_rq_map_sg);
+EXPORT_SYMBOL(__blk_rq_map_sg);
-static inline int ll_new_hw_segment(struct request_queue *q,
- struct request *req,
- struct bio *bio)
+static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
+ unsigned int nr_phys_segs)
{
- int nr_phys_segs = bio_phys_segments(q, bio);
-
- if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+ if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q))
goto no_merge;
- if (blk_integrity_merge_bio(q, req, bio) == false)
+ if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
/*
@@ -539,51 +551,44 @@ static inline int ll_new_hw_segment(struct request_queue *q,
return 1;
no_merge:
- req_set_nomerge(q, req);
+ req_set_nomerge(req->q, req);
return 0;
}
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio)
+int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
if (req_gap_back_merge(req, bio))
return 0;
if (blk_integrity_rq(req) &&
integrity_req_gap_back_merge(req, bio))
return 0;
+ if (!bio_crypt_ctx_back_mergeable(req, bio))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
- req_set_nomerge(q, req);
+ req_set_nomerge(req->q, req);
return 0;
}
- if (!bio_flagged(req->biotail, BIO_SEG_VALID))
- blk_recount_segments(q, req->biotail);
- if (!bio_flagged(bio, BIO_SEG_VALID))
- blk_recount_segments(q, bio);
- return ll_new_hw_segment(q, req, bio);
+ return ll_new_hw_segment(req, bio, nr_segs);
}
-int ll_front_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio)
+int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
-
if (req_gap_front_merge(req, bio))
return 0;
if (blk_integrity_rq(req) &&
integrity_req_gap_front_merge(req, bio))
return 0;
+ if (!bio_crypt_ctx_front_mergeable(req, bio))
+ return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
- req_set_nomerge(q, req);
+ req_set_nomerge(req->q, req);
return 0;
}
- if (!bio_flagged(bio, BIO_SEG_VALID))
- blk_recount_segments(q, bio);
- if (!bio_flagged(req->bio, BIO_SEG_VALID))
- blk_recount_segments(q, req->bio);
- return ll_new_hw_segment(q, req, bio);
+ return ll_new_hw_segment(req, bio, nr_segs);
}
static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
@@ -626,6 +631,9 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
if (blk_integrity_merge_rq(q, req, next) == false)
return 0;
+ if (!bio_crypt_ctx_merge_rq(req, next))
+ return 0;
+
/* Merge is OK... */
req->nr_phys_segments = total_phys_segments;
return 1;
@@ -661,20 +669,17 @@ void blk_rq_set_mixed_merge(struct request *rq)
rq->rq_flags |= RQF_MIXED_MERGE;
}
-static void blk_account_io_merge(struct request *req)
+static void blk_account_io_merge_request(struct request *req)
{
if (blk_do_io_stat(req)) {
- struct hd_struct *part;
-
part_stat_lock();
- part = req->part;
-
- part_dec_in_flight(req->q, part, rq_data_dir(req));
-
- hd_struct_put(part);
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
part_stat_unlock();
+
+ hd_struct_put(req->part);
}
}
+
/*
* Two cases of handling DISCARD merge:
* If max_discard_segments > 1, the driver takes every bio
@@ -786,7 +791,7 @@ static struct request *attempt_merge(struct request_queue *q,
/*
* 'next' is going away, so update stats accordingly
*/
- blk_account_io_merge(next);
+ blk_account_io_merge_request(next);
/*
* ownership of bio passed from next to req, return 'next' for
@@ -850,6 +855,10 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (blk_integrity_merge_bio(rq->q, rq, bio) == false)
return false;
+ /* Only merge if the crypt contexts are compatible */
+ if (!bio_crypt_rq_ctx_compatible(rq, bio))
+ return false;
+
/* must be using the same buffer */
if (req_op(rq) == REQ_OP_WRITE_SAME &&
!blk_write_same_mergeable(rq->bio, bio))
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index f945621a0e8f..0157f2b3485a 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -15,10 +15,10 @@
#include "blk.h"
#include "blk-mq.h"
-static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
- unsigned int nr_queues, const int cpu)
+static int queue_index(struct blk_mq_queue_map *qmap,
+ unsigned int nr_queues, const int q)
{
- return qmap->queue_offset + (cpu % nr_queues);
+ return qmap->queue_offset + (q % nr_queues);
}
static int get_first_sibling(unsigned int cpu)
@@ -36,21 +36,36 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
unsigned int *map = qmap->mq_map;
unsigned int nr_queues = qmap->nr_queues;
- unsigned int cpu, first_sibling;
+ unsigned int cpu, first_sibling, q = 0;
+
+ for_each_possible_cpu(cpu)
+ map[cpu] = -1;
+
+ /*
+ * Spread queues among present CPUs first for minimizing
+ * count of dead queues which are mapped by all un-present CPUs
+ */
+ for_each_present_cpu(cpu) {
+ if (q >= nr_queues)
+ break;
+ map[cpu] = queue_index(qmap, nr_queues, q++);
+ }
for_each_possible_cpu(cpu) {
+ if (map[cpu] != -1)
+ continue;
/*
* First do sequential mapping between CPUs and queues.
* In case we still have CPUs to map, and we have some number of
* threads per cores then map sibling threads to the same queue
* for performance optimizations.
*/
- if (cpu < nr_queues) {
- map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
+ if (q < nr_queues) {
+ map[cpu] = queue_index(qmap, nr_queues, q++);
} else {
first_sibling = get_first_sibling(cpu);
if (first_sibling == cpu)
- map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
+ map[cpu] = queue_index(qmap, nr_queues, q++);
else
map[cpu] = map[first_sibling];
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3afe327f816f..e0b2bc131bf5 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -17,7 +17,7 @@
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
{
if (stat->nr_samples) {
- seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+ seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
stat->nr_samples, stat->mean, stat->min, stat->max);
} else {
seq_puts(m, "samples=0");
@@ -29,13 +29,13 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
struct request_queue *q = data;
int bucket;
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
- seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket));
- print_stat(m, &q->poll_stat[2*bucket]);
+ for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
+ seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
+ print_stat(m, &q->poll_stat[2 * bucket]);
seq_puts(m, "\n");
- seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket));
- print_stat(m, &q->poll_stat[2*bucket+1]);
+ seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
+ print_stat(m, &q->poll_stat[2 * bucket + 1]);
seq_puts(m, "\n");
}
return 0;
@@ -125,6 +125,9 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(REGISTERED),
QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED),
+ QUEUE_FLAG_NAME(PCI_P2PDMA),
+ QUEUE_FLAG_NAME(ZONE_RESETALL),
+ QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
};
#undef QUEUE_FLAG_NAME
@@ -213,6 +216,7 @@ static const char *const hctx_state_name[] = {
HCTX_STATE_NAME(STOPPED),
HCTX_STATE_NAME(TAG_ACTIVE),
HCTX_STATE_NAME(SCHED_RESTART),
+ HCTX_STATE_NAME(INACTIVE),
};
#undef HCTX_STATE_NAME
@@ -239,6 +243,7 @@ static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(TAG_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
+ HCTX_FLAG_NAME(STACKING),
};
#undef HCTX_FLAG_NAME
@@ -261,23 +266,6 @@ static int hctx_flags_show(void *data, struct seq_file *m)
return 0;
}
-#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
-static const char *const op_name[] = {
- REQ_OP_NAME(READ),
- REQ_OP_NAME(WRITE),
- REQ_OP_NAME(FLUSH),
- REQ_OP_NAME(DISCARD),
- REQ_OP_NAME(SECURE_ERASE),
- REQ_OP_NAME(ZONE_RESET),
- REQ_OP_NAME(WRITE_SAME),
- REQ_OP_NAME(WRITE_ZEROES),
- REQ_OP_NAME(SCSI_IN),
- REQ_OP_NAME(SCSI_OUT),
- REQ_OP_NAME(DRV_IN),
- REQ_OP_NAME(DRV_OUT),
-};
-#undef REQ_OP_NAME
-
#define CMD_FLAG_NAME(name) [__REQ_##name] = #name
static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(FAILFAST_DEV),
@@ -309,7 +297,6 @@ static const char *const rqf_name[] = {
RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
RQF_NAME(PREEMPT),
- RQF_NAME(COPY_USER),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
RQF_NAME(ELVPRIV),
@@ -341,13 +328,14 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
- const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
+ const unsigned int op = req_op(rq);
+ const char *op_str = blk_op_str(op);
seq_printf(m, "%p {.op=", rq);
- if (op < ARRAY_SIZE(op_name) && op_name[op])
- seq_printf(m, "%s", op_name[op]);
+ if (strcmp(op_str, "UNKNOWN") == 0)
+ seq_printf(m, "%u", op);
else
- seq_printf(m, "%d", op);
+ seq_printf(m, "%s", op_str);
seq_puts(m, ", .cmd_flags=");
blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
ARRAY_SIZE(cmd_flag_name));
@@ -779,8 +767,8 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file)
if (attr->show)
return single_release(inode, file);
- else
- return seq_release(inode, file);
+
+ return seq_release(inode, file);
}
static const struct file_operations blk_mq_debugfs_fops = {
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2766066a15db..fdcc2c1dd178 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -80,16 +80,22 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
blk_mq_run_hw_queue(hctx, true);
}
+#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
+
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
LIST_HEAD(rq_list);
+ int ret = 0;
do {
struct request *rq;
@@ -97,12 +103,25 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
if (!blk_mq_get_dispatch_budget(hctx))
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
@@ -113,6 +132,8 @@ static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
*/
list_add(&rq->queuelist, &rq_list);
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+
+ return ret;
}
static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
@@ -130,16 +151,25 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
* restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ *
+ * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
+ * to be run again. This is necessary to avoid starving flushes.
*/
-static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
+static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
+ int ret = 0;
do {
struct request *rq;
+ if (!list_empty_careful(&hctx->dispatch)) {
+ ret = -EAGAIN;
+ break;
+ }
+
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
@@ -149,6 +179,14 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
blk_mq_put_dispatch_budget(hctx);
+ /*
+ * We're releasing without dispatching. Holding the
+ * budget could have blocked any "hctx"s with the
+ * same queue and if we didn't dispatch then there's
+ * no guarantee anyone will kick the queue. Kick it
+ * ourselves.
+ */
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
break;
}
@@ -165,21 +203,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
WRITE_ONCE(hctx->dispatch_from, ctx);
+ return ret;
}
-void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
+ int ret = 0;
LIST_HEAD(rq_list);
- /* RCU or SRCU read lock is needed before checking quiesced flag */
- if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
- return;
-
- hctx->run++;
-
/*
* If we have previous entries on our dispatch list, grab them first for
* more fair dispatch.
@@ -208,23 +242,45 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
blk_mq_sched_mark_restart_hctx(hctx);
if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
if (has_sched_dispatch)
- blk_mq_do_dispatch_sched(hctx);
+ ret = blk_mq_do_dispatch_sched(hctx);
else
- blk_mq_do_dispatch_ctx(hctx);
+ ret = blk_mq_do_dispatch_ctx(hctx);
}
} else if (has_sched_dispatch) {
- blk_mq_do_dispatch_sched(hctx);
+ ret = blk_mq_do_dispatch_sched(hctx);
} else if (hctx->dispatch_busy) {
/* dequeue request one by one from sw queue if queue is busy */
- blk_mq_do_dispatch_ctx(hctx);
+ ret = blk_mq_do_dispatch_ctx(hctx);
} else {
blk_mq_flush_busy_ctxs(hctx, &rq_list);
blk_mq_dispatch_rq_list(q, &rq_list, false);
}
+
+ return ret;
+}
+
+void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+
+ /* RCU or SRCU read lock is needed before checking quiesced flag */
+ if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
+ return;
+
+ hctx->run++;
+
+ /*
+ * A return of -EAGAIN is an indication that hctx->dispatch is not
+ * empty and we must run again in order to avoid starving flushes.
+ */
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) {
+ if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN)
+ blk_mq_run_hw_queue(hctx, true);
+ }
}
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- struct request **merged_request)
+ unsigned int nr_segs, struct request **merged_request)
{
struct request *rq;
@@ -232,7 +288,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_BACK_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
- if (!bio_attempt_back_merge(q, rq, bio))
+ if (!bio_attempt_back_merge(rq, bio, nr_segs))
return false;
*merged_request = attempt_back_merge(q, rq);
if (!*merged_request)
@@ -241,7 +297,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_FRONT_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
- if (!bio_attempt_front_merge(q, rq, bio))
+ if (!bio_attempt_front_merge(rq, bio, nr_segs))
return false;
*merged_request = attempt_front_merge(q, rq);
if (!*merged_request)
@@ -260,7 +316,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
* of them.
*/
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
- struct bio *bio)
+ struct bio *bio, unsigned int nr_segs)
{
struct request *rq;
int checked = 8;
@@ -277,11 +333,13 @@ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_back_merge(q, rq, bio);
+ merged = bio_attempt_back_merge(rq, bio,
+ nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_front_merge(q, rq, bio);
+ merged = bio_attempt_front_merge(rq, bio,
+ nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
@@ -304,13 +362,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
*/
static bool blk_mq_attempt_merge(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, struct bio *bio)
+ struct blk_mq_ctx *ctx, struct bio *bio,
+ unsigned int nr_segs)
{
enum hctx_type type = hctx->type;
lockdep_assert_held(&ctx->lock);
- if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) {
+ if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
return true;
}
@@ -318,7 +377,8 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
return false;
}
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
@@ -326,21 +386,18 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
bool ret = false;
enum hctx_type type;
- if (e && e->type->ops.bio_merge) {
- blk_mq_put_ctx(ctx);
- return e->type->ops.bio_merge(hctx, bio);
- }
+ if (e && e->type->ops.bio_merge)
+ return e->type->ops.bio_merge(hctx, bio, nr_segs);
type = hctx->type;
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
!list_empty_careful(&ctx->rq_lists[type])) {
/* default per sw-queue merge */
spin_lock(&ctx->lock);
- ret = blk_mq_attempt_merge(q, hctx, ctx, bio);
+ ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
spin_unlock(&ctx->lock);
}
- blk_mq_put_ctx(ctx);
return ret;
}
@@ -360,13 +417,19 @@ static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
bool has_sched,
struct request *rq)
{
- /* dispatch flush rq directly */
- if (rq->rq_flags & RQF_FLUSH_SEQ) {
- spin_lock(&hctx->lock);
- list_add(&rq->queuelist, &hctx->dispatch);
- spin_unlock(&hctx->lock);
+ /*
+ * dispatch flush and passthrough rq directly
+ *
+ * passthrough request has to be added to hctx->dispatch directly.
+ * For some reason, device may be in one situation which can't
+ * handle FS request, so STS_RESOURCE is always returned and the
+ * FS request will be added to hctx->dispatch. However passthrough
+ * request may be required at that time for fixing the problem. If
+ * passthrough request is added to scheduler queue, there isn't any
+ * chance to dispatch it given we prioritize requests in hctx->dispatch.
+ */
+ if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
return true;
- }
if (has_sched)
rq->rq_flags |= RQF_SORTED;
@@ -390,8 +453,32 @@ void blk_mq_sched_insert_request(struct request *rq, bool at_head,
WARN_ON(e && (rq->tag != -1));
- if (blk_mq_sched_bypass_insert(hctx, !!e, rq))
+ if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
+ /*
+ * Firstly normal IO request is inserted to scheduler queue or
+ * sw queue, meantime we add flush request to dispatch queue(
+ * hctx->dispatch) directly and there is at most one in-flight
+ * flush request for each hw queue, so it doesn't matter to add
+ * flush request to tail or front of the dispatch queue.
+ *
+ * Secondly in case of NCQ, flush request belongs to non-NCQ
+ * command, and queueing it will fail when there is any
+ * in-flight normal IO request(NCQ command). When adding flush
+ * rq to the front of hctx->dispatch, it is easier to introduce
+ * extra time to flush rq's latency because of S_SCHED_RESTART
+ * compared with adding to the tail of dispatch queue, then
+ * chance of flush merge is increased, and less flush requests
+ * will be issued to controller. It is observed that ~10% time
+ * is saved in blktests block/004 on disk attached to AHCI/NCQ
+ * drive when adding flush rq to the front of hctx->dispatch.
+ *
+ * Simply queue flush rq to the front of hctx->dispatch so that
+ * intensive flush workloads can benefit in case of NCQ HW.
+ */
+ at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
+ blk_mq_request_bypass_insert(rq, at_head, false);
goto run;
+ }
if (e && e->type->ops.insert_requests) {
LIST_HEAD(list);
@@ -554,8 +641,6 @@ void blk_mq_sched_free_requests(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- lockdep_assert_held(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3cf92cbbd8ac..126021fc3a11 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -12,8 +12,9 @@ void blk_mq_sched_assign_ioc(struct request *rq);
void blk_mq_sched_request_inserted(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+ unsigned int nr_segs, struct request **merged_request);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
@@ -31,12 +32,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_requests(struct request_queue *q);
static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
{
if (blk_queue_nomerges(q) || !bio_mergeable(bio))
return false;
- return __blk_mq_sched_bio_merge(q, bio);
+ return __blk_mq_sched_bio_merge(q, bio, nr_segs);
}
static inline bool
@@ -59,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
e->type->ops.completed_request(rq, now);
}
-static inline void blk_mq_sched_started_request(struct request *rq)
-{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.started_request)
- e->type->ops.started_request(rq);
-}
-
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index d6e1a9bd7131..062229395a50 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -74,10 +74,8 @@ static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
if (!entry->show)
return -EIO;
- res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->show(ctx, page);
+ res = entry->show(ctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
@@ -97,10 +95,8 @@ static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
if (!entry->store)
return -EIO;
- res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->store(ctx, page, length);
+ res = entry->store(ctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
@@ -120,10 +116,8 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
if (!entry->show)
return -EIO;
- res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->show(hctx, page);
+ res = entry->show(hctx, page);
mutex_unlock(&q->sysfs_lock);
return res;
}
@@ -144,10 +138,8 @@ static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
if (!entry->store)
return -EIO;
- res = -ENOENT;
mutex_lock(&q->sysfs_lock);
- if (!blk_queue_dying(q))
- res = entry->store(hctx, page, length);
+ res = entry->store(hctx, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
@@ -166,20 +158,25 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx,
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
{
+ const size_t size = PAGE_SIZE - 1;
unsigned int i, first = 1;
- ssize_t ret = 0;
+ int ret = 0, pos = 0;
for_each_cpu(i, hctx->cpumask) {
if (first)
- ret += sprintf(ret + page, "%u", i);
+ ret = snprintf(pos + page, size - pos, "%u", i);
else
- ret += sprintf(ret + page, ", %u", i);
+ ret = snprintf(pos + page, size - pos, ", %u", i);
+
+ if (ret >= size - pos)
+ break;
first = 0;
+ pos += ret;
}
- ret += sprintf(ret + page, "\n");
- return ret;
+ ret = snprintf(pos + page, size + 1 - pos, "\n");
+ return pos + ret;
}
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
@@ -270,7 +267,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_dir_lock);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -320,7 +317,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
int ret, i;
WARN_ON_ONCE(!q->kobj.parent);
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_dir_lock);
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -349,23 +346,12 @@ unreg:
return ret;
}
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
-{
- int ret;
-
- mutex_lock(&q->sysfs_lock);
- ret = __blk_mq_register_dev(dev, q);
- mutex_unlock(&q->sysfs_lock);
-
- return ret;
-}
-
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
@@ -373,7 +359,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
blk_mq_unregister_hctx(hctx);
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
}
int blk_mq_sysfs_register(struct request_queue *q)
@@ -381,7 +367,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
@@ -392,7 +378,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
}
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..ae722f8b13fb 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -10,18 +10,11 @@
#include <linux/module.h>
#include <linux/blk-mq.h>
+#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
-bool blk_mq_has_free_tags(struct blk_mq_tags *tags)
-{
- if (!tags)
- return true;
-
- return sbitmap_any_bit_clear(&tags->bitmap_tags.sb);
-}
-
/*
* If a previously inactive queue goes active, bump the active user count.
* We need to do this before try to allocate driver tag, then even if fail
@@ -99,7 +92,7 @@ static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
{
if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
!hctx_may_queue(data->hctx, bt))
- return -1;
+ return BLK_MQ_NO_TAG;
if (data->shallow_depth)
return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
@@ -113,13 +106,12 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
struct sbq_wait_state *ws;
DEFINE_SBQ_WAIT(wait);
unsigned int tag_offset;
- bool drop_ctx;
int tag;
if (data->flags & BLK_MQ_REQ_RESERVED) {
if (unlikely(!tags->nr_reserved_tags)) {
WARN_ON_ONCE(1);
- return BLK_MQ_TAG_FAIL;
+ return BLK_MQ_NO_TAG;
}
bt = &tags->breserved_tags;
tag_offset = 0;
@@ -129,14 +121,13 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
}
tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
+ if (tag != BLK_MQ_NO_TAG)
goto found_tag;
if (data->flags & BLK_MQ_REQ_NOWAIT)
- return BLK_MQ_TAG_FAIL;
+ return BLK_MQ_NO_TAG;
ws = bt_wait_ptr(bt, data->hctx);
- drop_ctx = data->ctx == NULL;
do {
struct sbitmap_queue *bt_prev;
@@ -152,18 +143,15 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* as running the queue may also have found completions.
*/
tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
+ if (tag != BLK_MQ_NO_TAG)
break;
sbitmap_prepare_to_wait(bt, ws, &wait, TASK_UNINTERRUPTIBLE);
tag = __blk_mq_get_tag(data, bt);
- if (tag != -1)
+ if (tag != BLK_MQ_NO_TAG)
break;
- if (data->ctx)
- blk_mq_put_ctx(data->ctx);
-
bt_prev = bt;
io_schedule();
@@ -189,17 +177,49 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
- if (drop_ctx && data->ctx)
- blk_mq_put_ctx(data->ctx);
-
sbitmap_finish_wait(bt, ws, &wait);
found_tag:
+ /*
+ * Give up this allocation if the hctx is inactive. The caller will
+ * retry on an active hctx.
+ */
+ if (unlikely(test_bit(BLK_MQ_S_INACTIVE, &data->hctx->state))) {
+ blk_mq_put_tag(tags, data->ctx, tag + tag_offset);
+ return BLK_MQ_NO_TAG;
+ }
return tag + tag_offset;
}
-void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
- struct blk_mq_ctx *ctx, unsigned int tag)
+bool __blk_mq_get_driver_tag(struct request *rq)
+{
+ struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+ bool shared = blk_mq_tag_busy(rq->mq_hctx);
+ int tag;
+
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+ bt = &rq->mq_hctx->tags->breserved_tags;
+ tag_offset = 0;
+ }
+
+ if (!hctx_may_queue(rq->mq_hctx, bt))
+ return false;
+ tag = __sbitmap_queue_get(bt);
+ if (tag == BLK_MQ_NO_TAG)
+ return false;
+
+ rq->tag = tag + tag_offset;
+ if (shared) {
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
+ atomic_inc(&rq->mq_hctx->nr_active);
+ }
+ rq->mq_hctx->tags->rqs[rq->tag] = rq;
+ return true;
+}
+
+void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
+ unsigned int tag)
{
if (!blk_mq_tag_is_reserved(tags, tag)) {
const int real_tag = tag - tags->nr_reserved_tags;
@@ -271,14 +291,18 @@ struct bt_tags_iter_data {
struct blk_mq_tags *tags;
busy_tag_iter_fn *fn;
void *data;
- bool reserved;
+ unsigned int flags;
};
+#define BT_TAG_ITER_RESERVED (1 << 0)
+#define BT_TAG_ITER_STARTED (1 << 1)
+#define BT_TAG_ITER_STATIC_RQS (1 << 2)
+
static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_tags_iter_data *iter_data = data;
struct blk_mq_tags *tags = iter_data->tags;
- bool reserved = iter_data->reserved;
+ bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
if (!reserved)
@@ -286,13 +310,18 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
/*
* We can hit rq == NULL here, because the tagging functions
- * test and set the bit before assining ->rqs[].
+ * test and set the bit before assigning ->rqs[].
*/
- rq = tags->rqs[bitnr];
- if (rq && blk_mq_request_started(rq))
- return iter_data->fn(rq, iter_data->data, reserved);
-
- return true;
+ if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
+ rq = tags->static_rqs[bitnr];
+ else
+ rq = tags->rqs[bitnr];
+ if (!rq)
+ return true;
+ if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
+ !blk_mq_request_started(rq))
+ return true;
+ return iter_data->fn(rq, iter_data->data, reserved);
}
/**
@@ -305,39 +334,49 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @reserved) where rq is a pointer to a request. Return true
* to continue iterating tags, false to stop.
* @data: Will be passed as second argument to @fn.
- * @reserved: Indicates whether @bt is the breserved_tags member or the
- * bitmap_tags member of struct blk_mq_tags.
+ * @flags: BT_TAG_ITER_*
*/
static void bt_tags_for_each(struct blk_mq_tags *tags, struct sbitmap_queue *bt,
- busy_tag_iter_fn *fn, void *data, bool reserved)
+ busy_tag_iter_fn *fn, void *data, unsigned int flags)
{
struct bt_tags_iter_data iter_data = {
.tags = tags,
.fn = fn,
.data = data,
- .reserved = reserved,
+ .flags = flags,
};
if (tags->rqs)
sbitmap_for_each_set(&bt->sb, bt_tags_iter, &iter_data);
}
+static void __blk_mq_all_tag_iter(struct blk_mq_tags *tags,
+ busy_tag_iter_fn *fn, void *priv, unsigned int flags)
+{
+ WARN_ON_ONCE(flags & BT_TAG_ITER_RESERVED);
+
+ if (tags->nr_reserved_tags)
+ bt_tags_for_each(tags, &tags->breserved_tags, fn, priv,
+ flags | BT_TAG_ITER_RESERVED);
+ bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, flags);
+}
+
/**
- * blk_mq_all_tag_busy_iter - iterate over all started requests in a tag map
+ * blk_mq_all_tag_iter - iterate over all requests in a tag map
* @tags: Tag map to iterate over.
- * @fn: Pointer to the function that will be called for each started
+ * @fn: Pointer to the function that will be called for each
* request. @fn will be called as follows: @fn(rq, @priv,
* reserved) where rq is a pointer to a request. 'reserved'
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
+ *
+ * Caller has to pass the tag map from which requests are allocated.
*/
-static void blk_mq_all_tag_busy_iter(struct blk_mq_tags *tags,
- busy_tag_iter_fn *fn, void *priv)
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+ void *priv)
{
- if (tags->nr_reserved_tags)
- bt_tags_for_each(tags, &tags->breserved_tags, fn, priv, true);
- bt_tags_for_each(tags, &tags->bitmap_tags, fn, priv, false);
+ __blk_mq_all_tag_iter(tags, fn, priv, BT_TAG_ITER_STATIC_RQS);
}
/**
@@ -357,11 +396,43 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
for (i = 0; i < tagset->nr_hw_queues; i++) {
if (tagset->tags && tagset->tags[i])
- blk_mq_all_tag_busy_iter(tagset->tags[i], fn, priv);
+ __blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
+ BT_TAG_ITER_STARTED);
}
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
+static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
+ void *data, bool reserved)
+{
+ unsigned *count = data;
+
+ if (blk_mq_request_completed(rq))
+ (*count)++;
+ return true;
+}
+
+/**
+ * blk_mq_tagset_wait_completed_request - wait until all completed req's
+ * complete funtion is run
+ * @tagset: Tag set to drain completed request
+ *
+ * Note: This function has to be run after all IO queues are shutdown
+ */
+void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
+{
+ while (true) {
+ unsigned count = 0;
+
+ blk_mq_tagset_busy_iter(tagset,
+ blk_mq_tagset_count_completed_rqs, &count);
+ if (!count)
+ break;
+ msleep(5);
+ }
+}
+EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
+
/**
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
* @q: Request queue to examine.
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
index 61deab0b5a5a..2e4ef51cdb32 100644
--- a/block/blk-mq-tag.h
+++ b/block/blk-mq-tag.h
@@ -26,15 +26,16 @@ extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int r
extern void blk_mq_free_tags(struct blk_mq_tags *tags);
extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_hw_ctx *hctx, struct blk_mq_tags *tags,
- struct blk_mq_ctx *ctx, unsigned int tag);
-extern bool blk_mq_has_free_tags(struct blk_mq_tags *tags);
+extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
+ unsigned int tag);
extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
struct blk_mq_tags **tags,
unsigned int depth, bool can_grow);
extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
void *priv);
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+ void *priv);
static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
struct blk_mq_hw_ctx *hctx)
@@ -45,11 +46,19 @@ static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
}
enum {
- BLK_MQ_TAG_FAIL = -1U,
+ BLK_MQ_NO_TAG = -1U,
BLK_MQ_TAG_MIN = 1,
- BLK_MQ_TAG_MAX = BLK_MQ_TAG_FAIL - 1,
+ BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
};
+bool __blk_mq_get_driver_tag(struct request *rq);
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+ if (rq->tag != BLK_MQ_NO_TAG)
+ return true;
+ return __blk_mq_get_driver_tag(rq);
+}
+
extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index 488341628256..7b8a42c35102 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -16,7 +16,7 @@
* @first_vec: first interrupt vectors to use for queues (usually 0)
*
* This function assumes the virtio device @vdev has at least as many available
- * interrupt vetors as @set has queues. It will then queuery the vector
+ * interrupt vectors as @set has queues. It will then query the vector
* corresponding to each queue for it's affinity mask and built queue mapping
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..4e0d173beaa3 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -26,10 +26,12 @@
#include <linux/delay.h>
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
+#include <linux/blk-crypto.h>
#include <trace/events/block.h>
#include <linux/blk-mq.h>
+#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@@ -44,12 +46,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
- int ddir, bytes, bucket;
+ int ddir, sectors, bucket;
ddir = rq_data_dir(rq);
- bytes = blk_rq_bytes(rq);
+ sectors = blk_rq_stats_sectors(rq);
- bucket = ddir + 2*(ilog2(bytes) - 9);
+ bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
@@ -92,7 +94,7 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
struct mq_inflight {
struct hd_struct *part;
- unsigned int *inflight;
+ unsigned int inflight[2];
};
static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
@@ -101,45 +103,29 @@ static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
{
struct mq_inflight *mi = priv;
- /*
- * index[0] counts the specific partition that was asked for.
- */
if (rq->part == mi->part)
- mi->inflight[0]++;
+ mi->inflight[rq_data_dir(rq)]++;
return true;
}
unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
{
- unsigned inflight[2];
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
+ struct mq_inflight mi = { .part = part };
- inflight[0] = inflight[1] = 0;
blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
- return inflight[0];
-}
-
-static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx,
- struct request *rq, void *priv,
- bool reserved)
-{
- struct mq_inflight *mi = priv;
-
- if (rq->part == mi->part)
- mi->inflight[rq_data_dir(rq)]++;
-
- return true;
+ return mi.inflight[0] + mi.inflight[1];
}
void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
unsigned int inflight[2])
{
- struct mq_inflight mi = { .part = part, .inflight = inflight, };
+ struct mq_inflight mi = { .part = part };
- inflight[0] = inflight[1] = 0;
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight_rw, &mi);
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi);
+ inflight[0] = mi.inflight[0];
+ inflight[1] = mi.inflight[1];
}
void blk_freeze_queue_start(struct request_queue *q)
@@ -275,30 +261,24 @@ void blk_mq_wake_waiters(struct request_queue *q)
blk_mq_tag_wakeup_all(hctx->tags, true);
}
-bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
-{
- return blk_mq_has_free_tags(hctx->tags);
-}
-EXPORT_SYMBOL(blk_mq_can_queue);
-
/*
- * Only need start/end time stamping if we have stats enabled, or using
- * an IO scheduler.
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
- return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
}
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
- unsigned int tag, unsigned int op)
+ unsigned int tag, u64 alloc_time_ns)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
req_flags_t rq_flags = 0;
if (data->flags & BLK_MQ_REQ_INTERNAL) {
- rq->tag = -1;
+ rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
@@ -306,7 +286,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
atomic_inc(&data->hctx->nr_active);
}
rq->tag = tag;
- rq->internal_tag = -1;
+ rq->internal_tag = BLK_MQ_NO_TAG;
data->hctx->tags->rqs[rq->tag] = rq;
}
@@ -315,7 +295,7 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
rq->rq_flags = rq_flags;
- rq->cmd_flags = op;
+ rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
if (blk_queue_io_stat(data->q))
@@ -325,17 +305,21 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+ rq->alloc_time_ns = alloc_time_ns;
+#endif
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
rq->io_start_time_ns = 0;
+ rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
+ blk_crypto_rq_set_defaults(rq);
/* tag was already set */
- rq->extra_len = 0;
WRITE_ONCE(rq->deadline, 0);
rq->timeout = 0;
@@ -343,29 +327,37 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
rq->end_io = NULL;
rq->end_io_data = NULL;
- data->ctx->rq_dispatched[op_is_sync(op)]++;
+ data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
refcount_set(&rq->ref, 1);
+
+ if (!op_is_flush(data->cmd_flags)) {
+ struct elevator_queue *e = data->q->elevator;
+
+ rq->elv.icq = NULL;
+ if (e && e->type->ops.prepare_request) {
+ if (e->type->icq_cache)
+ blk_mq_sched_assign_ioc(rq);
+
+ e->type->ops.prepare_request(rq);
+ rq->rq_flags |= RQF_ELVPRIV;
+ }
+ }
+
+ data->hctx->queued++;
return rq;
}
-static struct request *blk_mq_get_request(struct request_queue *q,
- struct bio *bio,
- struct blk_mq_alloc_data *data)
+static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
{
+ struct request_queue *q = data->q;
struct elevator_queue *e = q->elevator;
- struct request *rq;
+ u64 alloc_time_ns = 0;
unsigned int tag;
- bool put_ctx_on_error = false;
- blk_queue_enter_live(q);
- data->q = q;
- if (likely(!data->ctx)) {
- data->ctx = blk_mq_get_ctx(q);
- put_ctx_on_error = true;
- }
- if (likely(!data->hctx))
- data->hctx = blk_mq_map_queue(q, data->cmd_flags,
- data->ctx);
+ /* alloc_time includes depth and tag waits */
+ if (blk_queue_rq_alloc_time(q))
+ alloc_time_ns = ktime_get_ns();
+
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
@@ -381,39 +373,43 @@ static struct request *blk_mq_get_request(struct request_queue *q,
e->type->ops.limit_depth &&
!(data->flags & BLK_MQ_REQ_RESERVED))
e->type->ops.limit_depth(data->cmd_flags, data);
- } else {
- blk_mq_tag_busy(data->hctx);
}
- tag = blk_mq_get_tag(data);
- if (tag == BLK_MQ_TAG_FAIL) {
- if (put_ctx_on_error) {
- blk_mq_put_ctx(data->ctx);
- data->ctx = NULL;
- }
- blk_queue_exit(q);
- return NULL;
- }
+retry:
+ data->ctx = blk_mq_get_ctx(q);
+ data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
+ if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+ blk_mq_tag_busy(data->hctx);
- rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
- if (!op_is_flush(data->cmd_flags)) {
- rq->elv.icq = NULL;
- if (e && e->type->ops.prepare_request) {
- if (e->type->icq_cache)
- blk_mq_sched_assign_ioc(rq);
+ /*
+ * Waiting allocations only fail because of an inactive hctx. In that
+ * case just retry the hctx assignment and tag allocation as CPU hotplug
+ * should have migrated us to an online CPU by now.
+ */
+ tag = blk_mq_get_tag(data);
+ if (tag == BLK_MQ_NO_TAG) {
+ if (data->flags & BLK_MQ_REQ_NOWAIT)
+ return NULL;
- e->type->ops.prepare_request(rq, bio);
- rq->rq_flags |= RQF_ELVPRIV;
- }
+ /*
+ * Give up the CPU and sleep for a random short time to ensure
+ * that thread using a realtime scheduling class are migrated
+ * off the the CPU, and thus off the hctx that is going away.
+ */
+ msleep(3);
+ goto retry;
}
- data->hctx->queued++;
- return rq;
+ return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
}
struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
blk_mq_req_flags_t flags)
{
- struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .flags = flags,
+ .cmd_flags = op,
+ };
struct request *rq;
int ret;
@@ -421,36 +417,43 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
if (ret)
return ERR_PTR(ret);
- rq = blk_mq_get_request(q, NULL, &alloc_data);
- blk_queue_exit(q);
-
+ rq = __blk_mq_alloc_request(&data);
if (!rq)
- return ERR_PTR(-EWOULDBLOCK);
-
- blk_mq_put_ctx(alloc_data.ctx);
-
+ goto out_queue_exit;
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
return rq;
+out_queue_exit:
+ blk_queue_exit(q);
+ return ERR_PTR(-EWOULDBLOCK);
}
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
- struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op };
- struct request *rq;
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .flags = flags,
+ .cmd_flags = op,
+ };
+ u64 alloc_time_ns = 0;
unsigned int cpu;
+ unsigned int tag;
int ret;
+ /* alloc_time includes depth and tag waits */
+ if (blk_queue_rq_alloc_time(q))
+ alloc_time_ns = ktime_get_ns();
+
/*
* If the tag allocator sleeps we could get an allocation for a
* different hardware context. No need to complicate the low level
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
- if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)))
+ if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
@@ -464,21 +467,27 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* Check if the hardware context is actually mapped to anything.
* If not tell the caller that it should skip this queue.
*/
- alloc_data.hctx = q->queue_hw_ctx[hctx_idx];
- if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) {
- blk_queue_exit(q);
- return ERR_PTR(-EXDEV);
- }
- cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask);
- alloc_data.ctx = __blk_mq_get_ctx(q, cpu);
-
- rq = blk_mq_get_request(q, NULL, &alloc_data);
- blk_queue_exit(q);
+ ret = -EXDEV;
+ data.hctx = q->queue_hw_ctx[hctx_idx];
+ if (!blk_mq_hw_queue_mapped(data.hctx))
+ goto out_queue_exit;
+ cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+ data.ctx = __blk_mq_get_ctx(q, cpu);
+
+ if (q->elevator)
+ data.flags |= BLK_MQ_REQ_INTERNAL;
+ else
+ blk_mq_tag_busy(data.hctx);
- if (!rq)
- return ERR_PTR(-EWOULDBLOCK);
+ ret = -EWOULDBLOCK;
+ tag = blk_mq_get_tag(&data);
+ if (tag == BLK_MQ_NO_TAG)
+ goto out_queue_exit;
+ return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
- return rq;
+out_queue_exit:
+ blk_queue_exit(q);
+ return ERR_PTR(ret);
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
@@ -489,12 +498,13 @@ static void __blk_mq_free_request(struct request *rq)
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
const int sched_tag = rq->internal_tag;
+ blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
- if (rq->tag != -1)
- blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag);
- if (sched_tag != -1)
- blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag);
+ if (rq->tag != BLK_MQ_NO_TAG)
+ blk_mq_put_tag(hctx->tags, ctx, rq->tag);
+ if (sched_tag != BLK_MQ_NO_TAG)
+ blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
blk_queue_exit(q);
}
@@ -542,7 +552,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
- if (rq->internal_tag != -1)
+ if (rq->internal_tag != BLK_MQ_NO_TAG)
blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -572,7 +582,17 @@ static void __blk_mq_complete_request_remote(void *data)
q->mq_ops->complete(rq);
}
-static void __blk_mq_complete_request(struct request *rq)
+/**
+ * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
+ * injection that could drop the completion.
+ * @rq: Request to be force completed
+ *
+ * Drivers should use blk_mq_complete_request() to complete requests in their
+ * normal IO path. For timeout error recovery, drivers may call this forced
+ * completion routine after they've reclaimed timed out requests to bypass
+ * potentially subsequent fake timeouts.
+ */
+void blk_mq_force_complete_rq(struct request *rq)
{
struct blk_mq_ctx *ctx = rq->mq_ctx;
struct request_queue *q = rq->q;
@@ -618,6 +638,7 @@ static void __blk_mq_complete_request(struct request *rq)
}
put_cpu();
}
+EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
@@ -651,37 +672,28 @@ bool blk_mq_complete_request(struct request *rq)
{
if (unlikely(blk_should_fake_timeout(rq->q)))
return false;
- __blk_mq_complete_request(rq);
+ blk_mq_force_complete_rq(rq);
return true;
}
EXPORT_SYMBOL(blk_mq_complete_request);
-void blk_mq_complete_request_sync(struct request *rq)
-{
- WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
- rq->q->mq_ops->complete(rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync);
-
-int blk_mq_request_started(struct request *rq)
-{
- return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
-}
-EXPORT_SYMBOL_GPL(blk_mq_request_started);
-
+/**
+ * blk_mq_start_request - Start processing a request
+ * @rq: Pointer to request to be started
+ *
+ * Function used by device drivers to notify the block layer that a request
+ * is going to be processed now, so blk layer can do proper initializations
+ * such as starting the timeout timer.
+ */
void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
- blk_mq_sched_started_request(rq);
-
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- rq->throtl_size = blk_rq_sectors(rq);
-#endif
+ rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
}
@@ -691,14 +703,10 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
- if (q->dma_drain_size && blk_rq_bytes(rq)) {
- /*
- * Make sure space for the drain appears. We know we can do
- * this because max_hw_segments has been adjusted to be one
- * fewer than the device can handle.
- */
- rq->nr_phys_segments++;
- }
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
+ q->integrity.profile->prepare_fn(rq);
+#endif
}
EXPORT_SYMBOL(blk_mq_start_request);
@@ -714,8 +722,6 @@ static void __blk_mq_requeue_request(struct request *rq)
if (blk_mq_request_started(rq)) {
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
rq->rq_flags &= ~RQF_TIMED_OUT;
- if (q->dma_drain_size && blk_rq_bytes(rq))
- rq->nr_phys_segments--;
}
}
@@ -754,7 +760,7 @@ static void blk_mq_requeue_work(struct work_struct *work)
* merge.
*/
if (rq->rq_flags & RQF_DONTPREP)
- blk_mq_request_bypass_insert(rq, false);
+ blk_mq_request_bypass_insert(rq, false, false);
else
blk_mq_sched_insert_request(rq, true, false, false);
}
@@ -822,10 +828,10 @@ static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
void *priv, bool reserved)
{
/*
- * If we find a request that is inflight and the queue matches,
+ * If we find a request that isn't idle and the queue matches,
* we know the queue is busy. Return false to stop the iteration.
*/
- if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) {
+ if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
bool *busy = priv;
*busy = true;
@@ -911,7 +917,10 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
*/
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
- if (refcount_dec_and_test(&rq->ref))
+
+ if (is_flush_rq(rq, hctx))
+ rq->end_io(rq, 0);
+ else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
return true;
@@ -1043,36 +1052,6 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
-bool blk_mq_get_driver_tag(struct request *rq)
-{
- struct blk_mq_alloc_data data = {
- .q = rq->q,
- .hctx = rq->mq_hctx,
- .flags = BLK_MQ_REQ_NOWAIT,
- .cmd_flags = rq->cmd_flags,
- };
- bool shared;
-
- if (rq->tag != -1)
- goto done;
-
- if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag))
- data.flags |= BLK_MQ_REQ_RESERVED;
-
- shared = blk_mq_tag_busy(data.hctx);
- rq->tag = blk_mq_get_tag(&data);
- if (rq->tag >= 0) {
- if (shared) {
- rq->rq_flags |= RQF_MQ_INFLIGHT;
- atomic_inc(&data.hctx->nr_active);
- }
- data.hctx->tags->rqs[rq->tag] = rq;
- }
-
-done:
- return rq->tag != -1;
-}
-
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
int flags, void *key)
{
@@ -1195,6 +1174,36 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
#define BLK_MQ_RESOURCE_DELAY 3 /* ms units */
+static void blk_mq_handle_dev_resource(struct request *rq,
+ struct list_head *list)
+{
+ struct request *next =
+ list_first_entry_or_null(list, struct request, queuelist);
+
+ /*
+ * If an I/O scheduler has been configured and we got a driver tag for
+ * the next request already, free it.
+ */
+ if (next)
+ blk_mq_put_driver_tag(next);
+
+ list_add(&rq->queuelist, list);
+ __blk_mq_requeue_request(rq);
+}
+
+static void blk_mq_handle_zone_resource(struct request *rq,
+ struct list_head *zone_list)
+{
+ /*
+ * If we end up here it is because we cannot dispatch a request to a
+ * specific zone due to LLD level zone-write locking or other zone
+ * related resource not being available. In this case, set the request
+ * aside in zone_list for retrying it later.
+ */
+ list_add(&rq->queuelist, zone_list);
+ __blk_mq_requeue_request(rq);
+}
+
/*
* Returns true if we did some work AND can potentially do more.
*/
@@ -1206,6 +1215,8 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
+ bool no_budget_avail = false;
+ LIST_HEAD(zone_list);
if (list_empty(list))
return false;
@@ -1222,8 +1233,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
hctx = rq->mq_hctx;
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx))
+ if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
+ blk_mq_put_driver_tag(rq);
+ no_budget_avail = true;
break;
+ }
if (!blk_mq_get_driver_tag(rq)) {
/*
@@ -1262,18 +1276,18 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
ret = q->mq_ops->queue_rq(hctx, &bd);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
+ blk_mq_handle_dev_resource(rq, list);
+ break;
+ } else if (ret == BLK_STS_ZONE_RESOURCE) {
/*
- * If an I/O scheduler has been configured and we got a
- * driver tag for the next request already, free it
- * again.
+ * Move the request to zone_list and keep going through
+ * the dispatch list to find more requests the drive can
+ * accept.
*/
- if (!list_empty(list)) {
- nxt = list_first_entry(list, struct request, queuelist);
- blk_mq_put_driver_tag(nxt);
- }
- list_add(&rq->queuelist, list);
- __blk_mq_requeue_request(rq);
- break;
+ blk_mq_handle_zone_resource(rq, &zone_list);
+ if (list_empty(list))
+ break;
+ continue;
}
if (unlikely(ret != BLK_STS_OK)) {
@@ -1285,6 +1299,9 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
queued++;
} while (!list_empty(list));
+ if (!list_empty(&zone_list))
+ list_splice_tail_init(&zone_list, list);
+
hctx->dispatched[queued_to_index(queued)]++;
/*
@@ -1299,11 +1316,11 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* the driver there was more coming, but that turned out to
* be a lie.
*/
- if (q->mq_ops->commit_rqs)
+ if (q->mq_ops->commit_rqs && queued)
q->mq_ops->commit_rqs(hctx);
spin_lock(&hctx->lock);
- list_splice_init(list, &hctx->dispatch);
+ list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
/*
@@ -1328,13 +1345,15 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
* bit is set, run queue after a delay to avoid IO stalls
- * that could otherwise occur if the queue is idle.
+ * that could otherwise occur if the queue is idle. We'll do
+ * similar if we couldn't get budget and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx);
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
- else if (needs_restart && (ret == BLK_STS_RESOURCE))
+ else if (needs_restart && (ret == BLK_STS_RESOURCE ||
+ no_budget_avail))
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
@@ -1352,6 +1371,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
return (queued + errors) != 0;
}
+/**
+ * __blk_mq_run_hw_queue - Run a hardware queue.
+ * @hctx: Pointer to the hardware queue to run.
+ *
+ * Send pending requests to the hardware.
+ */
static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
{
int srcu_idx;
@@ -1449,6 +1474,15 @@ select_cpu:
return next_cpu;
}
+/**
+ * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
+ * @hctx: Pointer to the hardware queue to run.
+ * @async: If we want to run the queue asynchronously.
+ * @msecs: Microseconds of delay to wait before running the queue.
+ *
+ * If !@async, try to run the queue now. Else, run the queue asynchronously and
+ * with a delay of @msecs.
+ */
static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
unsigned long msecs)
{
@@ -1470,13 +1504,29 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
msecs_to_jiffies(msecs));
}
+/**
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
+ * @hctx: Pointer to the hardware queue to run.
+ * @msecs: Microseconds of delay to wait before running the queue.
+ *
+ * Run a hardware queue asynchronously with a delay of @msecs.
+ */
void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
__blk_mq_delay_run_hw_queue(hctx, true, msecs);
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
-bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
+/**
+ * blk_mq_run_hw_queue - Start to run a hardware queue.
+ * @hctx: Pointer to the hardware queue to run.
+ * @async: If we want to run the queue asynchronously.
+ *
+ * Check if the request queue is not in a quiesced state and if there are
+ * pending requests to be sent. If this is true, run the queue to send requests
+ * to hardware.
+ */
+void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
int srcu_idx;
bool need_run;
@@ -1494,15 +1544,16 @@ bool blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
blk_mq_hctx_has_pending(hctx);
hctx_unlock(hctx, srcu_idx);
- if (need_run) {
+ if (need_run)
__blk_mq_delay_run_hw_queue(hctx, async, 0);
- return true;
- }
-
- return false;
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
+/**
+ * blk_mq_run_hw_queue - Run all hardware queues in a request queue.
+ * @q: Pointer to the request queue to run.
+ * @async: If we want to run the queue asynchronously.
+ */
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
@@ -1518,6 +1569,25 @@ void blk_mq_run_hw_queues(struct request_queue *q, bool async)
EXPORT_SYMBOL(blk_mq_run_hw_queues);
/**
+ * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
+ * @q: Pointer to the request queue to run.
+ * @msecs: Microseconds of delay to wait before running the queues.
+ */
+void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
+{
+ struct blk_mq_hw_ctx *hctx;
+ int i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (blk_mq_hctx_stopped(hctx))
+ continue;
+
+ blk_mq_delay_run_hw_queue(hctx, msecs);
+ }
+}
+EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
+
+/**
* blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
* @q: request queue.
*
@@ -1654,16 +1724,24 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
blk_mq_hctx_mark_pending(hctx, ctx);
}
-/*
+/**
+ * blk_mq_request_bypass_insert - Insert a request at dispatch list.
+ * @rq: Pointer to request to be inserted.
+ * @run_queue: If we should run the hardware queue after inserting the request.
+ *
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue)
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
+ bool run_queue)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
spin_lock(&hctx->lock);
- list_add_tail(&rq->queuelist, &hctx->dispatch);
+ if (at_head)
+ list_add(&rq->queuelist, &hctx->dispatch);
+ else
+ list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
if (run_queue)
@@ -1697,28 +1775,20 @@ static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
struct request *rqa = container_of(a, struct request, queuelist);
struct request *rqb = container_of(b, struct request, queuelist);
- if (rqa->mq_ctx < rqb->mq_ctx)
- return -1;
- else if (rqa->mq_ctx > rqb->mq_ctx)
- return 1;
- else if (rqa->mq_hctx < rqb->mq_hctx)
- return -1;
- else if (rqa->mq_hctx > rqb->mq_hctx)
- return 1;
+ if (rqa->mq_ctx != rqb->mq_ctx)
+ return rqa->mq_ctx > rqb->mq_ctx;
+ if (rqa->mq_hctx != rqb->mq_hctx)
+ return rqa->mq_hctx > rqb->mq_hctx;
return blk_rq_pos(rqa) > blk_rq_pos(rqb);
}
void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
{
- struct blk_mq_hw_ctx *this_hctx;
- struct blk_mq_ctx *this_ctx;
- struct request_queue *this_q;
- struct request *rq;
LIST_HEAD(list);
- LIST_HEAD(rq_list);
- unsigned int depth;
+ if (list_empty(&plug->mq_list))
+ return;
list_splice_init(&plug->mq_list, &list);
if (plug->rq_count > 2 && plug->multiple_queues)
@@ -1726,49 +1796,41 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
plug->rq_count = 0;
- this_q = NULL;
- this_hctx = NULL;
- this_ctx = NULL;
- depth = 0;
-
- while (!list_empty(&list)) {
- rq = list_entry_rq(list.next);
- list_del_init(&rq->queuelist);
- BUG_ON(!rq->q);
- if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) {
- if (this_hctx) {
- trace_block_unplug(this_q, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx,
- &rq_list,
- from_schedule);
- }
-
- this_q = rq->q;
- this_ctx = rq->mq_ctx;
- this_hctx = rq->mq_hctx;
- depth = 0;
+ do {
+ struct list_head rq_list;
+ struct request *rq, *head_rq = list_entry_rq(list.next);
+ struct list_head *pos = &head_rq->queuelist; /* skip first */
+ struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
+ struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
+ unsigned int depth = 1;
+
+ list_for_each_continue(pos, &list) {
+ rq = list_entry_rq(pos);
+ BUG_ON(!rq->q);
+ if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
+ break;
+ depth++;
}
- depth++;
- list_add_tail(&rq->queuelist, &rq_list);
- }
-
- /*
- * If 'this_hctx' is set, we know we have entries to complete
- * on 'rq_list'. Do those.
- */
- if (this_hctx) {
- trace_block_unplug(this_q, depth, !from_schedule);
+ list_cut_before(&rq_list, &list, pos);
+ trace_block_unplug(head_rq->q, depth, !from_schedule);
blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
from_schedule);
- }
+ } while(!list_empty(&list));
}
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
+ unsigned int nr_segs)
{
- blk_init_request_from_bio(rq, bio);
+ if (bio->bi_opf & REQ_RAHEAD)
+ rq->cmd_flags |= REQ_FAILFAST_MASK;
+
+ rq->__sector = bio->bi_iter.bi_sector;
+ rq->write_hint = bio->bi_write_hint;
+ blk_rq_bio_prep(rq, bio, nr_segs);
+ blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
@@ -1847,10 +1909,21 @@ insert:
if (bypass_insert)
return BLK_STS_RESOURCE;
- blk_mq_request_bypass_insert(rq, run_queue);
+ blk_mq_request_bypass_insert(rq, false, run_queue);
return BLK_STS_OK;
}
+/**
+ * blk_mq_try_issue_directly - Try to send a request directly to device driver.
+ * @hctx: Pointer of the associated hardware queue.
+ * @rq: Pointer to request to be sent.
+ * @cookie: Request queue cookie.
+ *
+ * If the device has enough resources to accept a new request now, send the
+ * request directly to device driver. Else, insert at hctx->dispatch queue, so
+ * we can try send it another time in the future. Requests inserted at this
+ * queue have higher priority.
+ */
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
struct request *rq, blk_qc_t *cookie)
{
@@ -1863,7 +1936,7 @@ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- blk_mq_request_bypass_insert(rq, true);
+ blk_mq_request_bypass_insert(rq, false, true);
else if (ret != BLK_STS_OK)
blk_mq_end_request(rq, ret);
@@ -1887,6 +1960,8 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
+ int queued = 0;
+
while (!list_empty(list)) {
blk_status_t ret;
struct request *rq = list_first_entry(list, struct request,
@@ -1897,12 +1972,13 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
if (ret != BLK_STS_OK) {
if (ret == BLK_STS_RESOURCE ||
ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_request_bypass_insert(rq,
+ blk_mq_request_bypass_insert(rq, false,
list_empty(list));
break;
}
blk_mq_end_request(rq, ret);
- }
+ } else
+ queued++;
}
/*
@@ -1910,7 +1986,7 @@ void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
* the driver there was more coming, but that turned out to
* be a lie.
*/
- if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs)
+ if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued)
hctx->queue->mq_ops->commit_rqs(hctx);
}
@@ -1928,39 +2004,58 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
}
}
-static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+/**
+ * blk_mq_make_request - Create and send a request to block device.
+ * @q: Request queue pointer.
+ * @bio: Bio pointer.
+ *
+ * Builds up a request structure from @q and @bio and send to the device. The
+ * request may not be queued directly to hardware if:
+ * * This request can be merged with another one
+ * * We want to place request at plug queue for possible future merging
+ * * There is an IO scheduler active at this queue
+ *
+ * It will not queue the request if there is an error with the bio, or at the
+ * request creation.
+ *
+ * Returns: Request queue cookie.
+ */
+blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
{
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_mq_alloc_data data = { .flags = 0};
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ };
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
+ unsigned int nr_segs;
blk_qc_t cookie;
+ blk_status_t ret;
blk_queue_bounce(q, &bio);
-
- blk_queue_split(q, &bio);
+ __blk_queue_split(q, &bio, &nr_segs);
if (!bio_integrity_prep(bio))
- return BLK_QC_T_NONE;
+ goto queue_exit;
if (!is_flush_fua && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, &same_queue_rq))
- return BLK_QC_T_NONE;
+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
+ goto queue_exit;
- if (blk_mq_sched_bio_merge(q, bio))
- return BLK_QC_T_NONE;
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+ goto queue_exit;
rq_qos_throttle(q, bio);
data.cmd_flags = bio->bi_opf;
- rq = blk_mq_get_request(q, bio, &data);
+ rq = __blk_mq_alloc_request(&data);
if (unlikely(!rq)) {
rq_qos_cleanup(q, bio);
if (bio->bi_opf & REQ_NOWAIT)
bio_wouldblock_error(bio);
- return BLK_QC_T_NONE;
+ goto queue_exit;
}
trace_block_getrq(q, bio, bio->bi_opf);
@@ -1969,25 +2064,33 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
cookie = request_to_qc_t(data.hctx, rq);
- plug = current->plug;
- if (unlikely(is_flush_fua)) {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
+ blk_mq_bio_to_request(rq, bio, nr_segs);
+
+ ret = blk_crypto_init_request(rq);
+ if (ret != BLK_STS_OK) {
+ bio->bi_status = ret;
+ bio_endio(bio);
+ blk_mq_free_request(rq);
+ return BLK_QC_T_NONE;
+ }
- /* bypass scheduler for flush rq */
+ plug = blk_mq_plug(q, bio);
+ if (unlikely(is_flush_fua)) {
+ /* Bypass scheduler for flush requests */
blk_insert_flush(rq);
blk_mq_run_hw_queue(data.hctx, true);
- } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
+ } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
+ !blk_queue_nonrot(q))) {
/*
* Use plugging if we have a ->commit_rqs() hook as well, as
* we know the driver uses bd->last in a smart fashion.
+ *
+ * Use normal plugging if this disk is slow HDD, as sequential
+ * IO may benefit a lot from plug merging.
*/
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
-
if (!request_count)
trace_block_plug(q);
else
@@ -2000,9 +2103,10 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
}
blk_add_rq_to_plug(plug, rq);
+ } else if (q->elevator) {
+ /* Insert the request at the IO scheduler queue */
+ blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
- blk_mq_bio_to_request(rq, bio);
-
/*
* We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
@@ -2019,27 +2123,30 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_add_rq_to_plug(plug, rq);
trace_block_plug(q);
- blk_mq_put_ctx(data.ctx);
-
if (same_queue_rq) {
data.hctx = same_queue_rq->mq_hctx;
trace_block_unplug(q, 1, true);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
- } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
- !data.hctx->dispatch_busy)) {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
+ } else if ((q->nr_hw_queues > 1 && is_sync) ||
+ !data.hctx->dispatch_busy) {
+ /*
+ * There is no scheduler and we can try to send directly
+ * to the hardware.
+ */
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
} else {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
+ /* Default case. */
blk_mq_sched_insert_request(rq, false, true, true);
}
return cookie;
+queue_exit:
+ blk_queue_exit(q);
+ return BLK_QC_T_NONE;
}
+EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
@@ -2215,6 +2322,86 @@ fail:
return -ENOMEM;
}
+struct rq_iter_data {
+ struct blk_mq_hw_ctx *hctx;
+ bool has_rq;
+};
+
+static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+{
+ struct rq_iter_data *iter_data = data;
+
+ if (rq->mq_hctx != iter_data->hctx)
+ return true;
+ iter_data->has_rq = true;
+ return false;
+}
+
+static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
+{
+ struct blk_mq_tags *tags = hctx->sched_tags ?
+ hctx->sched_tags : hctx->tags;
+ struct rq_iter_data data = {
+ .hctx = hctx,
+ };
+
+ blk_mq_all_tag_iter(tags, blk_mq_has_request, &data);
+ return data.has_rq;
+}
+
+static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+ return false;
+ if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
+ return false;
+ return true;
+}
+
+static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (!cpumask_test_cpu(cpu, hctx->cpumask) ||
+ !blk_mq_last_cpu_in_hctx(cpu, hctx))
+ return 0;
+
+ /*
+ * Prevent new request from being allocated on the current hctx.
+ *
+ * The smp_mb__after_atomic() Pairs with the implied barrier in
+ * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is
+ * seen once we return from the tag allocator.
+ */
+ set_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ smp_mb__after_atomic();
+
+ /*
+ * Try to grab a reference to the queue and wait for any outstanding
+ * requests. If we could not grab a reference the queue has been
+ * frozen and there are no requests.
+ */
+ if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) {
+ while (blk_mq_hctx_has_requests(hctx))
+ msleep(5);
+ percpu_ref_put(&hctx->queue->q_usage_counter);
+ }
+
+ return 0;
+}
+
+static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node)
+{
+ struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node,
+ struct blk_mq_hw_ctx, cpuhp_online);
+
+ if (cpumask_test_cpu(cpu, hctx->cpumask))
+ clear_bit(BLK_MQ_S_INACTIVE, &hctx->state);
+ return 0;
+}
+
/*
* 'cpu' is going away. splice any existing rq_list entries from this
* software queue to the hw queue dispatch list, and ensure that it
@@ -2228,6 +2415,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
enum hctx_type type;
hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead);
+ if (!cpumask_test_cpu(cpu, hctx->cpumask))
+ return 0;
+
ctx = __blk_mq_get_ctx(hctx->queue, cpu);
type = hctx->type;
@@ -2251,6 +2441,9 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node)
static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
{
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
+ cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD,
&hctx->cpuhp_dead);
}
@@ -2310,6 +2503,9 @@ static int blk_mq_init_hctx(struct request_queue *q,
{
hctx->queue_num = hctx_idx;
+ if (!(hctx->flags & BLK_MQ_F_STACKING))
+ cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE,
+ &hctx->cpuhp_online);
cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead);
hctx->tags = set->tags[hctx_idx];
@@ -2376,8 +2572,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake);
INIT_LIST_HEAD(&hctx->dispatch_wait.entry);
- hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size,
- gfp);
+ hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp);
if (!hctx->fq)
goto free_bitmap;
@@ -2429,7 +2624,8 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
}
}
-static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx)
+static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
+ int hctx_idx)
{
int ret = 0;
@@ -2465,11 +2661,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
- /*
- * Avoid others reading imcomplete hctx->cpumask through sysfs
- */
- mutex_lock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -2482,18 +2673,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* If the cpu isn't present, the cpu is mapped to first hctx.
*/
for_each_possible_cpu(i) {
- hctx_idx = set->map[HCTX_TYPE_DEFAULT].mq_map[i];
- /* unmapped hw queue can be remapped after CPU topo changed */
- if (!set->tags[hctx_idx] &&
- !__blk_mq_alloc_rq_map(set, hctx_idx)) {
- /*
- * If tags initialization fail for some hctx,
- * that hctx won't be brought online. In this
- * case, remap the current ctx to hctx[0] which
- * is guaranteed to always have tags allocated
- */
- set->map[HCTX_TYPE_DEFAULT].mq_map[i] = 0;
- }
ctx = per_cpu_ptr(q->queue_ctx, i);
for (j = 0; j < set->nr_maps; j++) {
@@ -2502,6 +2681,18 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
continue;
}
+ hctx_idx = set->map[j].mq_map[i];
+ /* unmapped hw queue can be remapped after CPU topo changed */
+ if (!set->tags[hctx_idx] &&
+ !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+ /*
+ * If tags initialization fail for some hctx,
+ * that hctx won't be brought online. In this
+ * case, remap the current ctx to hctx[0] which
+ * is guaranteed to always have tags allocated
+ */
+ set->map[j].mq_map[i] = 0;
+ }
hctx = blk_mq_map_queue_type(q, j, i);
ctx->hctxs[j] = hctx;
@@ -2530,8 +2721,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
}
- mutex_unlock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
/*
* If no software queues are mapped to this hardware queue,
@@ -2674,8 +2863,6 @@ void blk_mq_release(struct request_queue *q)
struct blk_mq_hw_ctx *hctx, *next;
int i;
- cancel_delayed_work_sync(&q->requeue_work);
-
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -2694,20 +2881,32 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
-struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
+struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
+ void *queuedata)
{
struct request_queue *uninit_q, *q;
- uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node);
+ uninit_q = __blk_alloc_queue(set->numa_node);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
+ uninit_q->queuedata = queuedata;
- q = blk_mq_init_allocated_queue(set, uninit_q);
+ /*
+ * Initialize the queue without an elevator. device_add_disk() will do
+ * the initialization.
+ */
+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
if (IS_ERR(q))
blk_cleanup_queue(uninit_q);
return q;
}
+EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
+
+struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
+{
+ return blk_mq_init_queue_data(set, NULL);
+}
EXPORT_SYMBOL(blk_mq_init_queue);
/*
@@ -2784,6 +2983,22 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
int i, j, end;
struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
+ if (q->nr_hw_queues < set->nr_hw_queues) {
+ struct blk_mq_hw_ctx **new_hctxs;
+
+ new_hctxs = kcalloc_node(set->nr_hw_queues,
+ sizeof(*new_hctxs), GFP_KERNEL,
+ set->numa_node);
+ if (!new_hctxs)
+ return;
+ if (hctxs)
+ memcpy(new_hctxs, hctxs, q->nr_hw_queues *
+ sizeof(*hctxs));
+ q->queue_hw_ctx = new_hctxs;
+ kfree(hctxs);
+ hctxs = new_hctxs;
+ }
+
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
@@ -2839,21 +3054,9 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
mutex_unlock(&q->sysfs_lock);
}
-/*
- * Maximum number of hardware queues we support. For single sets, we'll never
- * have more than the CPUs (software queues). For multiple sets, the tag_set
- * user may have set ->nr_hw_queues larger.
- */
-static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
-{
- if (set->nr_maps == 1)
- return nr_cpu_ids;
-
- return max(set->nr_hw_queues, nr_cpu_ids);
-}
-
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q)
+ struct request_queue *q,
+ bool elevator_init)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -2870,12 +3073,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
- q->nr_queues = nr_hw_queues(set);
- q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)),
- GFP_KERNEL, set->numa_node);
- if (!q->queue_hw_ctx)
- goto err_sys_init;
-
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
@@ -2899,11 +3096,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
- blk_queue_make_request(q, blk_mq_make_request);
-
- /*
- * Do this after blk_queue_make_request() overrides it...
- */
q->nr_requests = set->queue_depth;
/*
@@ -2915,19 +3107,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
- int ret;
-
- ret = elevator_init_mq(q);
- if (ret)
- return ERR_PTR(ret);
- }
+ if (elevator_init)
+ elevator_init_mq(q);
return q;
err_hctxs:
kfree(q->queue_hw_ctx);
-err_sys_init:
+ q->nr_hw_queues = 0;
blk_mq_sysfs_deinit(q);
err_poll:
blk_stat_free_callback(q->poll_cb);
@@ -2952,14 +3139,14 @@ static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
int i;
for (i = 0; i < set->nr_hw_queues; i++)
- if (!__blk_mq_alloc_rq_map(set, i))
+ if (!__blk_mq_alloc_map_and_request(set, i))
goto out_unwind;
return 0;
out_unwind:
while (--i >= 0)
- blk_mq_free_rq_map(set->tags[i]);
+ blk_mq_free_map_and_requests(set, i);
return -ENOMEM;
}
@@ -2969,7 +3156,7 @@ out_unwind:
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
-static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
{
unsigned int depth;
int err;
@@ -3001,6 +3188,14 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
+ /*
+ * blk_mq_map_queues() and multiple .map_queues() implementations
+ * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the
+ * number of hardware queues.
+ */
+ if (set->nr_maps == 1)
+ set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues;
+
if (set->ops->map_queues && !is_kdump_kernel()) {
int i;
@@ -3028,6 +3223,29 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
}
}
+static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
+ int cur_nr_hw_queues, int new_nr_hw_queues)
+{
+ struct blk_mq_tags **new_tags;
+
+ if (cur_nr_hw_queues >= new_nr_hw_queues)
+ return 0;
+
+ new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
+ GFP_KERNEL, set->numa_node);
+ if (!new_tags)
+ return -ENOMEM;
+
+ if (set->tags)
+ memcpy(new_tags, set->tags, cur_nr_hw_queues *
+ sizeof(*set->tags));
+ kfree(set->tags);
+ set->tags = new_tags;
+ set->nr_hw_queues = new_nr_hw_queues;
+
+ return 0;
+}
+
/*
* Alloc a tag set to be associated with one or more request queues.
* May fail with EINVAL for various error conditions. May adjust the
@@ -3081,9 +3299,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
- set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *),
- GFP_KERNEL, set->numa_node);
- if (!set->tags)
+ if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
return -ENOMEM;
ret = -ENOMEM;
@@ -3100,7 +3316,7 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (ret)
goto out_free_mq_map;
- ret = blk_mq_alloc_rq_maps(set);
+ ret = blk_mq_alloc_map_and_requests(set);
if (ret)
goto out_free_mq_map;
@@ -3124,7 +3340,7 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i, j;
- for (i = 0; i < nr_hw_queues(set); i++)
+ for (i = 0; i < set->nr_hw_queues; i++)
blk_mq_free_map_and_requests(set, i);
for (j = 0; j < set->nr_maps; j++) {
@@ -3263,16 +3479,14 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids)
nr_hw_queues = nr_cpu_ids;
- if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues)
+ if (nr_hw_queues < 1)
+ return;
+ if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues)
return;
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_freeze_queue(q);
/*
- * Sync with blk_mq_queue_tag_busy_iter.
- */
- synchronize_rcu();
- /*
* Switch IO scheduler to 'none', cleaning up the data associated
* with the previous scheduler. We will switch back once we are done
* updating the new sw to hw queue mappings.
@@ -3287,9 +3501,13 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
}
prev_nr_hw_queues = set->nr_hw_queues;
+ if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
+ 0)
+ goto reregister;
+
set->nr_hw_queues = nr_hw_queues;
- blk_mq_update_queue_map(set);
fallback:
+ blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
if (q->nr_hw_queues != set->nr_hw_queues) {
@@ -3302,6 +3520,7 @@ fallback:
blk_mq_map_swqueue(q);
}
+reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_sysfs_register(q);
blk_mq_debugfs_register_hctxs(q);
@@ -3358,7 +3577,6 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
}
static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
unsigned long ret = 0;
@@ -3391,7 +3609,6 @@ static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
}
static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
struct hrtimer_sleeper hs;
@@ -3411,7 +3628,7 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
if (q->poll_nsec > 0)
nsecs = q->poll_nsec;
else
- nsecs = blk_mq_poll_nsecs(q, hctx, rq);
+ nsecs = blk_mq_poll_nsecs(q, rq);
if (!nsecs)
return false;
@@ -3425,15 +3642,14 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
kt = nsecs;
mode = HRTIMER_MODE_REL;
- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
hrtimer_set_expires(&hs.timer, kt);
- hrtimer_init_sleeper(&hs, current);
do {
if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_start_expires(&hs.timer, mode);
+ hrtimer_sleeper_start_expires(&hs, mode);
if (hs.task)
io_schedule();
hrtimer_cancel(&hs.timer);
@@ -3467,7 +3683,7 @@ static bool blk_mq_poll_hybrid(struct request_queue *q,
return false;
}
- return blk_mq_poll_hybrid_sleep(q, hctx, rq);
+ return blk_mq_poll_hybrid_sleep(q, rq);
}
/**
@@ -3546,6 +3762,9 @@ static int __init blk_mq_init(void)
{
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
+ cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
+ blk_mq_hctx_notify_online,
+ blk_mq_hctx_notify_offline);
return 0;
}
subsys_initcall(blk_mq_init);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..b3ce0f3a2ad2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -44,7 +44,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
bool kick_requeue_list);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
-bool blk_mq_get_driver_tag(struct request *rq);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
@@ -66,7 +65,8 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
*/
void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq, bool run_queue);
+void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
+ bool run_queue);
void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
struct list_head *list);
@@ -128,15 +128,6 @@ extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
void blk_mq_release(struct request_queue *q);
-/**
- * blk_mq_rq_state() - read the current MQ_RQ_* state of a request
- * @rq: target request.
- */
-static inline enum mq_rq_state blk_mq_rq_state(struct request *rq)
-{
- return READ_ONCE(rq->state);
-}
-
static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
unsigned int cpu)
{
@@ -151,12 +142,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
*/
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
- return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
- put_cpu();
+ return __blk_mq_get_ctx(q, raw_smp_processor_id());
}
struct blk_mq_alloc_data {
@@ -213,8 +199,8 @@ static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag);
- rq->tag = -1;
+ blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
+ rq->tag = BLK_MQ_NO_TAG;
if (rq->rq_flags & RQF_MQ_INFLIGHT) {
rq->rq_flags &= ~RQF_MQ_INFLIGHT;
@@ -224,7 +210,7 @@ static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
static inline void blk_mq_put_driver_tag(struct request *rq)
{
- if (rq->tag == -1 || rq->internal_tag == -1)
+ if (rq->tag == BLK_MQ_NO_TAG || rq->internal_tag == BLK_MQ_NO_TAG)
return;
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
@@ -238,4 +224,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
+/*
+ * blk_mq_plug() - Get caller context plug
+ * @q: request queue
+ * @bio : the bio being submitted by the caller context
+ *
+ * Plugging, by design, may delay the insertion of BIOs into the elevator in
+ * order to increase BIO merging opportunities. This however can cause BIO
+ * insertion order to change from the order in which submit_bio() is being
+ * executed in the case of multiple contexts concurrently issuing BIOs to a
+ * device, even if these context are synchronized to tightly control BIO issuing
+ * order. While this is not a problem with regular block devices, this ordering
+ * change can cause write BIO failures with zoned block devices as these
+ * require sequential write patterns to zones. Prevent this from happening by
+ * ignoring the plug state of a BIO issuing context if the target request queue
+ * is for a zoned block device and the BIO to plug is a write operation.
+ *
+ * Return current->plug if the bio can be plugged and NULL otherwise
+ */
+static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
+ struct bio *bio)
+{
+ /*
+ * For regular block devices or read operations, use the context plug
+ * which may be NULL if blk_start_plug() was not executed.
+ */
+ if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
+ return current->plug;
+
+ /* Zoned block device write operation case: do not plug the BIO */
+ return NULL;
+}
+
#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index 0a028c189897..1adc1cd748b4 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -207,10 +207,12 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
*/
void blk_set_runtime_active(struct request_queue *q)
{
- spin_lock_irq(&q->queue_lock);
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_mark_last_busy(q->dev);
- pm_request_autosuspend(q->dev);
- spin_unlock_irq(&q->queue_lock);
+ if (q->dev) {
+ spin_lock_irq(&q->queue_lock);
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ pm_request_autosuspend(q->dev);
+ spin_unlock_irq(&q->queue_lock);
+ }
}
EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 659ccb8b693f..656460636ad3 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -83,6 +83,15 @@ void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
} while (rqos);
}
+void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
+{
+ do {
+ if (rqos->ops->merge)
+ rqos->ops->merge(rqos, rq, bio);
+ rqos = rqos->next;
+ } while (rqos);
+}
+
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
{
do {
@@ -92,6 +101,15 @@ void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
} while (rqos);
}
+void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
+{
+ do {
+ if (rqos->ops->queue_depth_changed)
+ rqos->ops->queue_depth_changed(rqos);
+ rqos = rqos->next;
+ } while (rqos);
+}
+
/*
* Return true, if we can't increase the depth further by scaling
*/
@@ -142,24 +160,27 @@ bool rq_depth_calc_max_depth(struct rq_depth *rqd)
return ret;
}
-void rq_depth_scale_up(struct rq_depth *rqd)
+/* Returns true on success and false if scaling up wasn't possible */
+bool rq_depth_scale_up(struct rq_depth *rqd)
{
/*
* Hit max in previous round, stop here
*/
if (rqd->scaled_max)
- return;
+ return false;
rqd->scale_step--;
rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+ return true;
}
/*
* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
+ * had a latency violation. Returns true on success and returns false if
+ * scaling down wasn't possible.
*/
-void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
{
/*
* Stop scaling down when we've hit the limit. This also prevents
@@ -167,7 +188,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
* keep up.
*/
if (rqd->max_depth == 1)
- return;
+ return false;
if (rqd->scale_step < 0 && hard_throttle)
rqd->scale_step = 0;
@@ -176,6 +197,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
rqd->scaled_max = false;
rq_depth_calc_max_depth(rqd);
+ return true;
}
struct rq_qos_wait_data {
@@ -202,6 +224,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
+ smp_wmb();
list_del_init(&curr->entry);
wake_up_process(data->task);
return 1;
@@ -244,7 +267,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
return;
prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
+ has_sleeper = !wq_has_single_sleeper(&rqw->wait);
do {
+ /* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
@@ -255,12 +280,14 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
* which means we now have two. Put our local token
* and wake anyone else potentially waiting for one.
*/
+ smp_rmb();
if (data.got_token)
cleanup_cb(rqw, private_data);
break;
}
io_schedule();
- has_sleeper = false;
+ has_sleeper = true;
+ set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(&rqw->wait, &data.wq);
}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 2300e038b9fa..2bc43e94f4c4 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -14,7 +14,8 @@ struct blk_mq_debugfs_attr;
enum rq_qos_id {
RQ_QOS_WBT,
- RQ_QOS_CGROUP,
+ RQ_QOS_LATENCY,
+ RQ_QOS_COST,
};
struct rq_wait {
@@ -35,11 +36,13 @@ struct rq_qos {
struct rq_qos_ops {
void (*throttle)(struct rq_qos *, struct bio *);
void (*track)(struct rq_qos *, struct request *, struct bio *);
+ void (*merge)(struct rq_qos *, struct request *, struct bio *);
void (*issue)(struct rq_qos *, struct request *);
void (*requeue)(struct rq_qos *, struct request *);
void (*done)(struct rq_qos *, struct request *);
void (*done_bio)(struct rq_qos *, struct bio *);
void (*cleanup)(struct rq_qos *, struct bio *);
+ void (*queue_depth_changed)(struct rq_qos *);
void (*exit)(struct rq_qos *);
const struct blk_mq_debugfs_attr *debugfs_attrs;
};
@@ -72,7 +75,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
{
- return rq_qos_id(q, RQ_QOS_CGROUP);
+ return rq_qos_id(q, RQ_QOS_LATENCY);
}
static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
@@ -80,8 +83,10 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
switch (id) {
case RQ_QOS_WBT:
return "wbt";
- case RQ_QOS_CGROUP:
- return "cgroup";
+ case RQ_QOS_LATENCY:
+ return "latency";
+ case RQ_QOS_COST:
+ return "cost";
}
return "unknown";
}
@@ -103,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
{
- struct rq_qos *cur, *prev = NULL;
- for (cur = q->rq_qos; cur; cur = cur->next) {
- if (cur == rqos) {
- if (prev)
- prev->next = rqos->next;
- else
- q->rq_qos = cur;
+ struct rq_qos **cur;
+
+ for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
+ if (*cur == rqos) {
+ *cur = rqos->next;
break;
}
- prev = cur;
}
blk_mq_debugfs_unregister_rqos(rqos);
@@ -125,8 +127,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
acquire_inflight_cb_t *acquire_inflight_cb,
cleanup_cb_t *cleanup_cb);
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
-void rq_depth_scale_up(struct rq_depth *rqd);
-void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_scale_up(struct rq_depth *rqd);
+bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
bool rq_depth_calc_max_depth(struct rq_depth *rqd);
void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
@@ -135,7 +137,9 @@ void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
+void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
+void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
@@ -185,6 +189,19 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
__rq_qos_track(q->rq_qos, rq, bio);
}
+static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ if (q->rq_qos)
+ __rq_qos_merge(q->rq_qos, rq, bio);
+}
+
+static inline void rq_qos_queue_depth_changed(struct request_queue *q)
+{
+ if (q->rq_qos)
+ __rq_qos_queue_depth_changed(q->rq_qos);
+}
+
void rq_qos_exit(struct request_queue *);
#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 2ae348c101a0..9a2c23cd9700 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -12,6 +12,7 @@
#include <linux/lcm.h>
#include <linux/jiffies.h>
#include <linux/gfp.h>
+#include <linux/dma-mapping.h>
#include "blk.h"
#include "blk-wbt.h"
@@ -47,6 +48,7 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->chunk_sectors = 0;
lim->max_write_same_sectors = 0;
lim->max_write_zeroes_sectors = 0;
+ lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
lim->discard_granularity = 0;
@@ -82,46 +84,11 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_dev_sectors = UINT_MAX;
lim->max_write_same_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
+ lim->max_zone_append_sectors = UINT_MAX;
}
EXPORT_SYMBOL(blk_set_stacking_limits);
/**
- * blk_queue_make_request - define an alternate make_request function for a device
- * @q: the request queue for the device to be affected
- * @mfn: the alternate make_request function
- *
- * Description:
- * The normal way for &struct bios to be passed to a device
- * driver is for them to be collected into requests on a request
- * queue, and then to allow the device driver to select requests
- * off that queue when it is ready. This works well for many block
- * devices. However some block devices (typically virtual devices
- * such as md or lvm) do not benefit from the processing on the
- * request queue, and are served best by having the requests passed
- * directly to them. This can be achieved by providing a function
- * to blk_queue_make_request().
- *
- * Caveat:
- * The driver that does this *must* be able to deal appropriately
- * with buffers in "highmemory". This can be accomplished by either calling
- * kmap_atomic() to get a temporary kernel mapping, or by calling
- * blk_queue_bounce() to create a buffer in normal memory.
- **/
-void blk_queue_make_request(struct request_queue *q, make_request_fn *mfn)
-{
- /*
- * set defaults
- */
- q->nr_requests = BLKDEV_MAX_RQ;
-
- q->make_request_fn = mfn;
- blk_queue_dma_alignment(q, 511);
-
- blk_set_default_limits(&q->limits);
-}
-EXPORT_SYMBOL(blk_queue_make_request);
-
-/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
* @max_addr: the maximum address the device can handle
@@ -257,6 +224,33 @@ void blk_queue_max_write_zeroes_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_write_zeroes_sectors);
/**
+ * blk_queue_max_zone_append_sectors - set max sectors for a single zone append
+ * @q: the request queue for the device
+ * @max_zone_append_sectors: maximum number of sectors to write per command
+ **/
+void blk_queue_max_zone_append_sectors(struct request_queue *q,
+ unsigned int max_zone_append_sectors)
+{
+ unsigned int max_sectors;
+
+ if (WARN_ON(!blk_queue_is_zoned(q)))
+ return;
+
+ max_sectors = min(q->limits.max_hw_sectors, max_zone_append_sectors);
+ max_sectors = min(q->limits.chunk_sectors, max_sectors);
+
+ /*
+ * Signal eventual driver bugs resulting in the max_zone_append sectors limit
+ * being 0 due to a 0 argument, the chunk_sectors limit (zone size) not set,
+ * or the max_hw_sectors limit not set.
+ */
+ WARN_ON(!max_sectors);
+
+ q->limits.max_zone_append_sectors = max_sectors;
+}
+EXPORT_SYMBOL_GPL(blk_queue_max_zone_append_sectors);
+
+/**
* blk_queue_max_segments - set max hw segments for a request for this queue
* @q: the request queue for the device
* @max_segments: max number of segments
@@ -327,7 +321,7 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
* storage device can address. The default of 512 covers most
* hardware.
**/
-void blk_queue_logical_block_size(struct request_queue *q, unsigned short size)
+void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
{
q->limits.logical_block_size = size;
@@ -505,6 +499,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
b->max_write_same_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
+ t->max_zone_append_sectors = min(t->max_zone_append_sectors,
+ b->max_zone_append_sectors);
t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
@@ -663,6 +659,9 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
top, bottom);
}
+
+ t->backing_dev_info->io_pages =
+ t->limits.max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(disk_stack_limits);
@@ -684,43 +683,6 @@ void blk_queue_update_dma_pad(struct request_queue *q, unsigned int mask)
EXPORT_SYMBOL(blk_queue_update_dma_pad);
/**
- * blk_queue_dma_drain - Set up a drain buffer for excess dma.
- * @q: the request queue for the device
- * @dma_drain_needed: fn which returns non-zero if drain is necessary
- * @buf: physically contiguous buffer
- * @size: size of the buffer in bytes
- *
- * Some devices have excess DMA problems and can't simply discard (or
- * zero fill) the unwanted piece of the transfer. They have to have a
- * real area of memory to transfer it into. The use case for this is
- * ATAPI devices in DMA mode. If the packet command causes a transfer
- * bigger than the transfer size some HBAs will lock up if there
- * aren't DMA elements to contain the excess transfer. What this API
- * does is adjust the queue so that the buf is always appended
- * silently to the scatterlist.
- *
- * Note: This routine adjusts max_hw_segments to make room for appending
- * the drain buffer. If you call blk_queue_max_segments() after calling
- * this routine, you must set the limit to one fewer than your device
- * can support otherwise there won't be room for the drain buffer.
- */
-int blk_queue_dma_drain(struct request_queue *q,
- dma_drain_needed_fn *dma_drain_needed,
- void *buf, unsigned int size)
-{
- if (queue_max_segments(q) < 2)
- return -EINVAL;
- /* make room for appending the drain */
- blk_queue_max_segments(q, queue_max_segments(q) - 1);
- q->dma_drain_needed = dma_drain_needed;
- q->dma_drain_buffer = buf;
- q->dma_drain_size = size;
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(blk_queue_dma_drain);
-
-/**
* blk_queue_segment_boundary - set boundary rules for segment merging
* @q: the request queue for the device
* @mask: the memory boundary mask
@@ -752,7 +714,8 @@ void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
* page (which might not be idential to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
- q->limits.max_segment_size = UINT_MAX;
+ if (mask)
+ q->limits.max_segment_size = UINT_MAX;
}
EXPORT_SYMBOL(blk_queue_virt_boundary);
@@ -804,7 +767,7 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment);
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
- wbt_set_queue_depth(q, depth);
+ rq_qos_queue_depth_changed(q);
}
EXPORT_SYMBOL(blk_set_queue_depth);
@@ -831,6 +794,44 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
+/**
+ * blk_queue_required_elevator_features - Set a queue required elevator features
+ * @q: the request queue for the target device
+ * @features: Required elevator features OR'ed together
+ *
+ * Tell the block layer that for the device controlled through @q, only the
+ * only elevators that can be used are those that implement at least the set of
+ * features specified by @features.
+ */
+void blk_queue_required_elevator_features(struct request_queue *q,
+ unsigned int features)
+{
+ q->required_elevator_features = features;
+}
+EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
+
+/**
+ * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
+ * @q: the request queue for the device
+ * @dev: the device pointer for dma
+ *
+ * Tell the block layer about merging the segments by dma map of @q.
+ */
+bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
+ struct device *dev)
+{
+ unsigned long boundary = dma_get_merge_boundary(dev);
+
+ if (!boundary)
+ return false;
+
+ /* No need to update max_segment_size. see blk_queue_virt_boundary() */
+ blk_queue_virt_boundary(q, boundary);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
+
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
index 457d9ba3eb20..6e7ec87d49fa 100644
--- a/block/blk-softirq.c
+++ b/block/blk-softirq.c
@@ -42,17 +42,13 @@ static __latent_entropy void blk_done_softirq(struct softirq_action *h)
static void trigger_softirq(void *data)
{
struct request *rq = data;
- unsigned long flags;
struct list_head *list;
- local_irq_save(flags);
list = this_cpu_ptr(&blk_cpu_done);
list_add_tail(&rq->ipi_list, list);
if (list->next == &rq->ipi_list)
raise_softirq_irqoff(BLOCK_SOFTIRQ);
-
- local_irq_restore(flags);
}
/*
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 940f15d600f8..7da302ff88d0 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -53,7 +53,7 @@ void blk_stat_add(struct request *rq, u64 now)
struct request_queue *q = rq->q;
struct blk_stat_callback *cb;
struct blk_rq_stat *stat;
- int bucket;
+ int bucket, cpu;
u64 value;
value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
@@ -61,6 +61,7 @@ void blk_stat_add(struct request *rq, u64 now)
blk_throtl_stat_add(rq, value);
rcu_read_lock();
+ cpu = get_cpu();
list_for_each_entry_rcu(cb, &q->stats->callbacks, list) {
if (!blk_stat_is_active(cb))
continue;
@@ -69,10 +70,10 @@ void blk_stat_add(struct request *rq, u64 now)
if (bucket < 0)
continue;
- stat = &get_cpu_ptr(cb->cpu_stat)[bucket];
+ stat = &per_cpu_ptr(cb->cpu_stat, cpu)[bucket];
blk_rq_stat_add(stat, value);
- put_cpu_ptr(cb->cpu_stat);
}
+ put_cpu();
rcu_read_unlock();
}
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..02643e149d5e 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -218,6 +218,13 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}
+static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
+{
+ unsigned long long max_sectors = q->limits.max_zone_append_sectors;
+
+ return sprintf(page, "%llu\n", max_sectors << SECTOR_SHIFT);
+}
+
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
@@ -482,7 +489,6 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
blk_mq_quiesce_queue(q);
wbt_set_min_lat(q, val);
- wbt_update_limits(q);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
@@ -640,6 +646,11 @@ static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
.show = queue_write_zeroes_max_show,
};
+static struct queue_sysfs_entry queue_zone_append_max_entry = {
+ .attr = {.name = "zone_append_max_bytes", .mode = 0444 },
+ .show = queue_zone_append_max_show,
+};
+
static struct queue_sysfs_entry queue_nonrot_entry = {
.attr = {.name = "rotational", .mode = 0644 },
.show = queue_show_nonrot,
@@ -750,6 +761,7 @@ static struct attribute *queue_attrs[] = {
&queue_discard_zeroes_data_entry.attr,
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
+ &queue_zone_append_max_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
@@ -802,10 +814,6 @@ queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
if (!entry->show)
return -EIO;
mutex_lock(&q->sysfs_lock);
- if (blk_queue_dying(q)) {
- mutex_unlock(&q->sysfs_lock);
- return -ENOENT;
- }
res = entry->show(q, page);
mutex_unlock(&q->sysfs_lock);
return res;
@@ -824,10 +832,6 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
- if (blk_queue_dying(q)) {
- mutex_unlock(&q->sysfs_lock);
- return -ENOENT;
- }
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
@@ -892,6 +896,9 @@ static void __blk_release_queue(struct work_struct *work)
blk_free_queue_stats(q->stats);
+ if (queue_is_mq(q))
+ cancel_delayed_work_sync(&q->requeue_work);
+
blk_exit_queue(q);
blk_queue_free_zone_bitmaps(q);
@@ -938,14 +945,14 @@ int blk_register_queue(struct gendisk *disk)
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
+ bool has_elevator = false;
if (WARN_ON(!q))
return -ENXIO;
- WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+ WARN_ONCE(blk_queue_registered(q),
"%s is registering an already registered queue\n",
kobject_name(&dev->kobj));
- blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
/*
* SCSI probing may synchronously create and destroy a lot of
@@ -965,8 +972,7 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;
- /* Prevent changes through sysfs until registration is completed. */
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0) {
@@ -987,26 +993,33 @@ int blk_register_queue(struct gendisk *disk)
blk_mq_debugfs_register(q);
}
- kobject_uevent(&q->kobj, KOBJ_ADD);
-
- wbt_enable_default(q);
-
- blk_throtl_register_queue(q);
-
+ mutex_lock(&q->sysfs_lock);
if (q->elevator) {
- ret = elv_register_queue(q);
+ ret = elv_register_queue(q, false);
if (ret) {
mutex_unlock(&q->sysfs_lock);
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
+ has_elevator = true;
}
+
+ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+ wbt_enable_default(q);
+ blk_throtl_register_queue(q);
+
+ /* Now everything is ready and send out KOBJ_ADD uevent */
+ kobject_uevent(&q->kobj, KOBJ_ADD);
+ if (has_elevator)
+ kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
+ mutex_unlock(&q->sysfs_lock);
+
ret = 0;
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
@@ -1026,7 +1039,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
/* Return early if disk->queue was never registered. */
- if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ if (!blk_queue_registered(q))
return;
/*
@@ -1035,16 +1048,16 @@ void blk_unregister_queue(struct gendisk *disk)
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock);
-
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+ mutex_unlock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
- mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
@@ -1054,6 +1067,7 @@ void blk_unregister_queue(struct gendisk *disk)
if (q->elevator)
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9ea7c0ecad10..209fdd8939fb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -12,6 +12,7 @@
#include <linux/blktrace_api.h>
#include <linux/blk-cgroup.h>
#include "blk.h"
+#include "blk-cgroup-rwstat.h"
/* Max dispatch from a group in 1 round */
static int throtl_grp_quantum = 8;
@@ -176,6 +177,9 @@ struct throtl_grp {
unsigned int bio_cnt; /* total bios */
unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
unsigned long bio_cnt_reset_time;
+
+ struct blkg_rwstat stat_bytes;
+ struct blkg_rwstat stat_ios;
};
/* We measure latency for request size from <= 4k to >= 1M */
@@ -478,15 +482,23 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
-static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
+ struct request_queue *q,
+ struct blkcg *blkcg)
{
struct throtl_grp *tg;
int rw;
- tg = kzalloc_node(sizeof(*tg), gfp, node);
+ tg = kzalloc_node(sizeof(*tg), gfp, q->node);
if (!tg)
return NULL;
+ if (blkg_rwstat_init(&tg->stat_bytes, gfp))
+ goto err_free_tg;
+
+ if (blkg_rwstat_init(&tg->stat_ios, gfp))
+ goto err_exit_stat_bytes;
+
throtl_service_queue_init(&tg->service_queue);
for (rw = READ; rw <= WRITE; rw++) {
@@ -511,6 +523,12 @@ static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD;
return &tg->pd;
+
+err_exit_stat_bytes:
+ blkg_rwstat_exit(&tg->stat_bytes);
+err_free_tg:
+ kfree(tg);
+ return NULL;
}
static void throtl_pd_init(struct blkg_policy_data *pd)
@@ -609,6 +627,8 @@ static void throtl_pd_free(struct blkg_policy_data *pd)
struct throtl_grp *tg = pd_to_tg(pd);
del_timer_sync(&tg->service_queue.pending_timer);
+ blkg_rwstat_exit(&tg->stat_bytes);
+ blkg_rwstat_exit(&tg->stat_ios);
kfree(tg);
}
@@ -881,13 +901,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
- jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
-
- /* Slice has just started. Consider one slice interval */
- if (!jiffy_elapsed)
- jiffy_elapsed_rnd = tg->td->throtl_slice;
+ jiffy_elapsed = jiffies - tg->slice_start[rw];
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
+ /* Round up to the next throttle slice, wait time must be nonzero */
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -1465,6 +1482,32 @@ static ssize_t tg_set_conf_uint(struct kernfs_open_file *of,
return tg_set_conf(of, buf, nbytes, off, false);
}
+static int tg_print_rwstat(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ blkg_prfill_rwstat, &blkcg_policy_throtl,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
+static u64 tg_prfill_rwstat_recursive(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off,
+ &sum);
+ return __blkg_prfill_rwstat(sf, pd, &sum);
+}
+
+static int tg_print_rwstat_recursive(struct seq_file *sf, void *v)
+{
+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)),
+ tg_prfill_rwstat_recursive, &blkcg_policy_throtl,
+ seq_cft(sf)->private, true);
+ return 0;
+}
+
static struct cftype throtl_legacy_files[] = {
{
.name = "throttle.read_bps_device",
@@ -1492,23 +1535,23 @@ static struct cftype throtl_legacy_files[] = {
},
{
.name = "throttle.io_service_bytes",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_bytes,
+ .private = offsetof(struct throtl_grp, stat_bytes),
+ .seq_show = tg_print_rwstat,
},
{
.name = "throttle.io_service_bytes_recursive",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_bytes_recursive,
+ .private = offsetof(struct throtl_grp, stat_bytes),
+ .seq_show = tg_print_rwstat_recursive,
},
{
.name = "throttle.io_serviced",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_ios,
+ .private = offsetof(struct throtl_grp, stat_ios),
+ .seq_show = tg_print_rwstat,
},
{
.name = "throttle.io_serviced_recursive",
- .private = (unsigned long)&blkcg_policy_throtl,
- .seq_show = blkg_print_stat_ios_recursive,
+ .private = offsetof(struct throtl_grp, stat_ios),
+ .seq_show = tg_print_rwstat_recursive,
},
{ } /* terminate */
};
@@ -2128,7 +2171,16 @@ bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
WARN_ON_ONCE(!rcu_read_lock_held());
/* see throtl_charge_bio() */
- if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw])
+ if (bio_flagged(bio, BIO_THROTTLED))
+ goto out;
+
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
+ blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
+ bio->bi_iter.bi_size);
+ blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
+ }
+
+ if (!tg->has_rules[rw])
goto out;
spin_lock_irq(&q->queue_lock);
@@ -2249,7 +2301,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
struct request_queue *q = rq->q;
struct throtl_data *td = q->td;
- throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
+ throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
+ time_ns >> 10);
}
void blk_throtl_bio_endio(struct bio *bio)
@@ -2305,69 +2358,6 @@ void blk_throtl_bio_endio(struct bio *bio)
}
#endif
-/*
- * Dispatch all bios from all children tg's queued on @parent_sq. On
- * return, @parent_sq is guaranteed to not have any active children tg's
- * and all bios from previously active tg's are on @parent_sq->bio_lists[].
- */
-static void tg_drain_bios(struct throtl_service_queue *parent_sq)
-{
- struct throtl_grp *tg;
-
- while ((tg = throtl_rb_first(parent_sq))) {
- struct throtl_service_queue *sq = &tg->service_queue;
- struct bio *bio;
-
- throtl_dequeue_tg(tg);
-
- while ((bio = throtl_peek_queued(&sq->queued[READ])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- while ((bio = throtl_peek_queued(&sq->queued[WRITE])))
- tg_dispatch_one_bio(tg, bio_data_dir(bio));
- }
-}
-
-/**
- * blk_throtl_drain - drain throttled bios
- * @q: request_queue to drain throttled bios for
- *
- * Dispatch all currently throttled bios on @q through ->make_request_fn().
- */
-void blk_throtl_drain(struct request_queue *q)
- __releases(&q->queue_lock) __acquires(&q->queue_lock)
-{
- struct throtl_data *td = q->td;
- struct blkcg_gq *blkg;
- struct cgroup_subsys_state *pos_css;
- struct bio *bio;
- int rw;
-
- rcu_read_lock();
-
- /*
- * Drain each tg while doing post-order walk on the blkg tree, so
- * that all bios are propagated to td->service_queue. It'd be
- * better to walk service_queue tree directly but blkg walk is
- * easier.
- */
- blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg)
- tg_drain_bios(&blkg_to_tg(blkg)->service_queue);
-
- /* finally, transfer bios from top-level tg's into the td */
- tg_drain_bios(&td->service_queue);
-
- rcu_read_unlock();
- spin_unlock_irq(&q->queue_lock);
-
- /* all bios now should be in td->service_queue, issue them */
- for (rw = READ; rw <= WRITE; rw++)
- while ((bio = throtl_pop_queued(&td->service_queue.queued[rw],
- NULL)))
- generic_make_request(bio);
-
- spin_lock_irq(&q->queue_lock);
-}
-
int blk_throtl_init(struct request_queue *q)
{
struct throtl_data *td;
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 313f45a37e9d..0fa615eefd52 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -308,19 +308,21 @@ static void calc_wb_limits(struct rq_wb *rwb)
static void scale_up(struct rq_wb *rwb)
{
- rq_depth_scale_up(&rwb->rq_depth);
+ if (!rq_depth_scale_up(&rwb->rq_depth))
+ return;
calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
rwb_wake_all(rwb);
- rwb_trace_step(rwb, "scale up");
+ rwb_trace_step(rwb, tracepoint_string("scale up"));
}
static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
- rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
+ if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
+ return;
calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
- rwb_trace_step(rwb, "scale down");
+ rwb_trace_step(rwb, tracepoint_string("scale down"));
}
static void rwb_arm_timer(struct rq_wb *rwb)
@@ -403,7 +405,7 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
rwb_arm_timer(rwb);
}
-static void __wbt_update_limits(struct rq_wb *rwb)
+static void wbt_update_limits(struct rq_wb *rwb)
{
struct rq_depth *rqd = &rwb->rq_depth;
@@ -416,14 +418,6 @@ static void __wbt_update_limits(struct rq_wb *rwb)
rwb_wake_all(rwb);
}
-void wbt_update_limits(struct request_queue *q)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (!rqos)
- return;
- __wbt_update_limits(RQWB(rqos));
-}
-
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -439,7 +433,7 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
return;
RQWB(rqos)->min_lat_nsec = val;
RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
- __wbt_update_limits(RQWB(rqos));
+ wbt_update_limits(RQWB(rqos));
}
@@ -629,15 +623,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
-void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (rqos) {
- RQWB(rqos)->rq_depth.queue_depth = depth;
- __wbt_update_limits(RQWB(rqos));
- }
-}
-
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -656,7 +641,7 @@ void wbt_enable_default(struct request_queue *q)
return;
/* Queue not registered? Maybe shutting down... */
- if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ if (!blk_queue_registered(q))
return;
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
@@ -689,6 +674,12 @@ static int wbt_data_dir(const struct request *rq)
return -1;
}
+static void wbt_queue_depth_changed(struct rq_qos *rqos)
+{
+ RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
+ wbt_update_limits(RQWB(rqos));
+}
+
static void wbt_exit(struct rq_qos *rqos)
{
struct rq_wb *rwb = RQWB(rqos);
@@ -811,6 +802,7 @@ static struct rq_qos_ops wbt_rqos_ops = {
.requeue = wbt_requeue,
.done = wbt_done,
.cleanup = wbt_cleanup,
+ .queue_depth_changed = wbt_queue_depth_changed,
.exit = wbt_exit,
#ifdef CONFIG_BLK_DEBUG_FS
.debugfs_attrs = wbt_debugfs_attrs,
@@ -843,7 +835,7 @@ int wbt_init(struct request_queue *q)
rwb->enable_state = WBT_STATE_ON_DEFAULT;
rwb->wc = 1;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
- __wbt_update_limits(rwb);
+ wbt_update_limits(rwb);
/*
* Assign rwb and add the stats callback.
@@ -853,7 +845,7 @@ int wbt_init(struct request_queue *q)
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
- wbt_set_queue_depth(q, blk_queue_depth(q));
+ wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index f47218d5b3b2..16bdc85b8df9 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -88,14 +88,12 @@ static inline unsigned int wbt_inflight(struct rq_wb *rwb)
#ifdef CONFIG_BLK_WBT
int wbt_init(struct request_queue *);
-void wbt_update_limits(struct request_queue *);
void wbt_disable_default(struct request_queue *);
void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
-void wbt_set_queue_depth(struct request_queue *, unsigned int);
void wbt_set_write_cache(struct request_queue *, bool);
u64 wbt_default_latency_nsec(struct request_queue *);
@@ -109,18 +107,12 @@ static inline int wbt_init(struct request_queue *q)
{
return -EINVAL;
}
-static inline void wbt_update_limits(struct request_queue *q)
-{
-}
static inline void wbt_disable_default(struct request_queue *q)
{
}
static inline void wbt_enable_default(struct request_queue *q)
{
}
-static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
-{
-}
static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
{
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..23831fa8701d 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,9 +14,44 @@
#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include "blk.h"
+#define ZONE_COND_NAME(name) [BLK_ZONE_COND_##name] = #name
+static const char *const zone_cond_name[] = {
+ ZONE_COND_NAME(NOT_WP),
+ ZONE_COND_NAME(EMPTY),
+ ZONE_COND_NAME(IMP_OPEN),
+ ZONE_COND_NAME(EXP_OPEN),
+ ZONE_COND_NAME(CLOSED),
+ ZONE_COND_NAME(READONLY),
+ ZONE_COND_NAME(FULL),
+ ZONE_COND_NAME(OFFLINE),
+};
+#undef ZONE_COND_NAME
+
+/**
+ * blk_zone_cond_str - Return string XXX in BLK_ZONE_COND_XXX.
+ * @zone_cond: BLK_ZONE_COND_XXX.
+ *
+ * Description: Centralize block layer function to convert BLK_ZONE_COND_XXX
+ * into string format. Useful in the debugging and tracing zone conditions. For
+ * invalid BLK_ZONE_COND_XXX it returns string "UNKNOWN".
+ */
+const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
+{
+ static const char *zone_cond_str = "UNKNOWN";
+
+ if (zone_cond < ARRAY_SIZE(zone_cond_name) && zone_cond_name[zone_cond])
+ zone_cond_str = zone_cond_name[zone_cond];
+
+ return zone_cond_str;
+}
+EXPORT_SYMBOL_GPL(blk_zone_cond_str);
+
static inline sector_t blk_zone_start(struct request_queue *q,
sector_t sector)
{
@@ -47,6 +82,20 @@ bool blk_req_needs_zone_write_lock(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
+bool blk_req_zone_write_trylock(struct request *rq)
+{
+ unsigned int zno = blk_rq_zone_no(rq);
+
+ if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
+ return false;
+
+ WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
+ rq->rq_flags |= RQF_ZONE_WRITE_LOCKED;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
+
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
@@ -67,160 +116,98 @@ void __blk_req_zone_write_unlock(struct request *rq)
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
-static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
- sector_t nr_sectors)
-{
- unsigned long zone_sectors = blk_queue_zone_sectors(q);
-
- return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
-}
-
/**
* blkdev_nr_zones - Get number of zones
- * @bdev: Target block device
+ * @disk: Target gendisk
*
- * Description:
- * Return the total number of zones of a zoned block device.
- * For a regular block device, the number of zones is always 0.
+ * Return the total number of zones of a zoned block device. For a block
+ * device without zone capabilities, the number of zones is always 0.
*/
-unsigned int blkdev_nr_zones(struct block_device *bdev)
+unsigned int blkdev_nr_zones(struct gendisk *disk)
{
- struct request_queue *q = bdev_get_queue(bdev);
+ sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
- if (!blk_queue_is_zoned(q))
+ if (!blk_queue_is_zoned(disk->queue))
return 0;
-
- return __blkdev_nr_zones(q, bdev->bd_part->nr_sects);
+ return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
}
EXPORT_SYMBOL_GPL(blkdev_nr_zones);
-/*
- * Check that a zone report belongs to this partition, and if yes, fix its start
- * sector and write pointer and return true. Return false otherwise.
- */
-static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
-{
- sector_t offset = get_start_sect(bdev);
-
- if (rep->start < offset)
- return false;
-
- rep->start -= offset;
- if (rep->start + rep->len > bdev->bd_part->nr_sects)
- return false;
-
- if (rep->type == BLK_ZONE_TYPE_CONVENTIONAL)
- rep->wp = rep->start + rep->len;
- else
- rep->wp -= offset;
- return true;
-}
-
-static int blk_report_zones(struct gendisk *disk, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
-{
- struct request_queue *q = disk->queue;
- unsigned int z = 0, n, nrz = *nr_zones;
- sector_t capacity = get_capacity(disk);
- int ret;
-
- while (z < nrz && sector < capacity) {
- n = nrz - z;
- ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
- gfp_mask);
- if (ret)
- return ret;
- if (!n)
- break;
- sector += blk_queue_zone_sectors(q) * n;
- z += n;
- }
-
- WARN_ON(z > *nr_zones);
- *nr_zones = z;
-
- return 0;
-}
-
/**
* blkdev_report_zones - Get zones information
* @bdev: Target block device
* @sector: Sector from which to report zones
- * @zones: Array of zone structures where to return the zones information
- * @nr_zones: Number of zone structures in the zone array
- * @gfp_mask: Memory allocation flags (for bio_alloc)
+ * @nr_zones: Maximum number of zones to report
+ * @cb: Callback function called for each reported zone
+ * @data: Private data for the callback
*
* Description:
- * Get zone information starting from the zone containing @sector.
- * The number of zone information reported may be less than the number
- * requested by @nr_zones. The number of zones actually reported is
- * returned in @nr_zones.
+ * Get zone information starting from the zone containing @sector for at most
+ * @nr_zones, and call @cb for each zone reported by the device.
+ * To report all zones in a device starting from @sector, the BLK_ALL_ZONES
+ * constant can be passed to @nr_zones.
+ * Returns the number of zones reported by the device, or a negative errno
+ * value in case of failure.
+ *
+ * Note: The caller must use memalloc_noXX_save/restore() calls to control
+ * memory allocations done within this function.
*/
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ unsigned int nr_zones, report_zones_cb cb, void *data)
{
- struct request_queue *q = bdev_get_queue(bdev);
- unsigned int i, nrz;
- int ret;
-
- if (!blk_queue_is_zoned(q))
- return -EOPNOTSUPP;
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t capacity = get_capacity(disk);
- /*
- * A block device that advertized itself as zoned must have a
- * report_zones method. If it does not have one defined, the device
- * driver has a bug. So warn about that.
- */
- if (WARN_ON_ONCE(!bdev->bd_disk->fops->report_zones))
+ if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
+ WARN_ON_ONCE(!disk->fops->report_zones))
return -EOPNOTSUPP;
- if (!*nr_zones || sector >= bdev->bd_part->nr_sects) {
- *nr_zones = 0;
+ if (!nr_zones || sector >= capacity)
return 0;
- }
- nrz = min(*nr_zones,
- __blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
- ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
- zones, &nrz, gfp_mask);
- if (ret)
- return ret;
-
- for (i = 0; i < nrz; i++) {
- if (!blkdev_report_zone(bdev, zones))
- break;
- zones++;
- }
+ return disk->fops->report_zones(disk, sector, nr_zones, cb, data);
+}
+EXPORT_SYMBOL_GPL(blkdev_report_zones);
- *nr_zones = i;
+static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
+ sector_t sector,
+ sector_t nr_sectors)
+{
+ if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
+ return false;
- return 0;
+ /*
+ * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
+ * of the applicable zone range is the entire disk.
+ */
+ return !sector && nr_sectors == get_capacity(bdev->bd_disk);
}
-EXPORT_SYMBOL_GPL(blkdev_report_zones);
/**
- * blkdev_reset_zones - Reset zones write pointer
+ * blkdev_zone_mgmt - Execute a zone management operation on a range of zones
* @bdev: Target block device
- * @sector: Start sector of the first zone to reset
- * @nr_sectors: Number of sectors, at least the length of one zone
+ * @op: Operation to be performed on the zones
+ * @sector: Start sector of the first zone to operate on
+ * @nr_sectors: Number of sectors, should be at least the length of one zone and
+ * must be zone size aligned.
* @gfp_mask: Memory allocation flags (for bio_alloc)
*
* Description:
- * Reset the write pointer of the zones contained in the range
+ * Perform the specified operation on the range of zones specified by
* @sector..@sector+@nr_sectors. Specifying the entire disk sector range
* is valid, but the specified range should not contain conventional zones.
+ * The operation to execute on each zone can be a zone reset, open, close
+ * or finish request.
*/
-int blkdev_reset_zones(struct block_device *bdev,
- sector_t sector, sector_t nr_sectors,
- gfp_t gfp_mask)
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
+ sector_t sector, sector_t nr_sectors,
+ gfp_t gfp_mask)
{
struct request_queue *q = bdev_get_queue(bdev);
- sector_t zone_sectors;
+ sector_t zone_sectors = blk_queue_zone_sectors(q);
+ sector_t capacity = get_capacity(bdev->bd_disk);
sector_t end_sector = sector + nr_sectors;
struct bio *bio = NULL;
- struct blk_plug plug;
int ret;
if (!blk_queue_is_zoned(q))
@@ -229,42 +216,62 @@ int blkdev_reset_zones(struct block_device *bdev,
if (bdev_read_only(bdev))
return -EPERM;
- if (!nr_sectors || end_sector > bdev->bd_part->nr_sects)
+ if (!op_is_zone_mgmt(op))
+ return -EOPNOTSUPP;
+
+ if (end_sector <= sector || end_sector > capacity)
/* Out of range */
return -EINVAL;
/* Check alignment (handle eventual smaller last zone) */
- zone_sectors = blk_queue_zone_sectors(q);
if (sector & (zone_sectors - 1))
return -EINVAL;
- if ((nr_sectors & (zone_sectors - 1)) &&
- end_sector != bdev->bd_part->nr_sects)
+ if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
return -EINVAL;
- blk_start_plug(&plug);
while (sector < end_sector) {
-
bio = blk_next_bio(bio, 0, gfp_mask);
- bio->bi_iter.bi_sector = sector;
bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, REQ_OP_ZONE_RESET, 0);
+ /*
+ * Special case for the zone reset operation that reset all
+ * zones, this is useful for applications like mkfs.
+ */
+ if (op == REQ_OP_ZONE_RESET &&
+ blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
+ bio->bi_opf = REQ_OP_ZONE_RESET_ALL;
+ break;
+ }
+
+ bio->bi_opf = op | REQ_SYNC;
+ bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
/* This may take a while, so be nice to others */
cond_resched();
-
}
ret = submit_bio_wait(bio);
bio_put(bio);
- blk_finish_plug(&plug);
-
return ret;
}
-EXPORT_SYMBOL_GPL(blkdev_reset_zones);
+EXPORT_SYMBOL_GPL(blkdev_zone_mgmt);
+
+struct zone_report_args {
+ struct blk_zone __user *zones;
+};
+
+static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
+ struct zone_report_args *args = data;
+
+ if (copy_to_user(&args->zones[idx], zone, sizeof(struct blk_zone)))
+ return -EFAULT;
+ return 0;
+}
/*
* BLKREPORTZONE ioctl processing.
@@ -274,9 +281,9 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
unsigned int cmd, unsigned long arg)
{
void __user *argp = (void __user *)arg;
+ struct zone_report_args args;
struct request_queue *q;
struct blk_zone_report rep;
- struct blk_zone *zones;
int ret;
if (!argp)
@@ -298,46 +305,29 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!rep.nr_zones)
return -EINVAL;
- rep.nr_zones = min(blkdev_nr_zones(bdev), rep.nr_zones);
-
- zones = kvmalloc_array(rep.nr_zones, sizeof(struct blk_zone),
- GFP_KERNEL | __GFP_ZERO);
- if (!zones)
- return -ENOMEM;
-
- ret = blkdev_report_zones(bdev, rep.sector,
- zones, &rep.nr_zones,
- GFP_KERNEL);
- if (ret)
- goto out;
-
- if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) {
- ret = -EFAULT;
- goto out;
- }
-
- if (rep.nr_zones) {
- if (copy_to_user(argp + sizeof(struct blk_zone_report), zones,
- sizeof(struct blk_zone) * rep.nr_zones))
- ret = -EFAULT;
- }
-
- out:
- kvfree(zones);
+ args.zones = argp + sizeof(struct blk_zone_report);
+ ret = blkdev_report_zones(bdev, rep.sector, rep.nr_zones,
+ blkdev_copy_zone_to_user, &args);
+ if (ret < 0)
+ return ret;
- return ret;
+ rep.nr_zones = ret;
+ if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
+ return -EFAULT;
+ return 0;
}
/*
- * BLKRESETZONE ioctl processing.
+ * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
* Called from blkdev_ioctl.
*/
-int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
+ unsigned int cmd, unsigned long arg)
{
void __user *argp = (void __user *)arg;
struct request_queue *q;
struct blk_zone_range zrange;
+ enum req_opf op;
if (!argp)
return -EINVAL;
@@ -358,8 +348,25 @@ int blkdev_reset_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
return -EFAULT;
- return blkdev_reset_zones(bdev, zrange.sector, zrange.nr_sectors,
- GFP_KERNEL);
+ switch (cmd) {
+ case BLKRESETZONE:
+ op = REQ_OP_ZONE_RESET;
+ break;
+ case BLKOPENZONE:
+ op = REQ_OP_ZONE_OPEN;
+ break;
+ case BLKCLOSEZONE:
+ op = REQ_OP_ZONE_CLOSE;
+ break;
+ case BLKFINISHZONE:
+ op = REQ_OP_ZONE_FINISH;
+ break;
+ default:
+ return -ENOTTY;
+ }
+
+ return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
+ GFP_KERNEL);
}
static inline unsigned long *blk_alloc_zone_bitmap(int node,
@@ -369,130 +376,158 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
GFP_NOIO, node);
}
+void blk_queue_free_zone_bitmaps(struct request_queue *q)
+{
+ kfree(q->conv_zones_bitmap);
+ q->conv_zones_bitmap = NULL;
+ kfree(q->seq_zones_wlock);
+ q->seq_zones_wlock = NULL;
+}
+
+struct blk_revalidate_zone_args {
+ struct gendisk *disk;
+ unsigned long *conv_zones_bitmap;
+ unsigned long *seq_zones_wlock;
+ unsigned int nr_zones;
+ sector_t zone_sectors;
+ sector_t sector;
+};
+
/*
- * Allocate an array of struct blk_zone to get nr_zones zone information.
- * The allocated array may be smaller than nr_zones.
+ * Helper function to check the validity of zones of a zoned block device.
*/
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
{
- size_t size = *nr_zones * sizeof(struct blk_zone);
- struct page *page;
- int order;
-
- for (order = get_order(size); order >= 0; order--) {
- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
- if (page) {
- *nr_zones = min_t(unsigned int, *nr_zones,
- (PAGE_SIZE << order) / sizeof(struct blk_zone));
- return page_address(page);
+ struct blk_revalidate_zone_args *args = data;
+ struct gendisk *disk = args->disk;
+ struct request_queue *q = disk->queue;
+ sector_t capacity = get_capacity(disk);
+
+ /*
+ * All zones must have the same size, with the exception on an eventual
+ * smaller last zone.
+ */
+ if (zone->start == 0) {
+ if (zone->len == 0 || !is_power_of_2(zone->len)) {
+ pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
+ disk->disk_name, zone->len);
+ return -ENODEV;
+ }
+
+ args->zone_sectors = zone->len;
+ args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
+ } else if (zone->start + args->zone_sectors < capacity) {
+ if (zone->len != args->zone_sectors) {
+ pr_warn("%s: Invalid zoned device with non constant zone size\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+ } else {
+ if (zone->len > args->zone_sectors) {
+ pr_warn("%s: Invalid zoned device with larger last zone size\n",
+ disk->disk_name);
+ return -ENODEV;
}
}
- return NULL;
-}
+ /* Check for holes in the zone report */
+ if (zone->start != args->sector) {
+ pr_warn("%s: Zone gap at sectors %llu..%llu\n",
+ disk->disk_name, args->sector, zone->start);
+ return -ENODEV;
+ }
-void blk_queue_free_zone_bitmaps(struct request_queue *q)
-{
- kfree(q->seq_zones_bitmap);
- q->seq_zones_bitmap = NULL;
- kfree(q->seq_zones_wlock);
- q->seq_zones_wlock = NULL;
+ /* Check zone type */
+ switch (zone->type) {
+ case BLK_ZONE_TYPE_CONVENTIONAL:
+ if (!args->conv_zones_bitmap) {
+ args->conv_zones_bitmap =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->conv_zones_bitmap)
+ return -ENOMEM;
+ }
+ set_bit(idx, args->conv_zones_bitmap);
+ break;
+ case BLK_ZONE_TYPE_SEQWRITE_REQ:
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
+ if (!args->seq_zones_wlock) {
+ args->seq_zones_wlock =
+ blk_alloc_zone_bitmap(q->node, args->nr_zones);
+ if (!args->seq_zones_wlock)
+ return -ENOMEM;
+ }
+ break;
+ default:
+ pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
+ disk->disk_name, (int)zone->type, zone->start);
+ return -ENODEV;
+ }
+
+ args->sector += zone->len;
+ return 0;
}
/**
* blk_revalidate_disk_zones - (re)allocate and initialize zone bitmaps
* @disk: Target disk
+ * @update_driver_data: Callback to update driver data on the frozen disk
*
* Helper function for low-level device drivers to (re) allocate and initialize
* a disk request queue zone bitmaps. This functions should normally be called
- * within the disk ->revalidate method. For BIO based queues, no zone bitmap
- * is allocated.
+ * within the disk ->revalidate method for blk-mq based drivers. For BIO based
+ * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
+ * is correct.
+ * If the @update_driver_data callback function is not NULL, the callback is
+ * executed with the device request queue frozen after all zones have been
+ * checked.
*/
-int blk_revalidate_disk_zones(struct gendisk *disk)
+int blk_revalidate_disk_zones(struct gendisk *disk,
+ void (*update_driver_data)(struct gendisk *disk))
{
struct request_queue *q = disk->queue;
- unsigned int nr_zones = __blkdev_nr_zones(q, get_capacity(disk));
- unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
- unsigned int i, rep_nr_zones = 0, z = 0, nrz;
- struct blk_zone *zones = NULL;
- sector_t sector = 0;
- int ret = 0;
+ struct blk_revalidate_zone_args args = {
+ .disk = disk,
+ };
+ unsigned int noio_flag;
+ int ret;
+
+ if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+ return -EIO;
+ if (WARN_ON_ONCE(!queue_is_mq(q)))
+ return -EIO;
/*
- * BIO based queues do not use a scheduler so only q->nr_zones
- * needs to be updated so that the sysfs exposed value is correct.
+ * Ensure that all memory allocations in this context are done as if
+ * GFP_NOIO was specified.
*/
- if (!queue_is_mq(q)) {
- q->nr_zones = nr_zones;
- return 0;
- }
-
- if (!blk_queue_is_zoned(q) || !nr_zones) {
- nr_zones = 0;
- goto update;
- }
-
- /* Allocate bitmaps */
- ret = -ENOMEM;
- seq_zones_wlock = blk_alloc_zone_bitmap(q->node, nr_zones);
- if (!seq_zones_wlock)
- goto out;
- seq_zones_bitmap = blk_alloc_zone_bitmap(q->node, nr_zones);
- if (!seq_zones_bitmap)
- goto out;
-
- /* Get zone information and initialize seq_zones_bitmap */
- rep_nr_zones = nr_zones;
- zones = blk_alloc_zones(q->node, &rep_nr_zones);
- if (!zones)
- goto out;
-
- while (z < nr_zones) {
- nrz = min(nr_zones - z, rep_nr_zones);
- ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
- if (ret)
- goto out;
- if (!nrz)
- break;
- for (i = 0; i < nrz; i++) {
- if (zones[i].type != BLK_ZONE_TYPE_CONVENTIONAL)
- set_bit(z, seq_zones_bitmap);
- z++;
- }
- sector += nrz * blk_queue_zone_sectors(q);
- }
+ noio_flag = memalloc_noio_save();
+ ret = disk->fops->report_zones(disk, 0, UINT_MAX,
+ blk_revalidate_zone_cb, &args);
+ memalloc_noio_restore(noio_flag);
- if (WARN_ON(z != nr_zones)) {
- ret = -EIO;
- goto out;
- }
-
-update:
/*
- * Install the new bitmaps, making sure the queue is stopped and
- * all I/Os are completed (i.e. a scheduler is not referencing the
- * bitmaps).
+ * Install the new bitmaps and update nr_zones only once the queue is
+ * stopped and all I/Os are completed (i.e. a scheduler is not
+ * referencing the bitmaps).
*/
blk_mq_freeze_queue(q);
- q->nr_zones = nr_zones;
- swap(q->seq_zones_wlock, seq_zones_wlock);
- swap(q->seq_zones_bitmap, seq_zones_bitmap);
- blk_mq_unfreeze_queue(q);
-
-out:
- free_pages((unsigned long)zones,
- get_order(rep_nr_zones * sizeof(struct blk_zone)));
- kfree(seq_zones_wlock);
- kfree(seq_zones_bitmap);
-
- if (ret) {
+ if (ret >= 0) {
+ blk_queue_chunk_sectors(q, args.zone_sectors);
+ q->nr_zones = args.nr_zones;
+ swap(q->seq_zones_wlock, args.seq_zones_wlock);
+ swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
+ if (update_driver_data)
+ update_driver_data(disk);
+ ret = 0;
+ } else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- blk_mq_freeze_queue(q);
blk_queue_free_zone_bitmaps(q);
- blk_mq_unfreeze_queue(q);
}
+ blk_mq_unfreeze_queue(q);
+ kfree(args.seq_zones_wlock);
+ kfree(args.conv_zones_bitmap);
return ret;
}
EXPORT_SYMBOL_GPL(blk_revalidate_disk_zones);
-
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..b5d1f0fc6547 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -4,7 +4,10 @@
#include <linux/idr.h>
#include <linux/blk-mq.h>
+#include <linux/part_stat.h>
+#include <linux/blk-crypto.h>
#include <xen/xen.h>
+#include "blk-crypto-internal.h"
#include "blk-mq.h"
#include "blk-mq-sched.h"
@@ -16,9 +19,9 @@ extern struct dentry *blk_debugfs_root;
#endif
struct blk_flush_queue {
- unsigned int flush_queue_delayed:1;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
+ blk_status_t rq_status;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
struct list_head flush_data_in_flight;
@@ -29,6 +32,7 @@ struct blk_flush_queue {
* at the same time
*/
struct request *orig_rq;
+ struct lock_class_key key;
spinlock_t mq_flush_lock;
};
@@ -47,25 +51,18 @@ static inline void __blk_get_queue(struct request_queue *q)
kobject_get(&q->kobj);
}
-struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
- int node, int cmd_size, gfp_t flags);
+static inline bool
+is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->fq->flush_rq == req;
+}
+
+struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
+ gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
- struct bio *bio);
void blk_freeze_queue(struct request_queue *q);
-static inline void blk_queue_enter_live(struct request_queue *q)
-{
- /*
- * Given that running in generic_make_request() context
- * guarantees that a live reference against q_usage_counter has
- * been established, further references under that same context
- * need not check that the queue has been frozen (marked dead).
- */
- percpu_ref_get(&q->q_usage_counter);
-}
-
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
{
@@ -101,9 +98,22 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
return __bvec_gap_to_prev(q, bprv, offset);
}
+static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
+ unsigned int nr_segs)
+{
+ rq->nr_phys_segments = nr_segs;
+ rq->__data_len = bio->bi_iter.bi_size;
+ rq->bio = rq->biotail = bio;
+ rq->ioprio = bio_prio(bio);
+
+ if (bio->bi_disk)
+ rq->rq_disk = bio->bi_disk;
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
+void bio_integrity_free(struct bio *bio);
static inline bool bio_integrity_endio(struct bio *bio)
{
if (bio_integrity(bio))
@@ -130,6 +140,9 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
+
+void blk_integrity_add(struct gendisk *);
+void blk_integrity_del(struct gendisk *);
#else /* CONFIG_BLK_DEV_INTEGRITY */
static inline bool integrity_req_gap_back_merge(struct request *req,
struct bio *next)
@@ -149,22 +162,30 @@ static inline bool bio_integrity_endio(struct bio *bio)
{
return true;
}
+static inline void bio_integrity_free(struct bio *bio)
+{
+}
+static inline void blk_integrity_add(struct gendisk *disk)
+{
+}
+static inline void blk_integrity_del(struct gendisk *disk)
+{
+}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
- struct bio *bio);
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio);
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- struct request **same_queue_rq);
+ unsigned int nr_segs, struct request **same_queue_rq);
-void blk_account_io_start(struct request *req, bool new_io);
-void blk_account_io_completion(struct request *req, unsigned int bytes);
+void blk_account_io_start(struct request *req);
void blk_account_io_done(struct request *req, u64 now);
/*
@@ -174,22 +195,35 @@ void blk_account_io_done(struct request *req, u64 now);
void blk_insert_flush(struct request *rq);
-int elevator_init_mq(struct request_queue *q);
+void elevator_init_mq(struct request_queue *q);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *);
-int elv_register_queue(struct request_queue *q);
+int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
static inline void elevator_exit(struct request_queue *q,
struct elevator_queue *e)
{
+ lockdep_assert_held(&q->sysfs_lock);
+
blk_mq_sched_free_requests(q);
__elevator_exit(q, e);
}
struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
+ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
+ char *buf);
+ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
+ char *buf);
+ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
+ char *buf);
+ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
+ char *buf);
+ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
+ const char *buf, size_t count);
+
#ifdef CONFIG_FAIL_IO_TIMEOUT
int blk_should_fake_timeout(struct request_queue *);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
@@ -202,15 +236,17 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
}
#endif
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio);
-int ll_front_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio);
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+ unsigned int *nr_segs);
+int ll_back_merge_fn(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
+int ll_front_merge_fn(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
struct request *attempt_back_merge(struct request_queue *q, struct request *rq);
struct request *attempt_front_merge(struct request_queue *q, struct request *rq);
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
-void blk_recalc_rq_segments(struct request *rq);
+unsigned int blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
@@ -221,14 +257,11 @@ int blk_dev_init(void);
* Contribute to IO statistics IFF:
*
* a) it's attached to a gendisk, and
- * b) the queue had IO stats enabled when this request was started, and
- * c) it's a file system request
+ * b) the queue had IO stats enabled when this request was started
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return rq->rq_disk &&
- (rq->rq_flags & RQF_IO_STAT) &&
- !blk_rq_is_passthrough(rq);
+ return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
}
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
@@ -259,36 +292,14 @@ void ioc_clear_queue(struct request_queue *q);
int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-/**
- * create_io_context - try to create task->io_context
- * @gfp_mask: allocation mask
- * @node: allocation node
- *
- * If %current->io_context is %NULL, allocate a new io_context and install
- * it. Returns the current %current->io_context which may be %NULL if
- * allocation failed.
- *
- * Note that this function can't be called with IRQ disabled because
- * task_lock which protects %current->io_context is IRQ-unsafe.
- */
-static inline struct io_context *create_io_context(gfp_t gfp_mask, int node)
-{
- WARN_ON_ONCE(irqs_disabled());
- if (unlikely(!current->io_context))
- create_task_io_context(current, gfp_mask, node);
- return current->io_context;
-}
-
/*
* Internal throttling interface
*/
#ifdef CONFIG_BLK_DEV_THROTTLING
-extern void blk_throtl_drain(struct request_queue *q);
extern int blk_throtl_init(struct request_queue *q);
extern void blk_throtl_exit(struct request_queue *q);
extern void blk_throtl_register_queue(struct request_queue *q);
#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline void blk_throtl_drain(struct request_queue *q) { }
static inline int blk_throtl_init(struct request_queue *q) { return 0; }
static inline void blk_throtl_exit(struct request_queue *q) { }
static inline void blk_throtl_register_queue(struct request_queue *q) { }
@@ -331,4 +342,102 @@ void blk_queue_free_zone_bitmaps(struct request_queue *q);
static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
#endif
+struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
+
+int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
+void blk_free_devt(dev_t devt);
+void blk_invalidate_devt(dev_t devt);
+char *disk_name(struct gendisk *hd, int partno, char *buf);
+#define ADDPART_FLAG_NONE 0
+#define ADDPART_FLAG_RAID 1
+#define ADDPART_FLAG_WHOLEDISK 2
+void delete_partition(struct gendisk *disk, struct hd_struct *part);
+int bdev_add_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length);
+int bdev_del_partition(struct block_device *bdev, int partno);
+int bdev_resize_partition(struct block_device *bdev, int partno,
+ sector_t start, sector_t length);
+int disk_expand_part_tbl(struct gendisk *disk, int target);
+int hd_ref_init(struct hd_struct *part);
+
+/* no need to get/put refcount of part0 */
+static inline int hd_struct_try_get(struct hd_struct *part)
+{
+ if (part->partno)
+ return percpu_ref_tryget_live(&part->ref);
+ return 1;
+}
+
+static inline void hd_struct_put(struct hd_struct *part)
+{
+ if (part->partno)
+ percpu_ref_put(&part->ref);
+}
+
+static inline void hd_free_part(struct hd_struct *part)
+{
+ free_percpu(part->dkstats);
+ kfree(part->info);
+ percpu_ref_exit(&part->ref);
+}
+
+/*
+ * Any access of part->nr_sects which is not protected by partition
+ * bd_mutex or gendisk bdev bd_mutex, should be done using this
+ * accessor function.
+ *
+ * Code written along the lines of i_size_read() and i_size_write().
+ * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption
+ * on.
+ */
+static inline sector_t part_nr_sects_read(struct hd_struct *part)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+ sector_t nr_sects;
+ unsigned seq;
+ do {
+ seq = read_seqcount_begin(&part->nr_sects_seq);
+ nr_sects = part->nr_sects;
+ } while (read_seqcount_retry(&part->nr_sects_seq, seq));
+ return nr_sects;
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
+ sector_t nr_sects;
+
+ preempt_disable();
+ nr_sects = part->nr_sects;
+ preempt_enable();
+ return nr_sects;
+#else
+ return part->nr_sects;
+#endif
+}
+
+/*
+ * Should be called with mutex lock held (typically bd_mutex) of partition
+ * to provide mutual exlusion among writers otherwise seqcount might be
+ * left in wrong state leaving the readers spinning infinitely.
+ */
+static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
+{
+#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
+ preempt_disable();
+ write_seqcount_begin(&part->nr_sects_seq);
+ part->nr_sects = size;
+ write_seqcount_end(&part->nr_sects_seq);
+ preempt_enable();
+#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
+ preempt_disable();
+ part->nr_sects = size;
+ preempt_enable();
+#else
+ part->nr_sects = size;
+#endif
+}
+
+struct request_queue *__blk_alloc_queue(int node_id);
+
+int bio_add_hw_page(struct request_queue *q, struct bio *bio,
+ struct page *page, unsigned int len, unsigned int offset,
+ unsigned int max_sectors, bool *same_page);
+
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index f8ed677a1bf7..c3aaed070124 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -267,6 +267,8 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
break;
}
+ bio_crypt_clone(bio, bio_src, gfp_mask);
+
if (bio_integrity(bio_src)) {
int ret;
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 785dd58947f1..6cbb7926534c 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -266,6 +266,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct bsg_set *bset =
container_of(q->tag_set, struct bsg_set, tag_set);
+ blk_status_t sts = BLK_STS_IOERR;
int ret;
blk_mq_start_request(req);
@@ -274,14 +275,15 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_IOERR;
if (!bsg_prepare_job(dev, req))
- return BLK_STS_IOERR;
+ goto out;
ret = bset->job_fn(blk_mq_rq_to_pdu(req));
- if (ret)
- return BLK_STS_IOERR;
+ if (!ret)
+ sts = BLK_STS_OK;
+out:
put_device(dev);
- return BLK_STS_OK;
+ return sts;
}
/* called right after the request is allocated for the request_queue */
diff --git a/block/bsg.c b/block/bsg.c
index 833c44b3d458..d7bae94b64d9 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -382,6 +382,7 @@ static const struct file_operations bsg_fops = {
.open = bsg_open,
.release = bsg_release,
.unlocked_ioctl = bsg_ioctl,
+ .compat_ioctl = compat_ptr_ioctl,
.owner = THIS_MODULE,
.llseek = default_llseek,
};
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
deleted file mode 100644
index 6ca015f92766..000000000000
--- a/block/compat_ioctl.c
+++ /dev/null
@@ -1,411 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-#include <linux/blkdev.h>
-#include <linux/blkpg.h>
-#include <linux/blktrace_api.h>
-#include <linux/cdrom.h>
-#include <linux/compat.h>
-#include <linux/elevator.h>
-#include <linux/hdreg.h>
-#include <linux/slab.h>
-#include <linux/syscalls.h>
-#include <linux/types.h>
-#include <linux/uaccess.h>
-
-static int compat_put_ushort(unsigned long arg, unsigned short val)
-{
- return put_user(val, (unsigned short __user *)compat_ptr(arg));
-}
-
-static int compat_put_int(unsigned long arg, int val)
-{
- return put_user(val, (compat_int_t __user *)compat_ptr(arg));
-}
-
-static int compat_put_uint(unsigned long arg, unsigned int val)
-{
- return put_user(val, (compat_uint_t __user *)compat_ptr(arg));
-}
-
-static int compat_put_long(unsigned long arg, long val)
-{
- return put_user(val, (compat_long_t __user *)compat_ptr(arg));
-}
-
-static int compat_put_ulong(unsigned long arg, compat_ulong_t val)
-{
- return put_user(val, (compat_ulong_t __user *)compat_ptr(arg));
-}
-
-static int compat_put_u64(unsigned long arg, u64 val)
-{
- return put_user(val, (compat_u64 __user *)compat_ptr(arg));
-}
-
-struct compat_hd_geometry {
- unsigned char heads;
- unsigned char sectors;
- unsigned short cylinders;
- u32 start;
-};
-
-static int compat_hdio_getgeo(struct gendisk *disk, struct block_device *bdev,
- struct compat_hd_geometry __user *ugeo)
-{
- struct hd_geometry geo;
- int ret;
-
- if (!ugeo)
- return -EINVAL;
- if (!disk->fops->getgeo)
- return -ENOTTY;
-
- memset(&geo, 0, sizeof(geo));
- /*
- * We need to set the startsect first, the driver may
- * want to override it.
- */
- geo.start = get_start_sect(bdev);
- ret = disk->fops->getgeo(bdev, &geo);
- if (ret)
- return ret;
-
- ret = copy_to_user(ugeo, &geo, 4);
- ret |= put_user(geo.start, &ugeo->start);
- if (ret)
- ret = -EFAULT;
-
- return ret;
-}
-
-static int compat_hdio_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- unsigned long __user *p;
- int error;
-
- p = compat_alloc_user_space(sizeof(unsigned long));
- error = __blkdev_driver_ioctl(bdev, mode,
- cmd, (unsigned long)p);
- if (error == 0) {
- unsigned int __user *uvp = compat_ptr(arg);
- unsigned long v;
- if (get_user(v, p) || put_user(v, uvp))
- error = -EFAULT;
- }
- return error;
-}
-
-struct compat_cdrom_read_audio {
- union cdrom_addr addr;
- u8 addr_format;
- compat_int_t nframes;
- compat_caddr_t buf;
-};
-
-struct compat_cdrom_generic_command {
- unsigned char cmd[CDROM_PACKET_SIZE];
- compat_caddr_t buffer;
- compat_uint_t buflen;
- compat_int_t stat;
- compat_caddr_t sense;
- unsigned char data_direction;
- compat_int_t quiet;
- compat_int_t timeout;
- compat_caddr_t reserved[1];
-};
-
-static int compat_cdrom_read_audio(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct cdrom_read_audio __user *cdread_audio;
- struct compat_cdrom_read_audio __user *cdread_audio32;
- __u32 data;
- void __user *datap;
-
- cdread_audio = compat_alloc_user_space(sizeof(*cdread_audio));
- cdread_audio32 = compat_ptr(arg);
-
- if (copy_in_user(&cdread_audio->addr,
- &cdread_audio32->addr,
- (sizeof(*cdread_audio32) -
- sizeof(compat_caddr_t))))
- return -EFAULT;
-
- if (get_user(data, &cdread_audio32->buf))
- return -EFAULT;
- datap = compat_ptr(data);
- if (put_user(datap, &cdread_audio->buf))
- return -EFAULT;
-
- return __blkdev_driver_ioctl(bdev, mode, cmd,
- (unsigned long)cdread_audio);
-}
-
-static int compat_cdrom_generic_command(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
-{
- struct cdrom_generic_command __user *cgc;
- struct compat_cdrom_generic_command __user *cgc32;
- u32 data;
- unsigned char dir;
- int itmp;
-
- cgc = compat_alloc_user_space(sizeof(*cgc));
- cgc32 = compat_ptr(arg);
-
- if (copy_in_user(&cgc->cmd, &cgc32->cmd, sizeof(cgc->cmd)) ||
- get_user(data, &cgc32->buffer) ||
- put_user(compat_ptr(data), &cgc->buffer) ||
- copy_in_user(&cgc->buflen, &cgc32->buflen,
- (sizeof(unsigned int) + sizeof(int))) ||
- get_user(data, &cgc32->sense) ||
- put_user(compat_ptr(data), &cgc->sense) ||
- get_user(dir, &cgc32->data_direction) ||
- put_user(dir, &cgc->data_direction) ||
- get_user(itmp, &cgc32->quiet) ||
- put_user(itmp, &cgc->quiet) ||
- get_user(itmp, &cgc32->timeout) ||
- put_user(itmp, &cgc->timeout) ||
- get_user(data, &cgc32->reserved[0]) ||
- put_user(compat_ptr(data), &cgc->reserved[0]))
- return -EFAULT;
-
- return __blkdev_driver_ioctl(bdev, mode, cmd, (unsigned long)cgc);
-}
-
-struct compat_blkpg_ioctl_arg {
- compat_int_t op;
- compat_int_t flags;
- compat_int_t datalen;
- compat_caddr_t data;
-};
-
-static int compat_blkpg_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, struct compat_blkpg_ioctl_arg __user *ua32)
-{
- struct blkpg_ioctl_arg __user *a = compat_alloc_user_space(sizeof(*a));
- compat_caddr_t udata;
- compat_int_t n;
- int err;
-
- err = get_user(n, &ua32->op);
- err |= put_user(n, &a->op);
- err |= get_user(n, &ua32->flags);
- err |= put_user(n, &a->flags);
- err |= get_user(n, &ua32->datalen);
- err |= put_user(n, &a->datalen);
- err |= get_user(udata, &ua32->data);
- err |= put_user(compat_ptr(udata), &a->data);
- if (err)
- return err;
-
- return blkdev_ioctl(bdev, mode, cmd, (unsigned long)a);
-}
-
-#define BLKBSZGET_32 _IOR(0x12, 112, int)
-#define BLKBSZSET_32 _IOW(0x12, 113, int)
-#define BLKGETSIZE64_32 _IOR(0x12, 114, int)
-
-static int compat_blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg)
-{
- switch (cmd) {
- case HDIO_GET_UNMASKINTR:
- case HDIO_GET_MULTCOUNT:
- case HDIO_GET_KEEPSETTINGS:
- case HDIO_GET_32BIT:
- case HDIO_GET_NOWERR:
- case HDIO_GET_DMA:
- case HDIO_GET_NICE:
- case HDIO_GET_WCACHE:
- case HDIO_GET_ACOUSTIC:
- case HDIO_GET_ADDRESS:
- case HDIO_GET_BUSSTATE:
- return compat_hdio_ioctl(bdev, mode, cmd, arg);
- case CDROMREADAUDIO:
- return compat_cdrom_read_audio(bdev, mode, cmd, arg);
- case CDROM_SEND_PACKET:
- return compat_cdrom_generic_command(bdev, mode, cmd, arg);
-
- /*
- * No handler required for the ones below, we just need to
- * convert arg to a 64 bit pointer.
- */
- case BLKSECTSET:
- /*
- * 0x03 -- HD/IDE ioctl's used by hdparm and friends.
- * Some need translations, these do not.
- */
- case HDIO_GET_IDENTITY:
- case HDIO_DRIVE_TASK:
- case HDIO_DRIVE_CMD:
- /* 0x330 is reserved -- it used to be HDIO_GETGEO_BIG */
- case 0x330:
- /* CDROM stuff */
- case CDROMPAUSE:
- case CDROMRESUME:
- case CDROMPLAYMSF:
- case CDROMPLAYTRKIND:
- case CDROMREADTOCHDR:
- case CDROMREADTOCENTRY:
- case CDROMSTOP:
- case CDROMSTART:
- case CDROMEJECT:
- case CDROMVOLCTRL:
- case CDROMSUBCHNL:
- case CDROMMULTISESSION:
- case CDROM_GET_MCN:
- case CDROMRESET:
- case CDROMVOLREAD:
- case CDROMSEEK:
- case CDROMPLAYBLK:
- case CDROMCLOSETRAY:
- case CDROM_DISC_STATUS:
- case CDROM_CHANGER_NSLOTS:
- case CDROM_GET_CAPABILITY:
- /* Ignore cdrom.h about these next 5 ioctls, they absolutely do
- * not take a struct cdrom_read, instead they take a struct cdrom_msf
- * which is compatible.
- */
- case CDROMREADMODE2:
- case CDROMREADMODE1:
- case CDROMREADRAW:
- case CDROMREADCOOKED:
- case CDROMREADALL:
- /* DVD ioctls */
- case DVD_READ_STRUCT:
- case DVD_WRITE_STRUCT:
- case DVD_AUTH:
- arg = (unsigned long)compat_ptr(arg);
- /* These intepret arg as an unsigned long, not as a pointer,
- * so we must not do compat_ptr() conversion. */
- case HDIO_SET_MULTCOUNT:
- case HDIO_SET_UNMASKINTR:
- case HDIO_SET_KEEPSETTINGS:
- case HDIO_SET_32BIT:
- case HDIO_SET_NOWERR:
- case HDIO_SET_DMA:
- case HDIO_SET_PIO_MODE:
- case HDIO_SET_NICE:
- case HDIO_SET_WCACHE:
- case HDIO_SET_ACOUSTIC:
- case HDIO_SET_BUSSTATE:
- case HDIO_SET_ADDRESS:
- case CDROMEJECT_SW:
- case CDROM_SET_OPTIONS:
- case CDROM_CLEAR_OPTIONS:
- case CDROM_SELECT_SPEED:
- case CDROM_SELECT_DISC:
- case CDROM_MEDIA_CHANGED:
- case CDROM_DRIVE_STATUS:
- case CDROM_LOCKDOOR:
- case CDROM_DEBUG:
- break;
- default:
- /* unknown ioctl number */
- return -ENOIOCTLCMD;
- }
-
- return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
-}
-
-/* Most of the generic ioctls are handled in the normal fallback path.
- This assumes the blkdev's low level compat_ioctl always returns
- ENOIOCTLCMD for unknown ioctls. */
-long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
-{
- int ret = -ENOIOCTLCMD;
- struct inode *inode = file->f_mapping->host;
- struct block_device *bdev = inode->i_bdev;
- struct gendisk *disk = bdev->bd_disk;
- fmode_t mode = file->f_mode;
- loff_t size;
- unsigned int max_sectors;
-
- /*
- * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
- * to updated it before every ioctl.
- */
- if (file->f_flags & O_NDELAY)
- mode |= FMODE_NDELAY;
- else
- mode &= ~FMODE_NDELAY;
-
- switch (cmd) {
- case HDIO_GETGEO:
- return compat_hdio_getgeo(disk, bdev, compat_ptr(arg));
- case BLKPBSZGET:
- return compat_put_uint(arg, bdev_physical_block_size(bdev));
- case BLKIOMIN:
- return compat_put_uint(arg, bdev_io_min(bdev));
- case BLKIOOPT:
- return compat_put_uint(arg, bdev_io_opt(bdev));
- case BLKALIGNOFF:
- return compat_put_int(arg, bdev_alignment_offset(bdev));
- case BLKDISCARDZEROES:
- return compat_put_uint(arg, 0);
- case BLKFLSBUF:
- case BLKROSET:
- case BLKDISCARD:
- case BLKSECDISCARD:
- case BLKZEROOUT:
- /*
- * the ones below are implemented in blkdev_locked_ioctl,
- * but we call blkdev_ioctl, which gets the lock for us
- */
- case BLKRRPART:
- return blkdev_ioctl(bdev, mode, cmd,
- (unsigned long)compat_ptr(arg));
- case BLKBSZSET_32:
- return blkdev_ioctl(bdev, mode, BLKBSZSET,
- (unsigned long)compat_ptr(arg));
- case BLKPG:
- return compat_blkpg_ioctl(bdev, mode, cmd, compat_ptr(arg));
- case BLKRAGET:
- case BLKFRAGET:
- if (!arg)
- return -EINVAL;
- return compat_put_long(arg,
- (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
- case BLKROGET: /* compatible */
- return compat_put_int(arg, bdev_read_only(bdev) != 0);
- case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
- return compat_put_int(arg, block_size(bdev));
- case BLKSSZGET: /* get block device hardware sector size */
- return compat_put_int(arg, bdev_logical_block_size(bdev));
- case BLKSECTGET:
- max_sectors = min_t(unsigned int, USHRT_MAX,
- queue_max_sectors(bdev_get_queue(bdev)));
- return compat_put_ushort(arg, max_sectors);
- case BLKROTATIONAL:
- return compat_put_ushort(arg,
- !blk_queue_nonrot(bdev_get_queue(bdev)));
- case BLKRASET: /* compatible, but no compat_ptr (!) */
- case BLKFRASET:
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
- bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
- return 0;
- case BLKGETSIZE:
- size = i_size_read(bdev->bd_inode);
- if ((size >> 9) > ~0UL)
- return -EFBIG;
- return compat_put_ulong(arg, size >> 9);
-
- case BLKGETSIZE64_32:
- return compat_put_u64(arg, i_size_read(bdev->bd_inode));
-
- case BLKTRACESETUP32:
- case BLKTRACESTART: /* compatible */
- case BLKTRACESTOP: /* compatible */
- case BLKTRACETEARDOWN: /* compatible */
- ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
- return ret;
- default:
- if (disk->fops->compat_ioctl)
- ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
- if (ret == -ENOIOCTLCMD)
- ret = compat_blkdev_driver_ioctl(bdev, mode, cmd, arg);
- return ret;
- }
-}
diff --git a/block/elevator.c b/block/elevator.c
index 2f17d66d0e61..4eab3d70e880 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static bool elevator_match(const struct elevator_type *e, const char *name)
+static inline bool elv_support_features(unsigned int elv_features,
+ unsigned int required_features)
{
+ return (required_features & elv_features) == required_features;
+}
+
+/**
+ * elevator_match - Test an elevator name and features
+ * @e: Scheduler to test
+ * @name: Elevator name to test
+ * @required_features: Features that the elevator must provide
+ *
+ * Return true is the elevator @e name matches @name and if @e provides all the
+ * the feratures spcified by @required_features.
+ */
+static bool elevator_match(const struct elevator_type *e, const char *name,
+ unsigned int required_features)
+{
+ if (!elv_support_features(e->elevator_features, required_features))
+ return false;
if (!strcmp(e->elevator_name, name))
return true;
if (e->elevator_alias && !strcmp(e->elevator_alias, name))
@@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
return false;
}
-/*
- * Return scheduler with name 'name'
+/**
+ * elevator_find - Find an elevator
+ * @name: Name of the elevator to find
+ * @required_features: Features that the elevator must provide
+ *
+ * Return the first registered scheduler with name @name and supporting the
+ * features @required_features and NULL otherwise.
*/
-static struct elevator_type *elevator_find(const char *name)
+static struct elevator_type *elevator_find(const char *name,
+ unsigned int required_features)
{
struct elevator_type *e;
list_for_each_entry(e, &elv_list, list) {
- if (elevator_match(e, name))
+ if (elevator_match(e, name, required_features))
return e;
}
@@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
spin_lock(&elv_list_lock);
- e = elevator_find(name);
+ e = elevator_find(name, q->required_elevator_features);
if (!e && try_loading) {
spin_unlock(&elv_list_lock);
request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
- e = elevator_find(name);
+ e = elevator_find(name, q->required_elevator_features);
}
if (e && !try_module_get(e->elevator_owner))
@@ -135,20 +159,6 @@ static struct elevator_type *elevator_get(struct request_queue *q,
return e;
}
-static char chosen_elevator[ELV_NAME_MAX];
-
-static int __init elevator_setup(char *str)
-{
- /*
- * Be backwards-compatible with previous kernels, so users
- * won't get the wrong elevator.
- */
- strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
- return 1;
-}
-
-__setup("elevator=", elevator_setup);
-
static struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -470,13 +480,16 @@ static struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q)
+/*
+ * elv_register_queue is called from either blk_register_queue or
+ * elevator_switch, elevator switch is prevented from being happen
+ * in the two paths, so it is safe to not hold q->sysfs_lock.
+ */
+int elv_register_queue(struct request_queue *q, bool uevent)
{
struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->sysfs_lock);
-
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -487,21 +500,27 @@ int elv_register_queue(struct request_queue *q)
attr++;
}
}
- kobject_uevent(&e->kobj, KOBJ_ADD);
+ if (uevent)
+ kobject_uevent(&e->kobj, KOBJ_ADD);
+
e->registered = 1;
}
return error;
}
+/*
+ * elv_unregister_queue is called from either blk_unregister_queue or
+ * elevator_switch, elevator switch is prevented from being happen
+ * in the two paths, so it is safe to not hold q->sysfs_lock.
+ */
void elv_unregister_queue(struct request_queue *q)
{
- lockdep_assert_held(&q->sysfs_lock);
-
if (q) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
+
e->registered = 0;
/* Re-enable throttling in case elevator disabled it */
wbt_enable_default(q);
@@ -526,7 +545,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- if (elevator_find(e->elevator_name)) {
+ if (elevator_find(e->elevator_name, 0)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@@ -569,6 +588,7 @@ int elevator_switch_mq(struct request_queue *q,
if (q->elevator) {
if (q->elevator->registered)
elv_unregister_queue(q);
+
ioc_clear_queue(q);
elevator_exit(q, q->elevator);
}
@@ -578,7 +598,7 @@ int elevator_switch_mq(struct request_queue *q,
goto out;
if (new_e) {
- ret = elv_register_queue(q);
+ ret = elv_register_queue(q, true);
if (ret) {
elevator_exit(q, q->elevator);
goto out;
@@ -594,37 +614,90 @@ out:
return ret;
}
+static inline bool elv_support_iosched(struct request_queue *q)
+{
+ if (!q->mq_ops ||
+ (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
+ return false;
+ return true;
+}
+
/*
- * For blk-mq devices, we default to using mq-deadline, if available, for single
- * queue devices. If deadline isn't available OR we have multiple queues,
- * default to "none".
+ * For single queue devices, default to using mq-deadline. If we have multiple
+ * queues or mq-deadline is not available, default to "none".
*/
-int elevator_init_mq(struct request_queue *q)
+static struct elevator_type *elevator_get_default(struct request_queue *q)
+{
+ if (q->nr_hw_queues != 1)
+ return NULL;
+
+ return elevator_get(q, "mq-deadline", false);
+}
+
+/*
+ * Get the first elevator providing the features required by the request queue.
+ * Default to "none" if no matching elevator is found.
+ */
+static struct elevator_type *elevator_get_by_features(struct request_queue *q)
+{
+ struct elevator_type *e, *found = NULL;
+
+ spin_lock(&elv_list_lock);
+
+ list_for_each_entry(e, &elv_list, list) {
+ if (elv_support_features(e->elevator_features,
+ q->required_elevator_features)) {
+ found = e;
+ break;
+ }
+ }
+
+ if (found && !try_module_get(found->elevator_owner))
+ found = NULL;
+
+ spin_unlock(&elv_list_lock);
+ return found;
+}
+
+/*
+ * For a device queue that has no required features, use the default elevator
+ * settings. Otherwise, use the first elevator available matching the required
+ * features. If no suitable elevator is find or if the chosen elevator
+ * initialization fails, fall back to the "none" elevator (no elevator).
+ */
+void elevator_init_mq(struct request_queue *q)
{
struct elevator_type *e;
- int err = 0;
+ int err;
- if (q->nr_hw_queues != 1)
- return 0;
+ if (!elv_support_iosched(q))
+ return;
+
+ WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
- /*
- * q->sysfs_lock must be held to provide mutual exclusion between
- * elevator_switch() and here.
- */
- mutex_lock(&q->sysfs_lock);
if (unlikely(q->elevator))
- goto out_unlock;
+ return;
- e = elevator_get(q, "mq-deadline", false);
+ if (!q->required_elevator_features)
+ e = elevator_get_default(q);
+ else
+ e = elevator_get_by_features(q);
if (!e)
- goto out_unlock;
+ return;
+
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
err = blk_mq_init_sched(q, e);
- if (err)
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
+ if (err) {
+ pr_warn("\"%s\" elevator initialization failed, "
+ "falling back to \"none\"\n", e->elevator_name);
elevator_put(e);
-out_unlock:
- mutex_unlock(&q->sysfs_lock);
- return err;
+ }
}
@@ -660,7 +733,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
struct elevator_type *e;
/* Make sure queue is not in the middle of being removed */
- if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ if (!blk_queue_registered(q))
return -ENOENT;
/*
@@ -677,7 +750,8 @@ static int __elevator_change(struct request_queue *q, const char *name)
if (!e)
return -EINVAL;
- if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
+ if (q->elevator &&
+ elevator_match(q->elevator->type, elevator_name, 0)) {
elevator_put(e);
return 0;
}
@@ -685,13 +759,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
return elevator_switch(q, e);
}
-static inline bool elv_support_iosched(struct request_queue *q)
-{
- if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
- return false;
- return true;
-}
-
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
size_t count)
{
@@ -724,11 +791,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
- if (elv && elevator_match(elv, __e->elevator_name)) {
+ if (elv && elevator_match(elv, __e->elevator_name, 0)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
continue;
}
- if (elv_support_iosched(q))
+ if (elv_support_iosched(q) &&
+ elevator_match(__e, __e->elevator_name,
+ q->required_elevator_features))
len += sprintf(name+len, "%s ", __e->elevator_name);
}
spin_unlock(&elv_list_lock);
@@ -763,3 +832,12 @@ struct request *elv_rb_latter_request(struct request_queue *q,
return NULL;
}
EXPORT_SYMBOL(elv_rb_latter_request);
+
+static int __init elevator_setup(char *str)
+{
+ pr_warn("Kernel parameter elevator= does not have any effect anymore.\n"
+ "Please use sysfs to set IO scheduler for individual devices.\n");
+ return 1;
+}
+
+__setup("elevator=", elevator_setup);
diff --git a/block/genhd.c b/block/genhd.c
index 24654e1d83e6..1a7659327664 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -4,6 +4,7 @@
*/
#include <linux/module.h>
+#include <linux/ctype.h>
#include <linux/fs.h>
#include <linux/genhd.h>
#include <linux/kdev_t.h>
@@ -26,7 +27,7 @@
#include "blk.h"
static DEFINE_MUTEX(block_class_lock);
-struct kobject *block_depr;
+static struct kobject *block_depr;
/* for extended dynamic devt allocation, currently only one major is used */
#define NR_EXT_DEVT (1 << MINORBITS)
@@ -46,36 +47,77 @@ static void disk_add_events(struct gendisk *disk);
static void disk_del_events(struct gendisk *disk);
static void disk_release_events(struct gendisk *disk);
-void part_inc_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+/*
+ * Set disk capacity and notify if the size is not currently
+ * zero and will not be set to zero
+ */
+void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
+ bool revalidate)
{
- if (queue_is_mq(q))
- return;
+ sector_t capacity = get_capacity(disk);
- part_stat_local_inc(part, in_flight[rw]);
- if (part->partno)
- part_stat_local_inc(&part_to_disk(part)->part0, in_flight[rw]);
+ set_capacity(disk, size);
+
+ if (revalidate)
+ revalidate_disk(disk);
+
+ if (capacity != size && capacity != 0 && size != 0) {
+ char *envp[] = { "RESIZE=1", NULL };
+
+ kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+ }
}
-void part_dec_in_flight(struct request_queue *q, struct hd_struct *part, int rw)
+EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify);
+
+/*
+ * Format the device name of the indicated disk into the supplied buffer and
+ * return a pointer to that same buffer for convenience.
+ */
+char *disk_name(struct gendisk *hd, int partno, char *buf)
{
- if (queue_is_mq(q))
- return;
+ if (!partno)
+ snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
+ else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
+ snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
+ else
+ snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
- part_stat_local_dec(part, in_flight[rw]);
- if (part->partno)
- part_stat_local_dec(&part_to_disk(part)->part0, in_flight[rw]);
+ return buf;
+}
+
+const char *bdevname(struct block_device *bdev, char *buf)
+{
+ return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
}
+EXPORT_SYMBOL(bdevname);
-unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
+static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
{
int cpu;
- unsigned int inflight;
- if (queue_is_mq(q)) {
- return blk_mq_in_flight(q, part);
+ memset(stat, 0, sizeof(struct disk_stats));
+ for_each_possible_cpu(cpu) {
+ struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
+ int group;
+
+ for (group = 0; group < NR_STAT_GROUPS; group++) {
+ stat->nsecs[group] += ptr->nsecs[group];
+ stat->sectors[group] += ptr->sectors[group];
+ stat->ios[group] += ptr->ios[group];
+ stat->merges[group] += ptr->merges[group];
+ }
+
+ stat->io_ticks += ptr->io_ticks;
}
+}
+
+static unsigned int part_in_flight(struct request_queue *q,
+ struct hd_struct *part)
+{
+ unsigned int inflight = 0;
+ int cpu;
- inflight = 0;
for_each_possible_cpu(cpu) {
inflight += part_stat_local_read_cpu(part, in_flight[0], cpu) +
part_stat_local_read_cpu(part, in_flight[1], cpu);
@@ -86,16 +128,11 @@ unsigned int part_in_flight(struct request_queue *q, struct hd_struct *part)
return inflight;
}
-void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2])
+static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
+ unsigned int inflight[2])
{
int cpu;
- if (queue_is_mq(q)) {
- blk_mq_in_flight_rw(q, part, inflight);
- return;
- }
-
inflight[0] = 0;
inflight[1] = 0;
for_each_possible_cpu(cpu) {
@@ -143,7 +180,6 @@ struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
return part;
}
-EXPORT_SYMBOL_GPL(disk_get_part);
/**
* disk_part_iter_init - initialize partition iterator
@@ -271,11 +307,13 @@ static inline int sector_in_part(struct hd_struct *part, sector_t sector)
* primarily used for stats accounting.
*
* CONTEXT:
- * RCU read locked. The returned partition pointer is valid only
- * while preemption is disabled.
+ * RCU read locked. The returned partition pointer is always valid
+ * because its refcount is grabbed except for part0, which lifetime
+ * is same with the disk.
*
* RETURNS:
* Found partition on success, part0 is returned if no partition matches
+ * or the matched partition is being deleted.
*/
struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
{
@@ -283,23 +321,70 @@ struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
struct hd_struct *part;
int i;
+ rcu_read_lock();
ptbl = rcu_dereference(disk->part_tbl);
part = rcu_dereference(ptbl->last_lookup);
- if (part && sector_in_part(part, sector))
- return part;
+ if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
+ goto out_unlock;
for (i = 1; i < ptbl->len; i++) {
part = rcu_dereference(ptbl->part[i]);
if (part && sector_in_part(part, sector)) {
+ /*
+ * only live partition can be cached for lookup,
+ * so use-after-free on cached & deleting partition
+ * can be avoided
+ */
+ if (!hd_struct_try_get(part))
+ break;
rcu_assign_pointer(ptbl->last_lookup, part);
- return part;
+ goto out_unlock;
+ }
+ }
+
+ part = &disk->part0;
+out_unlock:
+ rcu_read_unlock();
+ return part;
+}
+
+/**
+ * disk_has_partitions
+ * @disk: gendisk of interest
+ *
+ * Walk through the partition table and check if valid partition exists.
+ *
+ * CONTEXT:
+ * Don't care.
+ *
+ * RETURNS:
+ * True if the gendisk has at least one valid non-zero size partition.
+ * Otherwise false.
+ */
+bool disk_has_partitions(struct gendisk *disk)
+{
+ struct disk_part_tbl *ptbl;
+ int i;
+ bool ret = false;
+
+ rcu_read_lock();
+ ptbl = rcu_dereference(disk->part_tbl);
+
+ /* Iterate partitions skipping the whole device at index 0 */
+ for (i = 1; i < ptbl->len; i++) {
+ if (rcu_dereference(ptbl->part[i])) {
+ ret = true;
+ break;
}
}
- return &disk->part0;
+
+ rcu_read_unlock();
+
+ return ret;
}
-EXPORT_SYMBOL_GPL(disk_map_sector_rcu);
+EXPORT_SYMBOL_GPL(disk_has_partitions);
/*
* Can be deleted altogether. Later.
@@ -695,6 +780,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
dev_t devt;
int retval;
+ /*
+ * The disk queue should now be all set with enough information about
+ * the device for the elevator code to pick an adequate default
+ * elevator if one is needed, that is, for devices requesting queue
+ * registration.
+ */
+ if (register_queue)
+ elevator_init_mq(disk->queue);
+
/* minors == 0 indicates to use ext devt from part0 and should
* be accompanied with EXT_DEVT flag. Make sure all
* parameters make sense.
@@ -723,13 +817,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
disk->flags |= GENHD_FL_NO_PART_SCAN;
} else {
+ struct backing_dev_info *bdi = disk->queue->backing_dev_info;
+ struct device *dev = disk_to_dev(disk);
int ret;
/* Register BDI before referencing it from bdev */
- disk_to_dev(disk)->devt = devt;
- ret = bdi_register_owner(disk->queue->backing_dev_info,
- disk_to_dev(disk));
+ dev->devt = devt;
+ ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
WARN_ON(ret);
+ bdi_set_owner(bdi, dev);
blk_register_region(disk_devt(disk), disk->minors, NULL,
exact_match, exact_lock, disk);
}
@@ -761,6 +857,25 @@ void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
}
EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+static void invalidate_partition(struct gendisk *disk, int partno)
+{
+ struct block_device *bdev;
+
+ bdev = bdget_disk(disk, partno);
+ if (!bdev)
+ return;
+
+ fsync_bdev(bdev);
+ __invalidate_device(bdev, true);
+
+ /*
+ * Unhash the bdev inode for this device so that it gets evicted as soon
+ * as last inode reference is dropped.
+ */
+ remove_inode_hash(bdev->bd_inode);
+ bdput(bdev);
+}
+
void del_gendisk(struct gendisk *disk)
{
struct disk_part_iter piter;
@@ -779,13 +894,11 @@ void del_gendisk(struct gendisk *disk)
DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
while ((part = disk_part_iter_next(&piter))) {
invalidate_partition(disk, part->partno);
- bdev_unhash_inode(part_devt(part));
- delete_partition(disk, part->partno);
+ delete_partition(disk, part);
}
disk_part_iter_exit(&piter);
invalidate_partition(disk, 0);
- bdev_unhash_inode(disk_devt(disk));
set_capacity(disk, 0);
disk->flags &= ~GENHD_FL_UP;
up_write(&disk->lookup_sem);
@@ -899,7 +1012,6 @@ struct gendisk *get_gendisk(dev_t devt, int *partno)
}
return disk;
}
-EXPORT_SYMBOL(get_gendisk);