aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig17
-rw-r--r--block/Kconfig.iosched9
-rw-r--r--block/Makefile1
-rw-r--r--block/bfq-cgroup.c368
-rw-r--r--block/bfq-iosched.c1082
-rw-r--r--block/bfq-iosched.h51
-rw-r--r--block/bfq-wf2q.c2
-rw-r--r--block/bio-integrity.c8
-rw-r--r--block/bio.c215
-rw-r--r--block/blk-cgroup.c361
-rw-r--r--block/blk-core.c172
-rw-r--r--block/blk-flush.c15
-rw-r--r--block/blk-integrity.c13
-rw-r--r--block/blk-iocost.c2472
-rw-r--r--block/blk-iolatency.c62
-rw-r--r--block/blk-map.c12
-rw-r--r--block/blk-merge.c261
-rw-r--r--block/blk-mq-cpumap.c29
-rw-r--r--block/blk-mq-debugfs.c42
-rw-r--r--block/blk-mq-sched.c33
-rw-r--r--block/blk-mq-sched.h19
-rw-r--r--block/blk-mq-sysfs.c38
-rw-r--r--block/blk-mq-tag.c40
-rw-r--r--block/blk-mq.c147
-rw-r--r--block/blk-mq.h39
-rw-r--r--block/blk-pm.c12
-rw-r--r--block/blk-rq-qos.c39
-rw-r--r--block/blk-rq-qos.h45
-rw-r--r--block/blk-settings.c44
-rw-r--r--block/blk-sysfs.c43
-rw-r--r--block/blk-throttle.c18
-rw-r--r--block/blk-wbt.c26
-rw-r--r--block/blk-wbt.h4
-rw-r--r--block/blk-zoned.c108
-rw-r--r--block/blk.h50
-rw-r--r--block/bsg-lib.c10
-rw-r--r--block/compat_ioctl.c13
-rw-r--r--block/elevator.c187
-rw-r--r--block/genhd.c16
-rw-r--r--block/ioprio.c2
-rw-r--r--block/kyber-iosched.c6
-rw-r--r--block/mq-deadline.c27
-rw-r--r--block/opal_proto.h19
-rw-r--r--block/partitions/Kconfig2
-rw-r--r--block/partitions/cmdline.c2
-rw-r--r--block/sed-opal.c250
-rw-r--r--block/t10-pi.c169
47 files changed, 5216 insertions, 1384 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 2466dcc3ef1d..41c0917ce622 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -26,6 +26,9 @@ menuconfig BLOCK
if BLOCK
+config BLK_RQ_ALLOC_TIME
+ bool
+
config BLK_SCSI_REQUEST
bool
@@ -89,7 +92,7 @@ config BLK_DEV_THROTTLING
one needs to mount and use blkio cgroup controller for creating
cgroups and specifying per device IO rate policies.
- See Documentation/cgroup-v1/blkio-controller.txt for more information.
+ See Documentation/admin-guide/cgroup-v1/blkio-controller.rst for more information.
config BLK_DEV_THROTTLING_LOW
bool "Block throttling .low limit interface support (EXPERIMENTAL)"
@@ -110,7 +113,7 @@ config BLK_CMDLINE_PARSER
which don't otherwise have any standardized method for listing the
partitions on a block device.
- See Documentation/block/cmdline-partition.txt for more information.
+ See Documentation/block/cmdline-partition.rst for more information.
config BLK_WBT
bool "Enable support for block device writeback throttling"
@@ -132,6 +135,16 @@ config BLK_CGROUP_IOLATENCY
Note, this is an experimental interface and could be changed someday.
+config BLK_CGROUP_IOCOST
+ bool "Enable support for cost model based cgroup IO controller"
+ depends on BLK_CGROUP=y
+ select BLK_RQ_ALLOC_TIME
+ ---help---
+ Enabling this option enables the .weight interface for cost
+ model based proportional IO control. The IO controller
+ distributes IO capacity between different groups based on
+ their share of the overall weight distribution.
+
config BLK_WBT_MQ
bool "Multiqueue writeback throttling"
default y
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 4626b88b2d5a..b89310a022ad 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -26,7 +26,7 @@ config IOSCHED_BFQ
regardless of the device parameters and with any workload. It
also guarantees a low latency to interactive and soft
real-time applications. Details in
- Documentation/block/bfq-iosched.txt
+ Documentation/block/bfq-iosched.rst
config BFQ_GROUP_IOSCHED
bool "BFQ hierarchical scheduling support"
@@ -36,6 +36,13 @@ config BFQ_GROUP_IOSCHED
Enable hierarchical scheduling in BFQ, using the blkio
(cgroups-v1) or io (cgroups-v2) controller.
+config BFQ_CGROUP_DEBUG
+ bool "BFQ IO controller debugging"
+ depends on BFQ_GROUP_IOSCHED
+ ---help---
+ Enable some debugging help. Currently it exports additional stat
+ files in a cgroup which can be useful for debugging.
+
endmenu
endif
diff --git a/block/Makefile b/block/Makefile
index eee1b4ceecf9..9ef57ace90d4 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
+obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index b3796a40a61a..86a607cf19a1 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -15,7 +15,83 @@
#include "bfq-iosched.h"
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+static int bfq_stat_init(struct bfq_stat *stat, gfp_t gfp)
+{
+ int ret;
+
+ ret = percpu_counter_init(&stat->cpu_cnt, 0, gfp);
+ if (ret)
+ return ret;
+
+ atomic64_set(&stat->aux_cnt, 0);
+ return 0;
+}
+
+static void bfq_stat_exit(struct bfq_stat *stat)
+{
+ percpu_counter_destroy(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_add - add a value to a bfq_stat
+ * @stat: target bfq_stat
+ * @val: value to add
+ *
+ * Add @val to @stat. The caller must ensure that IRQ on the same CPU
+ * don't re-enter this function for the same counter.
+ */
+static inline void bfq_stat_add(struct bfq_stat *stat, uint64_t val)
+{
+ percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
+}
+
+/**
+ * bfq_stat_read - read the current value of a bfq_stat
+ * @stat: bfq_stat to read
+ */
+static inline uint64_t bfq_stat_read(struct bfq_stat *stat)
+{
+ return percpu_counter_sum_positive(&stat->cpu_cnt);
+}
+
+/**
+ * bfq_stat_reset - reset a bfq_stat
+ * @stat: bfq_stat to reset
+ */
+static inline void bfq_stat_reset(struct bfq_stat *stat)
+{
+ percpu_counter_set(&stat->cpu_cnt, 0);
+ atomic64_set(&stat->aux_cnt, 0);
+}
+
+/**
+ * bfq_stat_add_aux - add a bfq_stat into another's aux count
+ * @to: the destination bfq_stat
+ * @from: the source
+ *
+ * Add @from's count including the aux one to @to's aux count.
+ */
+static inline void bfq_stat_add_aux(struct bfq_stat *to,
+ struct bfq_stat *from)
+{
+ atomic64_add(bfq_stat_read(from) + atomic64_read(&from->aux_cnt),
+ &to->aux_cnt);
+}
+
+/**
+ * blkg_prfill_stat - prfill callback for bfq_stat
+ * @sf: seq_file to print to
+ * @pd: policy private data of interest
+ * @off: offset to the bfq_stat in @pd
+ *
+ * prfill callback for printing a bfq_stat.
+ */
+static u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ return __blkg_prfill_u64(sf, pd, bfq_stat_read((void *)pd + off));
+}
/* bfqg stats flags */
enum bfqg_stats_flags {
@@ -53,7 +129,7 @@ static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats)
now = ktime_get_ns();
if (now > stats->start_group_wait_time)
- blkg_stat_add(&stats->group_wait_time,
+ bfq_stat_add(&stats->group_wait_time,
now - stats->start_group_wait_time);
bfqg_stats_clear_waiting(stats);
}
@@ -82,14 +158,14 @@ static void bfqg_stats_end_empty_time(struct bfqg_stats *stats)
now = ktime_get_ns();
if (now > stats->start_empty_time)
- blkg_stat_add(&stats->empty_time,
+ bfq_stat_add(&stats->empty_time,
now - stats->start_empty_time);
bfqg_stats_clear_empty(stats);
}
void bfqg_stats_update_dequeue(struct bfq_group *bfqg)
{
- blkg_stat_add(&bfqg->stats.dequeue, 1);
+ bfq_stat_add(&bfqg->stats.dequeue, 1);
}
void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg)
@@ -119,7 +195,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg)
u64 now = ktime_get_ns();
if (now > stats->start_idle_time)
- blkg_stat_add(&stats->idle_time,
+ bfq_stat_add(&stats->idle_time,
now - stats->start_idle_time);
bfqg_stats_clear_idling(stats);
}
@@ -137,9 +213,9 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
{
struct bfqg_stats *stats = &bfqg->stats;
- blkg_stat_add(&stats->avg_queue_size_sum,
+ bfq_stat_add(&stats->avg_queue_size_sum,
blkg_rwstat_total(&stats->queued));
- blkg_stat_add(&stats->avg_queue_size_samples, 1);
+ bfq_stat_add(&stats->avg_queue_size_samples, 1);
bfqg_stats_update_group_wait_time(stats);
}
@@ -176,7 +252,7 @@ void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
io_start_time_ns - start_time_ns);
}
-#else /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#else /* CONFIG_BFQ_CGROUP_DEBUG */
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
unsigned int op) { }
@@ -190,7 +266,7 @@ void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
-#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -274,18 +350,18 @@ void bfqg_and_blkg_put(struct bfq_group *bfqg)
/* @stats = 0 */
static void bfqg_stats_reset(struct bfqg_stats *stats)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* queued stats shouldn't be cleared */
blkg_rwstat_reset(&stats->merged);
blkg_rwstat_reset(&stats->service_time);
blkg_rwstat_reset(&stats->wait_time);
- blkg_stat_reset(&stats->time);
- blkg_stat_reset(&stats->avg_queue_size_sum);
- blkg_stat_reset(&stats->avg_queue_size_samples);
- blkg_stat_reset(&stats->dequeue);
- blkg_stat_reset(&stats->group_wait_time);
- blkg_stat_reset(&stats->idle_time);
- blkg_stat_reset(&stats->empty_time);
+ bfq_stat_reset(&stats->time);
+ bfq_stat_reset(&stats->avg_queue_size_sum);
+ bfq_stat_reset(&stats->avg_queue_size_samples);
+ bfq_stat_reset(&stats->dequeue);
+ bfq_stat_reset(&stats->group_wait_time);
+ bfq_stat_reset(&stats->idle_time);
+ bfq_stat_reset(&stats->empty_time);
#endif
}
@@ -295,19 +371,19 @@ static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from)
if (!to || !from)
return;
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* queued stats shouldn't be cleared */
blkg_rwstat_add_aux(&to->merged, &from->merged);
blkg_rwstat_add_aux(&to->service_time, &from->service_time);
blkg_rwstat_add_aux(&to->wait_time, &from->wait_time);
- blkg_stat_add_aux(&from->time, &from->time);
- blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
- blkg_stat_add_aux(&to->avg_queue_size_samples,
+ bfq_stat_add_aux(&from->time, &from->time);
+ bfq_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum);
+ bfq_stat_add_aux(&to->avg_queue_size_samples,
&from->avg_queue_size_samples);
- blkg_stat_add_aux(&to->dequeue, &from->dequeue);
- blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
- blkg_stat_add_aux(&to->idle_time, &from->idle_time);
- blkg_stat_add_aux(&to->empty_time, &from->empty_time);
+ bfq_stat_add_aux(&to->dequeue, &from->dequeue);
+ bfq_stat_add_aux(&to->group_wait_time, &from->group_wait_time);
+ bfq_stat_add_aux(&to->idle_time, &from->idle_time);
+ bfq_stat_add_aux(&to->empty_time, &from->empty_time);
#endif
}
@@ -355,35 +431,35 @@ void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg)
static void bfqg_stats_exit(struct bfqg_stats *stats)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
blkg_rwstat_exit(&stats->merged);
blkg_rwstat_exit(&stats->service_time);
blkg_rwstat_exit(&stats->wait_time);
blkg_rwstat_exit(&stats->queued);
- blkg_stat_exit(&stats->time);
- blkg_stat_exit(&stats->avg_queue_size_sum);
- blkg_stat_exit(&stats->avg_queue_size_samples);
- blkg_stat_exit(&stats->dequeue);
- blkg_stat_exit(&stats->group_wait_time);
- blkg_stat_exit(&stats->idle_time);
- blkg_stat_exit(&stats->empty_time);
+ bfq_stat_exit(&stats->time);
+ bfq_stat_exit(&stats->avg_queue_size_sum);
+ bfq_stat_exit(&stats->avg_queue_size_samples);
+ bfq_stat_exit(&stats->dequeue);
+ bfq_stat_exit(&stats->group_wait_time);
+ bfq_stat_exit(&stats->idle_time);
+ bfq_stat_exit(&stats->empty_time);
#endif
}
static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
blkg_rwstat_init(&stats->service_time, gfp) ||
blkg_rwstat_init(&stats->wait_time, gfp) ||
blkg_rwstat_init(&stats->queued, gfp) ||
- blkg_stat_init(&stats->time, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_sum, gfp) ||
- blkg_stat_init(&stats->avg_queue_size_samples, gfp) ||
- blkg_stat_init(&stats->dequeue, gfp) ||
- blkg_stat_init(&stats->group_wait_time, gfp) ||
- blkg_stat_init(&stats->idle_time, gfp) ||
- blkg_stat_init(&stats->empty_time, gfp)) {
+ bfq_stat_init(&stats->time, gfp) ||
+ bfq_stat_init(&stats->avg_queue_size_sum, gfp) ||
+ bfq_stat_init(&stats->avg_queue_size_samples, gfp) ||
+ bfq_stat_init(&stats->dequeue, gfp) ||
+ bfq_stat_init(&stats->group_wait_time, gfp) ||
+ bfq_stat_init(&stats->idle_time, gfp) ||
+ bfq_stat_init(&stats->empty_time, gfp)) {
bfqg_stats_exit(stats);
return -ENOMEM;
}
@@ -425,11 +501,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd)
kfree(cpd_to_bfqgd(cpd));
}
-static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q,
+ struct blkcg *blkcg)
{
struct bfq_group *bfqg;
- bfqg = kzalloc_node(sizeof(*bfqg), gfp, node);
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node);
if (!bfqg)
return NULL;
@@ -828,7 +905,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd)
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}
-static int bfq_io_show_weight(struct seq_file *sf, void *v)
+static int bfq_io_show_weight_legacy(struct seq_file *sf, void *v)
{
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
@@ -842,6 +919,60 @@ static int bfq_io_show_weight(struct seq_file *sf, void *v)
return 0;
}
+static u64 bfqg_prfill_weight_device(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ struct bfq_group *bfqg = pd_to_bfqg(pd);
+
+ if (!bfqg->entity.dev_weight)
+ return 0;
+ return __blkg_prfill_u64(sf, pd, bfqg->entity.dev_weight);
+}
+
+static int bfq_io_show_weight(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg);
+
+ seq_printf(sf, "default %u\n", bfqgd->weight);
+ blkcg_print_blkgs(sf, blkcg, bfqg_prfill_weight_device,
+ &blkcg_policy_bfq, 0, false);
+ return 0;
+}
+
+static void bfq_group_set_weight(struct bfq_group *bfqg, u64 weight, u64 dev_weight)
+{
+ weight = dev_weight ?: weight;
+
+ bfqg->entity.dev_weight = dev_weight;
+ /*
+ * Setting the prio_changed flag of the entity
+ * to 1 with new_weight == weight would re-set
+ * the value of the weight to its ioprio mapping.
+ * Set the flag only if necessary.
+ */
+ if ((unsigned short)weight != bfqg->entity.new_weight) {
+ bfqg->entity.new_weight = (unsigned short)weight;
+ /*
+ * Make sure that the above new value has been
+ * stored in bfqg->entity.new_weight before
+ * setting the prio_changed flag. In fact,
+ * this flag may be read asynchronously (in
+ * critical sections protected by a different
+ * lock than that held here), and finding this
+ * flag set may cause the execution of the code
+ * for updating parameters whose value may
+ * depend also on bfqg->entity.new_weight (in
+ * __bfq_entity_update_weight_prio).
+ * This barrier makes sure that the new value
+ * of bfqg->entity.new_weight is correctly
+ * seen in that code.
+ */
+ smp_wmb();
+ bfqg->entity.prio_changed = 1;
+ }
+}
+
static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
struct cftype *cftype,
u64 val)
@@ -860,56 +991,73 @@ static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css,
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct bfq_group *bfqg = blkg_to_bfqg(blkg);
- if (!bfqg)
- continue;
- /*
- * Setting the prio_changed flag of the entity
- * to 1 with new_weight == weight would re-set
- * the value of the weight to its ioprio mapping.
- * Set the flag only if necessary.
- */
- if ((unsigned short)val != bfqg->entity.new_weight) {
- bfqg->entity.new_weight = (unsigned short)val;
- /*
- * Make sure that the above new value has been
- * stored in bfqg->entity.new_weight before
- * setting the prio_changed flag. In fact,
- * this flag may be read asynchronously (in
- * critical sections protected by a different
- * lock than that held here), and finding this
- * flag set may cause the execution of the code
- * for updating parameters whose value may
- * depend also on bfqg->entity.new_weight (in
- * __bfq_entity_update_weight_prio).
- * This barrier makes sure that the new value
- * of bfqg->entity.new_weight is correctly
- * seen in that code.
- */
- smp_wmb();
- bfqg->entity.prio_changed = 1;
- }
+ if (bfqg)
+ bfq_group_set_weight(bfqg, val, 0);
}
spin_unlock_irq(&blkcg->lock);
return ret;
}
-static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
- char *buf, size_t nbytes,
- loff_t off)
+static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
{
- u64 weight;
- /* First unsigned long found in the file is used */
- int ret = kstrtoull(strim(buf), 0, &weight);
+ int ret;
+ struct blkg_conf_ctx ctx;
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct bfq_group *bfqg;
+ u64 v;
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
if (ret)
return ret;
- ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight);
+ if (sscanf(ctx.body, "%llu", &v) == 1) {
+ /* require "default" on dfl */
+ ret = -ERANGE;
+ if (!v)
+ goto out;
+ } else if (!strcmp(strim(ctx.body), "default")) {
+ v = 0;
+ } else {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ bfqg = blkg_to_bfqg(ctx.blkg);
+
+ ret = -ERANGE;
+ if (!v || (v >= BFQ_MIN_WEIGHT && v <= BFQ_MAX_WEIGHT)) {
+ bfq_group_set_weight(bfqg, bfqg->entity.weight, v);
+ ret = 0;
+ }
+out:
+ blkg_conf_finish(&ctx);
return ret ?: nbytes;
}
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of,
+ char *buf, size_t nbytes,
+ loff_t off)
+{
+ char *endp;
+ int ret;
+ u64 v;
+
+ buf = strim(buf);
+
+ /* "WEIGHT" or "default WEIGHT" sets the default weight */
+ v = simple_strtoull(buf, &endp, 0);
+ if (*endp == '\0' || sscanf(buf, "default %llu", &v) == 1) {
+ ret = bfq_io_set_weight_legacy(of_css(of), NULL, v);
+ return ret ?: nbytes;
+ }
+
+ return bfq_io_set_device_weight(of, buf, nbytes, off);
+}
+
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
static int bfqg_print_stat(struct seq_file *sf, void *v)
{
blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat,
@@ -927,17 +1075,34 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v)
static u64 bfqg_prfill_stat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq, off);
+ struct blkcg_gq *blkg = pd_to_blkg(pd);
+ struct blkcg_gq *pos_blkg;
+ struct cgroup_subsys_state *pos_css;
+ u64 sum = 0;
+
+ lockdep_assert_held(&blkg->q->queue_lock);
+
+ rcu_read_lock();
+ blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
+ struct bfq_stat *stat;
+
+ if (!pos_blkg->online)
+ continue;
+
+ stat = (void *)blkg_to_pd(pos_blkg, &blkcg_policy_bfq) + off;
+ sum += bfq_stat_read(stat) + atomic64_read(&stat->aux_cnt);
+ }
+ rcu_read_unlock();
+
return __blkg_prfill_u64(sf, pd, sum);
}
static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd),
- &blkcg_policy_bfq,
- off);
+ struct blkg_rwstat_sample sum;
+
+ blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off, &sum);
return __blkg_prfill_rwstat(sf, pd, &sum);
}
@@ -975,12 +1140,13 @@ static int bfqg_print_stat_sectors(struct seq_file *sf, void *v)
static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes));
- u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]);
+ struct blkg_rwstat_sample tmp;
- return __blkg_prfill_u64(sf, pd, sum >> 9);
+ blkg_rwstat_recursive_sum(pd->blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes), &tmp);
+
+ return __blkg_prfill_u64(sf, pd,
+ (tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE]) >> 9);
}
static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v)
@@ -995,11 +1161,11 @@ static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
struct bfq_group *bfqg = pd_to_bfqg(pd);
- u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples);
+ u64 samples = bfq_stat_read(&bfqg->stats.avg_queue_size_samples);
u64 v = 0;
if (samples) {
- v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum);
+ v = bfq_stat_read(&bfqg->stats.avg_queue_size_sum);
v = div64_u64(v, samples);
}
__blkg_prfill_u64(sf, pd, v);
@@ -1014,7 +1180,7 @@ static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v)
0, false);
return 0;
}
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
@@ -1047,9 +1213,15 @@ struct cftype bfq_blkcg_legacy_files[] = {
{
.name = "bfq.weight",
.flags = CFTYPE_NOT_ON_ROOT,
- .seq_show = bfq_io_show_weight,
+ .seq_show = bfq_io_show_weight_legacy,
.write_u64 = bfq_io_set_weight_legacy,
},
+ {
+ .name = "bfq.weight_device",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = bfq_io_show_weight,
+ .write = bfq_io_set_weight,
+ },
/* statistics, covers only the tasks in the bfqg */
{
@@ -1062,7 +1234,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_ios,
},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
.name = "bfq.time",
.private = offsetof(struct bfq_group, stats.time),
@@ -1092,7 +1264,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.queued),
.seq_show = bfqg_print_rwstat,
},
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
/* the same statistics which cover the bfqg and its descendants */
{
@@ -1105,7 +1277,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = (unsigned long)&blkcg_policy_bfq,
.seq_show = blkg_print_stat_ios_recursive,
},
-#ifdef CONFIG_DEBUG_BLK_CGROUP
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
{
.name = "bfq.time_recursive",
.private = offsetof(struct bfq_group, stats.time),
@@ -1159,7 +1331,7 @@ struct cftype bfq_blkcg_legacy_files[] = {
.private = offsetof(struct bfq_group, stats.dequeue),
.seq_show = bfqg_print_stat,
},
-#endif /* CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
{ } /* terminate */
};
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index e5db3856b194..0c6214497fcc 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -17,7 +17,7 @@
* low-latency capabilities. BFQ also supports full hierarchical
* scheduling through cgroups. Next paragraphs provide an introduction
* on BFQ inner workings. Details on BFQ benefits, usage and
- * limitations can be found in Documentation/block/bfq-iosched.txt.
+ * limitations can be found in Documentation/block/bfq-iosched.rst.
*
* BFQ is a proportional-share storage-I/O scheduling algorithm based
* on the slice-by-slice service scheme of CFQ. But BFQ assigns
@@ -157,6 +157,7 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS \
/* Expiration time of sync (0) and async (1) requests, in ns. */
@@ -1427,17 +1428,19 @@ static int bfq_min_budget(struct bfq_data *bfqd)
* mechanism may be re-designed in such a way to make it possible to
* know whether preemption is needed without needing to update service
* trees). In addition, queue preemptions almost always cause random
- * I/O, and thus loss of throughput. Because of these facts, the next
- * function adopts the following simple scheme to avoid both costly
- * operations and too frequent preemptions: it requests the expiration
- * of the in-service queue (unconditionally) only for queues that need
- * to recover a hole, or that either are weight-raised or deserve to
- * be weight-raised.
+ * I/O, which may in turn cause loss of throughput. Finally, there may
+ * even be no in-service queue when the next function is invoked (so,
+ * no queue to compare timestamps with). Because of these facts, the
+ * next function adopts the following simple scheme to avoid costly
+ * operations, too frequent preemptions and too many dependencies on
+ * the state of the scheduler: it requests the expiration of the
+ * in-service queue (unconditionally) only for queues that need to
+ * recover a hole. Then it delegates to other parts of the code the
+ * responsibility of handling the above case 2.
*/
static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
- bool arrived_in_time,
- bool wr_or_deserves_wr)
+ bool arrived_in_time)
{
struct bfq_entity *entity = &bfqq->entity;
@@ -1492,7 +1495,7 @@ static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd,
entity->budget = max_t(unsigned long, bfqq->max_budget,
bfq_serv_to_charge(bfqq->next_rq, bfqq));
bfq_clear_bfqq_non_blocking_wait_rq(bfqq);
- return wr_or_deserves_wr;
+ return false;
}
/*
@@ -1610,6 +1613,36 @@ static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd,
bfqd->bfq_wr_min_idle_time);
}
+
+/*
+ * Return true if bfqq is in a higher priority class, or has a higher
+ * weight than the in-service queue.
+ */
+static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
+ struct bfq_queue *in_serv_bfqq)
+{
+ int bfqq_weight, in_serv_weight;
+
+ if (bfqq->ioprio_class < in_serv_bfqq->ioprio_class)
+ return true;
+
+ if (in_serv_bfqq->entity.parent == bfqq->entity.parent) {
+ bfqq_weight = bfqq->entity.weight;
+ in_serv_weight = in_serv_bfqq->entity.weight;
+ } else {
+ if (bfqq->entity.parent)
+ bfqq_weight = bfqq->entity.parent->weight;
+ else
+ bfqq_weight = bfqq->entity.weight;
+ if (in_serv_bfqq->entity.parent)
+ in_serv_weight = in_serv_bfqq->entity.parent->weight;
+ else
+ in_serv_weight = in_serv_bfqq->entity.weight;
+ }
+
+ return bfqq_weight > in_serv_weight;
+}
+
static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
int old_wr_coeff,
@@ -1654,8 +1687,7 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
*/
bfqq_wants_to_preempt =
bfq_bfqq_update_budg_for_activation(bfqd, bfqq,
- arrived_in_time,
- wr_or_deserves_wr);
+ arrived_in_time);
/*
* If bfqq happened to be activated in a burst, but has been
@@ -1720,21 +1752,111 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
/*
* Expire in-service queue only if preemption may be needed
- * for guarantees. In this respect, the function
- * next_queue_may_preempt just checks a simple, necessary
- * condition, and not a sufficient condition based on
- * timestamps. In fact, for the latter condition to be
- * evaluated, timestamps would need first to be updated, and
- * this operation is quite costly (see the comments on the
- * function bfq_bfqq_update_budg_for_activation).
+ * for guarantees. In particular, we care only about two
+ * cases. The first is that bfqq has to recover a service
+ * hole, as explained in the comments on
+ * bfq_bfqq_update_budg_for_activation(), i.e., that
+ * bfqq_wants_to_preempt is true. However, if bfqq does not
+ * carry time-critical I/O, then bfqq's bandwidth is less
+ * important than that of queues that carry time-critical I/O.
+ * So, as a further constraint, we consider this case only if
+ * bfqq is at least as weight-raised, i.e., at least as time
+ * critical, as the in-service queue.
+ *
+ * The second case is that bfqq is in a higher priority class,
+ * or has a higher weight than the in-service queue. If this
+ * condition does not hold, we don't care because, even if
+ * bfqq does not start to be served immediately, the resulting
+ * delay for bfqq's I/O is however lower or much lower than
+ * the ideal completion time to be guaranteed to bfqq's I/O.
+ *
+ * In both cases, preemption is needed only if, according to
+ * the timestamps of both bfqq and of the in-service queue,
+ * bfqq actually is the next queue to serve. So, to reduce
+ * useless preemptions, the return value of
+ * next_queue_may_preempt() is considered in the next compound
+ * condition too. Yet next_queue_may_preempt() just checks a
+ * simple, necessary condition for bfqq to be the next queue
+ * to serve. In fact, to evaluate a sufficient condition, the
+ * timestamps of the in-service queue would need to be
+ * updated, and this operation is quite costly (see the
+ * comments on bfq_bfqq_update_budg_for_activation()).
*/
- if (bfqd->in_service_queue && bfqq_wants_to_preempt &&
- bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff &&
+ if (bfqd->in_service_queue &&
+ ((bfqq_wants_to_preempt &&
+ bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
next_queue_may_preempt(bfqd))
bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
false, BFQQE_PREEMPTED);
}
+static void bfq_reset_inject_limit(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ /* invalidate baseline total service time */
+ bfqq->last_serv_time_ns = 0;
+
+ /*
+ * Reset pointer in case we are waiting for
+ * some request completion.
+ */
+ bfqd->waited_rq = NULL;
+
+ /*
+ * If bfqq has a short think time, then start by setting the
+ * inject limit to 0 prudentially, because the service time of
+ * an injected I/O request may be higher than the think time
+ * of bfqq, and therefore, if one request was injected when
+ * bfqq remains empty, this injected request might delay the
+ * service of the next I/O request for bfqq significantly. In
+ * case bfqq can actually tolerate some injection, then the
+ * adaptive update will however raise the limit soon. This
+ * lucky circumstance holds exactly because bfqq has a short
+ * think time, and thus, after remaining empty, is likely to
+ * get new I/O enqueued---and then completed---before being
+ * expired. This is the very pattern that gives the
+ * limit-update algorithm the chance to measure the effect of
+ * injection on request service times, and then to update the
+ * limit accordingly.
+ *
+ * However, in the following special case, the inject limit is
+ * left to 1 even if the think time is short: bfqq's I/O is
+ * synchronized with that of some other queue, i.e., bfqq may
+ * receive new I/O only after the I/O of the other queue is
+ * completed. Keeping the inject limit to 1 allows the
+ * blocking I/O to be served while bfqq is in service. And
+ * this is very convenient both for bfqq and for overall
+ * throughput, as explained in detail in the comments in
+ * bfq_update_has_short_ttime().
+ *
+ * On the opposite end, if bfqq has a long think time, then
+ * start directly by 1, because:
+ * a) on the bright side, keeping at most one request in
+ * service in the drive is unlikely to cause any harm to the
+ * latency of bfqq's requests, as the service time of a single
+ * request is likely to be lower than the think time of bfqq;
+ * b) on the downside, after becoming empty, bfqq is likely to
+ * expire before getting its next request. With this request
+ * arrival pattern, it is very hard to sample total service
+ * times and update the inject limit accordingly (see comments
+ * on bfq_update_inject_limit()). So the limit is likely to be
+ * never, or at least seldom, updated. As a consequence, by
+ * setting the limit to 1, we avoid that no injection ever
+ * occurs with bfqq. On the downside, this proactive step
+ * further reduces chances to actually compute the baseline
+ * total service time. Thus it reduces chances to execute the
+ * limit-update algorithm and possibly raise the limit to more
+ * than 1.
+ */
+ if (bfq_bfqq_has_short_ttime(bfqq))
+ bfqq->inject_limit = 0;
+ else
+ bfqq->inject_limit = 1;
+
+ bfqq->decrease_time_jif = jiffies;
+}
+
static void bfq_add_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1749,77 +1871,120 @@ static void bfq_add_request(struct request *rq)
if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
/*
+ * Detect whether bfqq's I/O seems synchronized with
+ * that of some other queue, i.e., whether bfqq, after
+ * remaining empty, happens to receive new I/O only
+ * right after some I/O request of the other queue has
+ * been completed. We call waker queue the other
+ * queue, and we assume, for simplicity, that bfqq may
+ * have at most one waker queue.
+ *
+ * A remarkable throughput boost can be reached by
+ * unconditionally injecting the I/O of the waker
+ * queue, every time a new bfq_dispatch_request
+ * happens to be invoked while I/O is being plugged
+ * for bfqq. In addition to boosting throughput, this
+ * unblocks bfqq's I/O, thereby improving bandwidth
+ * and latency for bfqq. Note that these same results
+ * may be achieved with the general injection
+ * mechanism, but less effectively. For details on
+ * this aspect, see the comments on the choice of the
+ * queue for injection in bfq_select_queue().
+ *
+ * Turning back to the detection of a waker queue, a
+ * queue Q is deemed as a waker queue for bfqq if, for
+ * two consecutive times, bfqq happens to become non
+ * empty right after a request of Q has been
+ * completed. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while
+ * on the second time, the flag
+ * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
+ * is a waker queue for bfqq. These detection steps
+ * are performed only if bfqq has a long think time,
+ * so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This
+ * last filter, plus the above two-times requirement,
+ * make false positives less likely.
+ *
+ * NOTE
+ *
+ * The sooner a waker queue is detected, the sooner
+ * throughput can be boosted by injecting I/O from the
+ * waker queue. Fortunately, detection is likely to be
+ * actually fast, for the following reasons. While
+ * blocked by synchronization, bfqq has a long think
+ * time. This implies that bfqq's inject limit is at
+ * least equal to 1 (see the comments in
+ * bfq_update_inject_limit()). So, thanks to
+ * injection, the waker queue is likely to be served
+ * during the very first I/O-plugging time interval
+ * for bfqq. This triggers the first step of the
+ * detection mechanism. Thanks again to injection, the
+ * candidate waker queue is then likely to be
+ * confirmed no later than during the next
+ * I/O-plugging interval for bfqq.
+ */
+ if (bfqd->last_completed_rq_bfqq &&
+ !bfq_bfqq_has_short_ttime(bfqq) &&
+ ktime_get_ns() - bfqd->last_completion <
+ 200 * NSEC_PER_USEC) {
+ if (bfqd->last_completed_rq_bfqq != bfqq &&
+ bfqd->last_completed_rq_bfqq !=
+ bfqq->waker_bfqq) {
+ /*
+ * First synchronization detected with
+ * a candidate waker queue, or with a
+ * different candidate waker queue
+ * from the current one.
+ */
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+
+ /*
+ * If the waker queue disappears, then
+ * bfqq->waker_bfqq must be reset. To
+ * this goal, we maintain in each
+ * waker queue a list, woken_list, of
+ * all the queues that reference the
+ * waker queue through their
+ * waker_bfqq pointer. When the waker
+ * queue exits, the waker_bfqq pointer
+ * of all the queues in the woken_list
+ * is reset.
+ *
+ * In addition, if bfqq is already in
+ * the woken_list of a waker queue,
+ * then, before being inserted into
+ * the woken_list of a new waker
+ * queue, bfqq must be removed from
+ * the woken_list of the old waker
+ * queue.
+ */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqd->last_completed_rq_bfqq->woken_list);
+
+ bfq_clear_bfqq_has_waker(bfqq);
+ } else if (bfqd->last_completed_rq_bfqq ==
+ bfqq->waker_bfqq &&
+ !bfq_bfqq_has_waker(bfqq)) {
+ /*
+ * synchronization with waker_bfqq
+ * seen for the second time
+ */
+ bfq_mark_bfqq_has_waker(bfqq);
+ }
+ }
+
+ /*
* Periodically reset inject limit, to make sure that
* the latter eventually drops in case workload
* changes, see step (3) in the comments on
* bfq_update_inject_limit().
*/
if (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
- msecs_to_jiffies(1000))) {
- /* invalidate baseline total service time */
- bfqq->last_serv_time_ns = 0;
-
- /*
- * Reset pointer in case we are waiting for
- * some request completion.
- */
- bfqd->waited_rq = NULL;
-
- /*
- * If bfqq has a short think time, then start
- * by setting the inject limit to 0
- * prudentially, because the service time of
- * an injected I/O request may be higher than
- * the think time of bfqq, and therefore, if
- * one request was injected when bfqq remains
- * empty, this injected request might delay
- * the service of the next I/O request for
- * bfqq significantly. In case bfqq can
- * actually tolerate some injection, then the
- * adaptive update will however raise the
- * limit soon. This lucky circumstance holds
- * exactly because bfqq has a short think
- * time, and thus, after remaining empty, is
- * likely to get new I/O enqueued---and then
- * completed---before being expired. This is
- * the very pattern that gives the
- * limit-update algorithm the chance to
- * measure the effect of injection on request
- * service times, and then to update the limit
- * accordingly.
- *
- * On the opposite end, if bfqq has a long
- * think time, then start directly by 1,
- * because:
- * a) on the bright side, keeping at most one
- * request in service in the drive is unlikely
- * to cause any harm to the latency of bfqq's
- * requests, as the service time of a single
- * request is likely to be lower than the
- * think time of bfqq;
- * b) on the downside, after becoming empty,
- * bfqq is likely to expire before getting its
- * next request. With this request arrival
- * pattern, it is very hard to sample total
- * service times and update the inject limit
- * accordingly (see comments on
- * bfq_update_inject_limit()). So the limit is
- * likely to be never, or at least seldom,
- * updated. As a consequence, by setting the
- * limit to 1, we avoid that no injection ever
- * occurs with bfqq. On the downside, this
- * proactive step further reduces chances to
- * actually compute the baseline total service
- * time. Thus it reduces chances to execute the
- * limit-update algorithm and possibly raise the
- * limit to more than 1.
- */
- if (bfq_bfqq_has_short_ttime(bfqq))
- bfqq->inject_limit = 0;
- else
- bfqq->inject_limit = 1;
- bfqq->decrease_time_jif = jiffies;
- }
+ msecs_to_jiffies(1000)))
+ bfq_reset_inject_limit(bfqd, bfqq);
/*
* The following conditions must hold to setup a new
@@ -1851,7 +2016,7 @@ static void bfq_add_request(struct request *rq)
(bfqq->last_serv_time_ns > 0 &&
bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
- msecs_to_jiffies(100))) {
+ msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns();
/*
* Start the state machine for measuring the
@@ -1860,7 +2025,21 @@ static void bfq_add_request(struct request *rq)
* be set when rq will be dispatched.
*/
bfqd->wait_dispatch = true;
- bfqd->rqs_injected = false;
+ /*
+ * If there is no I/O in service in the drive,
+ * then possible injection occurred before the
+ * arrival of rq will not affect the total
+ * service time of rq. So the injection limit
+ * must not be updated as a function of such
+ * total service time, unless new injection
+ * occurs before rq is completed. To have the
+ * injection limit updated only in the latter
+ * case, reset rqs_injected here (rqs_injected
+ * will be set in case injection is performed
+ * on bfqq before rq is completed).
+ */
+ if (bfqd->rq_in_driver == 0)
+ bfqd->rqs_injected = false;
}
}
@@ -2027,7 +2206,8 @@ static void bfq_remove_request(struct request_queue *q,
}
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+ unsigned int nr_segs)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
@@ -2050,7 +2230,7 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
bfqd->bio_bfqq = NULL;
bfqd->bio_bic = bic;
- ret = blk_mq_sched_try_merge(q, bio, &free);
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
if (free)
blk_mq_free_request(free);
@@ -2085,9 +2265,14 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
blk_rq_pos(container_of(rb_prev(&req->rb_node),
struct request, rb_node))) {
struct bfq_queue *bfqq = bfq_init_rq(req);
- struct bfq_data *bfqd = bfqq->bfqd;
+ struct bfq_data *bfqd;
struct request *prev, *next_rq;
+ if (!bfqq)
+ return;
+
+ bfqd = bfqq->bfqd;
+
/* Reposition request in its sort_list */
elv_rb_del(&bfqq->sort_list, req);
elv_rb_add(&bfqq->sort_list, req);
@@ -2134,6 +2319,9 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct bfq_queue *bfqq = bfq_init_rq(rq),
*next_bfqq = bfq_init_rq(next);
+ if (!bfqq)
+ return;
+
/*
* If next and rq belong to the same bfq_queue and next is older
* than rq, then reposition rq in the fifo (by substituting next
@@ -2513,6 +2701,7 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
* to enjoy weight raising if split soon.
*/
bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+ bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
bic->saved_last_wr_start_finish = jiffies;
} else {
@@ -2524,6 +2713,28 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
}
}
+
+static
+void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+{
+ /*
+ * To prevent bfqq's service guarantees from being violated,
+ * bfqq may be left busy, i.e., queued for service, even if
+ * empty (see comments in __bfq_bfqq_expire() for
+ * details). But, if no process will send requests to bfqq any
+ * longer, then there is no point in keeping bfqq queued for
+ * service. In addition, keeping bfqq queued for service, but
+ * with no process ref any longer, may have caused bfqq to be
+ * freed when dequeued from service. But this is assumed to
+ * never happen.
+ */
+ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ bfqq != bfqd->in_service_queue)
+ bfq_del_bfqq_busy(bfqd, bfqq, false);
+
+ bfq_put_queue(bfqq);
+}
+
static void
bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
@@ -2594,8 +2805,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
*/
new_bfqq->pid = -1;
bfqq->bic = NULL;
- /* release process reference to bfqq */
- bfq_put_queue(bfqq);
+ bfq_release_process_ref(bfqd, bfqq);
}
static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
@@ -3045,7 +3255,205 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
bfq_remove_request(q, rq);
}
-static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+/*
+ * There is a case where idling does not have to be performed for
+ * throughput concerns, but to preserve the throughput share of
+ * the process associated with bfqq.
+ *
+ * To introduce this case, we can note that allowing the drive
+ * to enqueue more than one request at a time, and hence
+ * delegating de facto final scheduling decisions to the
+ * drive's internal scheduler, entails loss of control on the
+ * actual request service order. In particular, the critical
+ * situation is when requests from different processes happen
+ * to be present, at the same time, in the internal queue(s)
+ * of the drive. In such a situation, the drive, by deciding
+ * the service order of the internally-queued requests, does
+ * determine also the actual throughput distribution among
+ * these processes. But the drive typically has no notion or
+ * concern about per-process throughput distribution, and
+ * makes its decisions only on a per-request basis. Therefore,
+ * the service distribution enforced by the drive's internal
+ * scheduler is likely to coincide with the desired throughput
+ * distribution only in a completely symmetric, or favorably
+ * skewed scenario where:
+ * (i-a) each of these processes must get the same throughput as
+ * the others,
+ * (i-b) in case (i-a) does not hold, it holds that the process
+ * associated with bfqq must receive a lower or equal
+ * throughput than any of the other processes;
+ * (ii) the I/O of each process has the same properties, in
+ * terms of locality (sequential or random), direction
+ * (reads or writes), request sizes, greediness
+ * (from I/O-bound to sporadic), and so on;
+
+ * In fact, in such a scenario, the drive tends to treat the requests
+ * of each process in about the same way as the requests of the
+ * others, and thus to provide each of these processes with about the
+ * same throughput. This is exactly the desired throughput
+ * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
+ * even more convenient distribution for (the process associated with)
+ * bfqq.
+ *
+ * In contrast, in any asymmetric or unfavorable scenario, device
+ * idling (I/O-dispatch plugging) is certainly needed to guarantee
+ * that bfqq receives its assigned fraction of the device throughput
+ * (see [1] for details).
+ *
+ * The problem is that idling may significantly reduce throughput with
+ * certain combinations of types of I/O and devices. An important
+ * example is sync random I/O on flash storage with command
+ * queueing. So, unless bfqq falls in cases where idling also boosts
+ * throughput, it is important to check conditions (i-a), i(-b) and
+ * (ii) accurately, so as to avoid idling when not strictly needed for
+ * service guarantees.
+ *
+ * Unfortunately, it is extremely difficult to thoroughly check
+ * condition (ii). And, in case there are active groups, it becomes
+ * very difficult to check conditions (i-a) and (i-b) too. In fact,
+ * if there are active groups, then, for conditions (i-a) or (i-b) to
+ * become false 'indirectly', it is enough that an active group
+ * contains more active processes or sub-groups than some other active
+ * group. More precisely, for conditions (i-a) or (i-b) to become
+ * false because of such a group, it is not even necessary that the
+ * group is (still) active: it is sufficient that, even if the group
+ * has become inactive, some of its descendant processes still have
+ * some request already dispatched but still waiting for
+ * completion. In fact, requests have still to be guaranteed their
+ * share of the throughput even after being dispatched. In this
+ * respect, it is easy to show that, if a group frequently becomes
+ * inactive while still having in-flight requests, and if, when this
+ * happens, the group is not considered in the calculation of whether
+ * the scenario is asymmetric, then the group may fail to be
+ * guaranteed its fair share of the throughput (basically because
+ * idling may not be performed for the descendant processes of the
+ * group, but it had to be). We address this issue with the following
+ * bi-modal behavior, implemented in the function
+ * bfq_asymmetric_scenario().
+ *
+ * If there are groups with requests waiting for completion
+ * (as commented above, some of these groups may even be
+ * already inactive), then the scenario is tagged as
+ * asymmetric, conservatively, without checking any of the
+ * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
+ * This behavior matches also the fact that groups are created
+ * exactly if controlling I/O is a primary concern (to
+ * preserve bandwidth and latency guarantees).
+ *
+ * On the opposite end, if there are no groups with requests waiting
+ * for completion, then only conditions (i-a) and (i-b) are actually
+ * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
+ * idling is not performed, regardless of whether condition (ii)
+ * holds. In other words, only if conditions (i-a) and (i-b) do not
+ * hold, then idling is allowed, and the device tends to be prevented
+ * from queueing many requests, possibly of several processes. Since
+ * there are no groups with requests waiting for completion, then, to
+ * control conditions (i-a) and (i-b) it is enough to check just
+ * whether all the queues with requests waiting for completion also
+ * have the same weight.
+ *
+ * Not checking condition (ii) evidently exposes bfqq to the
+ * risk of getting less throughput than its fair share.
+ * However, for queues with the same weight, a further
+ * mechanism, preemption, mitigates or even eliminates this
+ * problem. And it does so without consequences on overall
+ * throughput. This mechanism and its benefits are explained
+ * in the next three paragraphs.
+ *
+ * Even if a queue, say Q, is expired when it remains idle, Q
+ * can still preempt the new in-service queue if the next
+ * request of Q arrives soon (see the comments on
+ * bfq_bfqq_update_budg_for_activation). If all queues and
+ * groups have the same weight, this form of preemption,
+ * combined with the hole-recovery heuristic described in the
+ * comments on function bfq_bfqq_update_budg_for_activation,
+ * are enough to preserve a correct bandwidth distribution in
+ * the mid term, even without idling. In fact, even if not
+ * idling allows the internal queues of the device to contain
+ * many requests, and thus to reorder requests, we can rather
+ * safely assume that the internal scheduler still preserves a
+ * minimum of mid-term fairness.
+ *
+ * More precisely, this preemption-based, idleless approach
+ * provides fairness in terms of IOPS, and not sectors per
+ * second. This can be seen with a simple example. Suppose
+ * that there are two queues with the same weight, but that
+ * the first queue receives requests of 8 sectors, while the
+ * second queue receives requests of 1024 sectors. In
+ * addition, suppose that each of the two queues contains at
+ * most one request at a time, which implies that each queue
+ * always remains idle after it is served. Finally, after
+ * remaining idle, each queue receives very quickly a new
+ * request. It follows that the two queues are served
+ * alternatively, preempting each other if needed. This
+ * implies that, although both queues have the same weight,
+ * the queue with large requests receives a service that is
+ * 1024/8 times as high as the service received by the other
+ * queue.
+ *
+ * The motivation for using preemption instead of idling (for
+ * queues with the same weight) is that, by not idling,
+ * service guarantees are preserved (completely or at least in
+ * part) without minimally sacrificing throughput. And, if
+ * there is no active group, then the primary expectation for
+ * this device is probably a high throughput.
+ *
+ * We are now left only with explaining the two sub-conditions in the
+ * additional compound condition that is checked below for deciding
+ * whether the scenario is asymmetric. To explain the first
+ * sub-condition, we need to add that the function
+ * bfq_asymmetric_scenario checks the weights of only
+ * non-weight-raised queues, for efficiency reasons (see comments on
+ * bfq_weights_tree_add()). Then the fact that bfqq is weight-raised
+ * is checked explicitly here. More precisely, the compound condition
+ * below takes into account also the fact that, even if bfqq is being
+ * weight-raised, the scenario is still symmetric if all queues with
+ * requests waiting for completion happen to be
+ * weight-raised. Actually, we should be even more precise here, and
+ * differentiate between interactive weight raising and soft real-time
+ * weight raising.
+ *
+ * The second sub-condition checked in the compound condition is
+ * whether there is a fair amount of already in-flight I/O not
+ * belonging to bfqq. If so, I/O dispatching is to be plugged, for the
+ * following reason. The drive may decide to serve in-flight
+ * non-bfqq's I/O requests before bfqq's ones, thereby delaying the
+ * arrival of new I/O requests for bfqq (recall that bfqq is sync). If
+ * I/O-dispatching is not plugged, then, while bfqq remains empty, a
+ * basically uncontrolled amount of I/O from other queues may be
+ * dispatched too, possibly causing the service of bfqq's I/O to be
+ * delayed even longer in the drive. This problem gets more and more
+ * serious as the speed and the queue depth of the drive grow,
+ * because, as these two quantities grow, the probability to find no
+ * queue busy but many requests in flight grows too. By contrast,
+ * plugging I/O dispatching minimizes the delay induced by already
+ * in-flight I/O, and enables bfqq to recover the bandwidth it may
+ * lose because of this delay.
+ *
+ * As a side note, it is worth considering that the above
+ * device-idling countermeasures may however fail in the following
+ * unlucky scenario: if I/O-dispatch plugging is (correctly) disabled
+ * in a time period during which all symmetry sub-conditions hold, and
+ * therefore the device is allowed to enqueue many requests, but at
+ * some later point in time some sub-condition stops to hold, then it
+ * may become impossible to make requests be served in the desired
+ * order until all the requests already queued in the device have been
+ * served. The last sub-condition commented above somewhat mitigates
+ * this problem for weight-raised queues.
+ */
+static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq)
+{
+ return (bfqq->wr_coeff > 1 &&
+ (bfqd->wr_busy_queues <
+ bfq_tot_busy_queues(bfqd) ||
+ bfqd->rq_in_driver >=
+ bfqq->dispatched + 4)) ||
+ bfq_asymmetric_scenario(bfqd, bfqq);
+}
+
+static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ enum bfqq_expiration reason)
{
/*
* If this bfqq is shared between multiple processes, check
@@ -3056,7 +3464,22 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq))
bfq_mark_bfqq_split_coop(bfqq);
- if (RB_EMPTY_ROOT(&bfqq->sort_list)) {
+ /*
+ * Consider queues with a higher finish virtual time than
+ * bfqq. If idling_needed_for_service_guarantees(bfqq) returns
+ * true, then bfqq's bandwidth would be violated if an
+ * uncontrolled amount of I/O from these queues were
+ * dispatched while bfqq is waiting for its new I/O to
+ * arrive. This is exactly what may happen if this is a forced
+ * expiration caused by a preemption attempt, and if bfqq is
+ * not re-scheduled. To prevent this from happening, re-queue
+ * bfqq if it needs I/O-dispatch plugging, even if it is
+ * empty. By doing so, bfqq is granted to be served before the
+ * above queues (provided that bfqq is of course eligible).
+ */
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ !(reason == BFQQE_PREEMPTED &&
+ idling_needed_for_service_guarantees(bfqd, bfqq))) {
if (bfqq->dispatched == 0)
/*
* Overloading budget_timeout field to store
@@ -3073,7 +3496,8 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* Resort priority tree of potential close cooperators.
* See comments on bfq_pos_tree_add_move() for the unlikely().
*/
- if (unlikely(!bfqd->nonrot_with_queueing))
+ if (unlikely(!bfqd->nonrot_with_queueing &&
+ !RB_EMPTY_ROOT(&bfqq->sort_list)))
bfq_pos_tree_add_move(bfqd, bfqq);
}
@@ -3574,7 +3998,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* reason.
*/
__bfq_bfqq_recalc_budget(bfqd, bfqq, reason);
- if (__bfq_bfqq_expire(bfqd, bfqq))
+ if (__bfq_bfqq_expire(bfqd, bfqq, reason))
/* bfqq is gone, no more actions on it */
return;
@@ -3721,184 +4145,6 @@ static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
}
/*
- * There is a case where idling does not have to be performed for
- * throughput concerns, but to preserve the throughput share of
- * the process associated with bfqq.
- *
- * To introduce this case, we can note that allowing the drive
- * to enqueue more than one request at a time, and hence
- * delegating de facto final scheduling decisions to the
- * drive's internal scheduler, entails loss of control on the
- * actual request service order. In particular, the critical
- * situation is when requests from different processes happen
- * to be present, at the same time, in the internal queue(s)
- * of the drive. In such a situation, the drive, by deciding
- * the service order of the internally-queued requests, does
- * determine also the actual throughput distribution among
- * these processes. But the drive typically has no notion or
- * concern about per-process throughput distribution, and
- * makes its decisions only on a per-request basis. Therefore,
- * the service distribution enforced by the drive's internal
- * scheduler is likely to coincide with the desired throughput
- * distribution only in a completely symmetric, or favorably
- * skewed scenario where:
- * (i-a) each of these processes must get the same throughput as
- * the others,
- * (i-b) in case (i-a) does not hold, it holds that the process
- * associated with bfqq must receive a lower or equal
- * throughput than any of the other processes;
- * (ii) the I/O of each process has the same properties, in
- * terms of locality (sequential or random), direction
- * (reads or writes), request sizes, greediness
- * (from I/O-bound to sporadic), and so on;
-
- * In fact, in such a scenario, the drive tends to treat the requests
- * of each process in about the same way as the requests of the
- * others, and thus to provide each of these processes with about the
- * same throughput. This is exactly the desired throughput
- * distribution if (i-a) holds, or, if (i-b) holds instead, this is an
- * even more convenient distribution for (the process associated with)
- * bfqq.
- *
- * In contrast, in any asymmetric or unfavorable scenario, device
- * idling (I/O-dispatch plugging) is certainly needed to guarantee
- * that bfqq receives its assigned fraction of the device throughput
- * (see [1] for details).
- *
- * The problem is that idling may significantly reduce throughput with
- * certain combinations of types of I/O and devices. An important
- * example is sync random I/O on flash storage with command
- * queueing. So, unless bfqq falls in cases where idling also boosts
- * throughput, it is important to check conditions (i-a), i(-b) and
- * (ii) accurately, so as to avoid idling when not strictly needed for
- * service guarantees.
- *
- * Unfortunately, it is extremely difficult to thoroughly check
- * condition (ii). And, in case there are active groups, it becomes
- * very difficult to check conditions (i-a) and (i-b) too. In fact,
- * if there are active groups, then, for conditions (i-a) or (i-b) to
- * become false 'indirectly', it is enough that an active group
- * contains more active processes or sub-groups than some other active
- * group. More precisely, for conditions (i-a) or (i-b) to become
- * false because of such a group, it is not even necessary that the
- * group is (still) active: it is sufficient that, even if the group
- * has become inactive, some of its descendant processes still have
- * some request already dispatched but still waiting for
- * completion. In fact, requests have still to be guaranteed their
- * share of the throughput even after being dispatched. In this
- * respect, it is easy to show that, if a group frequently becomes
- * inactive while still having in-flight requests, and if, when this
- * happens, the group is not considered in the calculation of whether
- * the scenario is asymmetric, then the group may fail to be
- * guaranteed its fair share of the throughput (basically because
- * idling may not be performed for the descendant processes of the
- * group, but it had to be). We address this issue with the following
- * bi-modal behavior, implemented in the function
- * bfq_asymmetric_scenario().
- *
- * If there are groups with requests waiting for completion
- * (as commented above, some of these groups may even be
- * already inactive), then the scenario is tagged as
- * asymmetric, conservatively, without checking any of the
- * conditions (i-a), (i-b) or (ii). So the device is idled for bfqq.
- * This behavior matches also the fact that groups are created
- * exactly if controlling I/O is a primary concern (to
- * preserve bandwidth and latency guarantees).
- *
- * On the opposite end, if there are no groups with requests waiting
- * for completion, then only conditions (i-a) and (i-b) are actually
- * controlled, i.e., provided that conditions (i-a) or (i-b) holds,
- * idling is not performed, regardless of whether condition (ii)
- * holds. In other words, only if conditions (i-a) and (i-b) do not
- * hold, then idling is allowed, and the device tends to be prevented
- * from queueing many requests, possibly of several processes. Since
- * there are no groups with requests waiting for completion, then, to
- * control conditions (i-a) and (i-b) it is enough to check just
- * whether all the queues with requests waiting for completion also
- * have the same weight.
- *
- * Not checking condition (ii) evidently exposes bfqq to the
- * risk of getting less throughput than its fair share.
- * However, for queues with the same weight, a further
- * mechanism, preemption, mitigates or even eliminates this
- * problem. And it does so without consequences on overall
- * throughput. This mechanism and its benefits are explained
- * in the next three paragraphs.
- *
- * Even if a queue, say Q, is expired when it remains idle, Q
- * can still preempt the new in-service queue if the next
- * request of Q arrives soon (see the comments on
- * bfq_bfqq_update_budg_for_activation). If all queues and
- * groups have the same weight, this form of preemption,
- * combined with the hole-recovery heuristic described in the
- * comments on function bfq_bfqq_update_budg_for_activation,
- * are enough to preserve a correct bandwidth distribution in
- * the mid term, even without idling. In fact, even if not
- * idling allows the internal queues of the device to contain
- * many requests, and thus to reorder requests, we can rather
- * safely assume that the internal scheduler still preserves a
- * minimum of mid-term fairness.
- *
- * More precisely, this preemption-based, idleless approach
- * provides fairness in terms of IOPS, and not sectors per
- * second. This can be seen with a simple example. Suppose
- * that there are two queues with the same weight, but that
- * the first queue receives requests of 8 sectors, while the
- * second queue receives requests of 1024 sectors. In
- * addition, suppose that each of the two queues contains at
- * most one request at a time, which implies that each queue
- * always remains idle after it is served. Finally, after
- * remaining idle, each queue receives very quickly a new
- * request. It follows that the two queues are served
- * alternatively, preempting each other if needed. This
- * implies that, although both queues have the same weight,
- * the queue with large requests receives a service that is
- * 1024/8 times as high as the service received by the other
- * queue.
- *
- * The motivation for using preemption instead of idling (for
- * queues with the same weight) is that, by not idling,
- * service guarantees are preserved (completely or at least in
- * part) without minimally sacrificing throughput. And, if
- * there is no active group, then the primary expectation for
- * this device is probably a high throughput.
- *
- * We are now left only with explaining the additional
- * compound condition that is checked below for deciding
- * whether the scenario is asymmetric. To explain this
- * compound condition, we need to add that the function
- * bfq_asymmetric_scenario checks the weights of only
- * non-weight-raised queues, for efficiency reasons (see
- * comments on bfq_weights_tree_add()). Then the fact that
- * bfqq is weight-raised is checked explicitly here. More
- * precisely, the compound condition below takes into account
- * also the fact that, even if bfqq is being weight-raised,
- * the scenario is still symmetric if all queues with requests
- * waiting for completion happen to be
- * weight-raised. Actually, we should be even more precise
- * here, and differentiate between interactive weight raising
- * and soft real-time weight raising.
- *
- * As a side note, it is worth considering that the above
- * device-idling countermeasures may however fail in the
- * following unlucky scenario: if idling is (correctly)
- * disabled in a time period during which all symmetry
- * sub-conditions hold, and hence the device is allowed to
- * enqueue many requests, but at some later point in time some
- * sub-condition stops to hold, then it may become impossible
- * to let requests be served in the desired order until all
- * the requests already queued in the device have been served.
- */
-static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- return (bfqq->wr_coeff > 1 &&
- bfqd->wr_busy_queues <
- bfq_tot_busy_queues(bfqd)) ||
- bfq_asymmetric_scenario(bfqd, bfqq);
-}
-
-/*
* For a queue that becomes empty, device idling is allowed only if
* this function returns true for that queue. As a consequence, since
* device idling plays a critical role for both throughput boosting
@@ -4156,22 +4402,95 @@ check_queue:
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
struct bfq_queue *async_bfqq =
bfqq->bic && bfqq->bic->bfqq[0] &&
- bfq_bfqq_busy(bfqq->bic->bfqq[0]) ?
+ bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
+ bfqq->bic->bfqq[0]->next_rq ?
bfqq->bic->bfqq[0] : NULL;
/*
- * If the process associated with bfqq has also async
- * I/O pending, then inject it
- * unconditionally. Injecting I/O from the same
- * process can cause no harm to the process. On the
- * contrary, it can only increase bandwidth and reduce
- * latency for the process.
+ * The next three mutually-exclusive ifs decide
+ * whether to try injection, and choose the queue to
+ * pick an I/O request from.
+ *
+ * The first if checks whether the process associated
+ * with bfqq has also async I/O pending. If so, it
+ * injects such I/O unconditionally. Injecting async
+ * I/O from the same process can cause no harm to the
+ * process. On the contrary, it can only increase
+ * bandwidth and reduce latency for the process.
+ *
+ * The second if checks whether there happens to be a
+ * non-empty waker queue for bfqq, i.e., a queue whose
+ * I/O needs to be completed for bfqq to receive new
+ * I/O. This happens, e.g., if bfqq is associated with
+ * a process that does some sync. A sync generates
+ * extra blocking I/O, which must be completed before
+ * the process associated with bfqq can go on with its
+ * I/O. If the I/O of the waker queue is not served,
+ * then bfqq remains empty, and no I/O is dispatched,
+ * until the idle timeout fires for bfqq. This is
+ * likely to result in lower bandwidth and higher
+ * latencies for bfqq, and in a severe loss of total
+ * throughput. The best action to take is therefore to
+ * serve the waker queue as soon as possible. So do it
+ * (without relying on the third alternative below for
+ * eventually serving waker_bfqq's I/O; see the last
+ * paragraph for further details). This systematic
+ * injection of I/O from the waker queue does not
+ * cause any delay to bfqq's I/O. On the contrary,
+ * next bfqq's I/O is brought forward dramatically,
+ * for it is not blocked for milliseconds.
+ *
+ * The third if checks whether bfqq is a queue for
+ * which it is better to avoid injection. It is so if
+ * bfqq delivers more throughput when served without
+ * any further I/O from other queues in the middle, or
+ * if the service times of bfqq's I/O requests both
+ * count more than overall throughput, and may be
+ * easily increased by injection (this happens if bfqq
+ * has a short think time). If none of these
+ * conditions holds, then a candidate queue for
+ * injection is looked for through
+ * bfq_choose_bfqq_for_injection(). Note that the
+ * latter may return NULL (for example if the inject
+ * limit for bfqq is currently 0).
+ *
+ * NOTE: motivation for the second alternative
+ *
+ * Thanks to the way the inject limit is updated in
+ * bfq_update_has_short_ttime(), it is rather likely
+ * that, if I/O is being plugged for bfqq and the
+ * waker queue has pending I/O requests that are
+ * blocking bfqq's I/O, then the third alternative
+ * above lets the waker queue get served before the
+ * I/O-plugging timeout fires. So one may deem the
+ * second alternative superfluous. It is not, because
+ * the third alternative may be way less effective in
+ * case of a synchronization. For two main
+ * reasons. First, throughput may be low because the
+ * inject limit may be too low to guarantee the same
+ * amount of injected I/O, from the waker queue or
+ * other queues, that the second alternative
+ * guarantees (the second alternative unconditionally
+ * injects a pending I/O request of the waker queue
+ * for each bfq_dispatch_request()). Second, with the
+ * third alternative, the duration of the plugging,
+ * i.e., the time before bfqq finally receives new I/O,
+ * may not be minimized, because the waker queue may
+ * happen to be served only after other queues.
*/
if (async_bfqq &&
icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
bfq_bfqq_budget_left(async_bfqq))
bfqq = bfqq->bic->bfqq[0];
+ else if (bfq_bfqq_has_waker(bfqq) &&
+ bfq_bfqq_busy(bfqq->waker_bfqq) &&
+ bfqq->next_rq &&
+ bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
+ bfqq->waker_bfqq) <=
+ bfq_bfqq_budget_left(bfqq->waker_bfqq)
+ )
+ bfqq = bfqq->waker_bfqq;
else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
(bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
!bfq_bfqq_has_short_ttime(bfqq)))
@@ -4403,7 +4722,7 @@ exit:
return rq;
}
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
static void bfq_update_dispatch_stats(struct request_queue *q,
struct request *rq,
struct bfq_queue *in_serv_queue,
@@ -4453,7 +4772,7 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q,
struct request *rq,
struct bfq_queue *in_serv_queue,
bool idle_timer_disabled) {}
-#endif
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
@@ -4489,6 +4808,8 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
*/
void bfq_put_queue(struct bfq_queue *bfqq)
{
+ struct bfq_queue *item;
+ struct hlist_node *n;
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqg = bfqq_group(bfqq);
#endif
@@ -4533,6 +4854,36 @@ void bfq_put_queue(struct bfq_queue *bfqq)
bfqq->bfqd->burst_size--;
}
+ /*
+ * bfqq does not exist any longer, so it cannot be woken by
+ * any other queue, and cannot wake any other queue. Then bfqq
+ * must be removed from the woken list of its possible waker
+ * queue, and all queues in the woken list of bfqq must stop
+ * having a waker queue. Strictly speaking, these updates
+ * should be performed when bfqq remains with no I/O source
+ * attached to it, which happens before bfqq gets freed. In
+ * particular, this happens when the last process associated
+ * with bfqq exits or gets associated with a different
+ * queue. However, both events lead to bfqq being freed soon,
+ * and dangling references would come out only after bfqq gets
+ * freed. So these updates are done here, as a simple and safe
+ * way to handle all cases.
+ */
+ /* remove bfqq from woken list */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+
+ /* reset waker for all queues in woken list */
+ hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
+ woken_list_node) {
+ item->waker_bfqq = NULL;
+ bfq_clear_bfqq_has_waker(item);
+ hlist_del_init(&item->woken_list_node);
+ }
+
+ if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
+ bfqq->bfqd->last_completed_rq_bfqq = NULL;
+
kmem_cache_free(bfq_pool, bfqq);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
bfqg_and_blkg_put(bfqg);
@@ -4561,7 +4912,7 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
if (bfqq == bfqd->in_service_queue) {
- __bfq_bfqq_expire(bfqd, bfqq);
+ __bfq_bfqq_expire(bfqd, bfqq, BFQQE_BUDGET_TIMEOUT);
bfq_schedule_dispatch(bfqd);
}
@@ -4569,7 +4920,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_put_cooperator(bfqq);
- bfq_put_queue(bfqq); /* release process reference */
+ bfq_release_process_ref(bfqd, bfqq);
}
static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
@@ -4671,8 +5022,7 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bfqq = bic_to_bfqq(bic, false);
if (bfqq) {
- /* release process reference on this queue */
- bfq_put_queue(bfqq);
+ bfq_release_process_ref(bfqd, bfqq);
bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
bic_set_bfqq(bic, bfqq, false);
}
@@ -4688,6 +5038,8 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
RB_CLEAR_NODE(&bfqq->entity.rb_node);
INIT_LIST_HEAD(&bfqq->fifo);
INIT_HLIST_NODE(&bfqq->burst_list_node);
+ INIT_HLIST_NODE(&bfqq->woken_list_node);
+ INIT_HLIST_HEAD(&bfqq->woken_list);
bfqq->ref = 0;
bfqq->bfqd = bfqd;
@@ -4855,7 +5207,7 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
struct bfq_io_cq *bic)
{
- bool has_short_ttime = true;
+ bool has_short_ttime = true, state_changed;
/*
* No need to update has_short_ttime if bfqq is async or in
@@ -4880,13 +5232,102 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
has_short_ttime = false;
- bfq_log_bfqq(bfqd, bfqq, "update_has_short_ttime: has_short_ttime %d",
- has_short_ttime);
+ state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
if (has_short_ttime)
bfq_mark_bfqq_has_short_ttime(bfqq);
else
bfq_clear_bfqq_has_short_ttime(bfqq);
+
+ /*
+ * Until the base value for the total service time gets
+ * finally computed for bfqq, the inject limit does depend on
+ * the think-time state (short|long). In particular, the limit
+ * is 0 or 1 if the think time is deemed, respectively, as
+ * short or long (details in the comments in
+ * bfq_update_inject_limit()). Accordingly, the next
+ * instructions reset the inject limit if the think-time state
+ * has changed and the above base value is still to be
+ * computed.
+ *
+ * However, the reset is performed only if more than 100 ms
+ * have elapsed since the last update of the inject limit, or
+ * (inclusive) if the change is from short to long think
+ * time. The reason for this waiting is as follows.
+ *
+ * bfqq may have a long think time because of a
+ * synchronization with some other queue, i.e., because the
+ * I/O of some other queue may need to be completed for bfqq
+ * to receive new I/O. Details in the comments on the choice
+ * of the queue for injection in bfq_select_queue().
+ *
+ * As stressed in those comments, if such a synchronization is
+ * actually in place, then, without injection on bfqq, the
+ * blocking I/O cannot happen to served while bfqq is in
+ * service. As a consequence, if bfqq is granted
+ * I/O-dispatch-plugging, then bfqq remains empty, and no I/O
+ * is dispatched, until the idle timeout fires. This is likely
+ * to result in lower bandwidth and higher latencies for bfqq,
+ * and in a severe loss of total throughput.
+ *
+ * On the opposite end, a non-zero inject limit may allow the
+ * I/O that blocks bfqq to be executed soon, and therefore
+ * bfqq to receive new I/O soon.
+ *
+ * But, if the blocking gets actually eliminated, then the
+ * next think-time sample for bfqq may be very low. This in
+ * turn may cause bfqq's think time to be deemed
+ * short. Without the 100 ms barrier, this new state change
+ * would cause the body of the next if to be executed
+ * immediately. But this would set to 0 the inject
+ * limit. Without injection, the blocking I/O would cause the
+ * think time of bfqq to become long again, and therefore the
+ * inject limit to be raised again, and so on. The only effect
+ * of such a steady oscillation between the two think-time
+ * states would be to prevent effective injection on bfqq.
+ *
+ * In contrast, if the inject limit is not reset during such a
+ * long time interval as 100 ms, then the number of short
+ * think time samples can grow significantly before the reset
+ * is performed. As a consequence, the think time state can
+ * become stable before the reset. Therefore there will be no
+ * state change when the 100 ms elapse, and no reset of the
+ * inject limit. The inject limit remains steadily equal to 1
+ * both during and after the 100 ms. So injection can be
+ * performed at all times, and throughput gets boosted.
+ *
+ * An inject limit equal to 1 is however in conflict, in
+ * general, with the fact that the think time of bfqq is
+ * short, because injection may be likely to delay bfqq's I/O
+ * (as explained in the comments in
+ * bfq_update_inject_limit()). But this does not happen in
+ * this special case, because bfqq's low think time is due to
+ * an effective handling of a synchronization, through
+ * injection. In this special case, bfqq's I/O does not get
+ * delayed by injection; on the contrary, bfqq's I/O is
+ * brought forward, because it is not blocked for
+ * milliseconds.
+ *
+ * In addition, serving the blocking I/O much sooner, and much
+ * more frequently than once per I/O-plugging timeout, makes
+ * it much quicker to detect a waker queue (the concept of
+ * waker queue is defined in the comments in
+ * bfq_add_request()). This makes it possible to start sooner
+ * to boost throughput more effectively, by injecting the I/O
+ * of the waker queue unconditionally on every
+ * bfq_dispatch_request().
+ *
+ * One last, important benefit of not resetting the inject
+ * limit before 100 ms is that, during this time interval, the
+ * base value for the total service time is likely to get
+ * finally computed for bfqq, freeing the inject limit from
+ * its relation with the think time.
+ */
+ if (state_changed && bfqq->last_serv_time_ns == 0 &&
+ (time_is_before_eq_jiffies(bfqq->decrease_time_jif +
+ msecs_to_jiffies(100)) ||
+ !has_short_ttime))
+ bfq_reset_inject_limit(bfqd, bfqq);
}
/*
@@ -4896,19 +5337,9 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct request *rq)
{
- struct bfq_io_cq *bic = RQ_BIC(rq);
-
if (rq->cmd_flags & REQ_META)
bfqq->meta_pending++;
- bfq_update_io_thinktime(bfqd, bfqq);
- bfq_update_has_short_ttime(bfqd, bfqq, bic);
- bfq_update_io_seektime(bfqd, bfqq, rq);
-
- bfq_log_bfqq(bfqd, bfqq,
- "rq_enqueued: has_short_ttime=%d (seeky %d)",
- bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq));
-
bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq);
if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) {
@@ -4996,6 +5427,10 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
bfqq = new_bfqq;
}
+ bfq_update_io_thinktime(bfqd, bfqq);
+ bfq_update_has_short_ttime(bfqd, bfqq, RQ_BIC(rq));
+ bfq_update_io_seektime(bfqd, bfqq, rq);
+
waiting = bfqq && bfq_bfqq_wait_request(bfqq);
bfq_add_request(rq);
idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq);
@@ -5008,7 +5443,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
return idle_timer_disabled;
}
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
static void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
@@ -5038,7 +5473,7 @@ static inline void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
unsigned int cmd_flags) {}
-#endif
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
bool at_head)
@@ -5061,12 +5496,12 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
spin_lock_irq(&bfqd->lock);
bfqq = bfq_init_rq(rq);
- if (at_head || blk_rq_is_passthrough(rq)) {
+ if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
if (at_head)
list_add(&rq->queuelist, &bfqd->dispatch);
else
list_add_tail(&rq->queuelist, &bfqd->dispatch);
- } else { /* bfqq is assumed to be non null here */
+ } else {
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
* Update bfqq, because, if a queue merge has occurred
@@ -5201,6 +5636,7 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
1UL<<(BFQ_RATE_SHIFT - 10))
bfq_update_rate_reset(bfqd, NULL);
bfqd->last_completion = now_ns;
+ bfqd->last_completed_rq_bfqq = bfqq;
/*
* If we are waiting to discover whether the request pattern
@@ -5382,14 +5818,14 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
u64 tot_time_ns = ktime_get_ns() - bfqd->last_empty_occupied_ns;
unsigned int old_limit = bfqq->inject_limit;
- if (bfqq->last_serv_time_ns > 0) {
+ if (bfqq->last_serv_time_ns > 0 && bfqd->rqs_injected) {
u64 threshold = (bfqq->last_serv_time_ns * 3)>>1;
if (tot_time_ns >= threshold && old_limit > 0) {
bfqq->inject_limit--;
bfqq->decrease_time_jif = jiffies;
} else if (tot_time_ns < threshold &&
- old_limit < bfqd->max_rq_in_driver<<1)
+ old_limit <= bfqd->max_rq_in_driver)
bfqq->inject_limit++;
}
@@ -5398,19 +5834,39 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
* total service time, and there seem to be the right
* conditions to do it, or we can lower the last base value
* computed.
+ *
+ * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
+ * request in flight, because this function is in the code
+ * path that handles the completion of a request of bfqq, and,
+ * in particular, this function is executed before
+ * bfqd->rq_in_driver is decremented in such a code path.
*/
- if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 0) ||
+ if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
tot_time_ns < bfqq->last_serv_time_ns) {
+ if (bfqq->last_serv_time_ns == 0) {
+ /*
+ * Now we certainly have a base value: make sure we
+ * start trying injection.
+ */
+ bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
+ }
bfqq->last_serv_time_ns = tot_time_ns;
+ } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
/*
- * Now we certainly have a base value: make sure we
- * start trying injection.
+ * No I/O injected and no request still in service in
+ * the drive: these are the exact conditions for
+ * computing the base value of the total service time
+ * for bfqq. So let's update this value, because it is
+ * rather variable. For example, it varies if the size
+ * or the spatial locality of the I/O requests in bfqq
+ * change.
*/
- bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
- }
+ bfqq->last_serv_time_ns = tot_time_ns;
+
/* update complete, not waiting for any request completion any longer */
bfqd->waited_rq = NULL;
+ bfqd->rqs_injected = false;
}
/*
@@ -5527,7 +5983,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
bfq_put_cooperator(bfqq);
- bfq_put_queue(bfqq);
+ bfq_release_process_ref(bfqq->bfqd, bfqq);
return NULL;
}
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index c2faa77824f8..5d1a519640f6 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -168,6 +168,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
+ /* device weight, if non-zero, it overrides the default weight of
+ * bfq_group_data */
+ int dev_weight;
/* weight of the queue */
int weight;
/* next weight if a change is in progress */
@@ -357,6 +360,24 @@ struct bfq_queue {
/* max service rate measured so far */
u32 max_service_rate;
+
+ /*
+ * Pointer to the waker queue for this queue, i.e., to the
+ * queue Q such that this queue happens to get new I/O right
+ * after some I/O request of Q is completed. For details, see
+ * the comments on the choice of the queue for injection in
+ * bfq_select_queue().
+ */
+ struct bfq_queue *waker_bfqq;
+ /* node for woken_list, see below */
+ struct hlist_node woken_list_node;
+ /*
+ * Head of the list of the woken queues for this queue, i.e.,
+ * of the list of the queues for which this queue is a waker
+ * queue. This list is used to reset the waker_bfqq pointer in
+ * the woken queues when this queue exits.
+ */
+ struct hlist_head woken_list;
};
/**
@@ -533,6 +554,9 @@ struct bfq_data {
/* time of last request completion (ns) */
u64 last_completion;
+ /* bfqq owning the last completed rq */
+ struct bfq_queue *last_completed_rq_bfqq;
+
/* time of last transition from empty to non-empty (ns) */
u64 last_empty_occupied_ns;
@@ -743,7 +767,8 @@ enum bfqq_state_flags {
* update
*/
BFQQF_coop, /* bfqq is shared */
- BFQQF_split_coop /* shared bfqq will be split */
+ BFQQF_split_coop, /* shared bfqq will be split */
+ BFQQF_has_waker /* bfqq has a waker queue */
};
#define BFQ_BFQQ_FNS(name) \
@@ -763,6 +788,7 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
+BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS
/* Expiration reasons. */
@@ -777,8 +803,13 @@ enum bfqq_expiration {
BFQQE_PREEMPTED /* preemption in progress */
};
+struct bfq_stat {
+ struct percpu_counter cpu_cnt;
+ atomic64_t aux_cnt;
+};
+
struct bfqg_stats {
-#if defined(CONFIG_BFQ_GROUP_IOSCHED) && defined(CONFIG_DEBUG_BLK_CGROUP)
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
/* number of ios merged */
struct blkg_rwstat merged;
/* total time spent on device in ns, may not be accurate w/ queueing */
@@ -788,25 +819,25 @@ struct bfqg_stats {
/* number of IOs queued up */
struct blkg_rwstat queued;
/* total disk time and nr sectors dispatched by this group */
- struct blkg_stat time;
+ struct bfq_stat time;
/* sum of number of ios queued across all samples */
- struct blkg_stat avg_queue_size_sum;
+ struct bfq_stat avg_queue_size_sum;
/* count of samples taken for average */
- struct blkg_stat avg_queue_size_samples;
+ struct bfq_stat avg_queue_size_samples;
/* how many times this group has been removed from service tree */
- struct blkg_stat dequeue;
+ struct bfq_stat dequeue;
/* total time spent waiting for it to be assigned a timeslice. */
- struct blkg_stat group_wait_time;
+ struct bfq_stat group_wait_time;
/* time spent idling for this blkcg_gq */
- struct blkg_stat idle_time;
+ struct bfq_stat idle_time;
/* total time with empty current active q with other requests queued */
- struct blkg_stat empty_time;
+ struct bfq_stat empty_time;
/* fields after this shouldn't be cleared on stat reset */
u64 start_group_wait_time;
u64 start_idle_time;
u64 start_empty_time;
uint16_t flags;
-#endif /* CONFIG_BFQ_GROUP_IOSCHED && CONFIG_DEBUG_BLK_CGROUP */
+#endif /* CONFIG_BFQ_CGROUP_DEBUG */
};
#ifdef CONFIG_BFQ_GROUP_IOSCHED
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index c9ba225081ce..05f0bf4a1144 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -744,6 +744,8 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
}
#endif
+ /* Matches the smp_wmb() in bfq_group_set_weight. */
+ smp_rmb();
old_st->wsum -= entity->weight;
if (entity->new_weight != entity->orig_weight) {
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 4db620849515..fb95dbb21dd8 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -276,8 +276,12 @@ bool bio_integrity_prep(struct bio *bio)
ret = bio_integrity_add_page(bio, virt_to_page(buf),
bytes, offset);
- if (ret == 0)
- return false;
+ if (ret == 0) {
+ printk(KERN_ERR "could not attach integrity payload\n");
+ kfree(buf);
+ status = BLK_STS_RESOURCE;
+ goto err_end_io;
+ }
if (ret < bytes)
break;
diff --git a/block/bio.c b/block/bio.c
index 67bba12d273b..c822ceb7c4de 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -16,6 +16,7 @@
#include <linux/workqueue.h>
#include <linux/cgroup.h>
#include <linux/blk-cgroup.h>
+#include <linux/highmem.h>
#include <trace/events/block.h>
#include "blk.h"
@@ -534,6 +535,45 @@ void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
}
EXPORT_SYMBOL(zero_fill_bio_iter);
+void bio_truncate(struct bio *bio, unsigned new_size)
+{
+ struct bio_vec bv;
+ struct bvec_iter iter;
+ unsigned int done = 0;
+ bool truncated = false;
+
+ if (new_size >= bio->bi_iter.bi_size)
+ return;
+
+ if (bio_data_dir(bio) != READ)
+ goto exit;
+
+ bio_for_each_segment(bv, bio, iter) {
+ if (done + bv.bv_len > new_size) {
+ unsigned offset;
+
+ if (!truncated)
+ offset = new_size - done;
+ else
+ offset = 0;
+ zero_user(bv.bv_page, offset, bv.bv_len - offset);
+ truncated = true;
+ }
+ done += bv.bv_len;
+ }
+
+ exit:
+ /*
+ * Don't touch bvec table here and make it really immutable, since
+ * fs bio user has to retrieve all pages via bio_for_each_segment_all
+ * in its .end_bio() callback.
+ *
+ * It is enough to truncate bio by updating .bi_size since we can make
+ * correct bvec with the updated .bi_size for drivers.
+ */
+ bio->bi_iter.bi_size = new_size;
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -558,14 +598,6 @@ void bio_put(struct bio *bio)
}
EXPORT_SYMBOL(bio_put);
-int bio_phys_segments(struct request_queue *q, struct bio *bio)
-{
- if (unlikely(!bio_flagged(bio, BIO_SEG_VALID)))
- blk_recount_segments(q, bio);
-
- return bio->bi_phys_segments;
-}
-
/**
* __bio_clone_fast - clone a bio that shares the original bio's biovec
* @bio: destination bio
@@ -653,25 +685,20 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
return true;
}
-/*
- * Check if the @page can be added to the current segment(@bv), and make
- * sure to call it only if page_is_mergeable(@bv, @page) is true
- */
-static bool can_add_page_to_seg(struct request_queue *q,
- struct bio_vec *bv, struct page *page, unsigned len,
- unsigned offset)
+static bool bio_try_merge_pc_page(struct request_queue *q, struct bio *bio,
+ struct page *page, unsigned len, unsigned offset,
+ bool *same_page)
{
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
if ((addr1 | mask) != (addr2 | mask))
return false;
-
if (bv->bv_len + len > queue_max_segment_size(q))
return false;
-
- return true;
+ return __bio_try_merge_page(bio, page, len, offset, same_page);
}
/**
@@ -681,7 +708,7 @@ static bool can_add_page_to_seg(struct request_queue *q,
* @page: page to add
* @len: vec entry length
* @offset: vec entry offset
- * @put_same_page: put the page if it is same with last added page
+ * @same_page: return if the merge happen inside the same page
*
* Attempt to add a page to the bio_vec maplist. This can fail for a
* number of reasons, such as the bio being full or target block device
@@ -692,10 +719,9 @@ static bool can_add_page_to_seg(struct request_queue *q,
*/
static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
- bool put_same_page)
+ bool *same_page)
{
struct bio_vec *bvec;
- bool same_page = false;
/*
* cloned bio must not modify vec list
@@ -707,34 +733,22 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
return 0;
if (bio->bi_vcnt > 0) {
- bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
-
- if (page == bvec->bv_page &&
- offset == bvec->bv_offset + bvec->bv_len) {
- if (put_same_page)
- put_page(page);
- bvec->bv_len += len;
- goto done;
- }
+ if (bio_try_merge_pc_page(q, bio, page, len, offset, same_page))
+ return len;
/*
- * If the queue doesn't support SG gaps and adding this
- * offset would create a gap, disallow it.
+ * If the queue doesn't support SG gaps and adding this segment
+ * would create a gap, disallow it.
*/
+ bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (bvec_gap_to_prev(q, bvec, offset))
return 0;
-
- if (page_is_mergeable(bvec, page, len, offset, &same_page) &&
- can_add_page_to_seg(q, bvec, page, len, offset)) {
- bvec->bv_len += len;
- goto done;
- }
}
if (bio_full(bio, len))
return 0;
- if (bio->bi_phys_segments >= queue_max_segments(q))
+ if (bio->bi_vcnt >= queue_max_segments(q))
return 0;
bvec = &bio->bi_io_vec[bio->bi_vcnt];
@@ -742,17 +756,15 @@ static int __bio_add_pc_page(struct request_queue *q, struct bio *bio,
bvec->bv_len = len;
bvec->bv_offset = offset;
bio->bi_vcnt++;
- done:
bio->bi_iter.bi_size += len;
- bio->bi_phys_segments = bio->bi_vcnt;
- bio_set_flag(bio, BIO_SEG_VALID);
return len;
}
int bio_add_pc_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset)
{
- return __bio_add_pc_page(q, bio, page, len, offset, false);
+ bool same_page = false;
+ return __bio_add_pc_page(q, bio, page, len, offset, &same_page);
}
EXPORT_SYMBOL(bio_add_pc_page);
@@ -782,6 +794,8 @@ bool __bio_try_merge_page(struct bio *bio, struct page *page,
struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
if (page_is_mergeable(bv, page, len, off, same_page)) {
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return false;
bv->bv_len += len;
bio->bi_iter.bi_size += len;
return true;
@@ -815,6 +829,9 @@ void __bio_add_page(struct bio *bio, struct page *page,
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
+
+ if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
+ bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
@@ -842,22 +859,19 @@ int bio_add_page(struct bio *bio, struct page *page,
}
EXPORT_SYMBOL(bio_add_page);
-static void bio_get_pages(struct bio *bio)
+void bio_release_pages(struct bio *bio, bool mark_dirty)
{
struct bvec_iter_all iter_all;
struct bio_vec *bvec;
- bio_for_each_segment_all(bvec, bio, iter_all)
- get_page(bvec->bv_page);
-}
-
-static void bio_release_pages(struct bio *bio)
-{
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ if (bio_flagged(bio, BIO_NO_PAGE_REF))
+ return;
- bio_for_each_segment_all(bvec, bio, iter_all)
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ if (mark_dirty && !PageCompound(bvec->bv_page))
+ set_page_dirty_lock(bvec->bv_page);
put_page(bvec->bv_page);
+ }
}
static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
@@ -968,11 +982,8 @@ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- if (iov_iter_bvec_no_ref(iter))
+ if (is_bvec)
bio_set_flag(bio, BIO_NO_PAGE_REF);
- else if (is_bvec)
- bio_get_pages(bio);
-
return bio->bi_vcnt ? 0 : ret;
}
@@ -1124,8 +1135,7 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
if (data->nr_segs > UIO_MAXIOV)
return NULL;
- bmd = kmalloc(sizeof(struct bio_map_data) +
- sizeof(struct iovec) * data->nr_segs, gfp_mask);
+ bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask);
if (!bmd)
return NULL;
memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
@@ -1371,8 +1381,6 @@ struct bio *bio_map_user_iov(struct request_queue *q,
int j;
struct bio *bio;
int ret;
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
if (!iov_iter_count(iter))
return ERR_PTR(-EINVAL);
@@ -1402,13 +1410,17 @@ struct bio *bio_map_user_iov(struct request_queue *q,
for (j = 0; j < npages; j++) {
struct page *page = pages[j];
unsigned int n = PAGE_SIZE - offs;
+ bool same_page = false;
if (n > bytes)
n = bytes;
if (!__bio_add_pc_page(q, bio, page, n, offs,
- true))
+ &same_page)) {
+ if (same_page)
+ put_page(page);
break;
+ }
added += n;
bytes -= n;
@@ -1439,31 +1451,11 @@ struct bio *bio_map_user_iov(struct request_queue *q,
return bio;
out_unmap:
- bio_for_each_segment_all(bvec, bio, iter_all) {
- put_page(bvec->bv_page);
- }
+ bio_release_pages(bio, false);
bio_put(bio);
return ERR_PTR(ret);
}
-static void __bio_unmap_user(struct bio *bio)
-{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
-
- /*
- * make sure we dirty pages we wrote to
- */
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (bio_data_dir(bio) == READ)
- set_page_dirty_lock(bvec->bv_page);
-
- put_page(bvec->bv_page);
- }
-
- bio_put(bio);
-}
-
/**
* bio_unmap_user - unmap a bio
* @bio: the bio being unmapped
@@ -1475,12 +1467,27 @@ static void __bio_unmap_user(struct bio *bio)
*/
void bio_unmap_user(struct bio *bio)
{
- __bio_unmap_user(bio);
+ bio_release_pages(bio, bio_data_dir(bio) == READ);
+ bio_put(bio);
bio_put(bio);
}
+static void bio_invalidate_vmalloc_pages(struct bio *bio)
+{
+#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+ if (bio->bi_private && !op_is_write(bio_op(bio))) {
+ unsigned long i, len = 0;
+
+ for (i = 0; i < bio->bi_vcnt; i++)
+ len += bio->bi_io_vec[i].bv_len;
+ invalidate_kernel_vmap_range(bio->bi_private, len);
+ }
+#endif
+}
+
static void bio_map_kern_endio(struct bio *bio)
{
+ bio_invalidate_vmalloc_pages(bio);
bio_put(bio);
}
@@ -1501,6 +1508,8 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
unsigned long end = (kaddr + len + PAGE_SIZE - 1) >> PAGE_SHIFT;
unsigned long start = kaddr >> PAGE_SHIFT;
const int nr_pages = end - start;
+ bool is_vmalloc = is_vmalloc_addr(data);
+ struct page *page;
int offset, i;
struct bio *bio;
@@ -1508,6 +1517,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (!bio)
return ERR_PTR(-ENOMEM);
+ if (is_vmalloc) {
+ flush_kernel_vmap_range(data, len);
+ bio->bi_private = data;
+ }
+
offset = offset_in_page(kaddr);
for (i = 0; i < nr_pages; i++) {
unsigned int bytes = PAGE_SIZE - offset;
@@ -1518,7 +1532,11 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
if (bytes > len)
bytes = len;
- if (bio_add_pc_page(q, bio, virt_to_page(data), bytes,
+ if (!is_vmalloc)
+ page = virt_to_page(data);
+ else
+ page = vmalloc_to_page(data);
+ if (bio_add_pc_page(q, bio, page, bytes,
offset) < bytes) {
/* we don't support partial mappings */
bio_put(bio);
@@ -1533,7 +1551,6 @@ struct bio *bio_map_kern(struct request_queue *q, void *data, unsigned int len,
bio->bi_end_io = bio_map_kern_endio;
return bio;
}
-EXPORT_SYMBOL(bio_map_kern);
static void bio_copy_kern_endio(struct bio *bio)
{
@@ -1695,9 +1712,7 @@ static void bio_dirty_fn(struct work_struct *work)
while ((bio = next) != NULL) {
next = bio->bi_private;
- bio_set_pages_dirty(bio);
- if (!bio_flagged(bio, BIO_NO_PAGE_REF))
- bio_release_pages(bio);
+ bio_release_pages(bio, true);
bio_put(bio);
}
}
@@ -1713,8 +1728,7 @@ void bio_check_pages_dirty(struct bio *bio)
goto defer;
}
- if (!bio_flagged(bio, BIO_NO_PAGE_REF))
- bio_release_pages(bio);
+ bio_release_pages(bio, false);
bio_put(bio);
return;
defer:
@@ -1775,18 +1789,6 @@ void generic_end_io_acct(struct request_queue *q, int req_op,
}
EXPORT_SYMBOL(generic_end_io_acct);
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-void bio_flush_dcache_pages(struct bio *bi)
-{
- struct bio_vec bvec;
- struct bvec_iter iter;
-
- bio_for_each_segment(bvec, bi, iter)
- flush_dcache_page(bvec.bv_page);
-}
-EXPORT_SYMBOL(bio_flush_dcache_pages);
-#endif
-
static inline bool bio_remaining_done(struct bio *bio)
{
/*
@@ -1869,8 +1871,8 @@ EXPORT_SYMBOL(bio_endio);
* @bio, and updates @bio to represent the remaining sectors.
*
* Unless this is a discard request the newly allocated bio will point
- * to @bio's bi_io_vec; it is the caller's responsibility to ensure that
- * @bio is not freed before the split.
+ * to @bio's bi_io_vec. It is the caller's responsibility to ensure that
+ * neither @bio nor @bs are freed before the split bio.
*/
struct bio *bio_split(struct bio *bio, int sectors,
gfp_t gfp, struct bio_set *bs)
@@ -1914,10 +1916,7 @@ void bio_trim(struct bio *bio, int offset, int size)
if (offset == 0 && size == bio->bi_iter.bi_size)
return;
- bio_clear_flag(bio, BIO_SEG_VALID);
-
bio_advance(bio, offset << 9);
-
bio->bi_iter.bi_size = size;
if (bio_integrity(bio))
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1f7127b03490..1eb8895be4c6 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -29,6 +29,7 @@
#include <linux/ctype.h>
#include <linux/blk-cgroup.h>
#include <linux/tracehook.h>
+#include <linux/psi.h>
#include "blk.h"
#define MAX_KEY_LEN 100
@@ -47,12 +48,14 @@ struct blkcg blkcg_root;
EXPORT_SYMBOL_GPL(blkcg_root);
struct cgroup_subsys_state * const blkcg_root_css = &blkcg_root.css;
+EXPORT_SYMBOL_GPL(blkcg_root_css);
static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
-static bool blkcg_debug_stats = false;
+bool blkcg_debug_stats = false;
+static struct workqueue_struct *blkcg_punt_bio_wq;
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
@@ -79,6 +82,7 @@ static void blkg_free(struct blkcg_gq *blkg)
blkg_rwstat_exit(&blkg->stat_ios);
blkg_rwstat_exit(&blkg->stat_bytes);
+ percpu_ref_exit(&blkg->refcnt);
kfree(blkg);
}
@@ -86,7 +90,7 @@ static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
- percpu_ref_exit(&blkg->refcnt);
+ WARN_ON(!bio_list_empty(&blkg->async_bios));
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
@@ -113,6 +117,23 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}
+static void blkg_async_bio_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ async_bio_work);
+ struct bio_list bios = BIO_EMPTY_LIST;
+ struct bio *bio;
+
+ /* as long as there are pending bios, @blkg can't go away */
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_merge(&bios, &blkg->async_bios);
+ bio_list_init(&blkg->async_bios);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ while ((bio = bio_list_pop(&bios)))
+ submit_bio(bio);
+}
+
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
@@ -132,12 +153,18 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
if (!blkg)
return NULL;
+ if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
+ goto err_free;
+
if (blkg_rwstat_init(&blkg->stat_bytes, gfp_mask) ||
blkg_rwstat_init(&blkg->stat_ios, gfp_mask))
goto err_free;
blkg->q = q;
INIT_LIST_HEAD(&blkg->q_node);
+ spin_lock_init(&blkg->async_bio_lock);
+ bio_list_init(&blkg->async_bios);
+ INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
blkg->blkcg = blkcg;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
@@ -148,7 +175,7 @@ static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
continue;
/* alloc per-policy data and attach it to blkg */
- pd = pol->pd_alloc_fn(gfp_mask, q->node);
+ pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
if (!pd)
goto err_free;
@@ -244,11 +271,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_get(blkg->parent);
}
- ret = percpu_ref_init(&blkg->refcnt, blkg_release, 0,
- GFP_NOWAIT | __GFP_NOWARN);
- if (ret)
- goto err_cancel_ref;
-
/* invoke per-policy init */
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -281,8 +303,6 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
-err_cancel_ref:
- percpu_ref_exit(&blkg->refcnt);
err_put_congested:
wb_congested_put(wb_congested);
err_put_css:
@@ -549,7 +569,7 @@ EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
* Print @rwstat to @sf for the device assocaited with @pd.
*/
u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
- const struct blkg_rwstat *rwstat)
+ const struct blkg_rwstat_sample *rwstat)
{
static const char *rwstr[] = {
[BLKG_RWSTAT_READ] = "Read",
@@ -567,31 +587,17 @@ u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
for (i = 0; i < BLKG_RWSTAT_NR; i++)
seq_printf(sf, "%s %s %llu\n", dname, rwstr[i],
- (unsigned long long)atomic64_read(&rwstat->aux_cnt[i]));
+ rwstat->cnt[i]);
- v = atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_READ]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_WRITE]) +
- atomic64_read(&rwstat->aux_cnt[BLKG_RWSTAT_DISCARD]);
- seq_printf(sf, "%s Total %llu\n", dname, (unsigned long long)v);
+ v = rwstat->cnt[BLKG_RWSTAT_READ] +
+ rwstat->cnt[BLKG_RWSTAT_WRITE] +
+ rwstat->cnt[BLKG_RWSTAT_DISCARD];
+ seq_printf(sf, "%s Total %llu\n", dname, v);
return v;
}
EXPORT_SYMBOL_GPL(__blkg_prfill_rwstat);
/**
- * blkg_prfill_stat - prfill callback for blkg_stat
- * @sf: seq_file to print to
- * @pd: policy private data of interest
- * @off: offset to the blkg_stat in @pd
- *
- * prfill callback for printing a blkg_stat.
- */
-u64 blkg_prfill_stat(struct seq_file *sf, struct blkg_policy_data *pd, int off)
-{
- return __blkg_prfill_u64(sf, pd, blkg_stat_read((void *)pd + off));
-}
-EXPORT_SYMBOL_GPL(blkg_prfill_stat);
-
-/**
* blkg_prfill_rwstat - prfill callback for blkg_rwstat
* @sf: seq_file to print to
* @pd: policy private data of interest
@@ -602,8 +608,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_stat);
u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd + off);
+ struct blkg_rwstat_sample rwstat = { };
+ blkg_rwstat_read((void *)pd + off, &rwstat);
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
@@ -611,8 +618,9 @@ EXPORT_SYMBOL_GPL(blkg_prfill_rwstat);
static u64 blkg_prfill_rwstat_field(struct seq_file *sf,
struct blkg_policy_data *pd, int off)
{
- struct blkg_rwstat rwstat = blkg_rwstat_read((void *)pd->blkg + off);
+ struct blkg_rwstat_sample rwstat = { };
+ blkg_rwstat_read((void *)pd->blkg + off, &rwstat);
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
@@ -654,8 +662,9 @@ static u64 blkg_prfill_rwstat_field_recursive(struct seq_file *sf,
struct blkg_policy_data *pd,
int off)
{
- struct blkg_rwstat rwstat = blkg_rwstat_recursive_sum(pd->blkg,
- NULL, off);
+ struct blkg_rwstat_sample rwstat;
+
+ blkg_rwstat_recursive_sum(pd->blkg, NULL, off, &rwstat);
return __blkg_prfill_rwstat(sf, pd, &rwstat);
}
@@ -690,52 +699,11 @@ int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v)
EXPORT_SYMBOL_GPL(blkg_print_stat_ios_recursive);
/**
- * blkg_stat_recursive_sum - collect hierarchical blkg_stat
- * @blkg: blkg of interest
- * @pol: blkcg_policy which contains the blkg_stat
- * @off: offset to the blkg_stat in blkg_policy_data or @blkg
- *
- * Collect the blkg_stat specified by @blkg, @pol and @off and all its
- * online descendants and their aux counts. The caller must be holding the
- * queue lock for online tests.
- *
- * If @pol is NULL, blkg_stat is at @off bytes into @blkg; otherwise, it is
- * at @off bytes into @blkg's blkg_policy_data of the policy.
- */
-u64 blkg_stat_recursive_sum(struct blkcg_gq *blkg,
- struct blkcg_policy *pol, int off)
-{
- struct blkcg_gq *pos_blkg;
- struct cgroup_subsys_state *pos_css;
- u64 sum = 0;
-
- lockdep_assert_held(&blkg->q->queue_lock);
-
- rcu_read_lock();
- blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
- struct blkg_stat *stat;
-
- if (!pos_blkg->online)
- continue;
-
- if (pol)
- stat = (void *)blkg_to_pd(pos_blkg, pol) + off;
- else
- stat = (void *)blkg + off;
-
- sum += blkg_stat_read(stat) + atomic64_read(&stat->aux_cnt);
- }
- rcu_read_unlock();
-
- return sum;
-}
-EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
-
-/**
* blkg_rwstat_recursive_sum - collect hierarchical blkg_rwstat
* @blkg: blkg of interest
* @pol: blkcg_policy which contains the blkg_rwstat
* @off: offset to the blkg_rwstat in blkg_policy_data or @blkg
+ * @sum: blkg_rwstat_sample structure containing the results
*
* Collect the blkg_rwstat specified by @blkg, @pol and @off and all its
* online descendants and their aux counts. The caller must be holding the
@@ -744,13 +712,12 @@ EXPORT_SYMBOL_GPL(blkg_stat_recursive_sum);
* If @pol is NULL, blkg_rwstat is at @off bytes into @blkg; otherwise, it
* is at @off bytes into @blkg's blkg_policy_data of the policy.
*/
-struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
- struct blkcg_policy *pol, int off)
+void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
+ int off, struct blkg_rwstat_sample *sum)
{
struct blkcg_gq *pos_blkg;
struct cgroup_subsys_state *pos_css;
- struct blkg_rwstat sum = { };
- int i;
+ unsigned int i;
lockdep_assert_held(&blkg->q->queue_lock);
@@ -767,13 +734,9 @@ struct blkg_rwstat blkg_rwstat_recursive_sum(struct blkcg_gq *blkg,
rwstat = (void *)pos_blkg + off;
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- atomic64_add(atomic64_read(&rwstat->aux_cnt[i]) +
- percpu_counter_sum_positive(&rwstat->cpu_cnt[i]),
- &sum.aux_cnt[i]);
+ sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
}
rcu_read_unlock();
-
- return sum;
}
EXPORT_SYMBOL_GPL(blkg_rwstat_recursive_sum);
@@ -792,6 +755,44 @@ static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
+ * @inputp: input string pointer
+ *
+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
+ * from @input and get and return the matching gendisk. *@inputp is
+ * updated to point past the device node prefix. Returns an ERR_PTR()
+ * value on error.
+ *
+ * Use this function iff blkg_conf_prep() can't be used for some reason.
+ */
+struct gendisk *blkcg_conf_get_disk(char **inputp)
+{
+ char *input = *inputp;
+ unsigned int major, minor;
+ struct gendisk *disk;
+ int key_len, part;
+
+ if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
+ return ERR_PTR(-EINVAL);
+
+ input += key_len;
+ if (!isspace(*input))
+ return ERR_PTR(-EINVAL);
+ input = skip_spaces(input);
+
+ disk = get_gendisk(MKDEV(major, minor), &part);
+ if (!disk)
+ return ERR_PTR(-ENODEV);
+ if (part) {
+ put_disk_and_module(disk);
+ return ERR_PTR(-ENODEV);
+ }
+
+ *inputp = input;
+ return disk;
+}
+
+/**
+ * blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
* @pol: target policy
* @input: input string
@@ -809,25 +810,11 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
- unsigned int major, minor;
- int key_len, part, ret;
- char *body;
-
- if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
- return -EINVAL;
+ int ret;
- body = input + key_len;
- if (!isspace(*body))
- return -EINVAL;
- body = skip_spaces(body);
-
- disk = get_gendisk(MKDEV(major, minor), &part);
- if (!disk)
- return -ENODEV;
- if (part) {
- ret = -ENODEV;
- goto fail;
- }
+ disk = blkcg_conf_get_disk(&input);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
q = disk->queue;
@@ -893,7 +880,7 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
success:
ctx->disk = disk;
ctx->blkg = blkg;
- ctx->body = body;
+ ctx->body = input;
return 0;
fail_unlock:
@@ -913,6 +900,7 @@ fail:
}
return ret;
}
+EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
* blkg_conf_finish - finish up per-blkg config update
@@ -928,6 +916,7 @@ void blkg_conf_finish(struct blkg_conf_ctx *ctx)
rcu_read_unlock();
put_disk_and_module(ctx->disk);
}
+EXPORT_SYMBOL_GPL(blkg_conf_finish);
static int blkcg_print_stat(struct seq_file *sf, void *v)
{
@@ -939,15 +928,20 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
const char *dname;
char *buf;
- struct blkg_rwstat rwstat;
+ struct blkg_rwstat_sample rwstat;
u64 rbytes, wbytes, rios, wios, dbytes, dios;
size_t size = seq_get_buf(sf, &buf), off = 0;
int i;
bool has_stats = false;
+ spin_lock_irq(&blkg->q->queue_lock);
+
+ if (!blkg->online)
+ goto skip;
+
dname = blkg_dev_name(blkg);
if (!dname)
- continue;
+ goto skip;
/*
* Hooray string manipulation, count is the size written NOT
@@ -957,21 +951,17 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
*/
off += scnprintf(buf+off, size-off, "%s ", dname);
- spin_lock_irq(&blkg->q->queue_lock);
-
- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
- offsetof(struct blkcg_gq, stat_bytes));
- rbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
- wbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
- dbytes = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
+ blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_bytes), &rwstat);
+ rbytes = rwstat.cnt[BLKG_RWSTAT_READ];
+ wbytes = rwstat.cnt[BLKG_RWSTAT_WRITE];
+ dbytes = rwstat.cnt[BLKG_RWSTAT_DISCARD];
- rwstat = blkg_rwstat_recursive_sum(blkg, NULL,
- offsetof(struct blkcg_gq, stat_ios));
- rios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_READ]);
- wios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_WRITE]);
- dios = atomic64_read(&rwstat.aux_cnt[BLKG_RWSTAT_DISCARD]);
-
- spin_unlock_irq(&blkg->q->queue_lock);
+ blkg_rwstat_recursive_sum(blkg, NULL,
+ offsetof(struct blkcg_gq, stat_ios), &rwstat);
+ rios = rwstat.cnt[BLKG_RWSTAT_READ];
+ wios = rwstat.cnt[BLKG_RWSTAT_WRITE];
+ dios = rwstat.cnt[BLKG_RWSTAT_DISCARD];
if (rbytes || wbytes || rios || wios) {
has_stats = true;
@@ -981,10 +971,7 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
dbytes, dios);
}
- if (!blkcg_debug_stats)
- goto next;
-
- if (atomic_read(&blkg->use_delay)) {
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
has_stats = true;
off += scnprintf(buf+off, size-off,
" use_delay=%d delay_nsec=%llu",
@@ -1004,11 +991,17 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
has_stats = true;
off += written;
}
-next:
+
if (has_stats) {
- off += scnprintf(buf+off, size-off, "\n");
- seq_commit(sf, off);
+ if (off < size - 1) {
+ off += scnprintf(buf+off, size-off, "\n");
+ seq_commit(sf, off);
+ } else {
+ seq_commit(sf, -1);
+ }
}
+ skip:
+ spin_unlock_irq(&blkg->q->queue_lock);
}
rcu_read_unlock();
@@ -1372,7 +1365,7 @@ int blkcg_activate_policy(struct request_queue *q,
const struct blkcg_policy *pol)
{
struct blkg_policy_data *pd_prealloc = NULL;
- struct blkcg_gq *blkg;
+ struct blkcg_gq *blkg, *pinned_blkg = NULL;
int ret;
if (blkcg_policy_enabled(q, pol))
@@ -1380,48 +1373,82 @@ int blkcg_activate_policy(struct request_queue *q,
if (queue_is_mq(q))
blk_mq_freeze_queue(q);
-pd_prealloc:
- if (!pd_prealloc) {
- pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q->node);
- if (!pd_prealloc) {
- ret = -ENOMEM;
- goto out_bypass_end;
- }
- }
-
+retry:
spin_lock_irq(&q->queue_lock);
- list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ /* blkg_list is pushed at the head, reverse walk to allocate parents first */
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
if (blkg->pd[pol->plid])
continue;
- pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q->node);
- if (!pd)
- swap(pd, pd_prealloc);
+ /* If prealloc matches, use it; otherwise try GFP_NOWAIT */
+ if (blkg == pinned_blkg) {
+ pd = pd_prealloc;
+ pd_prealloc = NULL;
+ } else {
+ pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
+ blkg->blkcg);
+ }
+
if (!pd) {
+ /*
+ * GFP_NOWAIT failed. Free the existing one and
+ * prealloc for @blkg w/ GFP_KERNEL.
+ */
+ if (pinned_blkg)
+ blkg_put(pinned_blkg);
+ blkg_get(blkg);
+ pinned_blkg = blkg;
+
spin_unlock_irq(&q->queue_lock);
- goto pd_prealloc;
+
+ if (pd_prealloc)
+ pol->pd_free_fn(pd_prealloc);
+ pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
+ blkg->blkcg);
+ if (pd_prealloc)
+ goto retry;
+ else
+ goto enomem;
}
blkg->pd[pol->plid] = pd;
pd->blkg = blkg;
pd->plid = pol->plid;
- if (pol->pd_init_fn)
- pol->pd_init_fn(pd);
}
+ /* all allocated, init in the same order */
+ if (pol->pd_init_fn)
+ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
+ pol->pd_init_fn(blkg->pd[pol->plid]);
+
__set_bit(pol->plid, q->blkcg_pols);
ret = 0;
spin_unlock_irq(&q->queue_lock);
-out_bypass_end:
+out:
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q);
+ if (pinned_blkg)
+ blkg_put(pinned_blkg);
if (pd_prealloc)
pol->pd_free_fn(pd_prealloc);
return ret;
+
+enomem:
+ /* alloc failed, nothing's initialized yet, free everything */
+ spin_lock_irq(&q->queue_lock);
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ if (blkg->pd[pol->plid]) {
+ pol->pd_free_fn(blkg->pd[pol->plid]);
+ blkg->pd[pol->plid] = NULL;
+ }
+ }
+ spin_unlock_irq(&q->queue_lock);
+ ret = -ENOMEM;
+ goto out;
}
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
@@ -1510,7 +1537,8 @@ int blkcg_policy_register(struct blkcg_policy *pol)
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
cpd->plid = pol->plid;
- pol->cpd_init_fn(cpd);
+ if (pol->cpd_init_fn)
+ pol->cpd_init_fn(cpd);
}
}
@@ -1583,6 +1611,25 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
+bool __blkcg_punt_bio_submit(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+
+ /* consume the flag first */
+ bio->bi_opf &= ~REQ_CGROUP_PUNT;
+
+ /* never bounce for the root cgroup */
+ if (!blkg->parent)
+ return false;
+
+ spin_lock_bh(&blkg->async_bio_lock);
+ bio_list_add(&blkg->async_bios, bio);
+ spin_unlock_bh(&blkg->async_bio_lock);
+
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+ return true;
+}
+
/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1644,6 +1691,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
*/
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
+ unsigned long pflags;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
@@ -1670,11 +1718,8 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*/
delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
- /*
- * TODO: the use_memdelay flag is going to be for the upcoming psi stuff
- * that hasn't landed upstream yet. Once that stuff is in place we need
- * to do a psi_memstall_enter/leave if memdelay is set.
- */
+ if (use_memdelay)
+ psi_memstall_enter(&pflags);
exp = ktime_add_ns(now, delay_nsec);
tok = io_schedule_prepare();
@@ -1684,6 +1729,9 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
break;
} while (!fatal_signal_pending(current));
io_schedule_finish(tok);
+
+ if (use_memdelay)
+ psi_memstall_leave(&pflags);
}
/**
@@ -1783,5 +1831,16 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
+static int __init blkcg_init(void)
+{
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!blkcg_punt_bio_wq)
+ return -ENOMEM;
+ return 0;
+}
+subsys_initcall(blkcg_init);
+
module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..1075aaff606d 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -34,8 +34,10 @@
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
+#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
+#include <linux/psi.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -117,9 +119,47 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
+ refcount_set(&rq->ref, 1);
}
EXPORT_SYMBOL(blk_rq_init);
+#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
+static const char *const blk_op_name[] = {
+ REQ_OP_NAME(READ),
+ REQ_OP_NAME(WRITE),
+ REQ_OP_NAME(FLUSH),
+ REQ_OP_NAME(DISCARD),
+ REQ_OP_NAME(SECURE_ERASE),
+ REQ_OP_NAME(ZONE_RESET),
+ REQ_OP_NAME(ZONE_RESET_ALL),
+ REQ_OP_NAME(WRITE_SAME),
+ REQ_OP_NAME(WRITE_ZEROES),
+ REQ_OP_NAME(SCSI_IN),
+ REQ_OP_NAME(SCSI_OUT),
+ REQ_OP_NAME(DRV_IN),
+ REQ_OP_NAME(DRV_OUT),
+};
+#undef REQ_OP_NAME
+
+/**
+ * blk_op_str - Return string XXX in the REQ_OP_XXX.
+ * @op: REQ_OP_XXX.
+ *
+ * Description: Centralize block layer function to convert REQ_OP_XXX into
+ * string format. Useful in the debugging and tracing bio or request. For
+ * invalid REQ_OP_XXX it returns string "UNKNOWN".
+ */
+inline const char *blk_op_str(unsigned int op)
+{
+ const char *op_str = "UNKNOWN";
+
+ if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
+ op_str = blk_op_name[op];
+
+ return op_str;
+}
+EXPORT_SYMBOL_GPL(blk_op_str);
+
static const struct {
int errno;
const char *name;
@@ -167,18 +207,23 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
-static void print_req_error(struct request *req, blk_status_t status)
+static void print_req_error(struct request *req, blk_status_t status,
+ const char *caller)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return;
- printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n",
- __func__, blk_errors[idx].name,
- req->rq_disk ? req->rq_disk->disk_name : "?",
- (unsigned long long)blk_rq_pos(req),
- req->cmd_flags);
+ printk_ratelimited(KERN_ERR
+ "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+ "phys_seg %u prio class %u\n",
+ caller, blk_errors[idx].name,
+ req->rq_disk ? req->rq_disk->disk_name : "?",
+ blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
+ req->cmd_flags & ~REQ_OP_MASK,
+ req->nr_phys_segments,
+ IOPRIO_PRIO_CLASS(req->ioprio));
}
static void req_bio_endio(struct request *rq, struct bio *bio,
@@ -302,7 +347,8 @@ void blk_cleanup_queue(struct request_queue *q)
/*
* Drain all requests queued before DYING marking. Set DEAD flag to
- * prevent that q->request_fn() gets invoked after draining finished.
+ * prevent that blk_mq_run_hw_queues() accesses the hardware queues
+ * after draining finished.
*/
blk_freeze_queue(q);
@@ -437,7 +483,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (!q)
return NULL;
- INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
@@ -476,6 +521,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
mutex_init(&q->blk_trace_mutex);
#endif
mutex_init(&q->sysfs_lock);
+ mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
@@ -550,15 +596,16 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
- if (!ll_back_merge_fn(q, req, bio))
+ if (!ll_back_merge_fn(req, bio, nr_segs))
return false;
- trace_block_bio_backmerge(q, req, bio);
+ trace_block_bio_backmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -571,15 +618,16 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
return true;
}
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
- if (!ll_front_merge_fn(q, req, bio))
+ if (!ll_front_merge_fn(req, bio, nr_segs))
return false;
- trace_block_bio_frontmerge(q, req, bio);
+ trace_block_bio_frontmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -605,6 +653,8 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
+ rq_qos_merge(q, req, bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
@@ -621,6 +671,7 @@ no_merge:
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
+ * @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
@@ -639,13 +690,13 @@ no_merge:
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- struct request **same_queue_rq)
+ unsigned int nr_segs, struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (!plug)
return false;
@@ -668,10 +719,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
- merged = bio_attempt_back_merge(q, rq, bio);
+ merged = bio_attempt_back_merge(rq, bio, nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
- merged = bio_attempt_front_merge(q, rq, bio);
+ merged = bio_attempt_front_merge(rq, bio, nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
@@ -687,18 +738,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
return false;
}
-void blk_init_request_from_bio(struct request *req, struct bio *bio)
-{
- if (bio->bi_opf & REQ_RAHEAD)
- req->cmd_flags |= REQ_FAILFAST_MASK;
-
- req->__sector = bio->bi_iter.bi_sector;
- req->ioprio = bio_prio(bio);
- req->write_hint = bio->bi_write_hint;
- blk_rq_bio_prep(req->q, req, bio);
-}
-EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
-
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
@@ -847,11 +886,14 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * For a REQ_NOWAIT based request, return -EOPNOTSUPP
- * if queue is not a request based queue.
+ * Non-mq queues do not honor REQ_NOWAIT, so complete a bio
+ * with BLK_STS_AGAIN status in order to catch -EAGAIN and
+ * to give a chance to the caller to repeat request gracefully.
*/
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
- goto not_supported;
+ if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) {
+ status = BLK_STS_AGAIN;
+ goto end_io;
+ }
if (should_fail_bio(bio))
goto end_io;
@@ -900,6 +942,10 @@ generic_make_request_checks(struct bio *bio)
if (!blk_queue_is_zoned(q))
goto not_supported;
break;
+ case REQ_OP_ZONE_RESET_ALL:
+ if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+ goto not_supported;
+ break;
case REQ_OP_WRITE_ZEROES:
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
@@ -1097,6 +1143,13 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*/
blk_qc_t submit_bio(struct bio *bio)
{
+ bool workingset_read = false;
+ unsigned long pflags;
+ blk_qc_t ret;
+
+ if (blkcg_punt_bio_submit(bio))
+ return BLK_QC_T_NONE;
+
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
@@ -1112,6 +1165,8 @@ blk_qc_t submit_bio(struct bio *bio)
if (op_is_write(bio_op(bio))) {
count_vm_events(PGPGOUT, count);
} else {
+ if (bio_flagged(bio, BIO_WORKINGSET))
+ workingset_read = true;
task_io_account_read(bio->bi_iter.bi_size);
count_vm_events(PGPGIN, count);
}
@@ -1126,7 +1181,21 @@ blk_qc_t submit_bio(struct bio *bio)
}
}
- return generic_make_request(bio);
+ /*
+ * If we're reading data that is part of the userspace
+ * workingset, count submission time as memory stall. When the
+ * device is congested, or the submitting cgroup IO-throttled,
+ * submission can be a significant part of overall IO time.
+ */
+ if (workingset_read)
+ psi_memstall_enter(&pflags);
+
+ ret = generic_make_request(bio);
+
+ if (workingset_read)
+ psi_memstall_leave(&pflags);
+
+ return ret;
}
EXPORT_SYMBOL(submit_bio);
@@ -1163,7 +1232,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
* Recalculate it to check the request correctly on this queue's
* limitation.
*/
- blk_recalc_rq_segments(rq);
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q));
@@ -1348,7 +1417,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
*
* This special helper function is only for request stacking drivers
* (e.g. request-based dm) so that they can handle partial completion.
- * Actual device drivers should use blk_end_request instead.
+ * Actual device drivers should use blk_mq_end_request instead.
*
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
@@ -1371,9 +1440,15 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (!req->bio)
return false;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+ error == BLK_STS_OK)
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)))
- print_req_error(req, error);
+ print_req_error(req, error, __func__);
blk_account_io_completion(req, nr_bytes);
@@ -1432,28 +1507,13 @@ bool blk_update_request(struct request *req, blk_status_t error,
}
/* recalculate the number of segments */
- blk_recalc_rq_segments(req);
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
}
return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
- struct bio *bio)
-{
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
- rq->nr_phys_segments = 1;
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
-
- if (bio->bi_disk)
- rq->rq_disk = bio->bi_disk;
-}
-
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
* rq_flush_dcache_pages - Helper function to flush all pages in a request
diff --git a/block/blk-flush.c b/block/blk-flush.c
index aedd9320e605..b1f0a1ac505c 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -69,6 +69,7 @@
#include <linux/blkdev.h>
#include <linux/gfp.h>
#include <linux/blk-mq.h>
+#include <linux/lockdep.h>
#include "blk.h"
#include "blk-mq.h"
@@ -214,6 +215,16 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
+
+ if (!refcount_dec_and_test(&flush_rq->ref)) {
+ fq->rq_status = error;
+ spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+ return;
+ }
+
+ if (fq->rq_status != BLK_STS_OK)
+ error = fq->rq_status;
+
hctx = flush_rq->mq_hctx;
if (!q->elevator) {
blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
@@ -482,6 +493,9 @@ struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
INIT_LIST_HEAD(&fq->flush_queue[1]);
INIT_LIST_HEAD(&fq->flush_data_in_flight);
+ lockdep_register_key(&fq->key);
+ lockdep_set_class(&fq->mq_flush_lock, &fq->key);
+
return fq;
fail_rq:
@@ -496,6 +510,7 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
if (!fq)
return;
+ lockdep_unregister_key(&fq->key);
kfree(fq->flush_rq);
kfree(fq);
}
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index 825c9c070458..ff1070edbb40 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -368,10 +368,21 @@ static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
return BLK_STS_OK;
}
+static void blk_integrity_nop_prepare(struct request *rq)
+{
+}
+
+static void blk_integrity_nop_complete(struct request *rq,
+ unsigned int nr_bytes)
+{
+}
+
static const struct blk_integrity_profile nop_profile = {
.name = "nop",
.generate_fn = blk_integrity_nop_fn,
.verify_fn = blk_integrity_nop_fn,
+ .prepare_fn = blk_integrity_nop_prepare,
+ .complete_fn = blk_integrity_nop_complete,
};
/**
@@ -383,7 +394,7 @@ static const struct blk_integrity_profile nop_profile = {
* send/receive integrity metadata it must use this function to register
* the capability with the block layer. The template is a blk_integrity
* struct with values appropriate for the underlying hardware. See
- * Documentation/block/data-integrity.txt.
+ * Documentation/block/data-integrity.rst.
*/
void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template)
{
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
new file mode 100644
index 000000000000..27ca68621137
--- /dev/null
+++ b/block/blk-iocost.c
@@ -0,0 +1,2472 @@
+/* SPDX-License-Identifier: GPL-2.0
+ *
+ * IO cost model based controller.
+ *
+ * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
+ * Copyright (C) 2019 Andy Newell <newella@fb.com>
+ * Copyright (C) 2019 Facebook
+ *
+ * One challenge of controlling IO resources is the lack of trivially
+ * observable cost metric. This is distinguished from CPU and memory where
+ * wallclock time and the number of bytes can serve as accurate enough
+ * approximations.
+ *
+ * Bandwidth and iops are the most commonly used metrics for IO devices but
+ * depending on the type and specifics of the device, different IO patterns
+ * easily lead to multiple orders of magnitude variations rendering them
+ * useless for the purpose of IO capacity distribution. While on-device
+ * time, with a lot of clutches, could serve as a useful approximation for
+ * non-queued rotational devices, this is no longer viable with modern
+ * devices, even the rotational ones.
+ *
+ * While there is no cost metric we can trivially observe, it isn't a
+ * complete mystery. For example, on a rotational device, seek cost
+ * dominates while a contiguous transfer contributes a smaller amount
+ * proportional to the size. If we can characterize at least the relative
+ * costs of these different types of IOs, it should be possible to
+ * implement a reasonable work-conserving proportional IO resource
+ * distribution.
+ *
+ * 1. IO Cost Model
+ *
+ * IO cost model estimates the cost of an IO given its basic parameters and
+ * history (e.g. the end sector of the last IO). The cost is measured in
+ * device time. If a given IO is estimated to cost 10ms, the device should
+ * be able to process ~100 of those IOs in a second.
+ *
+ * Currently, there's only one builtin cost model - linear. Each IO is
+ * classified as sequential or random and given a base cost accordingly.
+ * On top of that, a size cost proportional to the length of the IO is
+ * added. While simple, this model captures the operational
+ * characteristics of a wide varienty of devices well enough. Default
+ * paramters for several different classes of devices are provided and the
+ * parameters can be configured from userspace via
+ * /sys/fs/cgroup/io.cost.model.
+ *
+ * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
+ * device-specific coefficients.
+ *
+ * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
+ * device-specific coefficients.
+ *
+ * 2. Control Strategy
+ *
+ * The device virtual time (vtime) is used as the primary control metric.
+ * The control strategy is composed of the following three parts.
+ *
+ * 2-1. Vtime Distribution
+ *
+ * When a cgroup becomes active in terms of IOs, its hierarchical share is
+ * calculated. Please consider the following hierarchy where the numbers
+ * inside parentheses denote the configured weights.
+ *
+ * root
+ * / \
+ * A (w:100) B (w:300)
+ * / \
+ * A0 (w:100) A1 (w:100)
+ *
+ * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
+ * of equal weight, each gets 50% share. If then B starts issuing IOs, B
+ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
+ * 12.5% each. The distribution mechanism only cares about these flattened
+ * shares. They're called hweights (hierarchical weights) and always add
+ * upto 1 (HWEIGHT_WHOLE).
+ *
+ * A given cgroup's vtime runs slower in inverse proportion to its hweight.
+ * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
+ * against the device vtime - an IO which takes 10ms on the underlying
+ * device is considered to take 80ms on A0.
+ *
+ * This constitutes the basis of IO capacity distribution. Each cgroup's
+ * vtime is running at a rate determined by its hweight. A cgroup tracks
+ * the vtime consumed by past IOs and can issue a new IO iff doing so
+ * wouldn't outrun the current device vtime. Otherwise, the IO is
+ * suspended until the vtime has progressed enough to cover it.
+ *
+ * 2-2. Vrate Adjustment
+ *
+ * It's unrealistic to expect the cost model to be perfect. There are too
+ * many devices and even on the same device the overall performance
+ * fluctuates depending on numerous factors such as IO mixture and device
+ * internal garbage collection. The controller needs to adapt dynamically.
+ *
+ * This is achieved by adjusting the overall IO rate according to how busy
+ * the device is. If the device becomes overloaded, we're sending down too
+ * many IOs and should generally slow down. If there are waiting issuers
+ * but the device isn't saturated, we're issuing too few and should
+ * generally speed up.
+ *
+ * To slow down, we lower the vrate - the rate at which the device vtime
+ * passes compared to the wall clock. For example, if the vtime is running
+ * at the vrate of 75%, all cgroups added up would only be able to issue
+ * 750ms worth of IOs per second, and vice-versa for speeding up.
+ *
+ * Device business is determined using two criteria - rq wait and
+ * completion latencies.
+ *
+ * When a device gets saturated, the on-device and then the request queues
+ * fill up and a bio which is ready to be issued has to wait for a request
+ * to become available. When this delay becomes noticeable, it's a clear
+ * indication that the device is saturated and we lower the vrate. This
+ * saturation signal is fairly conservative as it only triggers when both
+ * hardware and software queues are filled up, and is used as the default
+ * busy signal.
+ *
+ * As devices can have deep queues and be unfair in how the queued commands
+ * are executed, soley depending on rq wait may not result in satisfactory
+ * control quality. For a better control quality, completion latency QoS
+ * parameters can be configured so that the device is considered saturated
+ * if N'th percentile completion latency rises above the set point.
+ *
+ * The completion latency requirements are a function of both the
+ * underlying device characteristics and the desired IO latency quality of
+ * service. There is an inherent trade-off - the tighter the latency QoS,
+ * the higher the bandwidth lossage. Latency QoS is disabled by default
+ * and can be set through /sys/fs/cgroup/io.cost.qos.
+ *
+ * 2-3. Work Conservation
+ *
+ * Imagine two cgroups A and B with equal weights. A is issuing a small IO
+ * periodically while B is sending out enough parallel IOs to saturate the
+ * device on its own. Let's say A's usage amounts to 100ms worth of IO
+ * cost per second, i.e., 10% of the device capacity. The naive
+ * distribution of half and half would lead to 60% utilization of the
+ * device, a significant reduction in the total amount of work done
+ * compared to free-for-all competition. This is too high a cost to pay
+ * for IO control.
+ *
+ * To conserve the total amount of work done, we keep track of how much
+ * each active cgroup is actually using and yield part of its weight if
+ * there are other cgroups which can make use of it. In the above case,
+ * A's weight will be lowered so that it hovers above the actual usage and
+ * B would be able to use the rest.
+ *
+ * As we don't want to penalize a cgroup for donating its weight, the
+ * surplus weight adjustment factors in a margin and has an immediate
+ * snapback mechanism in case the cgroup needs more IO vtime for itself.
+ *
+ * Note that adjusting down surplus weights has the same effects as
+ * accelerating vtime for other cgroups and work conservation can also be
+ * implemented by adjusting vrate dynamically. However, squaring who can
+ * donate and should take back how much requires hweight propagations
+ * anyway making it easier to implement and understand as a separate
+ * mechanism.
+ *
+ * 3. Monitoring
+ *
+ * Instead of debugfs or other clumsy monitoring mechanisms, this
+ * controller uses a drgn based monitoring script -
+ * tools/cgroup/iocost_monitor.py. For details on drgn, please see
+ * https://github.com/osandov/drgn. The ouput looks like the following.
+ *
+ * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
+ * active weight hweight% inflt% dbt delay usages%
+ * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
+ * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
+ *
+ * - per : Timer period
+ * - cur_per : Internal wall and device vtime clock
+ * - vrate : Device virtual time rate against wall clock
+ * - weight : Surplus-adjusted and configured weights
+ * - hweight : Surplus-adjusted and configured hierarchical weights
+ * - inflt : The percentage of in-flight IO cost at the end of last period
+ * - del_ms : Deferred issuer delay induction level and duration
+ * - usages : Usage history
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/timer.h>
+#include <linux/time64.h>
+#include <linux/parser.h>
+#include <linux/sched/signal.h>
+#include <linux/blk-cgroup.h>
+#include "blk-rq-qos.h"
+#include "blk-stat.h"
+#include "blk-wbt.h"
+
+#ifdef CONFIG_TRACEPOINTS
+
+/* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
+#define TRACE_IOCG_PATH_LEN 1024
+static DEFINE_SPINLOCK(trace_iocg_path_lock);
+static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
+
+#define TRACE_IOCG_PATH(type, iocg, ...) \
+ do { \
+ unsigned long flags; \
+ if (trace_iocost_##type##_enabled()) { \
+ spin_lock_irqsave(&trace_iocg_path_lock, flags); \
+ cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
+ trace_iocg_path, TRACE_IOCG_PATH_LEN); \
+ trace_iocost_##type(iocg, trace_iocg_path, \
+ ##__VA_ARGS__); \
+ spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
+ } \
+ } while (0)
+
+#else /* CONFIG_TRACE_POINTS */
+#define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
+#endif /* CONFIG_TRACE_POINTS */
+
+enum {
+ MILLION = 1000000,
+
+ /* timer period is calculated from latency requirements, bound it */
+ MIN_PERIOD = USEC_PER_MSEC,
+ MAX_PERIOD = USEC_PER_SEC,
+
+ /*
+ * A cgroup's vtime can run 50% behind the device vtime, which
+ * serves as its IO credit buffer. Surplus weight adjustment is
+ * immediately canceled if the vtime margin runs below 10%.
+ */
+ MARGIN_PCT = 50,
+ INUSE_MARGIN_PCT = 10,
+
+ /* Have some play in waitq timer operations */
+ WAITQ_TIMER_MARGIN_PCT = 5,
+
+ /*
+ * vtime can wrap well within a reasonable uptime when vrate is
+ * consistently raised. Don't trust recorded cgroup vtime if the
+ * period counter indicates that it's older than 5mins.
+ */
+ VTIME_VALID_DUR = 300 * USEC_PER_SEC,
+
+ /*
+ * Remember the past three non-zero usages and use the max for
+ * surplus calculation. Three slots guarantee that we remember one
+ * full period usage from the last active stretch even after
+ * partial deactivation and re-activation periods. Don't start
+ * giving away weight before collecting two data points to prevent
+ * hweight adjustments based on one partial activation period.
+ */
+ NR_USAGE_SLOTS = 3,
+ MIN_VALID_USAGES = 2,
+
+ /* 1/64k is granular enough and can easily be handled w/ u32 */
+ HWEIGHT_WHOLE = 1 << 16,
+
+ /*
+ * As vtime is used to calculate the cost of each IO, it needs to
+ * be fairly high precision. For example, it should be able to
+ * represent the cost of a single page worth of discard with
+ * suffificient accuracy. At the same time, it should be able to
+ * represent reasonably long enough durations to be useful and
+ * convenient during operation.
+ *
+ * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
+ * granularity and days of wrap-around time even at extreme vrates.
+ */
+ VTIME_PER_SEC_SHIFT = 37,
+ VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
+ VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
+
+ /* bound vrate adjustments within two orders of magnitude */
+ VRATE_MIN_PPM = 10000, /* 1% */
+ VRATE_MAX_PPM = 100000000, /* 10000% */
+
+ VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
+ VRATE_CLAMP_ADJ_PCT = 4,
+
+ /* if IOs end up waiting for requests, issue less */
+ RQ_WAIT_BUSY_PCT = 5,
+
+ /* unbusy hysterisis */
+ UNBUSY_THR_PCT = 75,
+
+ /* don't let cmds which take a very long time pin lagging for too long */
+ MAX_LAGGING_PERIODS = 10,
+
+ /*
+ * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
+ * donate the surplus.
+ */
+ SURPLUS_SCALE_PCT = 125, /* * 125% */
+ SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
+ SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
+
+ /* switch iff the conditions are met for longer than this */
+ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
+
+ /*
+ * Count IO size in 4k pages. The 12bit shift helps keeping
+ * size-proportional components of cost calculation in closer
+ * numbers of digits to per-IO cost components.
+ */
+ IOC_PAGE_SHIFT = 12,
+ IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
+ IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
+
+ /* if apart further than 16M, consider randio for linear model */
+ LCOEF_RANDIO_PAGES = 4096,
+};
+
+enum ioc_running {
+ IOC_IDLE,
+ IOC_RUNNING,
+ IOC_STOP,
+};
+
+/* io.cost.qos controls including per-dev enable of the whole controller */
+enum {
+ QOS_ENABLE,
+ QOS_CTRL,
+ NR_QOS_CTRL_PARAMS,
+};
+
+/* io.cost.qos params */
+enum {
+ QOS_RPPM,
+ QOS_RLAT,
+ QOS_WPPM,
+ QOS_WLAT,
+ QOS_MIN,
+ QOS_MAX,
+ NR_QOS_PARAMS,
+};
+
+/* io.cost.model controls */
+enum {
+ COST_CTRL,
+ COST_MODEL,
+ NR_COST_CTRL_PARAMS,
+};
+
+/* builtin linear cost model coefficients */
+enum {
+ I_LCOEF_RBPS,
+ I_LCOEF_RSEQIOPS,
+ I_LCOEF_RRANDIOPS,
+ I_LCOEF_WBPS,
+ I_LCOEF_WSEQIOPS,
+ I_LCOEF_WRANDIOPS,
+ NR_I_LCOEFS,
+};
+
+enum {
+ LCOEF_RPAGE,
+ LCOEF_RSEQIO,
+ LCOEF_RRANDIO,
+ LCOEF_WPAGE,
+ LCOEF_WSEQIO,
+ LCOEF_WRANDIO,
+ NR_LCOEFS,
+};
+
+enum {
+ AUTOP_INVALID,
+ AUTOP_HDD,
+ AUTOP_SSD_QD1,
+ AUTOP_SSD_DFL,
+ AUTOP_SSD_FAST,
+};
+
+struct ioc_gq;
+
+struct ioc_params {
+ u32 qos[NR_QOS_PARAMS];
+ u64 i_lcoefs[NR_I_LCOEFS];
+ u64 lcoefs[NR_LCOEFS];
+ u32 too_fast_vrate_pct;
+ u32 too_slow_vrate_pct;
+};
+
+struct ioc_missed {
+ u32 nr_met;
+ u32 nr_missed;
+ u32 last_met;
+ u32 last_missed;
+};
+
+struct ioc_pcpu_stat {
+ struct ioc_missed missed[2];
+
+ u64 rq_wait_ns;
+ u64 last_rq_wait_ns;
+};
+
+/* per device */
+struct ioc {
+ struct rq_qos rqos;
+
+ bool enabled;
+
+ struct ioc_params params;
+ u32 period_us;
+ u32 margin_us;
+ u64 vrate_min;
+ u64 vrate_max;
+
+ spinlock_t lock;
+ struct timer_list timer;
+ struct list_head active_iocgs; /* active cgroups */
+ struct ioc_pcpu_stat __percpu *pcpu_stat;
+
+ enum ioc_running running;
+ atomic64_t vtime_rate;
+
+ seqcount_t period_seqcount;
+ u32 period_at; /* wallclock starttime */
+ u64 period_at_vtime; /* vtime starttime */
+
+ atomic64_t cur_period; /* inc'd each period */
+ int busy_level; /* saturation history */
+
+ u64 inuse_margin_vtime;
+ bool weights_updated;
+ atomic_t hweight_gen; /* for lazy hweights */
+
+ u64 autop_too_fast_at;
+ u64 autop_too_slow_at;
+ int autop_idx;
+ bool user_qos_params:1;
+ bool user_cost_model:1;
+};
+
+/* per device-cgroup pair */
+struct ioc_gq {
+ struct blkg_policy_data pd;
+ struct ioc *ioc;
+
+ /*
+ * A iocg can get its weight from two sources - an explicit
+ * per-device-cgroup configuration or the default weight of the
+ * cgroup. `cfg_weight` is the explicit per-device-cgroup
+ * configuration. `weight` is the effective considering both
+ * sources.
+ *
+ * When an idle cgroup becomes active its `active` goes from 0 to
+ * `weight`. `inuse` is the surplus adjusted active weight.
+ * `active` and `inuse` are used to calculate `hweight_active` and
+ * `hweight_inuse`.
+ *
+ * `last_inuse` remembers `inuse` while an iocg is idle to persist
+ * surplus adjustments.
+ */
+ u32 cfg_weight;
+ u32 weight;
+ u32 active;
+ u32 inuse;
+ u32 last_inuse;
+
+ sector_t cursor; /* to detect randio */
+
+ /*
+ * `vtime` is this iocg's vtime cursor which progresses as IOs are
+ * issued. If lagging behind device vtime, the delta represents
+ * the currently available IO budget. If runnning ahead, the
+ * overage.
+ *
+ * `vtime_done` is the same but progressed on completion rather
+ * than issue. The delta behind `vtime` represents the cost of
+ * currently in-flight IOs.
+ *
+ * `last_vtime` is used to remember `vtime` at the end of the last
+ * period to calculate utilization.
+ */
+ atomic64_t vtime;
+ atomic64_t done_vtime;
+ atomic64_t abs_vdebt;
+ u64 last_vtime;
+
+ /*
+ * The period this iocg was last active in. Used for deactivation
+ * and invalidating `vtime`.
+ */
+ atomic64_t active_period;
+ struct list_head active_list;
+
+ /* see __propagate_active_weight() and current_hweight() for details */
+ u64 child_active_sum;
+ u64 child_inuse_sum;
+ int hweight_gen;
+ u32 hweight_active;
+ u32 hweight_inuse;
+ bool has_surplus;
+
+ struct wait_queue_head waitq;
+ struct hrtimer waitq_timer;
+ struct hrtimer delay_timer;
+
+ /* usage is recorded as fractions of HWEIGHT_WHOLE */
+ int usage_idx;
+ u32 usages[NR_USAGE_SLOTS];
+
+ /* this iocg's depth in the hierarchy and ancestors including self */
+ int level;
+ struct ioc_gq *ancestors[];
+};
+
+/* per cgroup */
+struct ioc_cgrp {
+ struct blkcg_policy_data cpd;
+ unsigned int dfl_weight;
+};
+
+struct ioc_now {
+ u64 now_ns;
+ u32 now;
+ u64 vnow;
+ u64 vrate;
+};
+
+struct iocg_wait {
+ struct wait_queue_entry wait;
+ struct bio *bio;
+ u64 abs_cost;
+ bool committed;
+};
+
+struct iocg_wake_ctx {
+ struct ioc_gq *iocg;
+ u32 hw_inuse;
+ s64 vbudget;
+};
+
+static const struct ioc_params autop[] = {
+ [AUTOP_HDD] = {
+ .qos = {
+ [QOS_RLAT] = 250000, /* 250ms */
+ [QOS_WLAT] = 250000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 174019176,
+ [I_LCOEF_RSEQIOPS] = 41708,
+ [I_LCOEF_RRANDIOPS] = 370,
+ [I_LCOEF_WBPS] = 178075866,
+ [I_LCOEF_WSEQIOPS] = 42705,
+ [I_LCOEF_WRANDIOPS] = 378,
+ },
+ },
+ [AUTOP_SSD_QD1] = {
+ .qos = {
+ [QOS_RLAT] = 25000, /* 25ms */
+ [QOS_WLAT] = 25000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 245855193,
+ [I_LCOEF_RSEQIOPS] = 61575,
+ [I_LCOEF_RRANDIOPS] = 6946,
+ [I_LCOEF_WBPS] = 141365009,
+ [I_LCOEF_WSEQIOPS] = 33716,
+ [I_LCOEF_WRANDIOPS] = 26796,
+ },
+ },
+ [AUTOP_SSD_DFL] = {
+ .qos = {
+ [QOS_RLAT] = 25000, /* 25ms */
+ [QOS_WLAT] = 25000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 488636629,
+ [I_LCOEF_RSEQIOPS] = 8932,
+ [I_LCOEF_RRANDIOPS] = 8518,
+ [I_LCOEF_WBPS] = 427891549,
+ [I_LCOEF_WSEQIOPS] = 28755,
+ [I_LCOEF_WRANDIOPS] = 21940,
+ },
+ .too_fast_vrate_pct = 500,
+ },
+ [AUTOP_SSD_FAST] = {
+ .qos = {
+ [QOS_RLAT] = 5000, /* 5ms */
+ [QOS_WLAT] = 5000,
+ [QOS_MIN] = VRATE_MIN_PPM,
+ [QOS_MAX] = VRATE_MAX_PPM,
+ },
+ .i_lcoefs = {
+ [I_LCOEF_RBPS] = 3102524156LLU,
+ [I_LCOEF_RSEQIOPS] = 724816,
+ [I_LCOEF_RRANDIOPS] = 778122,
+ [I_LCOEF_WBPS] = 1742780862LLU,
+ [I_LCOEF_WSEQIOPS] = 425702,
+ [I_LCOEF_WRANDIOPS] = 443193,
+ },
+ .too_slow_vrate_pct = 10,
+ },
+};
+
+/*
+ * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
+ * vtime credit shortage and down on device saturation.
+ */
+static u32 vrate_adj_pct[] =
+ { 0, 0, 0, 0,
+ 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
+ 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
+ 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
+
+static struct blkcg_policy blkcg_policy_iocost;
+
+/* accessors and helpers */
+static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct ioc, rqos);
+}
+
+static struct ioc *q_to_ioc(struct request_queue *q)
+{
+ return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
+}
+
+static const char *q_name(struct request_queue *q)
+{
+ if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ return kobject_name(q->kobj.parent);
+ else
+ return "<unknown>";
+}
+
+static const char __maybe_unused *ioc_name(struct ioc *ioc)
+{
+ return q_name(ioc->rqos.q);
+}
+
+static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
+}
+
+static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
+{
+ return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
+}
+
+static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
+{
+ return pd_to_blkg(&iocg->pd);
+}
+
+static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
+{
+ return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
+ struct ioc_cgrp, cpd);
+}
+
+/*
+ * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
+ * weight, the more expensive each IO. Must round up.
+ */
+static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
+{
+ return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
+}
+
+/*
+ * The inverse of abs_cost_to_cost(). Must round up.
+ */
+static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
+{
+ return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
+}
+
+static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
+{
+ bio->bi_iocost_cost = cost;
+ atomic64_add(cost, &iocg->vtime);
+}
+
+#define CREATE_TRACE_POINTS
+#include <trace/events/iocost.h>
+
+/* latency Qos params changed, update period_us and all the dependent params */
+static void ioc_refresh_period_us(struct ioc *ioc)
+{
+ u32 ppm, lat, multi, period_us;
+
+ lockdep_assert_held(&ioc->lock);
+
+ /* pick the higher latency target */
+ if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
+ ppm = ioc->params.qos[QOS_RPPM];
+ lat = ioc->params.qos[QOS_RLAT];
+ } else {
+ ppm = ioc->params.qos[QOS_WPPM];
+ lat = ioc->params.qos[QOS_WLAT];
+ }
+
+ /*
+ * We want the period to be long enough to contain a healthy number
+ * of IOs while short enough for granular control. Define it as a
+ * multiple of the latency target. Ideally, the multiplier should
+ * be scaled according to the percentile so that it would nominally
+ * contain a certain number of requests. Let's be simpler and
+ * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
+ */
+ if (ppm)
+ multi = max_t(u32, (MILLION - ppm) / 50000, 2);
+ else
+ multi = 2;
+ period_us = multi * lat;
+ period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
+
+ /* calculate dependent params */
+ ioc->period_us = period_us;
+ ioc->margin_us = period_us * MARGIN_PCT / 100;
+ ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
+ period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
+}
+
+static int ioc_autop_idx(struct ioc *ioc)
+{
+ int idx = ioc->autop_idx;
+ const struct ioc_params *p = &autop[idx];
+ u32 vrate_pct;
+ u64 now_ns;
+
+ /* rotational? */
+ if (!blk_queue_nonrot(ioc->rqos.q))
+ return AUTOP_HDD;
+
+ /* handle SATA SSDs w/ broken NCQ */
+ if (blk_queue_depth(ioc->rqos.q) == 1)
+ return AUTOP_SSD_QD1;
+
+ /* use one of the normal ssd sets */
+ if (idx < AUTOP_SSD_DFL)
+ return AUTOP_SSD_DFL;
+
+ /* if user is overriding anything, maintain what was there */
+ if (ioc->user_qos_params || ioc->user_cost_model)
+ return idx;
+
+ /* step up/down based on the vrate */
+ vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
+ VTIME_PER_USEC);
+ now_ns = ktime_get_ns();
+
+ if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
+ if (!ioc->autop_too_fast_at)
+ ioc->autop_too_fast_at = now_ns;
+ if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
+ return idx + 1;
+ } else {
+ ioc->autop_too_fast_at = 0;
+ }
+
+ if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
+ if (!ioc->autop_too_slow_at)
+ ioc->autop_too_slow_at = now_ns;
+ if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
+ return idx - 1;
+ } else {
+ ioc->autop_too_slow_at = 0;
+ }
+
+ return idx;
+}
+
+/*
+ * Take the followings as input
+ *
+ * @bps maximum sequential throughput
+ * @seqiops maximum sequential 4k iops
+ * @randiops maximum random 4k iops
+ *
+ * and calculate the linear model cost coefficients.
+ *
+ * *@page per-page cost 1s / (@bps / 4096)
+ * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
+ * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
+ */
+static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
+ u64 *page, u64 *seqio, u64 *randio)
+{
+ u64 v;
+
+ *page = *seqio = *randio = 0;
+
+ if (bps)
+ *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
+ DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
+
+ if (seqiops) {
+ v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
+ if (v > *page)
+ *seqio = v - *page;
+ }
+
+ if (randiops) {
+ v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
+ if (v > *page)
+ *randio = v - *page;
+ }
+}
+
+static void ioc_refresh_lcoefs(struct ioc *ioc)
+{
+ u64 *u = ioc->params.i_lcoefs;
+ u64 *c = ioc->params.lcoefs;
+
+ calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
+ &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
+ calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
+ &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
+}
+
+static bool ioc_refresh_params(struct ioc *ioc, bool force)
+{
+ const struct ioc_params *p;
+ int idx;
+
+ lockdep_assert_held(&ioc->lock);
+
+ idx = ioc_autop_idx(ioc);
+ p = &autop[idx];
+
+ if (idx == ioc->autop_idx && !force)
+ return false;
+
+ if (idx != ioc->autop_idx)
+ atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+
+ ioc->autop_idx = idx;
+ ioc->autop_too_fast_at = 0;
+ ioc->autop_too_slow_at = 0;
+
+ if (!ioc->user_qos_params)
+ memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
+ if (!ioc->user_cost_model)
+ memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
+
+ ioc_refresh_period_us(ioc);
+ ioc_refresh_lcoefs(ioc);
+
+ ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
+ VTIME_PER_USEC, MILLION);
+ ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
+ VTIME_PER_USEC, MILLION);
+
+ return true;
+}
+
+/* take a snapshot of the current [v]time and vrate */
+static void ioc_now(struct ioc *ioc, struct ioc_now *now)
+{
+ unsigned seq;
+
+ now->now_ns = ktime_get();
+ now->now = ktime_to_us(now->now_ns);
+ now->vrate = atomic64_read(&ioc->vtime_rate);
+
+ /*
+ * The current vtime is
+ *
+ * vtime at period start + (wallclock time since the start) * vrate
+ *
+ * As a consistent snapshot of `period_at_vtime` and `period_at` is
+ * needed, they're seqcount protected.
+ */
+ do {
+ seq = read_seqcount_begin(&ioc->period_seqcount);
+ now->vnow = ioc->period_at_vtime +
+ (now->now - ioc->period_at) * now->vrate;
+ } while (read_seqcount_retry(&ioc->period_seqcount, seq));
+}
+
+static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
+{
+ lockdep_assert_held(&ioc->lock);
+ WARN_ON_ONCE(ioc->running != IOC_RUNNING);
+
+ write_seqcount_begin(&ioc->period_seqcount);
+ ioc->period_at = now->now;
+ ioc->period_at_vtime = now->vnow;
+ write_seqcount_end(&ioc->period_seqcount);
+
+ ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
+ add_timer(&ioc->timer);
+}
+
+/*
+ * Update @iocg's `active` and `inuse` to @active and @inuse, update level
+ * weight sums and propagate upwards accordingly.
+ */
+static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+{
+ struct ioc *ioc = iocg->ioc;
+ int lvl;
+
+ lockdep_assert_held(&ioc->lock);
+
+ inuse = min(active, inuse);
+
+ for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
+ struct ioc_gq *parent = iocg->ancestors[lvl];
+ struct ioc_gq *child = iocg->ancestors[lvl + 1];
+ u32 parent_active = 0, parent_inuse = 0;
+
+ /* update the level sums */
+ parent->child_active_sum += (s32)(active - child->active);
+ parent->child_inuse_sum += (s32)(inuse - child->inuse);
+ /* apply the udpates */
+ child->active = active;
+ child->inuse = inuse;
+
+ /*
+ * The delta between inuse and active sums indicates that
+ * that much of weight is being given away. Parent's inuse
+ * and active should reflect the ratio.
+ */
+ if (parent->child_active_sum) {
+ parent_active = parent->weight;
+ parent_inuse = DIV64_U64_ROUND_UP(
+ parent_active * parent->child_inuse_sum,
+ parent->child_active_sum);
+ }
+
+ /* do we need to keep walking up? */
+ if (parent_active == parent->active &&
+ parent_inuse == parent->inuse)
+ break;
+
+ active = parent_active;
+ inuse = parent_inuse;
+ }
+
+ ioc->weights_updated = true;
+}
+
+static void commit_active_weights(struct ioc *ioc)
+{
+ lockdep_assert_held(&ioc->lock);
+
+ if (ioc->weights_updated) {
+ /* paired with rmb in current_hweight(), see there */
+ smp_wmb();
+ atomic_inc(&ioc->hweight_gen);
+ ioc->weights_updated = false;
+ }
+}
+
+static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+{
+ __propagate_active_weight(iocg, active, inuse);
+ commit_active_weights(iocg->ioc);
+}
+
+static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
+{
+ struct ioc *ioc = iocg->ioc;
+ int lvl;
+ u32 hwa, hwi;
+ int ioc_gen;
+
+ /* hot path - if uptodate, use cached */
+ ioc_gen = atomic_read(&ioc->hweight_gen);
+ if (ioc_gen == iocg->hweight_gen)
+ goto out;
+
+ /*
+ * Paired with wmb in commit_active_weights(). If we saw the
+ * updated hweight_gen, all the weight updates from
+ * __propagate_active_weight() are visible too.
+ *
+ * We can race with weight updates during calculation and get it
+ * wrong. However, hweight_gen would have changed and a future
+ * reader will recalculate and we're guaranteed to discard the
+ * wrong result soon.
+ */
+ smp_rmb();
+
+ hwa = hwi = HWEIGHT_WHOLE;
+ for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
+ struct ioc_gq *parent = iocg->ancestors[lvl];
+ struct ioc_gq *child = iocg->ancestors[lvl + 1];
+ u32 active_sum = READ_ONCE(parent->child_active_sum);
+ u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
+ u32 active = READ_ONCE(child->active);
+ u32 inuse = READ_ONCE(child->inuse);
+
+ /* we can race with deactivations and either may read as zero */
+ if (!active_sum || !inuse_sum)
+ continue;
+
+ active_sum = max(active, active_sum);
+ hwa = hwa * active / active_sum; /* max 16bits * 10000 */
+
+ inuse_sum = max(inuse, inuse_sum);
+ hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
+ }
+
+ iocg->hweight_active = max_t(u32, hwa, 1);
+ iocg->hweight_inuse = max_t(u32, hwi, 1);
+ iocg->hweight_gen = ioc_gen;
+out:
+ if (hw_activep)
+ *hw_activep = iocg->hweight_active;
+ if (hw_inusep)
+ *hw_inusep = iocg->hweight_inuse;
+}
+
+static void weight_updated(struct ioc_gq *iocg)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct blkcg_gq *blkg = iocg_to_blkg(iocg);
+ struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
+ u32 weight;
+
+ lockdep_assert_held(&ioc->lock);
+
+ weight = iocg->cfg_weight ?: iocc->dfl_weight;
+ if (weight != iocg->weight && iocg->active)
+ propagate_active_weight(iocg, weight,
+ DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
+ iocg->weight = weight;
+}
+
+static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 last_period, cur_period, max_period_delta;
+ u64 vtime, vmargin, vmin;
+ int i;
+
+ /*
+ * If seem to be already active, just update the stamp to tell the
+ * timer that we're still active. We don't mind occassional races.
+ */
+ if (!list_empty(&iocg->active_list)) {
+ ioc_now(ioc, now);
+ cur_period = atomic64_read(&ioc->cur_period);
+ if (atomic64_read(&iocg->active_period) != cur_period)
+ atomic64_set(&iocg->active_period, cur_period);
+ return true;
+ }
+
+ /* racy check on internal node IOs, treat as root level IOs */
+ if (iocg->child_active_sum)
+ return false;
+
+ spin_lock_irq(&ioc->lock);
+
+ ioc_now(ioc, now);
+
+ /* update period */
+ cur_period = atomic64_read(&ioc->cur_period);
+ last_period = atomic64_read(&iocg->active_period);
+ atomic64_set(&iocg->active_period, cur_period);
+
+ /* already activated or breaking leaf-only constraint? */
+ if (!list_empty(&iocg->active_list))
+ goto succeed_unlock;
+ for (i = iocg->level - 1; i > 0; i--)
+ if (!list_empty(&iocg->ancestors[i]->active_list))
+ goto fail_unlock;
+
+ if (iocg->child_active_sum)
+ goto fail_unlock;
+
+ /*
+ * vtime may wrap when vrate is raised substantially due to
+ * underestimated IO costs. Look at the period and ignore its
+ * vtime if the iocg has been idle for too long. Also, cap the
+ * budget it can start with to the margin.
+ */
+ max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
+ vtime = atomic64_read(&iocg->vtime);
+ vmargin = ioc->margin_us * now->vrate;
+ vmin = now->vnow - vmargin;
+
+ if (last_period + max_period_delta < cur_period ||
+ time_before64(vtime, vmin)) {
+ atomic64_add(vmin - vtime, &iocg->vtime);
+ atomic64_add(vmin - vtime, &iocg->done_vtime);
+ vtime = vmin;
+ }
+
+ /*
+ * Activate, propagate weight and start period timer if not
+ * running. Reset hweight_gen to avoid accidental match from
+ * wrapping.
+ */
+ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
+ list_add(&iocg->active_list, &ioc->active_iocgs);
+ propagate_active_weight(iocg, iocg->weight,
+ iocg->last_inuse ?: iocg->weight);
+
+ TRACE_IOCG_PATH(iocg_activate, iocg, now,
+ last_period, cur_period, vtime);
+
+ iocg->last_vtime = vtime;
+
+ if (ioc->running == IOC_IDLE) {
+ ioc->running = IOC_RUNNING;
+ ioc_start_period(ioc, now);
+ }
+
+succeed_unlock:
+ spin_unlock_irq(&ioc->lock);
+ return true;
+
+fail_unlock:
+ spin_unlock_irq(&ioc->lock);
+ return false;
+}
+
+static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
+ int flags, void *key)
+{
+ struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
+ struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
+ u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
+
+ ctx->vbudget -= cost;
+
+ if (ctx->vbudget < 0)
+ return -1;
+
+ iocg_commit_bio(ctx->iocg, wait->bio, cost);
+
+ /*
+ * autoremove_wake_function() removes the wait entry only when it
+ * actually changed the task state. We want the wait always
+ * removed. Remove explicitly and use default_wake_function().
+ */
+ list_del_init(&wq_entry->entry);
+ wait->committed = true;
+
+ default_wake_function(wq_entry, mode, flags, key);
+ return 0;
+}
+
+static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct iocg_wake_ctx ctx = { .iocg = iocg };
+ u64 margin_ns = (u64)(ioc->period_us *
+ WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
+ u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
+ s64 vbudget;
+ u32 hw_inuse;
+
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ current_hweight(iocg, NULL, &hw_inuse);
+ vbudget = now->vnow - atomic64_read(&iocg->vtime);
+
+ /* pay off debt */
+ abs_vdebt = atomic64_read(&iocg->abs_vdebt);
+ vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
+ if (vdebt && vbudget > 0) {
+ u64 delta = min_t(u64, vbudget, vdebt);
+ u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
+ abs_vdebt);
+
+ atomic64_add(delta, &iocg->vtime);
+ atomic64_add(delta, &iocg->done_vtime);
+ atomic64_sub(abs_delta, &iocg->abs_vdebt);
+ if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
+ atomic64_set(&iocg->abs_vdebt, 0);
+ }
+
+ /*
+ * Wake up the ones which are due and see how much vtime we'll need
+ * for the next one.
+ */
+ ctx.hw_inuse = hw_inuse;
+ ctx.vbudget = vbudget - vdebt;
+ __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
+ if (!waitqueue_active(&iocg->waitq))
+ return;
+ if (WARN_ON_ONCE(ctx.vbudget >= 0))
+ return;
+
+ /* determine next wakeup, add a quarter margin to guarantee chunking */
+ vshortage = -ctx.vbudget;
+ expires = now->now_ns +
+ DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
+ expires += margin_ns / 4;
+
+ /* if already active and close enough, don't bother */
+ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
+ if (hrtimer_is_queued(&iocg->waitq_timer) &&
+ abs(oexpires - expires) <= margin_ns / 4)
+ return;
+
+ hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
+ margin_ns / 4, HRTIMER_MODE_ABS);
+}
+
+static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
+{
+ struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
+ struct ioc_now now;
+ unsigned long flags;
+
+ ioc_now(iocg->ioc, &now);
+
+ spin_lock_irqsave(&iocg->waitq.lock, flags);
+ iocg_kick_waitq(iocg, &now);
+ spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+
+ return HRTIMER_NORESTART;
+}
+
+static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct blkcg_gq *blkg = iocg_to_blkg(iocg);
+ u64 vtime = atomic64_read(&iocg->vtime);
+ u64 vmargin = ioc->margin_us * now->vrate;
+ u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
+ u64 expires, oexpires;
+ u32 hw_inuse;
+
+ /* debt-adjust vtime */
+ current_hweight(iocg, NULL, &hw_inuse);
+ vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
+
+ /* clear or maintain depending on the overage */
+ if (time_before_eq64(vtime, now->vnow)) {
+ blkcg_clear_delay(blkg);
+ return false;
+ }
+ if (!atomic_read(&blkg->use_delay) &&
+ time_before_eq64(vtime, now->vnow + vmargin))
+ return false;
+
+ /* use delay */
+ if (cost) {
+ u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
+ now->vrate);
+ blkcg_add_delay(blkg, now->now_ns, cost_ns);
+ }
+ blkcg_use_delay(blkg);
+
+ expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
+ now->vrate) * NSEC_PER_USEC;
+
+ /* if already active and close enough, don't bother */
+ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
+ if (hrtimer_is_queued(&iocg->delay_timer) &&
+ abs(oexpires - expires) <= margin_ns / 4)
+ return true;
+
+ hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
+ margin_ns / 4, HRTIMER_MODE_ABS);
+ return true;
+}
+
+static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
+{
+ struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
+ struct ioc_now now;
+
+ ioc_now(iocg->ioc, &now);
+ iocg_kick_delay(iocg, &now, 0);
+
+ return HRTIMER_NORESTART;
+}
+
+static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
+{
+ u32 nr_met[2] = { };
+ u32 nr_missed[2] = { };
+ u64 rq_wait_ns = 0;
+ int cpu, rw;
+
+ for_each_online_cpu(cpu) {
+ struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
+ u64 this_rq_wait_ns;
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
+ u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
+
+ nr_met[rw] += this_met - stat->missed[rw].last_met;
+ nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
+ stat->missed[rw].last_met = this_met;
+ stat->missed[rw].last_missed = this_missed;
+ }
+
+ this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
+ rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
+ stat->last_rq_wait_ns = this_rq_wait_ns;
+ }
+
+ for (rw = READ; rw <= WRITE; rw++) {
+ if (nr_met[rw] + nr_missed[rw])
+ missed_ppm_ar[rw] =
+ DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
+ nr_met[rw] + nr_missed[rw]);
+ else
+ missed_ppm_ar[rw] = 0;
+ }
+
+ *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
+ ioc->period_us * NSEC_PER_USEC);
+}
+
+/* was iocg idle this period? */
+static bool iocg_is_idle(struct ioc_gq *iocg)
+{
+ struct ioc *ioc = iocg->ioc;
+
+ /* did something get issued this period? */
+ if (atomic64_read(&iocg->active_period) ==
+ atomic64_read(&ioc->cur_period))
+ return false;
+
+ /* is something in flight? */
+ if (atomic64_read(&iocg->done_vtime) < atomic64_read(&iocg->vtime))
+ return false;
+
+ return true;
+}
+
+/* returns usage with margin added if surplus is large enough */
+static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
+{
+ /* add margin */
+ usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
+ usage += SURPLUS_SCALE_ABS;
+
+ /* don't bother if the surplus is too small */
+ if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
+ return 0;
+
+ return usage;
+}
+
+static void ioc_timer_fn(struct timer_list *timer)
+{
+ struct ioc *ioc = container_of(timer, struct ioc, timer);
+ struct ioc_gq *iocg, *tiocg;
+ struct ioc_now now;
+ int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
+ u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
+ u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
+ u32 missed_ppm[2], rq_wait_pct;
+ u64 period_vtime;
+ int prev_busy_level, i;
+
+ /* how were the latencies during the period? */
+ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+
+ /* take care of active iocgs */
+ spin_lock_irq(&ioc->lock);
+
+ ioc_now(ioc, &now);
+
+ period_vtime = now.vnow - ioc->period_at_vtime;
+ if (WARN_ON_ONCE(!period_vtime)) {
+ spin_unlock_irq(&ioc->lock);
+ return;
+ }
+
+ /*
+ * Waiters determine the sleep durations based on the vrate they
+ * saw at the time of sleep. If vrate has increased, some waiters
+ * could be sleeping for too long. Wake up tardy waiters which
+ * should have woken up in the last period and expire idle iocgs.
+ */
+ list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
+ if (!waitqueue_active(&iocg->waitq) &&
+ !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
+ continue;
+
+ spin_lock(&iocg->waitq.lock);
+
+ if (waitqueue_active(&iocg->waitq) ||
+ atomic64_read(&iocg->abs_vdebt)) {
+ /* might be oversleeping vtime / hweight changes, kick */
+ iocg_kick_waitq(iocg, &now);
+ iocg_kick_delay(iocg, &now, 0);
+ } else if (iocg_is_idle(iocg)) {
+ /* no waiter and idle, deactivate */
+ iocg->last_inuse = iocg->inuse;
+ __propagate_active_weight(iocg, 0, 0);
+ list_del_init(&iocg->active_list);
+ }
+
+ spin_unlock(&iocg->waitq.lock);
+ }
+ commit_active_weights(ioc);
+
+ /* calc usages and see whether some weights need to be moved around */
+ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
+ u64 vdone, vtime, vusage, vmargin, vmin;
+ u32 hw_active, hw_inuse, usage;
+
+ /*
+ * Collect unused and wind vtime closer to vnow to prevent
+ * iocgs from accumulating a large amount of budget.
+ */
+ vdone = atomic64_read(&iocg->done_vtime);
+ vtime = atomic64_read(&iocg->vtime);
+ current_hweight(iocg, &hw_active, &hw_inuse);
+
+ /*
+ * Latency QoS detection doesn't account for IOs which are
+ * in-flight for longer than a period. Detect them by
+ * comparing vdone against period start. If lagging behind
+ * IOs from past periods, don't increase vrate.
+ */
+ if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
+ !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
+ time_after64(vtime, vdone) &&
+ time_after64(vtime, now.vnow -
+ MAX_LAGGING_PERIODS * period_vtime) &&
+ time_before64(vdone, now.vnow - period_vtime))
+ nr_lagging++;
+
+ if (waitqueue_active(&iocg->waitq))
+ vusage = now.vnow - iocg->last_vtime;
+ else if (time_before64(iocg->last_vtime, vtime))
+ vusage = vtime - iocg->last_vtime;
+ else
+ vusage = 0;
+
+ iocg->last_vtime += vusage;
+ /*
+ * Factor in in-flight vtime into vusage to avoid
+ * high-latency completions appearing as idle. This should
+ * be done after the above ->last_time adjustment.
+ */
+ vusage = max(vusage, vtime - vdone);
+
+ /* calculate hweight based usage ratio and record */
+ if (vusage) {
+ usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
+ period_vtime);
+ iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
+ iocg->usages[iocg->usage_idx] = usage;
+ } else {
+ usage = 0;
+ }
+
+ /* see whether there's surplus vtime */
+ vmargin = ioc->margin_us * now.vrate;
+ vmin = now.vnow - vmargin;
+
+ iocg->has_surplus = false;
+
+ if (!waitqueue_active(&iocg->waitq) &&
+ time_before64(vtime, vmin)) {
+ u64 delta = vmin - vtime;
+
+ /* throw away surplus vtime */
+ atomic64_add(delta, &iocg->vtime);
+ atomic64_add(delta, &iocg->done_vtime);
+ iocg->last_vtime += delta;
+ /* if usage is sufficiently low, maybe it can donate */
+ if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
+ iocg->has_surplus = true;
+ nr_surpluses++;
+ }
+ } else if (hw_inuse < hw_active) {
+ u32 new_hwi, new_inuse;
+
+ /* was donating but might need to take back some */
+ if (waitqueue_active(&iocg->waitq)) {
+ new_hwi = hw_active;
+ } else {
+ new_hwi = max(hw_inuse,
+ usage * SURPLUS_SCALE_PCT / 100 +
+ SURPLUS_SCALE_ABS);
+ }
+
+ new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
+ hw_inuse);
+ new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
+
+ if (new_inuse > iocg->inuse) {
+ TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
+ iocg->inuse, new_inuse,
+ hw_inuse, new_hwi);
+ __propagate_active_weight(iocg, iocg->weight,
+ new_inuse);
+ }
+ } else {
+ /* genuninely out of vtime */
+ nr_shortages++;
+ }
+ }
+
+ if (!nr_shortages || !nr_surpluses)
+ goto skip_surplus_transfers;
+
+ /* there are both shortages and surpluses, transfer surpluses */
+ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
+ u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
+ int nr_valid = 0;
+
+ if (!iocg->has_surplus)
+ continue;
+
+ /* base the decision on max historical usage */
+ for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
+ if (iocg->usages[i]) {
+ usage = max(usage, iocg->usages[i]);
+ nr_valid++;
+ }
+ }
+ if (nr_valid < MIN_VALID_USAGES)
+ continue;
+
+ current_hweight(iocg, &hw_active, &hw_inuse);
+ new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
+ if (!new_hwi)
+ continue;
+
+ new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
+ hw_inuse);
+ if (new_inuse < iocg->inuse) {
+ TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
+ iocg->inuse, new_inuse,
+ hw_inuse, new_hwi);
+ __propagate_active_weight(iocg, iocg->weight, new_inuse);
+ }
+ }
+skip_surplus_transfers:
+ commit_active_weights(ioc);
+
+ /*
+ * If q is getting clogged or we're missing too much, we're issuing
+ * too much IO and should lower vtime rate. If we're not missing
+ * and experiencing shortages but not surpluses, we're too stingy
+ * and should increase vtime rate.
+ */
+ prev_busy_level = ioc->busy_level;
+ if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
+ missed_ppm[READ] > ppm_rthr ||
+ missed_ppm[WRITE] > ppm_wthr) {
+ ioc->busy_level = max(ioc->busy_level, 0);
+ ioc->busy_level++;
+ } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
+ missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
+ missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
+ /* take action iff there is contention */
+ if (nr_shortages && !nr_lagging) {
+ ioc->busy_level = min(ioc->busy_level, 0);
+ /* redistribute surpluses first */
+ if (!nr_surpluses)
+ ioc->busy_level--;
+ }
+ } else {
+ ioc->busy_level = 0;
+ }
+
+ ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
+
+ if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
+ u64 vrate = atomic64_read(&ioc->vtime_rate);
+ u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
+
+ /* rq_wait signal is always reliable, ignore user vrate_min */
+ if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
+ vrate_min = VRATE_MIN;
+
+ /*
+ * If vrate is out of bounds, apply clamp gradually as the
+ * bounds can change abruptly. Otherwise, apply busy_level
+ * based adjustment.
+ */
+ if (vrate < vrate_min) {
+ vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
+ 100);
+ vrate = min(vrate, vrate_min);
+ } else if (vrate > vrate_max) {
+ vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
+ 100);
+ vrate = max(vrate, vrate_max);
+ } else {
+ int idx = min_t(int, abs(ioc->busy_level),
+ ARRAY_SIZE(vrate_adj_pct) - 1);
+ u32 adj_pct = vrate_adj_pct[idx];
+
+ if (ioc->busy_level > 0)
+ adj_pct = 100 - adj_pct;
+ else
+ adj_pct = 100 + adj_pct;
+
+ vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
+ vrate_min, vrate_max);
+ }
+
+ trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
+ nr_lagging, nr_shortages,
+ nr_surpluses);
+
+ atomic64_set(&ioc->vtime_rate, vrate);
+ ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
+ ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
+ } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
+ trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
+ &missed_ppm, rq_wait_pct, nr_lagging,
+ nr_shortages, nr_surpluses);
+ }
+
+ ioc_refresh_params(ioc, false);
+
+ /*
+ * This period is done. Move onto the next one. If nothing's
+ * going on with the device, stop the timer.
+ */
+ atomic64_inc(&ioc->cur_period);
+
+ if (ioc->running != IOC_STOP) {
+ if (!list_empty(&ioc->active_iocgs)) {
+ ioc_start_period(ioc, &now);
+ } else {
+ ioc->busy_level = 0;
+ ioc->running = IOC_IDLE;
+ }
+ }
+
+ spin_unlock_irq(&ioc->lock);
+}
+
+static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
+ bool is_merge, u64 *costp)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 coef_seqio, coef_randio, coef_page;
+ u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
+ u64 seek_pages = 0;
+ u64 cost = 0;
+
+ switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
+ coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
+ coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
+ break;
+ case REQ_OP_WRITE:
+ coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
+ coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
+ coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
+ break;
+ default:
+ goto out;
+ }
+
+ if (iocg->cursor) {
+ seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
+ seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
+ }
+
+ if (!is_merge) {
+ if (seek_pages > LCOEF_RANDIO_PAGES) {
+ cost += coef_randio;
+ } else {
+ cost += coef_seqio;
+ }
+ }
+ cost += pages * coef_page;
+out:
+ *costp = cost;
+}
+
+static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
+{
+ u64 cost;
+
+ calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
+ return cost;
+}
+
+static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+ struct ioc *ioc = rqos_to_ioc(rqos);
+ struct ioc_gq *iocg = blkg_to_iocg(blkg);
+ struct ioc_now now;
+ struct iocg_wait wait;
+ u32 hw_active, hw_inuse;
+ u64 abs_cost, cost, vtime;
+
+ /* bypass IOs if disabled or for root cgroup */
+ if (!ioc->enabled || !iocg->level)
+ return;
+
+ /* always activate so that even 0 cost IOs get protected to some level */
+ if (!iocg_activate(iocg, &now))
+ return;
+
+ /* calculate the absolute vtime cost */
+ abs_cost = calc_vtime_cost(bio, iocg, false);
+ if (!abs_cost)
+ return;
+
+ iocg->cursor = bio_end_sector(bio);
+
+ vtime = atomic64_read(&iocg->vtime);
+ current_hweight(iocg, &hw_active, &hw_inuse);
+
+ if (hw_inuse < hw_active &&
+ time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
+ TRACE_IOCG_PATH(inuse_reset, iocg, &now,
+ iocg->inuse, iocg->weight, hw_inuse, hw_active);
+ spin_lock_irq(&ioc->lock);
+ propagate_active_weight(iocg, iocg->weight, iocg->weight);
+ spin_unlock_irq(&ioc->lock);
+ current_hweight(iocg, &hw_active, &hw_inuse);
+ }
+
+ cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
+ /*
+ * If no one's waiting and within budget, issue right away. The
+ * tests are racy but the races aren't systemic - we only miss once
+ * in a while which is fine.
+ */
+ if (!waitqueue_active(&iocg->waitq) &&
+ !atomic64_read(&iocg->abs_vdebt) &&
+ time_before_eq64(vtime + cost, now.vnow)) {
+ iocg_commit_bio(iocg, bio, cost);
+ return;
+ }
+
+ /*
+ * We're over budget. If @bio has to be issued regardless,
+ * remember the abs_cost instead of advancing vtime.
+ * iocg_kick_waitq() will pay off the debt before waking more IOs.
+ * This way, the debt is continuously paid off each period with the
+ * actual budget available to the cgroup. If we just wound vtime,
+ * we would incorrectly use the current hw_inuse for the entire
+ * amount which, for example, can lead to the cgroup staying
+ * blocked for a long time even with substantially raised hw_inuse.
+ */
+ if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
+ atomic64_add(abs_cost, &iocg->abs_vdebt);
+ if (iocg_kick_delay(iocg, &now, cost))
+ blkcg_schedule_throttle(rqos->q,
+ (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
+ return;
+ }
+
+ /*
+ * Append self to the waitq and schedule the wakeup timer if we're
+ * the first waiter. The timer duration is calculated based on the
+ * current vrate. vtime and hweight changes can make it too short
+ * or too long. Each wait entry records the absolute cost it's
+ * waiting for to allow re-evaluation using a custom wait entry.
+ *
+ * If too short, the timer simply reschedules itself. If too long,
+ * the period timer will notice and trigger wakeups.
+ *
+ * All waiters are on iocg->waitq and the wait states are
+ * synchronized using waitq.lock.
+ */
+ spin_lock_irq(&iocg->waitq.lock);
+
+ /*
+ * We activated above but w/o any synchronization. Deactivation is
+ * synchronized with waitq.lock and we won't get deactivated as
+ * long as we're waiting, so we're good if we're activated here.
+ * In the unlikely case that we are deactivated, just issue the IO.
+ */
+ if (unlikely(list_empty(&iocg->active_list))) {
+ spin_unlock_irq(&iocg->waitq.lock);
+ iocg_commit_bio(iocg, bio, cost);
+ return;
+ }
+
+ init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
+ wait.wait.private = current;
+ wait.bio = bio;
+ wait.abs_cost = abs_cost;
+ wait.committed = false; /* will be set true by waker */
+
+ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
+ iocg_kick_waitq(iocg, &now);
+
+ spin_unlock_irq(&iocg->waitq.lock);
+
+ while (true) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (wait.committed)
+ break;
+ io_schedule();
+ }
+
+ /* waker already committed us, proceed */
+ finish_wait(&iocg->waitq, &wait.wait);
+}
+
+static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
+ struct bio *bio)
+{
+ struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
+ struct ioc *ioc = iocg->ioc;
+ sector_t bio_end = bio_end_sector(bio);
+ struct ioc_now now;
+ u32 hw_inuse;
+ u64 abs_cost, cost;
+
+ /* bypass if disabled or for root cgroup */
+ if (!ioc->enabled || !iocg->level)
+ return;
+
+ abs_cost = calc_vtime_cost(bio, iocg, true);
+ if (!abs_cost)
+ return;
+
+ ioc_now(ioc, &now);
+ current_hweight(iocg, NULL, &hw_inuse);
+ cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
+ /* update cursor if backmerging into the request at the cursor */
+ if (blk_rq_pos(rq) < bio_end &&
+ blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
+ iocg->cursor = bio_end;
+
+ /*
+ * Charge if there's enough vtime budget and the existing request
+ * has cost assigned. Otherwise, account it as debt. See debt
+ * handling in ioc_rqos_throttle() for details.
+ */
+ if (rq->bio && rq->bio->bi_iocost_cost &&
+ time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
+ iocg_commit_bio(iocg, bio, cost);
+ else
+ atomic64_add(abs_cost, &iocg->abs_vdebt);
+}
+
+static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
+{
+ struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
+
+ if (iocg && bio->bi_iocost_cost)
+ atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
+}
+
+static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
+{
+ struct ioc *ioc = rqos_to_ioc(rqos);
+ u64 on_q_ns, rq_wait_ns;
+ int pidx, rw;
+
+ if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
+ return;
+
+ switch (req_op(rq) & REQ_OP_MASK) {
+ case REQ_OP_READ:
+ pidx = QOS_RLAT;
+ rw = READ;
+ break;
+ case REQ_OP_WRITE:
+ pidx = QOS_WLAT;
+ rw = WRITE;
+ break;
+ default:
+ return;
+ }
+
+ on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
+ rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
+
+ if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
+ this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
+ else
+ this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
+
+ this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
+}
+
+static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
+{
+ struct ioc *ioc = rqos_to_ioc(rqos);
+
+ spin_lock_irq(&ioc->lock);
+ ioc_refresh_params(ioc, false);
+ spin_unlock_irq(&ioc->lock);
+}
+
+static void ioc_rqos_exit(struct rq_qos *rqos)
+{
+ struct ioc *ioc = rqos_to_ioc(rqos);
+
+ blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
+
+ spin_lock_irq(&ioc->lock);
+ ioc->running = IOC_STOP;
+ spin_unlock_irq(&ioc->lock);
+
+ del_timer_sync(&ioc->timer);
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+}
+
+static struct rq_qos_ops ioc_rqos_ops = {
+ .throttle = ioc_rqos_throttle,
+ .merge = ioc_rqos_merge,
+ .done_bio = ioc_rqos_done_bio,
+ .done = ioc_rqos_done,
+ .queue_depth_changed = ioc_rqos_queue_depth_changed,
+ .exit = ioc_rqos_exit,
+};
+
+static int blk_iocost_init(struct request_queue *q)
+{
+ struct ioc *ioc;
+ struct rq_qos *rqos;
+ int ret;
+
+ ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
+ if (!ioc)
+ return -ENOMEM;
+
+ ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
+ if (!ioc->pcpu_stat) {
+ kfree(ioc);
+ return -ENOMEM;
+ }
+
+ rqos = &ioc->rqos;
+ rqos->id = RQ_QOS_COST;
+ rqos->ops = &ioc_rqos_ops;
+ rqos->q = q;
+
+ spin_lock_init(&ioc->lock);
+ timer_setup(&ioc->timer, ioc_timer_fn, 0);
+ INIT_LIST_HEAD(&ioc->active_iocgs);
+
+ ioc->running = IOC_IDLE;
+ atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+ seqcount_init(&ioc->period_seqcount);
+ ioc->period_at = ktime_to_us(ktime_get());
+ atomic64_set(&ioc->cur_period, 0);
+ atomic_set(&ioc->hweight_gen, 0);
+
+ spin_lock_irq(&ioc->lock);
+ ioc->autop_idx = AUTOP_INVALID;
+ ioc_refresh_params(ioc, true);
+ spin_unlock_irq(&ioc->lock);
+
+ rq_qos_add(q, rqos);
+ ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
+ if (ret) {
+ rq_qos_del(q, rqos);
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+ return ret;
+ }
+ return 0;
+}
+
+static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
+{
+ struct ioc_cgrp *iocc;
+
+ iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
+ if (!iocc)
+ return NULL;
+
+ iocc->dfl_weight = CGROUP_WEIGHT_DFL;
+ return &iocc->cpd;
+}
+
+static void ioc_cpd_free(struct blkcg_policy_data *cpd)
+{
+ kfree(container_of(cpd, struct ioc_cgrp, cpd));
+}
+
+static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
+ struct blkcg *blkcg)
+{
+ int levels = blkcg->css.cgroup->level + 1;
+ struct ioc_gq *iocg;
+
+ iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
+ gfp, q->node);
+ if (!iocg)
+ return NULL;
+
+ return &iocg->pd;
+}
+
+static void ioc_pd_init(struct blkg_policy_data *pd)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+ struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
+ struct ioc *ioc = q_to_ioc(blkg->q);
+ struct ioc_now now;
+ struct blkcg_gq *tblkg;
+ unsigned long flags;
+
+ ioc_now(ioc, &now);
+
+ iocg->ioc = ioc;
+ atomic64_set(&iocg->vtime, now.vnow);
+ atomic64_set(&iocg->done_vtime, now.vnow);
+ atomic64_set(&iocg->abs_vdebt, 0);
+ atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
+ INIT_LIST_HEAD(&iocg->active_list);
+ iocg->hweight_active = HWEIGHT_WHOLE;
+ iocg->hweight_inuse = HWEIGHT_WHOLE;
+
+ init_waitqueue_head(&iocg->waitq);
+ hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ iocg->waitq_timer.function = iocg_waitq_timer_fn;
+ hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
+ iocg->delay_timer.function = iocg_delay_timer_fn;
+
+ iocg->level = blkg->blkcg->css.cgroup->level;
+
+ for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
+ struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
+ iocg->ancestors[tiocg->level] = tiocg;
+ }
+
+ spin_lock_irqsave(&ioc->lock, flags);
+ weight_updated(iocg);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+}
+
+static void ioc_pd_free(struct blkg_policy_data *pd)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+ struct ioc *ioc = iocg->ioc;
+
+ if (ioc) {
+ spin_lock(&ioc->lock);
+ if (!list_empty(&iocg->active_list)) {
+ propagate_active_weight(iocg, 0, 0);
+ list_del_init(&iocg->active_list);
+ }
+ spin_unlock(&ioc->lock);
+
+ hrtimer_cancel(&iocg->waitq_timer);
+ hrtimer_cancel(&iocg->delay_timer);
+ }
+ kfree(iocg);
+}
+
+static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ const char *dname = blkg_dev_name(pd->blkg);
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+
+ if (dname && iocg->cfg_weight)
+ seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
+ return 0;
+}
+
+
+static int ioc_weight_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
+
+ seq_printf(sf, "default %u\n", iocc->dfl_weight);
+ blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
+ &blkcg_policy_iocost, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct blkcg *blkcg = css_to_blkcg(of_css(of));
+ struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
+ struct blkg_conf_ctx ctx;
+ struct ioc_gq *iocg;
+ u32 v;
+ int ret;
+
+ if (!strchr(buf, ':')) {
+ struct blkcg_gq *blkg;
+
+ if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
+ return -EINVAL;
+
+ if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
+ return -EINVAL;
+
+ spin_lock(&blkcg->lock);
+ iocc->dfl_weight = v;
+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
+ struct ioc_gq *iocg = blkg_to_iocg(blkg);
+
+ if (iocg) {
+ spin_lock_irq(&iocg->ioc->lock);
+ weight_updated(iocg);
+ spin_unlock_irq(&iocg->ioc->lock);
+ }
+ }
+ spin_unlock(&blkcg->lock);
+
+ return nbytes;
+ }
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
+ if (ret)
+ return ret;
+
+ iocg = blkg_to_iocg(ctx.blkg);
+
+ if (!strncmp(ctx.body, "default", 7)) {
+ v = 0;
+ } else {
+ if (!sscanf(ctx.body, "%u", &v))
+ goto einval;
+ if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
+ goto einval;
+ }
+
+ spin_lock(&iocg->ioc->lock);
+ iocg->cfg_weight = v;
+ weight_updated(iocg);
+ spin_unlock(&iocg->ioc->lock);
+
+ blkg_conf_finish(&ctx);
+ return nbytes;
+
+einval:
+ blkg_conf_finish(&ctx);
+ return -EINVAL;
+}
+
+static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
+ int off)
+{
+ const char *dname = blkg_dev_name(pd->blkg);
+ struct ioc *ioc = pd_to_iocg(pd)->ioc;
+
+ if (!dname)
+ return 0;
+
+ seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
+ dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
+ ioc->params.qos[QOS_RPPM] / 10000,
+ ioc->params.qos[QOS_RPPM] % 10000 / 100,
+ ioc->params.qos[QOS_RLAT],
+ ioc->params.qos[QOS_WPPM] / 10000,
+ ioc->params.qos[QOS_WPPM] % 10000 / 100,
+ ioc->params.qos[QOS_WLAT],
+ ioc->params.qos[QOS_MIN] / 10000,
+ ioc->params.qos[QOS_MIN] % 10000 / 100,
+ ioc->params.qos[QOS_MAX] / 10000,
+ ioc->params.qos[QOS_MAX] % 10000 / 100);
+ return 0;
+}
+
+static int ioc_qos_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+ blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
+ &blkcg_policy_iocost, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static const match_table_t qos_ctrl_tokens = {
+ { QOS_ENABLE, "enable=%u" },
+ { QOS_CTRL, "ctrl=%s" },
+ { NR_QOS_CTRL_PARAMS, NULL },
+};
+
+static const match_table_t qos_tokens = {
+ { QOS_RPPM, "rpct=%s" },
+ { QOS_RLAT, "rlat=%u" },
+ { QOS_WPPM, "wpct=%s" },
+ { QOS_WLAT, "wlat=%u" },
+ { QOS_MIN, "min=%s" },
+ { QOS_MAX, "max=%s" },
+ { NR_QOS_PARAMS, NULL },
+};
+
+static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
+ size_t nbytes, loff_t off)
+{
+ struct gendisk *disk;
+ struct ioc *ioc;
+ u32 qos[NR_QOS_PARAMS];
+ bool enable, user;
+ char *p;
+ int ret;
+
+ disk = blkcg_conf_get_disk(&input);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
+
+ ioc = q_to_ioc(disk->queue);
+ if (!ioc) {
+ ret = blk_iocost_init(disk->queue);
+ if (ret)
+ goto err;
+ ioc = q_to_ioc(disk->queue);
+ }
+
+ spin_lock_irq(&ioc->lock);
+ memcpy(qos, ioc->params.qos, sizeof(qos));
+ enable = ioc->enabled;
+ user = ioc->user_qos_params;
+ spin_unlock_irq(&ioc->lock);
+
+ while ((p = strsep(&input, " \t\n"))) {
+ substring_t args[MAX_OPT_ARGS];
+ char buf[32];
+ int tok;
+ s64 v;
+
+ if (!*p)
+ continue;
+
+ switch (match_token(p, qos_ctrl_tokens, args)) {
+ case QOS_ENABLE:
+ match_u64(&args[0], &v);
+ enable = v;
+ continue;
+ case QOS_CTRL:
+ match_strlcpy(buf, &args[0], sizeof(buf));
+ if (!strcmp(buf, "auto"))
+ user = false;
+ else if (!strcmp(buf, "user"))
+ user = true;
+ else
+ goto einval;
+ continue;
+ }
+
+ tok = match_token(p, qos_tokens, args);
+ switch (tok) {
+ case QOS_RPPM:
+ case QOS_WPPM:
+ if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
+ sizeof(buf))
+ goto einval;
+ if (cgroup_parse_float(buf, 2, &v))
+ goto einval;
+ if (v < 0 || v > 10000)
+ goto einval;
+ qos[tok] = v * 100;
+ break;
+ case QOS_RLAT:
+ case QOS_WLAT:
+ if (match_u64(&args[0], &v))
+ goto einval;
+ qos[tok] = v;
+ break;
+ case QOS_MIN:
+ case QOS_MAX:
+ if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
+ sizeof(buf))
+ goto einval;
+ if (cgroup_parse_float(buf, 2, &v))
+ goto einval;
+ if (v < 0)
+ goto einval;
+ qos[tok] = clamp_t(s64, v * 100,
+ VRATE_MIN_PPM, VRATE_MAX_PPM);
+ break;
+ default:
+ goto einval;
+ }
+ user = true;
+ }
+
+ if (qos[QOS_MIN] > qos[QOS_MAX])
+ goto einval;
+
+ spin_lock_irq(&ioc->lock);
+
+ if (enable) {
+ blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+ ioc->enabled = true;
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+ ioc->enabled = false;
+ }
+
+ if (user) {
+ memcpy(ioc->params.qos, qos, sizeof(qos));
+ ioc->user_qos_params = true;
+ } else {
+ ioc->user_qos_params = false;
+ }
+
+ ioc_refresh_params(ioc, true);
+ spin_unlock_irq(&ioc->lock);
+
+ put_disk_and_module(disk);
+ return nbytes;
+einval:
+ ret = -EINVAL;
+err:
+ put_disk_and_module(disk);
+ return ret;
+}
+
+static u64 ioc_cost_model_prfill(struct seq_file *sf,
+ struct blkg_policy_data *pd, int off)
+{
+ const char *dname = blkg_dev_name(pd->blkg);
+ struct ioc *ioc = pd_to_iocg(pd)->ioc;
+ u64 *u = ioc->params.i_lcoefs;
+
+ if (!dname)
+ return 0;
+
+ seq_printf(sf, "%s ctrl=%s model=linear "
+ "rbps=%llu rseqiops=%llu rrandiops=%llu "
+ "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
+ dname, ioc->user_cost_model ? "user" : "auto",
+ u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
+ u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
+ return 0;
+}
+
+static int ioc_cost_model_show(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+
+ blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
+ &blkcg_policy_iocost, seq_cft(sf)->private, false);
+ return 0;
+}
+
+static const match_table_t cost_ctrl_tokens = {
+ { COST_CTRL, "ctrl=%s" },
+ { COST_MODEL, "model=%s" },
+ { NR_COST_CTRL_PARAMS, NULL },
+};
+
+static const match_table_t i_lcoef_tokens = {
+ { I_LCOEF_RBPS, "rbps=%u" },
+ { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
+ { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
+ { I_LCOEF_WBPS, "wbps=%u" },
+ { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
+ { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
+ { NR_I_LCOEFS, NULL },
+};
+
+static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
+ size_t nbytes, loff_t off)
+{
+ struct gendisk *disk;
+ struct ioc *ioc;
+ u64 u[NR_I_LCOEFS];
+ bool user;
+ char *p;
+ int ret;
+
+ disk = blkcg_conf_get_disk(&input);
+ if (IS_ERR(disk))
+ return PTR_ERR(disk);
+
+ ioc = q_to_ioc(disk->queue);
+ if (!ioc) {
+ ret = blk_iocost_init(disk->queue);
+ if (ret)
+ goto err;
+ ioc = q_to_ioc(disk->queue);
+ }
+
+ spin_lock_irq(&ioc->lock);
+ memcpy(u, ioc->params.i_lcoefs, sizeof(u));
+ user = ioc->user_cost_model;
+ spin_unlock_irq(&ioc->lock);
+
+ while ((p = strsep(&input, " \t\n"))) {
+ substring_t args[MAX_OPT_ARGS];
+ char buf[32];
+ int tok;
+ u64 v;
+
+ if (!*p)
+ continue;
+
+ switch (match_token(p, cost_ctrl_tokens, args)) {
+ case COST_CTRL:
+ match_strlcpy(buf, &args[0], sizeof(buf));
+ if (!strcmp(buf, "auto"))
+ user = false;
+ else if (!strcmp(buf, "user"))
+ user = true;
+ else
+ goto einval;
+ continue;
+ case COST_MODEL:
+ match_strlcpy(buf, &args[0], sizeof(buf));
+ if (strcmp(buf, "linear"))
+ goto einval;
+ continue;
+ }
+
+ tok = match_token(p, i_lcoef_tokens, args);
+ if (tok == NR_I_LCOEFS)
+ goto einval;
+ if (match_u64(&args[0], &v))
+ goto einval;
+ u[tok] = v;
+ user = true;
+ }
+
+ spin_lock_irq(&ioc->lock);
+ if (user) {
+ memcpy(ioc->params.i_lcoefs, u, sizeof(u));
+ ioc->user_cost_model = true;
+ } else {
+ ioc->user_cost_model = false;
+ }
+ ioc_refresh_params(ioc, true);
+ spin_unlock_irq(&ioc->lock);
+
+ put_disk_and_module(disk);
+ return nbytes;
+
+einval:
+ ret = -EINVAL;
+err:
+ put_disk_and_module(disk);
+ return ret;
+}
+
+static struct cftype ioc_files[] = {
+ {
+ .name = "weight",
+ .flags = CFTYPE_NOT_ON_ROOT,
+ .seq_show = ioc_weight_show,
+ .write = ioc_weight_write,
+ },
+ {
+ .name = "cost.qos",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = ioc_qos_show,
+ .write = ioc_qos_write,
+ },
+ {
+ .name = "cost.model",
+ .flags = CFTYPE_ONLY_ON_ROOT,
+ .seq_show = ioc_cost_model_show,
+ .write = ioc_cost_model_write,
+ },
+ {}
+};
+
+static struct blkcg_policy blkcg_policy_iocost = {
+ .dfl_cftypes = ioc_files,
+ .cpd_alloc_fn = ioc_cpd_alloc,
+ .cpd_free_fn = ioc_cpd_free,
+ .pd_alloc_fn = ioc_pd_alloc,
+ .pd_init_fn = ioc_pd_init,
+ .pd_free_fn = ioc_pd_free,
+};
+
+static int __init ioc_init(void)
+{
+ return blkcg_policy_register(&blkcg_policy_iocost);
+}
+
+static void __exit ioc_exit(void)
+{
+ return blkcg_policy_unregister(&blkcg_policy_iocost);
+}
+
+module_init(ioc_init);
+module_exit(ioc_exit);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index d22e61bced86..c128d50cb410 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -618,44 +618,26 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
inflight = atomic_dec_return(&rqw->inflight);
WARN_ON_ONCE(inflight < 0);
- if (iolat->min_lat_nsec == 0)
- goto next;
- iolatency_record_time(iolat, &bio->bi_issue, now,
- issue_as_root);
- window_start = atomic64_read(&iolat->window_start);
- if (now > window_start &&
- (now - window_start) >= iolat->cur_win_nsec) {
- if (atomic64_cmpxchg(&iolat->window_start,
- window_start, now) == window_start)
- iolatency_check_latencies(iolat, now);
+ /*
+ * If bi_status is BLK_STS_AGAIN, the bio wasn't actually
+ * submitted, so do not account for it.
+ */
+ if (iolat->min_lat_nsec && bio->bi_status != BLK_STS_AGAIN) {
+ iolatency_record_time(iolat, &bio->bi_issue, now,
+ issue_as_root);
+ window_start = atomic64_read(&iolat->window_start);
+ if (now > window_start &&
+ (now - window_start) >= iolat->cur_win_nsec) {
+ if (atomic64_cmpxchg(&iolat->window_start,
+ window_start, now) == window_start)
+ iolatency_check_latencies(iolat, now);
+ }
}
-next:
wake_up(&rqw->wait);
blkg = blkg->parent;
}
}
-static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio)
-{
- struct blkcg_gq *blkg;
-
- blkg = bio->bi_blkg;
- while (blkg && blkg->parent) {
- struct rq_wait *rqw;
- struct iolatency_grp *iolat;
-
- iolat = blkg_to_lat(blkg);
- if (!iolat)
- goto next;
-
- rqw = &iolat->rq_wait;
- atomic_dec(&rqw->inflight);
- wake_up(&rqw->wait);
-next:
- blkg = blkg->parent;
- }
-}
-
static void blkcg_iolatency_exit(struct rq_qos *rqos)
{
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
@@ -667,7 +649,6 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos)
static struct rq_qos_ops blkcg_iolatency_ops = {
.throttle = blkcg_iolatency_throttle,
- .cleanup = blkcg_iolatency_cleanup,
.done_bio = blkcg_iolatency_done_bio,
.exit = blkcg_iolatency_exit,
};
@@ -744,7 +725,7 @@ int blk_iolatency_init(struct request_queue *q)
return -ENOMEM;
rqos = &blkiolat->rqos;
- rqos->id = RQ_QOS_CGROUP;
+ rqos->id = RQ_QOS_LATENCY;
rqos->ops = &blkcg_iolatency_ops;
rqos->q = q;
@@ -778,8 +759,10 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
if (!oldval && val)
return 1;
- if (oldval && !val)
+ if (oldval && !val) {
+ blkcg_clear_delay(blkg);
return -1;
+ }
return 0;
}
@@ -934,6 +917,9 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
unsigned long long avg_lat;
unsigned long long cur_win;
+ if (!blkcg_debug_stats)
+ return 0;
+
if (iolat->ssd)
return iolatency_ssd_stat(iolat, buf, size);
@@ -948,11 +934,13 @@ static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
}
-static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
+ struct request_queue *q,
+ struct blkcg *blkcg)
{
struct iolatency_grp *iolat;
- iolat = kzalloc_node(sizeof(*iolat), gfp, node);
+ iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
if (!iolat)
return NULL;
iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
diff --git a/block/blk-map.c b/block/blk-map.c
index db9373bd31ac..b0790268ed9d 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -18,13 +18,19 @@
int blk_rq_append_bio(struct request *rq, struct bio **bio)
{
struct bio *orig_bio = *bio;
+ struct bvec_iter iter;
+ struct bio_vec bv;
+ unsigned int nr_segs = 0;
blk_queue_bounce(rq->q, bio);
+ bio_for_each_bvec(bv, *bio, iter)
+ nr_segs++;
+
if (!rq->bio) {
- blk_rq_bio_prep(rq->q, rq, *bio);
+ blk_rq_bio_prep(rq, *bio, nr_segs);
} else {
- if (!ll_back_merge_fn(rq->q, rq, *bio)) {
+ if (!ll_back_merge_fn(rq, *bio, nr_segs)) {
if (orig_bio != *bio) {
bio_put(*bio);
*bio = orig_bio;
@@ -145,7 +151,7 @@ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
return 0;
unmap_rq:
- __blk_rq_unmap_user(bio);
+ blk_rq_unmap_user(bio);
fail:
rq->bio = NULL;
return ret;
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 17713d7d98d5..48e6725b32ee 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -105,7 +105,7 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
struct bio *bio, struct bio_set *bs, unsigned *nsegs)
{
- *nsegs = 1;
+ *nsegs = 0;
if (!q->limits.max_write_zeroes_sectors)
return NULL;
@@ -132,19 +132,32 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
}
+/*
+ * Return the maximum number of sectors from the start of a bio that may be
+ * submitted as a single request to a block device. If enough sectors remain,
+ * align the end to the physical block size. Otherwise align the end to the
+ * logical block size. This approach minimizes the number of non-aligned
+ * requests that are submitted to a block device if the start of a bio is not
+ * aligned to a physical block boundary.
+ */
static inline unsigned get_max_io_size(struct request_queue *q,
struct bio *bio)
{
unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
- unsigned mask = queue_logical_block_size(q) - 1;
+ unsigned max_sectors = sectors;
+ unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
+ unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
+ unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
- /* aligned to logical block size */
- sectors &= ~(mask >> 9);
+ max_sectors += start_offset;
+ max_sectors &= ~(pbs - 1);
+ if (max_sectors > start_offset)
+ return max_sectors - start_offset;
- return sectors;
+ return sectors & (lbs - 1);
}
-static unsigned get_max_segment_size(struct request_queue *q,
+static unsigned get_max_segment_size(const struct request_queue *q,
unsigned offset)
{
unsigned long mask = queue_segment_boundary(q);
@@ -157,26 +170,41 @@ static unsigned get_max_segment_size(struct request_queue *q,
queue_max_segment_size(q));
}
-/*
- * Split the bvec @bv into segments, and update all kinds of
- * variables.
+/**
+ * bvec_split_segs - verify whether or not a bvec should be split in the middle
+ * @q: [in] request queue associated with the bio associated with @bv
+ * @bv: [in] bvec to examine
+ * @nsegs: [in,out] Number of segments in the bio being built. Incremented
+ * by the number of segments from @bv that may be appended to that
+ * bio without exceeding @max_segs
+ * @sectors: [in,out] Number of sectors in the bio being built. Incremented
+ * by the number of sectors from @bv that may be appended to that
+ * bio without exceeding @max_sectors
+ * @max_segs: [in] upper bound for *@nsegs
+ * @max_sectors: [in] upper bound for *@sectors
+ *
+ * When splitting a bio, it can happen that a bvec is encountered that is too
+ * big to fit in a single segment and hence that it has to be split in the
+ * middle. This function verifies whether or not that should happen. The value
+ * %true is returned if and only if appending the entire @bv to a bio with
+ * *@nsegs segments and *@sectors sectors would make that bio unacceptable for
+ * the block driver.
*/
-static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
- unsigned *nsegs, unsigned *sectors, unsigned max_segs)
+static bool bvec_split_segs(const struct request_queue *q,
+ const struct bio_vec *bv, unsigned *nsegs,
+ unsigned *sectors, unsigned max_segs,
+ unsigned max_sectors)
{
- unsigned len = bv->bv_len;
+ unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+ unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
- unsigned new_nsegs = 0, seg_size = 0;
+ unsigned seg_size = 0;
- /*
- * Multi-page bvec may be too big to hold in one segment, so the
- * current bvec has to be splitted as multiple segments.
- */
- while (len && new_nsegs + *nsegs < max_segs) {
+ while (len && *nsegs < max_segs) {
seg_size = get_max_segment_size(q, bv->bv_offset + total_len);
seg_size = min(seg_size, len);
- new_nsegs++;
+ (*nsegs)++;
total_len += seg_size;
len -= seg_size;
@@ -184,16 +212,31 @@ static bool bvec_split_segs(struct request_queue *q, struct bio_vec *bv,
break;
}
- if (new_nsegs) {
- *nsegs += new_nsegs;
- if (sectors)
- *sectors += total_len >> 9;
- }
+ *sectors += total_len >> 9;
- /* split in the middle of the bvec if len != 0 */
- return !!len;
+ /* tell the caller to split the bvec if it is too big to fit */
+ return len > 0 || bv->bv_len > max_len;
}
+/**
+ * blk_bio_segment_split - split a bio in two bios
+ * @q: [in] request queue pointer
+ * @bio: [in] bio to be split
+ * @bs: [in] bio set to allocate the clone from
+ * @segs: [out] number of segments in the bio with the first half of the sectors
+ *
+ * Clone @bio, update the bi_iter of the clone to represent the first sectors
+ * of @bio and update @bio->bi_iter to represent the remaining sectors. The
+ * following is guaranteed for the cloned bio:
+ * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most queue_max_segments(@q) segments.
+ *
+ * Except for discard requests the cloned bio will point at the bi_io_vec of
+ * the original bio. It is the responsibility of the caller to ensure that the
+ * original bio is not freed before the cloned bio. The caller is also
+ * responsible for ensuring that @bs is only destroyed after processing of the
+ * split bio has finished.
+ */
static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio *bio,
struct bio_set *bs,
@@ -202,8 +245,6 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
unsigned nsegs = 0, sectors = 0;
- bool do_split = true;
- struct bio *new = NULL;
const unsigned max_sectors = get_max_io_size(q, bio);
const unsigned max_segs = queue_max_segments(q);
@@ -215,75 +256,63 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
goto split;
- if (sectors + (bv.bv_len >> 9) > max_sectors) {
- /*
- * Consider this a new segment if we're splitting in
- * the middle of this vector.
- */
- if (nsegs < max_segs &&
- sectors < max_sectors) {
- /* split in the middle of bvec */
- bv.bv_len = (max_sectors - sectors) << 9;
- bvec_split_segs(q, &bv, &nsegs,
- &sectors, max_segs);
- }
+ if (nsegs < max_segs &&
+ sectors + (bv.bv_len >> 9) <= max_sectors &&
+ bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
+ nsegs++;
+ sectors += bv.bv_len >> 9;
+ } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
+ max_sectors)) {
goto split;
}
- if (nsegs == max_segs)
- goto split;
-
bvprv = bv;
bvprvp = &bvprv;
-
- if (bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
- nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, &sectors,
- max_segs)) {
- goto split;
- }
}
- do_split = false;
+ *segs = nsegs;
+ return NULL;
split:
*segs = nsegs;
-
- if (do_split) {
- new = bio_split(bio, sectors, GFP_NOIO, bs);
- if (new)
- bio = new;
- }
-
- return do_split ? new : NULL;
+ return bio_split(bio, sectors, GFP_NOIO, bs);
}
-void blk_queue_split(struct request_queue *q, struct bio **bio)
+/**
+ * __blk_queue_split - split a bio and submit the second half
+ * @q: [in] request queue pointer
+ * @bio: [in, out] bio to be split
+ * @nr_segs: [out] number of segments in the first bio
+ *
+ * Split a bio into two bios, chain the two bios, submit the second half and
+ * store a pointer to the first half in *@bio. If the second bio is still too
+ * big it will be split by a recursive call to this function. Since this
+ * function may allocate a new bio from @q->bio_split, it is the responsibility
+ * of the caller to ensure that @q is only released after processing of the
+ * split bio has finished.
+ */
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+ unsigned int *nr_segs)
{
- struct bio *split, *res;
- unsigned nsegs;
+ struct bio *split;
switch (bio_op(*bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
- split = blk_bio_discard_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
break;
case REQ_OP_WRITE_ZEROES:
- split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
+ nr_segs);
break;
case REQ_OP_WRITE_SAME:
- split = blk_bio_write_same_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_write_same_split(q, *bio, &q->bio_split,
+ nr_segs);
break;
default:
- split = blk_bio_segment_split(q, *bio, &q->bio_split, &nsegs);
+ split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
break;
}
- /* physical segments can be figured out during splitting */
- res = split ? split : *bio;
- res->bi_phys_segments = nsegs;
- bio_set_flag(res, BIO_SEG_VALID);
-
if (split) {
/* there isn't chance to merge the splitted bio */
split->bi_opf |= REQ_NOMERGE;
@@ -304,19 +333,37 @@ void blk_queue_split(struct request_queue *q, struct bio **bio)
*bio = split;
}
}
+
+/**
+ * blk_queue_split - split a bio and submit the second half
+ * @q: [in] request queue pointer
+ * @bio: [in, out] bio to be split
+ *
+ * Split a bio into two bios, chains the two bios, submit the second half and
+ * store a pointer to the first half in *@bio. Since this function may allocate
+ * a new bio from @q->bio_split, it is the responsibility of the caller to
+ * ensure that @q is only released after processing of the split bio has
+ * finished.
+ */
+void blk_queue_split(struct request_queue *q, struct bio **bio)
+{
+ unsigned int nr_segs;
+
+ __blk_queue_split(q, bio, &nr_segs);
+}
EXPORT_SYMBOL(blk_queue_split);
-static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
- struct bio *bio)
+unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
- struct bvec_iter iter;
+ unsigned int nr_sectors = 0;
+ struct req_iterator iter;
struct bio_vec bv;
- if (!bio)
+ if (!rq->bio)
return 0;
- switch (bio_op(bio)) {
+ switch (bio_op(rq->bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
@@ -325,30 +372,12 @@ static unsigned int __blk_recalc_rq_segments(struct request_queue *q,
return 1;
}
- for_each_bio(bio) {
- bio_for_each_bvec(bv, bio, iter)
- bvec_split_segs(q, &bv, &nr_phys_segs, NULL, UINT_MAX);
- }
-
+ rq_for_each_bvec(bv, rq, iter)
+ bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+ UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
-void blk_recalc_rq_segments(struct request *rq)
-{
- rq->nr_phys_segments = __blk_recalc_rq_segments(rq->q, rq->bio);
-}
-
-void blk_recount_segments(struct request_queue *q, struct bio *bio)
-{
- struct bio *nxt = bio->bi_next;
-
- bio->bi_next = NULL;
- bio->bi_phys_segments = __blk_recalc_rq_segments(q, bio);
- bio->bi_next = nxt;
-
- bio_set_flag(bio, BIO_SEG_VALID);
-}
-
static inline struct scatterlist *blk_next_sg(struct scatterlist **sg,
struct scatterlist *sglist)
{
@@ -519,16 +548,13 @@ int blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(blk_rq_map_sg);
-static inline int ll_new_hw_segment(struct request_queue *q,
- struct request *req,
- struct bio *bio)
+static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
+ unsigned int nr_phys_segs)
{
- int nr_phys_segs = bio_phys_segments(q, bio);
-
- if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(q))
+ if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q))
goto no_merge;
- if (blk_integrity_merge_bio(q, req, bio) == false)
+ if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
/*
@@ -539,12 +565,11 @@ static inline int ll_new_hw_segment(struct request_queue *q,
return 1;
no_merge:
- req_set_nomerge(q, req);
+ req_set_nomerge(req->q, req);
return 0;
}
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio)
+int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
if (req_gap_back_merge(req, bio))
return 0;
@@ -553,21 +578,15 @@ int ll_back_merge_fn(struct request_queue *q, struct request *req,
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, blk_rq_pos(req))) {
- req_set_nomerge(q, req);
+ req_set_nomerge(req->q, req);
return 0;
}
- if (!bio_flagged(req->biotail, BIO_SEG_VALID))
- blk_recount_segments(q, req->biotail);
- if (!bio_flagged(bio, BIO_SEG_VALID))
- blk_recount_segments(q, bio);
- return ll_new_hw_segment(q, req, bio);
+ return ll_new_hw_segment(req, bio, nr_segs);
}
-int ll_front_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio)
+int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
{
-
if (req_gap_front_merge(req, bio))
return 0;
if (blk_integrity_rq(req) &&
@@ -575,15 +594,11 @@ int ll_front_merge_fn(struct request_queue *q, struct request *req,
return 0;
if (blk_rq_sectors(req) + bio_sectors(bio) >
blk_rq_get_max_sectors(req, bio->bi_iter.bi_sector)) {
- req_set_nomerge(q, req);
+ req_set_nomerge(req->q, req);
return 0;
}
- if (!bio_flagged(bio, BIO_SEG_VALID))
- blk_recount_segments(q, bio);
- if (!bio_flagged(req->bio, BIO_SEG_VALID))
- blk_recount_segments(q, req->bio);
- return ll_new_hw_segment(q, req, bio);
+ return ll_new_hw_segment(req, bio, nr_segs);
}
static bool req_attempt_discard_merge(struct request_queue *q, struct request *req,
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index f945621a0e8f..0157f2b3485a 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -15,10 +15,10 @@
#include "blk.h"
#include "blk-mq.h"
-static int cpu_to_queue_index(struct blk_mq_queue_map *qmap,
- unsigned int nr_queues, const int cpu)
+static int queue_index(struct blk_mq_queue_map *qmap,
+ unsigned int nr_queues, const int q)
{
- return qmap->queue_offset + (cpu % nr_queues);
+ return qmap->queue_offset + (q % nr_queues);
}
static int get_first_sibling(unsigned int cpu)
@@ -36,21 +36,36 @@ int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
unsigned int *map = qmap->mq_map;
unsigned int nr_queues = qmap->nr_queues;
- unsigned int cpu, first_sibling;
+ unsigned int cpu, first_sibling, q = 0;
+
+ for_each_possible_cpu(cpu)
+ map[cpu] = -1;
+
+ /*
+ * Spread queues among present CPUs first for minimizing
+ * count of dead queues which are mapped by all un-present CPUs
+ */
+ for_each_present_cpu(cpu) {
+ if (q >= nr_queues)
+ break;
+ map[cpu] = queue_index(qmap, nr_queues, q++);
+ }
for_each_possible_cpu(cpu) {
+ if (map[cpu] != -1)
+ continue;
/*
* First do sequential mapping between CPUs and queues.
* In case we still have CPUs to map, and we have some number of
* threads per cores then map sibling threads to the same queue
* for performance optimizations.
*/
- if (cpu < nr_queues) {
- map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
+ if (q < nr_queues) {
+ map[cpu] = queue_index(qmap, nr_queues, q++);
} else {
first_sibling = get_first_sibling(cpu);
if (first_sibling == cpu)
- map[cpu] = cpu_to_queue_index(qmap, nr_queues, cpu);
+ map[cpu] = queue_index(qmap, nr_queues, q++);
else
map[cpu] = map[first_sibling];
}
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index 3afe327f816f..b3f2ba483992 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -17,7 +17,7 @@
static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
{
if (stat->nr_samples) {
- seq_printf(m, "samples=%d, mean=%lld, min=%llu, max=%llu",
+ seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
stat->nr_samples, stat->mean, stat->min, stat->max);
} else {
seq_puts(m, "samples=0");
@@ -29,13 +29,13 @@ static int queue_poll_stat_show(void *data, struct seq_file *m)
struct request_queue *q = data;
int bucket;
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS/2; bucket++) {
- seq_printf(m, "read (%d Bytes): ", 1 << (9+bucket));
- print_stat(m, &q->poll_stat[2*bucket]);
+ for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
+ seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
+ print_stat(m, &q->poll_stat[2 * bucket]);
seq_puts(m, "\n");
- seq_printf(m, "write (%d Bytes): ", 1 << (9+bucket));
- print_stat(m, &q->poll_stat[2*bucket+1]);
+ seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
+ print_stat(m, &q->poll_stat[2 * bucket + 1]);
seq_puts(m, "\n");
}
return 0;
@@ -261,23 +261,6 @@ static int hctx_flags_show(void *data, struct seq_file *m)
return 0;
}
-#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
-static const char *const op_name[] = {
- REQ_OP_NAME(READ),
- REQ_OP_NAME(WRITE),
- REQ_OP_NAME(FLUSH),
- REQ_OP_NAME(DISCARD),
- REQ_OP_NAME(SECURE_ERASE),
- REQ_OP_NAME(ZONE_RESET),
- REQ_OP_NAME(WRITE_SAME),
- REQ_OP_NAME(WRITE_ZEROES),
- REQ_OP_NAME(SCSI_IN),
- REQ_OP_NAME(SCSI_OUT),
- REQ_OP_NAME(DRV_IN),
- REQ_OP_NAME(DRV_OUT),
-};
-#undef REQ_OP_NAME
-
#define CMD_FLAG_NAME(name) [__REQ_##name] = #name
static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(FAILFAST_DEV),
@@ -341,13 +324,14 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
- const unsigned int op = rq->cmd_flags & REQ_OP_MASK;
+ const unsigned int op = req_op(rq);
+ const char *op_str = blk_op_str(op);
seq_printf(m, "%p {.op=", rq);
- if (op < ARRAY_SIZE(op_name) && op_name[op])
- seq_printf(m, "%s", op_name[op]);
+ if (strcmp(op_str, "UNKNOWN") == 0)
+ seq_printf(m, "%u", op);
else
- seq_printf(m, "%d", op);
+ seq_printf(m, "%s", op_str);
seq_puts(m, ", .cmd_flags=");
blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
ARRAY_SIZE(cmd_flag_name));
@@ -779,8 +763,8 @@ static int blk_mq_debugfs_release(struct inode *inode, struct file *file)
if (attr->show)
return single_release(inode, file);
- else
- return seq_release(inode, file);
+
+ return seq_release(inode, file);
}
static const struct file_operations blk_mq_debugfs_fops = {
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index 2766066a15db..ca22afd47b3d 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -224,7 +224,7 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- struct request **merged_request)
+ unsigned int nr_segs, struct request **merged_request)
{
struct request *rq;
@@ -232,7 +232,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_BACK_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
- if (!bio_attempt_back_merge(q, rq, bio))
+ if (!bio_attempt_back_merge(rq, bio, nr_segs))
return false;
*merged_request = attempt_back_merge(q, rq);
if (!*merged_request)
@@ -241,7 +241,7 @@ bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
case ELEVATOR_FRONT_MERGE:
if (!blk_mq_sched_allow_merge(q, rq, bio))
return false;
- if (!bio_attempt_front_merge(q, rq, bio))
+ if (!bio_attempt_front_merge(rq, bio, nr_segs))
return false;
*merged_request = attempt_front_merge(q, rq);
if (!*merged_request)
@@ -260,7 +260,7 @@ EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
* of them.
*/
bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
- struct bio *bio)
+ struct bio *bio, unsigned int nr_segs)
{
struct request *rq;
int checked = 8;
@@ -277,11 +277,13 @@ bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_back_merge(q, rq, bio);
+ merged = bio_attempt_back_merge(rq, bio,
+ nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_front_merge(q, rq, bio);
+ merged = bio_attempt_front_merge(rq, bio,
+ nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
@@ -304,13 +306,14 @@ EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
*/
static bool blk_mq_attempt_merge(struct request_queue *q,
struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, struct bio *bio)
+ struct blk_mq_ctx *ctx, struct bio *bio,
+ unsigned int nr_segs)
{
enum hctx_type type = hctx->type;
lockdep_assert_held(&ctx->lock);
- if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) {
+ if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
ctx->rq_merged++;
return true;
}
@@ -318,7 +321,8 @@ static bool blk_mq_attempt_merge(struct request_queue *q,
return false;
}
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
@@ -326,21 +330,18 @@ bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
bool ret = false;
enum hctx_type type;
- if (e && e->type->ops.bio_merge) {
- blk_mq_put_ctx(ctx);
- return e->type->ops.bio_merge(hctx, bio);
- }
+ if (e && e->type->ops.bio_merge)
+ return e->type->ops.bio_merge(hctx, bio, nr_segs);
type = hctx->type;
if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
!list_empty_careful(&ctx->rq_lists[type])) {
/* default per sw-queue merge */
spin_lock(&ctx->lock);
- ret = blk_mq_attempt_merge(q, hctx, ctx, bio);
+ ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
spin_unlock(&ctx->lock);
}
- blk_mq_put_ctx(ctx);
return ret;
}
@@ -554,8 +555,6 @@ void blk_mq_sched_free_requests(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- lockdep_assert_held(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
if (hctx->sched_tags)
blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 3cf92cbbd8ac..126021fc3a11 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -12,8 +12,9 @@ void blk_mq_sched_assign_ioc(struct request *rq);
void blk_mq_sched_request_inserted(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio);
+ unsigned int nr_segs, struct request **merged_request);
+bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs);
bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
@@ -31,12 +32,13 @@ void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
void blk_mq_sched_free_requests(struct request_queue *q);
static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
+blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
{
if (blk_queue_nomerges(q) || !bio_mergeable(bio))
return false;
- return __blk_mq_sched_bio_merge(q, bio);
+ return __blk_mq_sched_bio_merge(q, bio, nr_segs);
}
static inline bool
@@ -59,15 +61,6 @@ static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
e->type->ops.completed_request(rq, now);
}
-static inline void blk_mq_sched_started_request(struct request *rq)
-{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.started_request)
- e->type->ops.started_request(rq);
-}
-
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index d6e1a9bd7131..a09ab0c3d074 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -166,20 +166,25 @@ static ssize_t blk_mq_hw_sysfs_nr_reserved_tags_show(struct blk_mq_hw_ctx *hctx,
static ssize_t blk_mq_hw_sysfs_cpus_show(struct blk_mq_hw_ctx *hctx, char *page)
{
+ const size_t size = PAGE_SIZE - 1;
unsigned int i, first = 1;
- ssize_t ret = 0;
+ int ret = 0, pos = 0;
for_each_cpu(i, hctx->cpumask) {
if (first)
- ret += sprintf(ret + page, "%u", i);
+ ret = snprintf(pos + page, size - pos, "%u", i);
else
- ret += sprintf(ret + page, ", %u", i);
+ ret = snprintf(pos + page, size - pos, ", %u", i);
+
+ if (ret >= size - pos)
+ break;
first = 0;
+ pos += ret;
}
- ret += sprintf(ret + page, "\n");
- return ret;
+ ret = snprintf(pos + page, size + 1 - pos, "\n");
+ return pos + ret;
}
static struct blk_mq_hw_ctx_sysfs_entry blk_mq_hw_sysfs_nr_tags = {
@@ -270,7 +275,7 @@ void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i;
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_dir_lock);
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_unregister_hctx(hctx);
@@ -320,7 +325,7 @@ int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
int ret, i;
WARN_ON_ONCE(!q->kobj.parent);
- lockdep_assert_held(&q->sysfs_lock);
+ lockdep_assert_held(&q->sysfs_dir_lock);
ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
if (ret < 0)
@@ -349,23 +354,12 @@ unreg:
return ret;
}
-int blk_mq_register_dev(struct device *dev, struct request_queue *q)
-{
- int ret;
-
- mutex_lock(&q->sysfs_lock);
- ret = __blk_mq_register_dev(dev, q);
- mutex_unlock(&q->sysfs_lock);
-
- return ret;
-}
-
void blk_mq_sysfs_unregister(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
int i;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
@@ -373,7 +367,7 @@ void blk_mq_sysfs_unregister(struct request_queue *q)
blk_mq_unregister_hctx(hctx);
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
}
int blk_mq_sysfs_register(struct request_queue *q)
@@ -381,7 +375,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
struct blk_mq_hw_ctx *hctx;
int i, ret = 0;
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
goto unlock;
@@ -392,7 +386,7 @@ int blk_mq_sysfs_register(struct request_queue *q)
}
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index 7513c8eaabee..008388e82b5c 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -10,6 +10,7 @@
#include <linux/module.h>
#include <linux/blk-mq.h>
+#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-tag.h"
@@ -113,7 +114,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
struct sbq_wait_state *ws;
DEFINE_SBQ_WAIT(wait);
unsigned int tag_offset;
- bool drop_ctx;
int tag;
if (data->flags & BLK_MQ_REQ_RESERVED) {
@@ -136,7 +136,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
return BLK_MQ_TAG_FAIL;
ws = bt_wait_ptr(bt, data->hctx);
- drop_ctx = data->ctx == NULL;
do {
struct sbitmap_queue *bt_prev;
@@ -161,9 +160,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
if (tag != -1)
break;
- if (data->ctx)
- blk_mq_put_ctx(data->ctx);
-
bt_prev = bt;
io_schedule();
@@ -189,9 +185,6 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
- if (drop_ctx && data->ctx)
- blk_mq_put_ctx(data->ctx);
-
sbitmap_finish_wait(bt, ws, &wait);
found_tag:
@@ -362,6 +355,37 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
+static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
+ void *data, bool reserved)
+{
+ unsigned *count = data;
+
+ if (blk_mq_request_completed(rq))
+ (*count)++;
+ return true;
+}
+
+/**
+ * blk_mq_tagset_wait_completed_request - wait until all completed req's
+ * complete funtion is run
+ * @tagset: Tag set to drain completed request
+ *
+ * Note: This function has to be run after all IO queues are shutdown
+ */
+void blk_mq_tagset_wait_completed_request(struct blk_mq_tag_set *tagset)
+{
+ while (true) {
+ unsigned count = 0;
+
+ blk_mq_tagset_busy_iter(tagset,
+ blk_mq_tagset_count_completed_rqs, &count);
+ if (!count)
+ break;
+ msleep(5);
+ }
+}
+EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
+
/**
* blk_mq_queue_tag_busy_iter - iterate over all requests with a driver tag
* @q: Request queue to examine.
diff --git a/block/blk-mq.c b/block/blk-mq.c
index ce0f5f4ede70..ec791156e9cc 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -30,6 +30,7 @@
#include <trace/events/block.h>
#include <linux/blk-mq.h>
+#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
@@ -44,12 +45,12 @@ static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
static int blk_mq_poll_stats_bkt(const struct request *rq)
{
- int ddir, bytes, bucket;
+ int ddir, sectors, bucket;
ddir = rq_data_dir(rq);
- bytes = blk_rq_bytes(rq);
+ sectors = blk_rq_stats_sectors(rq);
- bucket = ddir + 2*(ilog2(bytes) - 9);
+ bucket = ddir + 2 * ilog2(sectors);
if (bucket < 0)
return -1;
@@ -282,16 +283,16 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx)
EXPORT_SYMBOL(blk_mq_can_queue);
/*
- * Only need start/end time stamping if we have stats enabled, or using
- * an IO scheduler.
+ * Only need start/end time stamping if we have iostat or
+ * blk stats enabled, or using an IO scheduler.
*/
static inline bool blk_mq_need_time_stamp(struct request *rq)
{
- return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator;
+ return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
}
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
- unsigned int tag, unsigned int op)
+ unsigned int tag, unsigned int op, u64 alloc_time_ns)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
@@ -325,11 +326,15 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
RB_CLEAR_NODE(&rq->rb_node);
rq->rq_disk = NULL;
rq->part = NULL;
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+ rq->alloc_time_ns = alloc_time_ns;
+#endif
if (blk_mq_need_time_stamp(rq))
rq->start_time_ns = ktime_get_ns();
else
rq->start_time_ns = 0;
rq->io_start_time_ns = 0;
+ rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
@@ -355,13 +360,19 @@ static struct request *blk_mq_get_request(struct request_queue *q,
struct elevator_queue *e = q->elevator;
struct request *rq;
unsigned int tag;
- bool put_ctx_on_error = false;
+ bool clear_ctx_on_error = false;
+ u64 alloc_time_ns = 0;
blk_queue_enter_live(q);
+
+ /* alloc_time includes depth and tag waits */
+ if (blk_queue_rq_alloc_time(q))
+ alloc_time_ns = ktime_get_ns();
+
data->q = q;
if (likely(!data->ctx)) {
data->ctx = blk_mq_get_ctx(q);
- put_ctx_on_error = true;
+ clear_ctx_on_error = true;
}
if (likely(!data->hctx))
data->hctx = blk_mq_map_queue(q, data->cmd_flags,
@@ -387,15 +398,13 @@ static struct request *blk_mq_get_request(struct request_queue *q,
tag = blk_mq_get_tag(data);
if (tag == BLK_MQ_TAG_FAIL) {
- if (put_ctx_on_error) {
- blk_mq_put_ctx(data->ctx);
+ if (clear_ctx_on_error)
data->ctx = NULL;
- }
blk_queue_exit(q);
return NULL;
}
- rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags);
+ rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags, alloc_time_ns);
if (!op_is_flush(data->cmd_flags)) {
rq->elv.icq = NULL;
if (e && e->type->ops.prepare_request) {
@@ -427,8 +436,6 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
if (!rq)
return ERR_PTR(-EWOULDBLOCK);
- blk_mq_put_ctx(alloc_data.ctx);
-
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
@@ -656,32 +663,27 @@ bool blk_mq_complete_request(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_complete_request);
-void blk_mq_complete_request_sync(struct request *rq)
-{
- WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
- rq->q->mq_ops->complete(rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_complete_request_sync);
-
int blk_mq_request_started(struct request *rq)
{
return blk_mq_rq_state(rq) != MQ_RQ_IDLE;
}
EXPORT_SYMBOL_GPL(blk_mq_request_started);
+int blk_mq_request_completed(struct request *rq)
+{
+ return blk_mq_rq_state(rq) == MQ_RQ_COMPLETE;
+}
+EXPORT_SYMBOL_GPL(blk_mq_request_completed);
+
void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
- blk_mq_sched_started_request(rq);
-
trace_block_rq_issue(q, rq);
if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
rq->io_start_time_ns = ktime_get_ns();
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- rq->throtl_size = blk_rq_sectors(rq);
-#endif
+ rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
rq_qos_issue(q, rq);
}
@@ -699,6 +701,11 @@ void blk_mq_start_request(struct request *rq)
*/
rq->nr_phys_segments++;
}
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
+ q->integrity.profile->prepare_fn(rq);
+#endif
}
EXPORT_SYMBOL(blk_mq_start_request);
@@ -911,7 +918,10 @@ static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
*/
if (blk_mq_req_expired(rq, next))
blk_mq_rq_timed_out(rq, reserved);
- if (refcount_dec_and_test(&rq->ref))
+
+ if (is_flush_rq(rq, hctx))
+ rq->end_io(rq, 0);
+ else if (refcount_dec_and_test(&rq->ref))
__blk_mq_free_request(rq);
return true;
@@ -1764,9 +1774,15 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
}
}
-static void blk_mq_bio_to_request(struct request *rq, struct bio *bio)
+static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
+ unsigned int nr_segs)
{
- blk_init_request_from_bio(rq, bio);
+ if (bio->bi_opf & REQ_RAHEAD)
+ rq->cmd_flags |= REQ_FAILFAST_MASK;
+
+ rq->__sector = bio->bi_iter.bi_sector;
+ rq->write_hint = bio->bi_write_hint;
+ blk_rq_bio_prep(rq, bio, nr_segs);
blk_account_io_start(rq, true);
}
@@ -1936,20 +1952,20 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
struct request *rq;
struct blk_plug *plug;
struct request *same_queue_rq = NULL;
+ unsigned int nr_segs;
blk_qc_t cookie;
blk_queue_bounce(q, &bio);
-
- blk_queue_split(q, &bio);
+ __blk_queue_split(q, &bio, &nr_segs);
if (!bio_integrity_prep(bio))
return BLK_QC_T_NONE;
if (!is_flush_fua && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, &same_queue_rq))
+ blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
return BLK_QC_T_NONE;
- if (blk_mq_sched_bio_merge(q, bio))
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
return BLK_QC_T_NONE;
rq_qos_throttle(q, bio);
@@ -1969,25 +1985,25 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
cookie = request_to_qc_t(data.hctx, rq);
- plug = current->plug;
- if (unlikely(is_flush_fua)) {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
+ blk_mq_bio_to_request(rq, bio, nr_segs);
+ plug = blk_mq_plug(q, bio);
+ if (unlikely(is_flush_fua)) {
/* bypass scheduler for flush rq */
blk_insert_flush(rq);
blk_mq_run_hw_queue(data.hctx, true);
- } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) {
+ } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
+ !blk_queue_nonrot(q))) {
/*
* Use plugging if we have a ->commit_rqs() hook as well, as
* we know the driver uses bd->last in a smart fashion.
+ *
+ * Use normal plugging if this disk is slow HDD, as sequential
+ * IO may benefit a lot from plug merging.
*/
unsigned int request_count = plug->rq_count;
struct request *last = NULL;
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
-
if (!request_count)
trace_block_plug(q);
else
@@ -2000,9 +2016,9 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
}
blk_add_rq_to_plug(plug, rq);
+ } else if (q->elevator) {
+ blk_mq_sched_insert_request(rq, false, true, true);
} else if (plug && !blk_queue_nomerges(q)) {
- blk_mq_bio_to_request(rq, bio);
-
/*
* We do limited plugging. If the bio can be merged, do that.
* Otherwise the existing request in the plug list will be
@@ -2019,22 +2035,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_add_rq_to_plug(plug, rq);
trace_block_plug(q);
- blk_mq_put_ctx(data.ctx);
-
if (same_queue_rq) {
data.hctx = same_queue_rq->mq_hctx;
trace_block_unplug(q, 1, true);
blk_mq_try_issue_directly(data.hctx, same_queue_rq,
&cookie);
}
- } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator &&
- !data.hctx->dispatch_busy)) {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
+ } else if ((q->nr_hw_queues > 1 && is_sync) ||
+ !data.hctx->dispatch_busy) {
blk_mq_try_issue_directly(data.hctx, rq, &cookie);
} else {
- blk_mq_put_ctx(data.ctx);
- blk_mq_bio_to_request(rq, bio);
blk_mq_sched_insert_request(rq, false, true, true);
}
@@ -2465,11 +2475,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
- /*
- * Avoid others reading imcomplete hctx->cpumask through sysfs
- */
- mutex_lock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
cpumask_clear(hctx->cpumask);
hctx->nr_ctx = 0;
@@ -2530,8 +2535,6 @@ static void blk_mq_map_swqueue(struct request_queue *q)
HCTX_TYPE_DEFAULT, i);
}
- mutex_unlock(&q->sysfs_lock);
-
queue_for_each_hw_ctx(q, hctx, i) {
/*
* If no software queues are mapped to this hardware queue,
@@ -2674,8 +2677,6 @@ void blk_mq_release(struct request_queue *q)
struct blk_mq_hw_ctx *hctx, *next;
int i;
- cancel_delayed_work_sync(&q->requeue_work);
-
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -2702,7 +2703,11 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
if (!uninit_q)
return ERR_PTR(-ENOMEM);
- q = blk_mq_init_allocated_queue(set, uninit_q);
+ /*
+ * Initialize the queue without an elevator. device_add_disk() will do
+ * the initialization.
+ */
+ q = blk_mq_init_allocated_queue(set, uninit_q, false);
if (IS_ERR(q))
blk_cleanup_queue(uninit_q);
@@ -2853,7 +2858,8 @@ static unsigned int nr_hw_queues(struct blk_mq_tag_set *set)
}
struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q)
+ struct request_queue *q,
+ bool elevator_init)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
@@ -2915,18 +2921,14 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
- if (!(set->flags & BLK_MQ_F_NO_SCHED)) {
- int ret;
-
- ret = elevator_init_mq(q);
- if (ret)
- return ERR_PTR(ret);
- }
+ if (elevator_init)
+ elevator_init_mq(q);
return q;
err_hctxs:
kfree(q->queue_hw_ctx);
+ q->nr_hw_queues = 0;
err_sys_init:
blk_mq_sysfs_deinit(q);
err_poll:
@@ -3425,15 +3427,14 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
kt = nsecs;
mode = HRTIMER_MODE_REL;
- hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode);
+ hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
hrtimer_set_expires(&hs.timer, kt);
- hrtimer_init_sleeper(&hs, current);
do {
if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
break;
set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_start_expires(&hs.timer, mode);
+ hrtimer_sleeper_start_expires(&hs, mode);
if (hs.task)
io_schedule();
hrtimer_cancel(&hs.timer);
diff --git a/block/blk-mq.h b/block/blk-mq.h
index 633a5a77ee8b..32c62c64e6c2 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -151,12 +151,7 @@ static inline struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q,
*/
static inline struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q)
{
- return __blk_mq_get_ctx(q, get_cpu());
-}
-
-static inline void blk_mq_put_ctx(struct blk_mq_ctx *ctx)
-{
- put_cpu();
+ return __blk_mq_get_ctx(q, raw_smp_processor_id());
}
struct blk_mq_alloc_data {
@@ -238,4 +233,36 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
qmap->mq_map[cpu] = 0;
}
+/*
+ * blk_mq_plug() - Get caller context plug
+ * @q: request queue
+ * @bio : the bio being submitted by the caller context
+ *
+ * Plugging, by design, may delay the insertion of BIOs into the elevator in
+ * order to increase BIO merging opportunities. This however can cause BIO
+ * insertion order to change from the order in which submit_bio() is being
+ * executed in the case of multiple contexts concurrently issuing BIOs to a
+ * device, even if these context are synchronized to tightly control BIO issuing
+ * order. While this is not a problem with regular block devices, this ordering
+ * change can cause write BIO failures with zoned block devices as these
+ * require sequential write patterns to zones. Prevent this from happening by
+ * ignoring the plug state of a BIO issuing context if the target request queue
+ * is for a zoned block device and the BIO to plug is a write operation.
+ *
+ * Return current->plug if the bio can be plugged and NULL otherwise
+ */
+static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
+ struct bio *bio)
+{
+ /*
+ * For regular block devices or read operations, use the context plug
+ * which may be NULL if blk_start_plug() was not executed.
+ */
+ if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
+ return current->plug;
+
+ /* Zoned block device write operation case: do not plug the BIO */
+ return NULL;
+}
+
#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index 0a028c189897..1adc1cd748b4 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -207,10 +207,12 @@ EXPORT_SYMBOL(blk_post_runtime_resume);
*/
void blk_set_runtime_active(struct request_queue *q)
{
- spin_lock_irq(&q->queue_lock);
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_mark_last_busy(q->dev);
- pm_request_autosuspend(q->dev);
- spin_unlock_irq(&q->queue_lock);
+ if (q->dev) {
+ spin_lock_irq(&q->queue_lock);
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ pm_request_autosuspend(q->dev);
+ spin_unlock_irq(&q->queue_lock);
+ }
}
EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 659ccb8b693f..656460636ad3 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -83,6 +83,15 @@ void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
} while (rqos);
}
+void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio)
+{
+ do {
+ if (rqos->ops->merge)
+ rqos->ops->merge(rqos, rq, bio);
+ rqos = rqos->next;
+ } while (rqos);
+}
+
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
{
do {
@@ -92,6 +101,15 @@ void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio)
} while (rqos);
}
+void __rq_qos_queue_depth_changed(struct rq_qos *rqos)
+{
+ do {
+ if (rqos->ops->queue_depth_changed)
+ rqos->ops->queue_depth_changed(rqos);
+ rqos = rqos->next;
+ } while (rqos);
+}
+
/*
* Return true, if we can't increase the depth further by scaling
*/
@@ -142,24 +160,27 @@ bool rq_depth_calc_max_depth(struct rq_depth *rqd)
return ret;
}
-void rq_depth_scale_up(struct rq_depth *rqd)
+/* Returns true on success and false if scaling up wasn't possible */
+bool rq_depth_scale_up(struct rq_depth *rqd)
{
/*
* Hit max in previous round, stop here
*/
if (rqd->scaled_max)
- return;
+ return false;
rqd->scale_step--;
rqd->scaled_max = rq_depth_calc_max_depth(rqd);
+ return true;
}
/*
* Scale rwb down. If 'hard_throttle' is set, do it quicker, since we
- * had a latency violation.
+ * had a latency violation. Returns true on success and returns false if
+ * scaling down wasn't possible.
*/
-void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
+bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
{
/*
* Stop scaling down when we've hit the limit. This also prevents
@@ -167,7 +188,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
* keep up.
*/
if (rqd->max_depth == 1)
- return;
+ return false;
if (rqd->scale_step < 0 && hard_throttle)
rqd->scale_step = 0;
@@ -176,6 +197,7 @@ void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle)
rqd->scaled_max = false;
rq_depth_calc_max_depth(rqd);
+ return true;
}
struct rq_qos_wait_data {
@@ -202,6 +224,7 @@ static int rq_qos_wake_function(struct wait_queue_entry *curr,
return -1;
data->got_token = true;
+ smp_wmb();
list_del_init(&curr->entry);
wake_up_process(data->task);
return 1;
@@ -244,7 +267,9 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
return;
prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
+ has_sleeper = !wq_has_single_sleeper(&rqw->wait);
do {
+ /* The memory barrier in set_task_state saves us here. */
if (data.got_token)
break;
if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) {
@@ -255,12 +280,14 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
* which means we now have two. Put our local token
* and wake anyone else potentially waiting for one.
*/
+ smp_rmb();
if (data.got_token)
cleanup_cb(rqw, private_data);
break;
}
io_schedule();
- has_sleeper = false;
+ has_sleeper = true;
+ set_current_state(TASK_UNINTERRUPTIBLE);
} while (1);
finish_wait(&rqw->wait, &data.wq);
}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 2300e038b9fa..2bc43e94f4c4 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -14,7 +14,8 @@ struct blk_mq_debugfs_attr;
enum rq_qos_id {
RQ_QOS_WBT,
- RQ_QOS_CGROUP,
+ RQ_QOS_LATENCY,
+ RQ_QOS_COST,
};
struct rq_wait {
@@ -35,11 +36,13 @@ struct rq_qos {
struct rq_qos_ops {
void (*throttle)(struct rq_qos *, struct bio *);
void (*track)(struct rq_qos *, struct request *, struct bio *);
+ void (*merge)(struct rq_qos *, struct request *, struct bio *);
void (*issue)(struct rq_qos *, struct request *);
void (*requeue)(struct rq_qos *, struct request *);
void (*done)(struct rq_qos *, struct request *);
void (*done_bio)(struct rq_qos *, struct bio *);
void (*cleanup)(struct rq_qos *, struct bio *);
+ void (*queue_depth_changed)(struct rq_qos *);
void (*exit)(struct rq_qos *);
const struct blk_mq_debugfs_attr *debugfs_attrs;
};
@@ -72,7 +75,7 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
{
- return rq_qos_id(q, RQ_QOS_CGROUP);
+ return rq_qos_id(q, RQ_QOS_LATENCY);
}
static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
@@ -80,8 +83,10 @@ static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
switch (id) {
case RQ_QOS_WBT:
return "wbt";
- case RQ_QOS_CGROUP:
- return "cgroup";
+ case RQ_QOS_LATENCY:
+ return "latency";
+ case RQ_QOS_COST:
+ return "cost";
}
return "unknown";
}
@@ -103,16 +108,13 @@ static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
{
- struct rq_qos *cur, *prev = NULL;
- for (cur = q->rq_qos; cur; cur = cur->next) {
- if (cur == rqos) {
- if (prev)
- prev->next = rqos->next;
- else
- q->rq_qos = cur;
+ struct rq_qos **cur;
+
+ for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
+ if (*cur == rqos) {
+ *cur = rqos->next;
break;
}
- prev = cur;
}
blk_mq_debugfs_unregister_rqos(rqos);
@@ -125,8 +127,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
acquire_inflight_cb_t *acquire_inflight_cb,
cleanup_cb_t *cleanup_cb);
bool rq_wait_inc_below(struct rq_wait *rq_wait, unsigned int limit);
-void rq_depth_scale_up(struct rq_depth *rqd);
-void rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
+bool rq_depth_scale_up(struct rq_depth *rqd);
+bool rq_depth_scale_down(struct rq_depth *rqd, bool hard_throttle);
bool rq_depth_calc_max_depth(struct rq_depth *rqd);
void __rq_qos_cleanup(struct rq_qos *rqos, struct bio *bio);
@@ -135,7 +137,9 @@ void __rq_qos_issue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_requeue(struct rq_qos *rqos, struct request *rq);
void __rq_qos_throttle(struct rq_qos *rqos, struct bio *bio);
void __rq_qos_track(struct rq_qos *rqos, struct request *rq, struct bio *bio);
+void __rq_qos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio);
void __rq_qos_done_bio(struct rq_qos *rqos, struct bio *bio);
+void __rq_qos_queue_depth_changed(struct rq_qos *rqos);
static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
{
@@ -185,6 +189,19 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
__rq_qos_track(q->rq_qos, rq, bio);
}
+static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
+ struct bio *bio)
+{
+ if (q->rq_qos)
+ __rq_qos_merge(q->rq_qos, rq, bio);
+}
+
+static inline void rq_qos_queue_depth_changed(struct request_queue *q)
+{
+ if (q->rq_qos)
+ __rq_qos_queue_depth_changed(q->rq_qos);
+}
+
void rq_qos_exit(struct request_queue *);
#endif
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 2ae348c101a0..5f6dcc7a47bd 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -12,6 +12,7 @@
#include <linux/lcm.h>
#include <linux/jiffies.h>
#include <linux/gfp.h>
+#include <linux/dma-mapping.h>
#include "blk.h"
#include "blk-wbt.h"
@@ -752,7 +753,8 @@ void blk_queue_virt_boundary(struct request_queue *q, unsigned long mask)
* page (which might not be idential to the Linux PAGE_SIZE). Because
* of that they are not limited by our notion of "segment size".
*/
- q->limits.max_segment_size = UINT_MAX;
+ if (mask)
+ q->limits.max_segment_size = UINT_MAX;
}
EXPORT_SYMBOL(blk_queue_virt_boundary);
@@ -804,7 +806,7 @@ EXPORT_SYMBOL(blk_queue_update_dma_alignment);
void blk_set_queue_depth(struct request_queue *q, unsigned int depth)
{
q->queue_depth = depth;
- wbt_set_queue_depth(q, depth);
+ rq_qos_queue_depth_changed(q);
}
EXPORT_SYMBOL(blk_set_queue_depth);
@@ -831,6 +833,44 @@ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
+/**
+ * blk_queue_required_elevator_features - Set a queue required elevator features
+ * @q: the request queue for the target device
+ * @features: Required elevator features OR'ed together
+ *
+ * Tell the block layer that for the device controlled through @q, only the
+ * only elevators that can be used are those that implement at least the set of
+ * features specified by @features.
+ */
+void blk_queue_required_elevator_features(struct request_queue *q,
+ unsigned int features)
+{
+ q->required_elevator_features = features;
+}
+EXPORT_SYMBOL_GPL(blk_queue_required_elevator_features);
+
+/**
+ * blk_queue_can_use_dma_map_merging - configure queue for merging segments.
+ * @q: the request queue for the device
+ * @dev: the device pointer for dma
+ *
+ * Tell the block layer about merging the segments by dma map of @q.
+ */
+bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
+ struct device *dev)
+{
+ unsigned long boundary = dma_get_merge_boundary(dev);
+
+ if (!boundary)
+ return false;
+
+ /* No need to update max_segment_size. see blk_queue_virt_boundary() */
+ blk_queue_virt_boundary(q, boundary);
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
+
static int __init blk_settings_init(void)
{
blk_max_low_pfn = max_low_pfn - 1;
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 977c659dcd18..46f5198be017 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -482,7 +482,6 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
blk_mq_quiesce_queue(q);
wbt_set_min_lat(q, val);
- wbt_update_limits(q);
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
@@ -892,6 +891,9 @@ static void __blk_release_queue(struct work_struct *work)
blk_free_queue_stats(q->stats);
+ if (queue_is_mq(q))
+ cancel_delayed_work_sync(&q->requeue_work);
+
blk_exit_queue(q);
blk_queue_free_zone_bitmaps(q);
@@ -938,14 +940,14 @@ int blk_register_queue(struct gendisk *disk)
int ret;
struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
+ bool has_elevator = false;
if (WARN_ON(!q))
return -ENXIO;
- WARN_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags),
+ WARN_ONCE(blk_queue_registered(q),
"%s is registering an already registered queue\n",
kobject_name(&dev->kobj));
- blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
/*
* SCSI probing may synchronously create and destroy a lot of
@@ -965,8 +967,7 @@ int blk_register_queue(struct gendisk *disk)
if (ret)
return ret;
- /* Prevent changes through sysfs until registration is completed. */
- mutex_lock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
if (ret < 0) {
@@ -987,26 +988,33 @@ int blk_register_queue(struct gendisk *disk)
blk_mq_debugfs_register(q);
}
- kobject_uevent(&q->kobj, KOBJ_ADD);
-
- wbt_enable_default(q);
-
- blk_throtl_register_queue(q);
-
+ mutex_lock(&q->sysfs_lock);
if (q->elevator) {
- ret = elv_register_queue(q);
+ ret = elv_register_queue(q, false);
if (ret) {
mutex_unlock(&q->sysfs_lock);
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
+ mutex_unlock(&q->sysfs_dir_lock);
kobject_del(&q->kobj);
blk_trace_remove_sysfs(dev);
kobject_put(&dev->kobj);
return ret;
}
+ has_elevator = true;
}
+
+ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+ wbt_enable_default(q);
+ blk_throtl_register_queue(q);
+
+ /* Now everything is ready and send out KOBJ_ADD uevent */
+ kobject_uevent(&q->kobj, KOBJ_ADD);
+ if (has_elevator)
+ kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
+ mutex_unlock(&q->sysfs_lock);
+
ret = 0;
unlock:
- mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
EXPORT_SYMBOL_GPL(blk_register_queue);
@@ -1026,7 +1034,7 @@ void blk_unregister_queue(struct gendisk *disk)
return;
/* Return early if disk->queue was never registered. */
- if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ if (!blk_queue_registered(q))
return;
/*
@@ -1035,16 +1043,16 @@ void blk_unregister_queue(struct gendisk *disk)
* concurrent elv_iosched_store() calls.
*/
mutex_lock(&q->sysfs_lock);
-
blk_queue_flag_clear(QUEUE_FLAG_REGISTERED, q);
+ mutex_unlock(&q->sysfs_lock);
+ mutex_lock(&q->sysfs_dir_lock);
/*
* Remove the sysfs attributes before unregistering the queue data
* structures that can be modified through sysfs.
*/
if (queue_is_mq(q))
blk_mq_unregister_dev(disk_to_dev(disk), q);
- mutex_unlock(&q->sysfs_lock);
kobject_uevent(&q->kobj, KOBJ_REMOVE);
kobject_del(&q->kobj);
@@ -1054,6 +1062,7 @@ void blk_unregister_queue(struct gendisk *disk)
if (q->elevator)
elv_unregister_queue(q);
mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
kobject_put(&disk_to_dev(disk)->kobj);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 9ea7c0ecad10..18f773e52dfb 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -478,12 +478,14 @@ static void throtl_service_queue_init(struct throtl_service_queue *sq)
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
-static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node)
+static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
+ struct request_queue *q,
+ struct blkcg *blkcg)
{
struct throtl_grp *tg;
int rw;
- tg = kzalloc_node(sizeof(*tg), gfp, node);
+ tg = kzalloc_node(sizeof(*tg), gfp, q->node);
if (!tg)
return NULL;
@@ -881,13 +883,10 @@ static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
u64 tmp;
- jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
-
- /* Slice has just started. Consider one slice interval */
- if (!jiffy_elapsed)
- jiffy_elapsed_rnd = tg->td->throtl_slice;
+ jiffy_elapsed = jiffies - tg->slice_start[rw];
- jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
+ /* Round up to the next throttle slice, wait time must be nonzero */
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
/*
* jiffy_elapsed_rnd should not be a big value as minimum iops can be
@@ -2249,7 +2248,8 @@ void blk_throtl_stat_add(struct request *rq, u64 time_ns)
struct request_queue *q = rq->q;
struct throtl_data *td = q->td;
- throtl_track_latency(td, rq->throtl_size, req_op(rq), time_ns >> 10);
+ throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq),
+ time_ns >> 10);
}
void blk_throtl_bio_endio(struct bio *bio)
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 313f45a37e9d..8641ba9793c5 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -308,7 +308,8 @@ static void calc_wb_limits(struct rq_wb *rwb)
static void scale_up(struct rq_wb *rwb)
{
- rq_depth_scale_up(&rwb->rq_depth);
+ if (!rq_depth_scale_up(&rwb->rq_depth))
+ return;
calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
rwb_wake_all(rwb);
@@ -317,7 +318,8 @@ static void scale_up(struct rq_wb *rwb)
static void scale_down(struct rq_wb *rwb, bool hard_throttle)
{
- rq_depth_scale_down(&rwb->rq_depth, hard_throttle);
+ if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
+ return;
calc_wb_limits(rwb);
rwb->unknown_cnt = 0;
rwb_trace_step(rwb, "scale down");
@@ -629,15 +631,6 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
-void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (rqos) {
- RQWB(rqos)->rq_depth.queue_depth = depth;
- __wbt_update_limits(RQWB(rqos));
- }
-}
-
void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -656,7 +649,7 @@ void wbt_enable_default(struct request_queue *q)
return;
/* Queue not registered? Maybe shutting down... */
- if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ if (!blk_queue_registered(q))
return;
if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
@@ -689,6 +682,12 @@ static int wbt_data_dir(const struct request *rq)
return -1;
}
+static void wbt_queue_depth_changed(struct rq_qos *rqos)
+{
+ RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
+ __wbt_update_limits(RQWB(rqos));
+}
+
static void wbt_exit(struct rq_qos *rqos)
{
struct rq_wb *rwb = RQWB(rqos);
@@ -811,6 +810,7 @@ static struct rq_qos_ops wbt_rqos_ops = {
.requeue = wbt_requeue,
.done = wbt_done,
.cleanup = wbt_cleanup,
+ .queue_depth_changed = wbt_queue_depth_changed,
.exit = wbt_exit,
#ifdef CONFIG_BLK_DEBUG_FS
.debugfs_attrs = wbt_debugfs_attrs,
@@ -853,7 +853,7 @@ int wbt_init(struct request_queue *q)
rwb->min_lat_nsec = wbt_default_latency_nsec(q);
- wbt_set_queue_depth(q, blk_queue_depth(q));
+ wbt_queue_depth_changed(&rwb->rqos);
wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
return 0;
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index f47218d5b3b2..8e4e37660971 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -95,7 +95,6 @@ void wbt_enable_default(struct request_queue *);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
-void wbt_set_queue_depth(struct request_queue *, unsigned int);
void wbt_set_write_cache(struct request_queue *, bool);
u64 wbt_default_latency_nsec(struct request_queue *);
@@ -118,9 +117,6 @@ static inline void wbt_disable_default(struct request_queue *q)
static inline void wbt_enable_default(struct request_queue *q)
{
}
-static inline void wbt_set_queue_depth(struct request_queue *q, unsigned int depth)
-{
-}
static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
{
}
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index ae7e91bd0618..4bc5f260248a 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -14,6 +14,9 @@
#include <linux/rbtree.h>
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/sched/mm.h>
#include "blk.h"
@@ -70,7 +73,7 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
static inline unsigned int __blkdev_nr_zones(struct request_queue *q,
sector_t nr_sectors)
{
- unsigned long zone_sectors = blk_queue_zone_sectors(q);
+ sector_t zone_sectors = blk_queue_zone_sectors(q);
return (nr_sectors + zone_sectors - 1) >> ilog2(zone_sectors);
}
@@ -117,8 +120,7 @@ static bool blkdev_report_zone(struct block_device *bdev, struct blk_zone *rep)
}
static int blk_report_zones(struct gendisk *disk, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
struct request_queue *q = disk->queue;
unsigned int z = 0, n, nrz = *nr_zones;
@@ -127,8 +129,7 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
while (z < nrz && sector < capacity) {
n = nrz - z;
- ret = disk->fops->report_zones(disk, sector, &zones[z], &n,
- gfp_mask);
+ ret = disk->fops->report_zones(disk, sector, &zones[z], &n);
if (ret)
return ret;
if (!n)
@@ -149,17 +150,18 @@ static int blk_report_zones(struct gendisk *disk, sector_t sector,
* @sector: Sector from which to report zones
* @zones: Array of zone structures where to return the zones information
* @nr_zones: Number of zone structures in the zone array
- * @gfp_mask: Memory allocation flags (for bio_alloc)
*
* Description:
* Get zone information starting from the zone containing @sector.
* The number of zone information reported may be less than the number
* requested by @nr_zones. The number of zones actually reported is
* returned in @nr_zones.
+ * The caller must use memalloc_noXX_save/restore() calls to control
+ * memory allocations done within this function (zone array and command
+ * buffer allocation by the device driver).
*/
int blkdev_report_zones(struct block_device *bdev, sector_t sector,
- struct blk_zone *zones, unsigned int *nr_zones,
- gfp_t gfp_mask)
+ struct blk_zone *zones, unsigned int *nr_zones)
{
struct request_queue *q = bdev_get_queue(bdev);
unsigned int i, nrz;
@@ -184,7 +186,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
nrz = min(*nr_zones,
__blkdev_nr_zones(q, bdev->bd_part->nr_sects - sector));
ret = blk_report_zones(bdev->bd_disk, get_start_sect(bdev) + sector,
- zones, &nrz, gfp_mask);
+ zones, &nrz);
if (ret)
return ret;
@@ -200,6 +202,42 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);
+/*
+ * Special case of zone reset operation to reset all zones in one command,
+ * useful for applications like mkfs.
+ */
+static int __blkdev_reset_all_zones(struct block_device *bdev, gfp_t gfp_mask)
+{
+ struct bio *bio = bio_alloc(gfp_mask, 0);
+ int ret;
+
+ /* across the zones operations, don't need any sectors */
+ bio_set_dev(bio, bdev);
+ bio_set_op_attrs(bio, REQ_OP_ZONE_RESET_ALL, 0);
+
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+
+ return ret;
+}
+
+static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
+ sector_t nr_sectors)
+{
+ if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
+ return false;
+
+ if (nr_sectors != part_nr_sects_read(bdev->bd_part))
+ return false;
+ /*
+ * REQ_OP_ZONE_RESET_ALL can be executed only if the block device is
+ * the entire disk, that is, if the blocks device start offset is 0 and
+ * its capacity is the same as the entire disk.
+ */
+ return get_start_sect(bdev) == 0 &&
+ part_nr_sects_read(bdev->bd_part) == get_capacity(bdev->bd_disk);
+}
+
/**
* blkdev_reset_zones - Reset zones write pointer
* @bdev: Target block device
@@ -233,6 +271,9 @@ int blkdev_reset_zones(struct block_device *bdev,
/* Out of range */
return -EINVAL;
+ if (blkdev_allow_reset_all_zones(bdev, nr_sectors))
+ return __blkdev_reset_all_zones(bdev, gfp_mask);
+
/* Check alignment (handle eventual smaller last zone) */
zone_sectors = blk_queue_zone_sectors(q);
if (sector & (zone_sectors - 1))
@@ -305,9 +346,7 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
if (!zones)
return -ENOMEM;
- ret = blkdev_report_zones(bdev, rep.sector,
- zones, &rep.nr_zones,
- GFP_KERNEL);
+ ret = blkdev_report_zones(bdev, rep.sector, zones, &rep.nr_zones);
if (ret)
goto out;
@@ -373,22 +412,25 @@ static inline unsigned long *blk_alloc_zone_bitmap(int node,
* Allocate an array of struct blk_zone to get nr_zones zone information.
* The allocated array may be smaller than nr_zones.
*/
-static struct blk_zone *blk_alloc_zones(int node, unsigned int *nr_zones)
+static struct blk_zone *blk_alloc_zones(unsigned int *nr_zones)
{
- size_t size = *nr_zones * sizeof(struct blk_zone);
- struct page *page;
- int order;
-
- for (order = get_order(size); order >= 0; order--) {
- page = alloc_pages_node(node, GFP_NOIO | __GFP_ZERO, order);
- if (page) {
- *nr_zones = min_t(unsigned int, *nr_zones,
- (PAGE_SIZE << order) / sizeof(struct blk_zone));
- return page_address(page);
- }
+ struct blk_zone *zones;
+ size_t nrz = min(*nr_zones, BLK_ZONED_REPORT_MAX_ZONES);
+
+ /*
+ * GFP_KERNEL here is meaningless as the caller task context has
+ * the PF_MEMALLOC_NOIO flag set in blk_revalidate_disk_zones()
+ * with memalloc_noio_save().
+ */
+ zones = kvcalloc(nrz, sizeof(struct blk_zone), GFP_KERNEL);
+ if (!zones) {
+ *nr_zones = 0;
+ return NULL;
}
- return NULL;
+ *nr_zones = nrz;
+
+ return zones;
}
void blk_queue_free_zone_bitmaps(struct request_queue *q)
@@ -415,6 +457,7 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
unsigned long *seq_zones_wlock = NULL, *seq_zones_bitmap = NULL;
unsigned int i, rep_nr_zones = 0, z = 0, nrz;
struct blk_zone *zones = NULL;
+ unsigned int noio_flag;
sector_t sector = 0;
int ret = 0;
@@ -427,6 +470,12 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
return 0;
}
+ /*
+ * Ensure that all memory allocations in this context are done as
+ * if GFP_NOIO was specified.
+ */
+ noio_flag = memalloc_noio_save();
+
if (!blk_queue_is_zoned(q) || !nr_zones) {
nr_zones = 0;
goto update;
@@ -443,13 +492,13 @@ int blk_revalidate_disk_zones(struct gendisk *disk)
/* Get zone information and initialize seq_zones_bitmap */
rep_nr_zones = nr_zones;
- zones = blk_alloc_zones(q->node, &rep_nr_zones);
+ zones = blk_alloc_zones(&rep_nr_zones);
if (!zones)
goto out;
while (z < nr_zones) {
nrz = min(nr_zones - z, rep_nr_zones);
- ret = blk_report_zones(disk, sector, zones, &nrz, GFP_NOIO);
+ ret = blk_report_zones(disk, sector, zones, &nrz);
if (ret)
goto out;
if (!nrz)
@@ -480,8 +529,9 @@ update:
blk_mq_unfreeze_queue(q);
out:
- free_pages((unsigned long)zones,
- get_order(rep_nr_zones * sizeof(struct blk_zone)));
+ memalloc_noio_restore(noio_flag);
+
+ kvfree(zones);
kfree(seq_zones_wlock);
kfree(seq_zones_bitmap);
diff --git a/block/blk.h b/block/blk.h
index 7814aa207153..ffea1691470e 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -19,6 +19,7 @@ struct blk_flush_queue {
unsigned int flush_queue_delayed:1;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
+ blk_status_t rq_status;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
struct list_head flush_data_in_flight;
@@ -29,6 +30,7 @@ struct blk_flush_queue {
* at the same time
*/
struct request *orig_rq;
+ struct lock_class_key key;
spinlock_t mq_flush_lock;
};
@@ -47,12 +49,16 @@ static inline void __blk_get_queue(struct request_queue *q)
kobject_get(&q->kobj);
}
+static inline bool
+is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
+{
+ return hctx->fq->flush_rq == req;
+}
+
struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q,
int node, int cmd_size, gfp_t flags);
void blk_free_flush_queue(struct blk_flush_queue *q);
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
- struct bio *bio);
void blk_freeze_queue(struct request_queue *q);
static inline void blk_queue_enter_live(struct request_queue *q)
@@ -101,6 +107,18 @@ static inline bool bvec_gap_to_prev(struct request_queue *q,
return __bvec_gap_to_prev(q, bprv, offset);
}
+static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
+ unsigned int nr_segs)
+{
+ rq->nr_phys_segments = nr_segs;
+ rq->__data_len = bio->bi_iter.bi_size;
+ rq->bio = rq->biotail = bio;
+ rq->ioprio = bio_prio(bio);
+
+ if (bio->bi_disk)
+ rq->rq_disk = bio->bi_disk;
+}
+
#ifdef CONFIG_BLK_DEV_INTEGRITY
void blk_flush_integrity(void);
bool __bio_integrity_endio(struct bio *);
@@ -154,14 +172,14 @@ static inline bool bio_integrity_endio(struct bio *bio)
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
- struct bio *bio);
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio);
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- struct request **same_queue_rq);
+ unsigned int nr_segs, struct request **same_queue_rq);
void blk_account_io_start(struct request *req, bool new_io);
void blk_account_io_completion(struct request *req, unsigned int bytes);
@@ -174,16 +192,18 @@ void blk_account_io_done(struct request *req, u64 now);
void blk_insert_flush(struct request *rq);
-int elevator_init_mq(struct request_queue *q);
+void elevator_init_mq(struct request_queue *q);
int elevator_switch_mq(struct request_queue *q,
struct elevator_type *new_e);
void __elevator_exit(struct request_queue *, struct elevator_queue *);
-int elv_register_queue(struct request_queue *q);
+int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
static inline void elevator_exit(struct request_queue *q,
struct elevator_queue *e)
{
+ lockdep_assert_held(&q->sysfs_lock);
+
blk_mq_sched_free_requests(q);
__elevator_exit(q, e);
}
@@ -202,15 +222,17 @@ static inline int blk_should_fake_timeout(struct request_queue *q)
}
#endif
-int ll_back_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio);
-int ll_front_merge_fn(struct request_queue *q, struct request *req,
- struct bio *bio);
+void __blk_queue_split(struct request_queue *q, struct bio **bio,
+ unsigned int *nr_segs);
+int ll_back_merge_fn(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
+int ll_front_merge_fn(struct request *req, struct bio *bio,
+ unsigned int nr_segs);
struct request *attempt_back_merge(struct request_queue *q, struct request *rq);
struct request *attempt_front_merge(struct request_queue *q, struct request *rq);
int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
-void blk_recalc_rq_segments(struct request *rq);
+unsigned int blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 785dd58947f1..347dda16c2f4 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -266,6 +266,7 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
struct request *req = bd->rq;
struct bsg_set *bset =
container_of(q->tag_set, struct bsg_set, tag_set);
+ int sts = BLK_STS_IOERR;
int ret;
blk_mq_start_request(req);
@@ -274,14 +275,15 @@ static blk_status_t bsg_queue_rq(struct blk_mq_hw_ctx *hctx,
return BLK_STS_IOERR;
if (!bsg_prepare_job(dev, req))
- return BLK_STS_IOERR;
+ goto out;
ret = bset->job_fn(blk_mq_rq_to_pdu(req));
- if (ret)
- return BLK_STS_IOERR;
+ if (!ret)
+ sts = BLK_STS_OK;
+out:
put_device(dev);
- return BLK_STS_OK;
+ return sts;
}
/* called right after the request is allocated for the request_queue */
diff --git a/block/compat_ioctl.c b/block/compat_ioctl.c
index 6ca015f92766..7f053468b50d 100644
--- a/block/compat_ioctl.c
+++ b/block/compat_ioctl.c
@@ -6,6 +6,7 @@
#include <linux/compat.h>
#include <linux/elevator.h>
#include <linux/hdreg.h>
+#include <linux/pr.h>
#include <linux/slab.h>
#include <linux/syscalls.h>
#include <linux/types.h>
@@ -354,6 +355,10 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
* but we call blkdev_ioctl, which gets the lock for us
*/
case BLKRRPART:
+ case BLKREPORTZONE:
+ case BLKRESETZONE:
+ case BLKGETZONESZ:
+ case BLKGETNRZONES:
return blkdev_ioctl(bdev, mode, cmd,
(unsigned long)compat_ptr(arg));
case BLKBSZSET_32:
@@ -401,6 +406,14 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKTRACETEARDOWN: /* compatible */
ret = blk_trace_ioctl(bdev, cmd, compat_ptr(arg));
return ret;
+ case IOC_PR_REGISTER:
+ case IOC_PR_RESERVE:
+ case IOC_PR_RELEASE:
+ case IOC_PR_PREEMPT:
+ case IOC_PR_PREEMPT_ABORT:
+ case IOC_PR_CLEAR:
+ return blkdev_ioctl(bdev, mode, cmd,
+ (unsigned long)compat_ptr(arg));
default:
if (disk->fops->compat_ioctl)
ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg);
diff --git a/block/elevator.c b/block/elevator.c
index 2f17d66d0e61..076ba7308e65 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -83,8 +83,26 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static bool elevator_match(const struct elevator_type *e, const char *name)
+static inline bool elv_support_features(unsigned int elv_features,
+ unsigned int required_features)
{
+ return (required_features & elv_features) == required_features;
+}
+
+/**
+ * elevator_match - Test an elevator name and features
+ * @e: Scheduler to test
+ * @name: Elevator name to test
+ * @required_features: Features that the elevator must provide
+ *
+ * Return true is the elevator @e name matches @name and if @e provides all the
+ * the feratures spcified by @required_features.
+ */
+static bool elevator_match(const struct elevator_type *e, const char *name,
+ unsigned int required_features)
+{
+ if (!elv_support_features(e->elevator_features, required_features))
+ return false;
if (!strcmp(e->elevator_name, name))
return true;
if (e->elevator_alias && !strcmp(e->elevator_alias, name))
@@ -93,15 +111,21 @@ static bool elevator_match(const struct elevator_type *e, const char *name)
return false;
}
-/*
- * Return scheduler with name 'name'
+/**
+ * elevator_find - Find an elevator
+ * @name: Name of the elevator to find
+ * @required_features: Features that the elevator must provide
+ *
+ * Return the first registered scheduler with name @name and supporting the
+ * features @required_features and NULL otherwise.
*/
-static struct elevator_type *elevator_find(const char *name)
+static struct elevator_type *elevator_find(const char *name,
+ unsigned int required_features)
{
struct elevator_type *e;
list_for_each_entry(e, &elv_list, list) {
- if (elevator_match(e, name))
+ if (elevator_match(e, name, required_features))
return e;
}
@@ -120,12 +144,12 @@ static struct elevator_type *elevator_get(struct request_queue *q,
spin_lock(&elv_list_lock);
- e = elevator_find(name);
+ e = elevator_find(name, q->required_elevator_features);
if (!e && try_loading) {
spin_unlock(&elv_list_lock);
request_module("%s-iosched", name);
spin_lock(&elv_list_lock);
- e = elevator_find(name);
+ e = elevator_find(name, q->required_elevator_features);
}
if (e && !try_module_get(e->elevator_owner))
@@ -135,20 +159,6 @@ static struct elevator_type *elevator_get(struct request_queue *q,
return e;
}
-static char chosen_elevator[ELV_NAME_MAX];
-
-static int __init elevator_setup(char *str)
-{
- /*
- * Be backwards-compatible with previous kernels, so users
- * won't get the wrong elevator.
- */
- strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1);
- return 1;
-}
-
-__setup("elevator=", elevator_setup);
-
static struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
@@ -470,13 +480,16 @@ static struct kobj_type elv_ktype = {
.release = elevator_release,
};
-int elv_register_queue(struct request_queue *q)
+/*
+ * elv_register_queue is called from either blk_register_queue or
+ * elevator_switch, elevator switch is prevented from being happen
+ * in the two paths, so it is safe to not hold q->sysfs_lock.
+ */
+int elv_register_queue(struct request_queue *q, bool uevent)
{
struct elevator_queue *e = q->elevator;
int error;
- lockdep_assert_held(&q->sysfs_lock);
-
error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
@@ -487,21 +500,27 @@ int elv_register_queue(struct request_queue *q)
attr++;
}
}
- kobject_uevent(&e->kobj, KOBJ_ADD);
+ if (uevent)
+ kobject_uevent(&e->kobj, KOBJ_ADD);
+
e->registered = 1;
}
return error;
}
+/*
+ * elv_unregister_queue is called from either blk_unregister_queue or
+ * elevator_switch, elevator switch is prevented from being happen
+ * in the two paths, so it is safe to not hold q->sysfs_lock.
+ */
void elv_unregister_queue(struct request_queue *q)
{
- lockdep_assert_held(&q->sysfs_lock);
-
if (q) {
struct elevator_queue *e = q->elevator;
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
+
e->registered = 0;
/* Re-enable throttling in case elevator disabled it */
wbt_enable_default(q);
@@ -526,7 +545,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- if (elevator_find(e->elevator_name)) {
+ if (elevator_find(e->elevator_name, 0)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@@ -569,6 +588,7 @@ int elevator_switch_mq(struct request_queue *q,
if (q->elevator) {
if (q->elevator->registered)
elv_unregister_queue(q);
+
ioc_clear_queue(q);
elevator_exit(q, q->elevator);
}
@@ -578,7 +598,7 @@ int elevator_switch_mq(struct request_queue *q,
goto out;
if (new_e) {
- ret = elv_register_queue(q);
+ ret = elv_register_queue(q, true);
if (ret) {
elevator_exit(q, q->elevator);
goto out;
@@ -594,37 +614,90 @@ out:
return ret;
}
+static inline bool elv_support_iosched(struct request_queue *q)
+{
+ if (!q->mq_ops ||
+ (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
+ return false;
+ return true;
+}
+
+/*
+ * For single queue devices, default to using mq-deadline. If we have multiple
+ * queues or mq-deadline is not available, default to "none".
+ */
+static struct elevator_type *elevator_get_default(struct request_queue *q)
+{
+ if (q->nr_hw_queues != 1)
+ return NULL;
+
+ return elevator_get(q, "mq-deadline", false);
+}
+
+/*
+ * Get the first elevator providing the features required by the request queue.
+ * Default to "none" if no matching elevator is found.
+ */
+static struct elevator_type *elevator_get_by_features(struct request_queue *q)
+{
+ struct elevator_type *e, *found = NULL;
+
+ spin_lock(&elv_list_lock);
+
+ list_for_each_entry(e, &elv_list, list) {
+ if (elv_support_features(e->elevator_features,
+ q->required_elevator_features)) {
+ found = e;
+ break;
+ }
+ }
+
+ if (found && !try_module_get(found->elevator_owner))
+ found = NULL;
+
+ spin_unlock(&elv_list_lock);
+ return found;
+}
+
/*
- * For blk-mq devices, we default to using mq-deadline, if available, for single
- * queue devices. If deadline isn't available OR we have multiple queues,
- * default to "none".
+ * For a device queue that has no required features, use the default elevator
+ * settings. Otherwise, use the first elevator available matching the required
+ * features. If no suitable elevator is find or if the chosen elevator
+ * initialization fails, fall back to the "none" elevator (no elevator).
*/
-int elevator_init_mq(struct request_queue *q)
+void elevator_init_mq(struct request_queue *q)
{
struct elevator_type *e;
- int err = 0;
+ int err;
- if (q->nr_hw_queues != 1)
- return 0;
+ if (!elv_support_iosched(q))
+ return;
+
+ WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
- /*
- * q->sysfs_lock must be held to provide mutual exclusion between
- * elevator_switch() and here.
- */
- mutex_lock(&q->sysfs_lock);
if (unlikely(q->elevator))
- goto out_unlock;
+ return;
- e = elevator_get(q, "mq-deadline", false);
+ if (!q->required_elevator_features)
+ e = elevator_get_default(q);
+ else
+ e = elevator_get_by_features(q);
if (!e)
- goto out_unlock;
+ return;
+
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
err = blk_mq_init_sched(q, e);
- if (err)
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
+ if (err) {
+ pr_warn("\"%s\" elevator initialization failed, "
+ "falling back to \"none\"\n", e->elevator_name);
elevator_put(e);
-out_unlock:
- mutex_unlock(&q->sysfs_lock);
- return err;
+ }
}
@@ -660,7 +733,7 @@ static int __elevator_change(struct request_queue *q, const char *name)
struct elevator_type *e;
/* Make sure queue is not in the middle of being removed */
- if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
+ if (!blk_queue_registered(q))
return -ENOENT;
/*
@@ -677,7 +750,8 @@ static int __elevator_change(struct request_queue *q, const char *name)
if (!e)
return -EINVAL;
- if (q->elevator && elevator_match(q->elevator->type, elevator_name)) {
+ if (q->elevator &&
+ elevator_match(q->elevator->type, elevator_name, 0)) {
elevator_put(e);
return 0;
}
@@ -685,13 +759,6 @@ static int __elevator_change(struct request_queue *q, const char *name)
return elevator_switch(q, e);
}
-static inline bool elv_support_iosched(struct request_queue *q)
-{
- if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))
- return false;
- return true;
-}
-
ssize_t elv_iosched_store(struct request_queue *q, const char *name,
size_t count)
{
@@ -724,11 +791,13 @@ ssize_t elv_iosched_show(struct request_queue *q, char *name)
spin_lock(&elv_list_lock);
list_for_each_entry(__e, &elv_list, list) {
- if (elv && elevator_match(elv, __e->elevator_name)) {
+ if (elv && elevator_match(elv, __e->elevator_name, 0)) {
len += sprintf(name+len, "[%s] ", elv->elevator_name);
continue;
}
- if (elv_support_iosched(q))
+ if (elv_support_iosched(q) &&
+ elevator_match(__e, __e->elevator_name,
+ q->required_elevator_features))
len += sprintf(name+len, "%s ", __e->elevator_name);
}
spin_unlock(&elv_list_lock);
diff --git a/block/genhd.c b/block/genhd.c
index 24654e1d83e6..26b31fcae217 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -695,6 +695,15 @@ static void __device_add_disk(struct device *parent, struct gendisk *disk,
dev_t devt;
int retval;
+ /*
+ * The disk queue should now be all set with enough information about
+ * the device for the elevator code to pick an adequate default
+ * elevator if one is needed, that is, for devices requesting queue
+ * registration.
+ */
+ if (register_queue)
+ elevator_init_mq(disk->queue);
+
/* minors == 0 indicates to use ext devt from part0 and should
* be accompanied with EXT_DEVT flag. Make sure all
* parameters make sense.
@@ -1281,7 +1290,6 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
struct disk_part_tbl *new_ptbl;
int len = old_ptbl ? old_ptbl->len : 0;
int i, target;
- size_t size;
/*
* check for int overflow, since we can get here from blkpg_ioctl()
@@ -1298,8 +1306,8 @@ int disk_expand_part_tbl(struct gendisk *disk, int partno)
if (target <= len)
return 0;
- size = sizeof(*new_ptbl) + target * sizeof(new_ptbl->part[0]);
- new_ptbl = kzalloc_node(size, GFP_KERNEL, disk->node_id);
+ new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
+ disk->node_id);
if (!new_ptbl)
return -ENOMEM;
@@ -1970,7 +1978,7 @@ static const struct attribute *disk_events_attrs[] = {
* The default polling interval can be specified by the kernel
* parameter block.events_dfl_poll_msecs which defaults to 0
* (disable). This can also be modified runtime by writing to
- * /sys/module/block/events_dfl_poll_msecs.
+ * /sys/module/block/parameters/events_dfl_poll_msecs.
*/
static int disk_events_set_dfl_poll_msecs(const char *val,
const struct kernel_param *kp)
diff --git a/block/ioprio.c b/block/ioprio.c
index 2e0559f157c8..77bcab11dce5 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -17,7 +17,7 @@
*
* ioprio_set(PRIO_PROCESS, pid, prio);
*
- * See also Documentation/block/ioprio.txt
+ * See also Documentation/block/ioprio.rst
*
*/
#include <linux/gfp.h>
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index c3b05119cebd..34dcea0ef637 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -562,7 +562,8 @@ static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
}
}
-static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+ unsigned int nr_segs)
{
struct kyber_hctx_data *khd = hctx->sched_data;
struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
@@ -572,9 +573,8 @@ static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
bool merged;
spin_lock(&kcq->lock);
- merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio);
+ merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
spin_unlock(&kcq->lock);
- blk_mq_put_ctx(ctx);
return merged;
}
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index 1876f5712bfd..b490f47fd553 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -25,7 +25,7 @@
#include "blk-mq-sched.h"
/*
- * See Documentation/block/deadline-iosched.txt
+ * See Documentation/block/deadline-iosched.rst
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
@@ -377,13 +377,6 @@ done:
* hardware queue, but we may return a request that is for a
* different hardware queue. This is because mq-deadline has shared
* state for all hardware queues, in terms of sorting, FIFOs, etc.
- *
- * For a zoned block device, __dd_dispatch_request() may return NULL
- * if all the queued write requests are directed at zones that are already
- * locked due to on-going write requests. In this case, make sure to mark
- * the queue as needing a restart to ensure that the queue is run again
- * and the pending writes dispatched once the target zones for the ongoing
- * write requests are unlocked in dd_finish_request().
*/
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
@@ -392,9 +385,6 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
spin_lock(&dd->lock);
rq = __dd_dispatch_request(dd);
- if (!rq && blk_queue_is_zoned(hctx->queue) &&
- !list_empty(&dd->fifo_list[WRITE]))
- blk_mq_sched_mark_restart_hctx(hctx);
spin_unlock(&dd->lock);
return rq;
@@ -469,7 +459,8 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
return ELEVATOR_NO_MERGE;
}
-static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
+static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+ unsigned int nr_segs)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
@@ -477,7 +468,7 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio)
bool ret;
spin_lock(&dd->lock);
- ret = blk_mq_sched_try_merge(q, bio, &free);
+ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
spin_unlock(&dd->lock);
if (free)
@@ -560,6 +551,13 @@ static void dd_prepare_request(struct request *rq, struct bio *bio)
* spinlock so that the zone is never unlocked while deadline_fifo_request()
* or deadline_next_request() are executing. This function is called for
* all requests, whether or not these requests complete successfully.
+ *
+ * For a zoned block device, __dd_dispatch_request() may have stopped
+ * dispatching requests if all the queued requests are write requests directed
+ * at zones that are already locked due to on-going write requests. To ensure
+ * write request dispatch progress in this case, mark the queue as needing a
+ * restart to ensure that the queue is run again after completion of the
+ * request and zones being unlocked.
*/
static void dd_finish_request(struct request *rq)
{
@@ -571,6 +569,8 @@ static void dd_finish_request(struct request *rq)
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
+ if (!list_empty(&dd->fifo_list[WRITE]))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
}
}
@@ -794,6 +794,7 @@ static struct elevator_type mq_deadline = {
.elevator_attrs = deadline_attrs,
.elevator_name = "mq-deadline",
.elevator_alias = "deadline",
+ .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE,
.elevator_owner = THIS_MODULE,
};
MODULE_ALIAS("mq-deadline-iosched");
diff --git a/block/opal_proto.h b/block/opal_proto.h
index d9a05ad02eb5..5532412d567c 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -98,6 +98,7 @@ enum opal_uid {
OPAL_ENTERPRISE_BANDMASTER0_UID,
OPAL_ENTERPRISE_ERASEMASTER_UID,
/* tables */
+ OPAL_TABLE_TABLE,
OPAL_LOCKINGRANGE_GLOBAL,
OPAL_LOCKINGRANGE_ACE_RDLOCKED,
OPAL_LOCKINGRANGE_ACE_WRLOCKED,
@@ -118,8 +119,6 @@ enum opal_uid {
OPAL_UID_HEXFF,
};
-#define OPAL_METHOD_LENGTH 8
-
/* Enum for indexing the OPALMETHOD array */
enum opal_method {
OPAL_PROPERTIES,
@@ -152,6 +151,20 @@ enum opal_token {
OPAL_STARTCOLUMN = 0x03,
OPAL_ENDCOLUMN = 0x04,
OPAL_VALUES = 0x01,
+ /* table table */
+ OPAL_TABLE_UID = 0x00,
+ OPAL_TABLE_NAME = 0x01,
+ OPAL_TABLE_COMMON = 0x02,
+ OPAL_TABLE_TEMPLATE = 0x03,
+ OPAL_TABLE_KIND = 0x04,
+ OPAL_TABLE_COLUMN = 0x05,
+ OPAL_TABLE_COLUMNS = 0x06,
+ OPAL_TABLE_ROWS = 0x07,
+ OPAL_TABLE_ROWS_FREE = 0x08,
+ OPAL_TABLE_ROW_BYTES = 0x09,
+ OPAL_TABLE_LASTID = 0x0A,
+ OPAL_TABLE_MIN = 0x0B,
+ OPAL_TABLE_MAX = 0x0C,
/* authority table */
OPAL_PIN = 0x03,
/* locking tokens */
@@ -166,7 +179,7 @@ enum opal_token {
OPAL_LIFECYCLE = 0x06,
/* locking info table */
OPAL_MAXRANGES = 0x04,
- /* mbr control */
+ /* mbr control */
OPAL_MBRENABLE = 0x01,
OPAL_MBRDONE = 0x02,
/* properties */
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 37b9710cc80a..702689a628f0 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -194,7 +194,7 @@ config LDM_PARTITION
Normal partitions are now called Basic Disks under Windows 2000, XP,
and Vista.
- For a fuller description read <file:Documentation/ldm.txt>.
+ For a fuller description read <file:Documentation/admin-guide/ldm.rst>.
If unsure, say N.
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index 60fb3df9897c..f1edd5452249 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -11,7 +11,7 @@
*
* The format for the command line is just like mtdparts.
*
- * For further information, see "Documentation/block/cmdline-partition.txt"
+ * For further information, see "Documentation/block/cmdline-partition.rst"
*
*/
diff --git a/block/sed-opal.c b/block/sed-opal.c
index a46e8d13e16d..b4c761973ac1 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -26,6 +26,9 @@
#define IO_BUFFER_LENGTH 2048
#define MAX_TOKS 64
+/* Number of bytes needed by cmd_finalize. */
+#define CMD_FINALIZE_BYTES_NEEDED 7
+
struct opal_step {
int (*fn)(struct opal_dev *dev, void *data);
void *data;
@@ -126,7 +129,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x00, 0x09, 0x00, 0x00, 0x84, 0x01 },
/* tables */
-
+ [OPAL_TABLE_TABLE] =
+ { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 },
[OPAL_LOCKINGRANGE_GLOBAL] =
{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
[OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
@@ -147,7 +151,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x08, 0x01, 0x00, 0x00, 0x00, 0x00 },
/* C_PIN_TABLE object ID's */
-
[OPAL_C_PIN_MSID] =
{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x00, 0x84, 0x02},
[OPAL_C_PIN_SID] =
@@ -156,7 +159,6 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x00, 0x0B, 0x00, 0x01, 0x00, 0x01},
/* half UID's (only first 4 bytes used) */
-
[OPAL_HALF_UID_AUTHORITY_OBJ_REF] =
{ 0x00, 0x00, 0x0C, 0x05, 0xff, 0xff, 0xff, 0xff },
[OPAL_HALF_UID_BOOLEAN_ACE] =
@@ -370,8 +372,8 @@ static void check_geometry(struct opal_dev *dev, const void *data)
{
const struct d0_geometry_features *geo = data;
- dev->align = geo->alignment_granularity;
- dev->lowest_lba = geo->lowest_aligned_lba;
+ dev->align = be64_to_cpu(geo->alignment_granularity);
+ dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba);
}
static int execute_step(struct opal_dev *dev,
@@ -512,6 +514,7 @@ static int opal_discovery0(struct opal_dev *dev, void *data)
ret = opal_recv_cmd(dev);
if (ret)
return ret;
+
return opal_discovery0_end(dev);
}
@@ -520,15 +523,21 @@ static int opal_discovery0_step(struct opal_dev *dev)
const struct opal_step discovery0_step = {
opal_discovery0,
};
+
return execute_step(dev, &discovery0_step, 0);
}
+static size_t remaining_size(struct opal_dev *cmd)
+{
+ return IO_BUFFER_LENGTH - cmd->pos;
+}
+
static bool can_add(int *err, struct opal_dev *cmd, size_t len)
{
if (*err)
return false;
- if (len > IO_BUFFER_LENGTH || cmd->pos > IO_BUFFER_LENGTH - len) {
+ if (remaining_size(cmd) < len) {
pr_debug("Error adding %zu bytes: end of buffer.\n", len);
*err = -ERANGE;
return false;
@@ -541,6 +550,7 @@ static void add_token_u8(int *err, struct opal_dev *cmd, u8 tok)
{
if (!can_add(err, cmd, 1))
return;
+
cmd->cmd[cmd->pos++] = tok;
}
@@ -567,6 +577,7 @@ static void add_medium_atom_header(struct opal_dev *cmd, bool bytestring,
header0 |= bytestring ? MEDIUM_ATOM_BYTESTRING : 0;
header0 |= has_sign ? MEDIUM_ATOM_SIGNED : 0;
header0 |= (len >> 8) & MEDIUM_ATOM_LEN_MASK;
+
cmd->cmd[cmd->pos++] = header0;
cmd->cmd[cmd->pos++] = len;
}
@@ -639,6 +650,7 @@ static int build_locking_range(u8 *buffer, size_t length, u8 lr)
if (lr == 0)
return 0;
+
buffer[5] = LOCKING_RANGE_NON_GLOBAL;
buffer[7] = lr;
@@ -674,7 +686,11 @@ static int cmd_finalize(struct opal_dev *cmd, u32 hsn, u32 tsn)
struct opal_header *hdr;
int err = 0;
- /* close the parameter list opened from cmd_start */
+ /*
+ * Close the parameter list opened from cmd_start.
+ * The number of bytes added must be equal to
+ * CMD_FINALIZE_BYTES_NEEDED.
+ */
add_token_u8(&err, cmd, OPAL_ENDLIST);
add_token_u8(&err, cmd, OPAL_ENDOFDATA);
@@ -889,10 +905,6 @@ static int response_parse(const u8 *buf, size_t length,
num_entries++;
}
- if (num_entries == 0) {
- pr_debug("Couldn't parse response.\n");
- return -EINVAL;
- }
resp->num = num_entries;
return 0;
@@ -931,6 +943,7 @@ static size_t response_get_string(const struct parsed_resp *resp, int n,
}
*store = tok->pos + skip;
+
return tok->len - skip;
}
@@ -1048,6 +1061,7 @@ static int start_opal_session_cont(struct opal_dev *dev)
dev->hsn = hsn;
dev->tsn = tsn;
+
return 0;
}
@@ -1070,6 +1084,7 @@ static int end_session_cont(struct opal_dev *dev)
{
dev->hsn = 0;
dev->tsn = 0;
+
return parse_and_check_status(dev);
}
@@ -1119,6 +1134,29 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table,
return finalize_and_send(dev, parse_and_check_status);
}
+/*
+ * see TCG SAS 5.3.2.3 for a description of the available columns
+ *
+ * the result is provided in dev->resp->tok[4]
+ */
+static int generic_get_table_info(struct opal_dev *dev, enum opal_uid table,
+ u64 column)
+{
+ u8 uid[OPAL_UID_LENGTH];
+ const unsigned int half = OPAL_UID_LENGTH/2;
+
+ /* sed-opal UIDs can be split in two halves:
+ * first: actual table index
+ * second: relative index in the table
+ * so we have to get the first half of the OPAL_TABLE_TABLE and use the
+ * first part of the target table as relative index into that table
+ */
+ memcpy(uid, opaluid[OPAL_TABLE_TABLE], half);
+ memcpy(uid+half, opaluid[table], half);
+
+ return generic_get_column(dev, uid, column);
+}
+
static int gen_key(struct opal_dev *dev, void *data)
{
u8 uid[OPAL_UID_LENGTH];
@@ -1135,6 +1173,7 @@ static int gen_key(struct opal_dev *dev, void *data)
return err;
}
+
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1147,12 +1186,14 @@ static int get_active_key_cont(struct opal_dev *dev)
error = parse_and_check_status(dev);
if (error)
return error;
+
keylen = response_get_string(&dev->parsed, 4, &activekey);
if (!activekey) {
pr_debug("%s: Couldn't extract the Activekey from the response\n",
__func__);
return OPAL_INVAL_PARAM;
}
+
dev->prev_data = kmemdup(activekey, keylen, GFP_KERNEL);
if (!dev->prev_data)
@@ -1214,6 +1255,7 @@ static int generic_lr_enable_disable(struct opal_dev *dev,
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
+
return err;
}
@@ -1226,6 +1268,7 @@ static inline int enable_global_lr(struct opal_dev *dev, u8 *uid,
0, 0);
if (err)
pr_debug("Failed to create enable global lr command\n");
+
return err;
}
@@ -1276,7 +1319,6 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
if (err) {
pr_debug("Error building Setup Locking range command.\n");
return err;
-
}
return finalize_and_send(dev, parse_and_check_status);
@@ -1307,6 +1349,7 @@ static int start_generic_opal_session(struct opal_dev *dev,
break;
case OPAL_ADMIN1_UID:
case OPAL_SID_UID:
+ case OPAL_PSID_UID:
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, 0); /* HostChallenge */
add_token_bytestring(&err, dev, key, key_len);
@@ -1355,6 +1398,7 @@ static int start_SIDASP_opal_session(struct opal_dev *dev, void *data)
kfree(key);
dev->prev_data = NULL;
}
+
return ret;
}
@@ -1367,6 +1411,16 @@ static int start_admin1LSP_opal_session(struct opal_dev *dev, void *data)
key->key, key->key_len);
}
+static int start_PSID_opal_session(struct opal_dev *dev, void *data)
+{
+ const struct opal_key *okey = data;
+
+ return start_generic_opal_session(dev, OPAL_PSID_UID,
+ OPAL_ADMINSP_UID,
+ okey->key,
+ okey->key_len);
+}
+
static int start_auth_opal_session(struct opal_dev *dev, void *data)
{
struct opal_session_info *session = data;
@@ -1470,6 +1524,7 @@ static int erase_locking_range(struct opal_dev *dev, void *data)
pr_debug("Error building Erase Locking Range Command.\n");
return err;
}
+
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1525,6 +1580,73 @@ static int set_mbr_enable_disable(struct opal_dev *dev, void *data)
return finalize_and_send(dev, parse_and_check_status);
}
+static int write_shadow_mbr(struct opal_dev *dev, void *data)
+{
+ struct opal_shadow_mbr *shadow = data;
+ const u8 __user *src;
+ u8 *dst;
+ size_t off = 0;
+ u64 len;
+ int err = 0;
+
+ /* do we fit in the available shadow mbr space? */
+ err = generic_get_table_info(dev, OPAL_MBR, OPAL_TABLE_ROWS);
+ if (err) {
+ pr_debug("MBR: could not get shadow size\n");
+ return err;
+ }
+
+ len = response_get_u64(&dev->parsed, 4);
+ if (shadow->size > len || shadow->offset > len - shadow->size) {
+ pr_debug("MBR: does not fit in shadow (%llu vs. %llu)\n",
+ shadow->offset + shadow->size, len);
+ return -ENOSPC;
+ }
+
+ /* do the actual transmission(s) */
+ src = (u8 __user *)(uintptr_t)shadow->data;
+ while (off < shadow->size) {
+ err = cmd_start(dev, opaluid[OPAL_MBR], opalmethod[OPAL_SET]);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_WHERE);
+ add_token_u64(&err, dev, shadow->offset + off);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u8(&err, dev, OPAL_VALUES);
+
+ /*
+ * The bytestring header is either 1 or 2 bytes, so assume 2.
+ * There also needs to be enough space to accommodate the
+ * trailing OPAL_ENDNAME (1 byte) and tokens added by
+ * cmd_finalize.
+ */
+ len = min(remaining_size(dev) - (2+1+CMD_FINALIZE_BYTES_NEEDED),
+ (size_t)(shadow->size - off));
+ pr_debug("MBR: write bytes %zu+%llu/%llu\n",
+ off, len, shadow->size);
+
+ dst = add_bytestring_header(&err, dev, len);
+ if (!dst)
+ break;
+ if (copy_from_user(dst, src + off, len))
+ err = -EFAULT;
+ dev->pos += len;
+
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ if (err)
+ break;
+
+ err = finalize_and_send(dev, parse_and_check_status);
+ if (err)
+ break;
+
+ off += len;
+ }
+
+ return err;
+}
+
static int generic_pw_cmd(u8 *key, size_t key_len, u8 *cpin_uid,
struct opal_dev *dev)
{
@@ -1702,6 +1824,7 @@ static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
pr_debug("Error building SET command.\n");
return err;
}
+
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1743,6 +1866,7 @@ static int lock_unlock_locking_range_sum(struct opal_dev *dev, void *data)
pr_debug("Error building SET command.\n");
return ret;
}
+
return finalize_and_send(dev, parse_and_check_status);
}
@@ -1843,6 +1967,7 @@ static int end_opal_session(struct opal_dev *dev, void *data)
if (err < 0)
return err;
+
return finalize_and_send(dev, end_session_cont);
}
@@ -1851,6 +1976,7 @@ static int end_opal_session_error(struct opal_dev *dev)
const struct opal_step error_end_session = {
end_opal_session,
};
+
return execute_step(dev, &error_end_session, 0);
}
@@ -1870,6 +1996,7 @@ static int check_opal_support(struct opal_dev *dev)
ret = opal_discovery0_step(dev);
dev->supported = !ret;
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -1890,6 +2017,7 @@ void free_opal_dev(struct opal_dev *dev)
{
if (!dev)
return;
+
clean_opal_dev(dev);
kfree(dev);
}
@@ -1912,6 +2040,7 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
kfree(dev);
return NULL;
}
+
return dev;
}
EXPORT_SYMBOL(init_opal_dev);
@@ -1931,6 +2060,7 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -1948,6 +2078,7 @@ static int opal_erase_locking_range(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -1975,6 +2106,53 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
mutex_unlock(&dev->dev_lock);
+
+ return ret;
+}
+
+static int opal_set_mbr_done(struct opal_dev *dev,
+ struct opal_mbr_done *mbr_done)
+{
+ u8 mbr_done_tf = mbr_done->done_flag == OPAL_MBR_DONE ?
+ OPAL_TRUE : OPAL_FALSE;
+
+ const struct opal_step mbr_steps[] = {
+ { start_admin1LSP_opal_session, &mbr_done->key },
+ { set_mbr_done, &mbr_done_tf },
+ { end_opal_session, }
+ };
+ int ret;
+
+ if (mbr_done->done_flag != OPAL_MBR_DONE &&
+ mbr_done->done_flag != OPAL_MBR_NOT_DONE)
+ return -EINVAL;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
+ mutex_unlock(&dev->dev_lock);
+
+ return ret;
+}
+
+static int opal_write_shadow_mbr(struct opal_dev *dev,
+ struct opal_shadow_mbr *info)
+{
+ const struct opal_step mbr_steps[] = {
+ { start_admin1LSP_opal_session, &info->key },
+ { write_shadow_mbr, info },
+ { end_opal_session, }
+ };
+ int ret;
+
+ if (info->size == 0)
+ return 0;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
+ mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -1993,6 +2171,7 @@ static int opal_save(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk)
setup_opal_dev(dev);
add_suspend_info(dev, suspend);
mutex_unlock(&dev->dev_lock);
+
return 0;
}
@@ -2011,12 +2190,14 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
pr_debug("Locking state was not RO or RW\n");
return -EINVAL;
}
+
if (lk_unlk->session.who < OPAL_USER1 ||
lk_unlk->session.who > OPAL_USER9) {
pr_debug("Authority was not within the range of users: %d\n",
lk_unlk->session.who);
return -EINVAL;
}
+
if (lk_unlk->session.sum) {
pr_debug("%s not supported in sum. Use setup locking range\n",
__func__);
@@ -2027,20 +2208,32 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
-static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal)
+static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psid)
{
+ /* controller will terminate session */
const struct opal_step revert_steps[] = {
{ start_SIDASP_opal_session, opal },
- { revert_tper, } /* controller will terminate session */
+ { revert_tper, }
+ };
+ const struct opal_step psid_revert_steps[] = {
+ { start_PSID_opal_session, opal },
+ { revert_tper, }
};
+
int ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
- ret = execute_steps(dev, revert_steps, ARRAY_SIZE(revert_steps));
+ if (psid)
+ ret = execute_steps(dev, psid_revert_steps,
+ ARRAY_SIZE(psid_revert_steps));
+ else
+ ret = execute_steps(dev, revert_steps,
+ ARRAY_SIZE(revert_steps));
mutex_unlock(&dev->dev_lock);
/*
@@ -2092,13 +2285,13 @@ static int opal_lock_unlock(struct opal_dev *dev,
{
int ret;
- if (lk_unlk->session.who < OPAL_ADMIN1 ||
- lk_unlk->session.who > OPAL_USER9)
+ if (lk_unlk->session.who > OPAL_USER9)
return -EINVAL;
mutex_lock(&dev->dev_lock);
ret = __opal_lock_unlock(dev, lk_unlk);
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -2121,6 +2314,7 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
setup_opal_dev(dev);
ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -2142,6 +2336,7 @@ static int opal_activate_lsp(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -2159,6 +2354,7 @@ static int opal_setup_locking_range(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -2171,9 +2367,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
};
int ret;
- if (opal_pw->session.who < OPAL_ADMIN1 ||
- opal_pw->session.who > OPAL_USER9 ||
- opal_pw->new_user_pw.who < OPAL_ADMIN1 ||
+ if (opal_pw->session.who > OPAL_USER9 ||
opal_pw->new_user_pw.who > OPAL_USER9)
return -EINVAL;
@@ -2181,6 +2375,7 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
setup_opal_dev(dev);
ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -2205,6 +2400,7 @@ static int opal_activate_user(struct opal_dev *dev,
setup_opal_dev(dev);
ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps));
mutex_unlock(&dev->dev_lock);
+
return ret;
}
@@ -2216,6 +2412,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
if (!dev)
return false;
+
if (!dev->supported)
return false;
@@ -2233,6 +2430,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
suspend->unlk.session.sum);
was_failure = true;
}
+
if (dev->mbr_enabled) {
ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
if (ret)
@@ -2240,6 +2438,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
}
}
mutex_unlock(&dev->dev_lock);
+
return was_failure;
}
EXPORT_SYMBOL(opal_unlock_from_suspend);
@@ -2280,7 +2479,7 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
ret = opal_activate_user(dev, p);
break;
case IOC_OPAL_REVERT_TPR:
- ret = opal_reverttper(dev, p);
+ ret = opal_reverttper(dev, p, false);
break;
case IOC_OPAL_LR_SETUP:
ret = opal_setup_locking_range(dev, p);
@@ -2291,12 +2490,21 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_ENABLE_DISABLE_MBR:
ret = opal_enable_disable_shadow_mbr(dev, p);
break;
+ case IOC_OPAL_MBR_DONE:
+ ret = opal_set_mbr_done(dev, p);
+ break;
+ case IOC_OPAL_WRITE_SHADOW_MBR:
+ ret = opal_write_shadow_mbr(dev, p);
+ break;
case IOC_OPAL_ERASE_LR:
ret = opal_erase_locking_range(dev, p);
break;
case IOC_OPAL_SECURE_ERASE_LR:
ret = opal_secure_erase_locking_range(dev, p);
break;
+ case IOC_OPAL_PSID_REVERT_TPR:
+ ret = opal_reverttper(dev, p, true);
+ break;
default:
break;
}
diff --git a/block/t10-pi.c b/block/t10-pi.c
index 0c0094609dd6..9803c7e0376e 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -27,7 +27,7 @@ static __be16 t10_pi_ip_fn(void *data, unsigned int len)
* tag.
*/
static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
- csum_fn *fn, unsigned int type)
+ csum_fn *fn, enum t10_dif_type type)
{
unsigned int i;
@@ -37,7 +37,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
pi->guard_tag = fn(iter->data_buf, iter->interval);
pi->app_tag = 0;
- if (type == 1)
+ if (type == T10_PI_TYPE1_PROTECTION)
pi->ref_tag = cpu_to_be32(lower_32_bits(iter->seed));
else
pi->ref_tag = 0;
@@ -51,17 +51,18 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
}
static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
- csum_fn *fn, unsigned int type)
+ csum_fn *fn, enum t10_dif_type type)
{
unsigned int i;
+ BUG_ON(type == T10_PI_TYPE0_PROTECTION);
+
for (i = 0 ; i < iter->data_size ; i += iter->interval) {
struct t10_pi_tuple *pi = iter->prot_buf;
__be16 csum;
- switch (type) {
- case 1:
- case 2:
+ if (type == T10_PI_TYPE1_PROTECTION ||
+ type == T10_PI_TYPE2_PROTECTION) {
if (pi->app_tag == T10_PI_APP_ESCAPE)
goto next;
@@ -73,12 +74,10 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
iter->seed, be32_to_cpu(pi->ref_tag));
return BLK_STS_PROTECTION;
}
- break;
- case 3:
+ } else if (type == T10_PI_TYPE3_PROTECTION) {
if (pi->app_tag == T10_PI_APP_ESCAPE &&
pi->ref_tag == T10_PI_REF_ESCAPE)
goto next;
- break;
}
csum = fn(iter->data_buf, iter->interval);
@@ -102,94 +101,40 @@ next:
static blk_status_t t10_pi_type1_generate_crc(struct blk_integrity_iter *iter)
{
- return t10_pi_generate(iter, t10_pi_crc_fn, 1);
+ return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION);
}
static blk_status_t t10_pi_type1_generate_ip(struct blk_integrity_iter *iter)
{
- return t10_pi_generate(iter, t10_pi_ip_fn, 1);
+ return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION);
}
static blk_status_t t10_pi_type1_verify_crc(struct blk_integrity_iter *iter)
{
- return t10_pi_verify(iter, t10_pi_crc_fn, 1);
+ return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE1_PROTECTION);
}
static blk_status_t t10_pi_type1_verify_ip(struct blk_integrity_iter *iter)
{
- return t10_pi_verify(iter, t10_pi_ip_fn, 1);
-}
-
-static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
-{
- return t10_pi_generate(iter, t10_pi_crc_fn, 3);
-}
-
-static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
-{
- return t10_pi_generate(iter, t10_pi_ip_fn, 3);
-}
-
-static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
-{
- return t10_pi_verify(iter, t10_pi_crc_fn, 3);
+ return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE1_PROTECTION);
}
-static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
-{
- return t10_pi_verify(iter, t10_pi_ip_fn, 3);
-}
-
-const struct blk_integrity_profile t10_pi_type1_crc = {
- .name = "T10-DIF-TYPE1-CRC",
- .generate_fn = t10_pi_type1_generate_crc,
- .verify_fn = t10_pi_type1_verify_crc,
-};
-EXPORT_SYMBOL(t10_pi_type1_crc);
-
-const struct blk_integrity_profile t10_pi_type1_ip = {
- .name = "T10-DIF-TYPE1-IP",
- .generate_fn = t10_pi_type1_generate_ip,
- .verify_fn = t10_pi_type1_verify_ip,
-};
-EXPORT_SYMBOL(t10_pi_type1_ip);
-
-const struct blk_integrity_profile t10_pi_type3_crc = {
- .name = "T10-DIF-TYPE3-CRC",
- .generate_fn = t10_pi_type3_generate_crc,
- .verify_fn = t10_pi_type3_verify_crc,
-};
-EXPORT_SYMBOL(t10_pi_type3_crc);
-
-const struct blk_integrity_profile t10_pi_type3_ip = {
- .name = "T10-DIF-TYPE3-IP",
- .generate_fn = t10_pi_type3_generate_ip,
- .verify_fn = t10_pi_type3_verify_ip,
-};
-EXPORT_SYMBOL(t10_pi_type3_ip);
-
/**
- * t10_pi_prepare - prepare PI prior submitting request to device
+ * t10_pi_type1_prepare - prepare PI prior submitting request to device
* @rq: request with PI that should be prepared
- * @protection_type: PI type (Type 1/Type 2/Type 3)
*
* For Type 1/Type 2, the virtual start sector is the one that was
* originally submitted by the block layer for the ref_tag usage. Due to
* partitioning, MD/DM cloning, etc. the actual physical start sector is
* likely to be different. Remap protection information to match the
* physical LBA.
- *
- * Type 3 does not have a reference tag so no remapping is required.
*/
-void t10_pi_prepare(struct request *rq, u8 protection_type)
+static void t10_pi_type1_prepare(struct request *rq)
{
const int tuple_sz = rq->q->integrity.tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
struct bio *bio;
- if (protection_type == T10_PI_TYPE3_PROTECTION)
- return;
-
__rq_for_each_bio(bio, rq) {
struct bio_integrity_payload *bip = bio_integrity(bio);
u32 virt = bip_get_seed(bip) & 0xffffffff;
@@ -222,13 +167,11 @@ void t10_pi_prepare(struct request *rq, u8 protection_type)
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
}
}
-EXPORT_SYMBOL(t10_pi_prepare);
/**
- * t10_pi_complete - prepare PI prior returning request to the block layer
+ * t10_pi_type1_complete - prepare PI prior returning request to the blk layer
* @rq: request with PI that should be prepared
- * @protection_type: PI type (Type 1/Type 2/Type 3)
- * @intervals: total elements to prepare
+ * @nr_bytes: total bytes to prepare
*
* For Type 1/Type 2, the virtual start sector is the one that was
* originally submitted by the block layer for the ref_tag usage. Due to
@@ -236,19 +179,14 @@ EXPORT_SYMBOL(t10_pi_prepare);
* likely to be different. Since the physical start sector was submitted
* to the device, we should remap it back to virtual values expected by the
* block layer.
- *
- * Type 3 does not have a reference tag so no remapping is required.
*/
-void t10_pi_complete(struct request *rq, u8 protection_type,
- unsigned int intervals)
+static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
{
+ unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
const int tuple_sz = rq->q->integrity.tuple_size;
u32 ref_tag = t10_pi_ref_tag(rq);
struct bio *bio;
- if (protection_type == T10_PI_TYPE3_PROTECTION)
- return;
-
__rq_for_each_bio(bio, rq) {
struct bio_integrity_payload *bip = bio_integrity(bio);
u32 virt = bip_get_seed(bip) & 0xffffffff;
@@ -276,4 +214,73 @@ void t10_pi_complete(struct request *rq, u8 protection_type,
}
}
}
-EXPORT_SYMBOL(t10_pi_complete);
+
+static blk_status_t t10_pi_type3_generate_crc(struct blk_integrity_iter *iter)
+{
+ return t10_pi_generate(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t t10_pi_type3_generate_ip(struct blk_integrity_iter *iter)
+{
+ return t10_pi_generate(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t t10_pi_type3_verify_crc(struct blk_integrity_iter *iter)
+{
+ return t10_pi_verify(iter, t10_pi_crc_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t t10_pi_type3_verify_ip(struct blk_integrity_iter *iter)
+{
+ return t10_pi_verify(iter, t10_pi_ip_fn, T10_PI_TYPE3_PROTECTION);
+}
+
+/**
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+static void t10_pi_type3_prepare(struct request *rq)
+{
+}
+
+/**
+ * Type 3 does not have a reference tag so no remapping is required.
+ */
+static void t10_pi_type3_complete(struct request *rq, unsigned int nr_bytes)
+{
+}
+
+const struct blk_integrity_profile t10_pi_type1_crc = {
+ .name = "T10-DIF-TYPE1-CRC",
+ .generate_fn = t10_pi_type1_generate_crc,
+ .verify_fn = t10_pi_type1_verify_crc,
+ .prepare_fn = t10_pi_type1_prepare,
+ .complete_fn = t10_pi_type1_complete,
+};
+EXPORT_SYMBOL(t10_pi_type1_crc);
+
+const struct blk_integrity_profile t10_pi_type1_ip = {
+ .name = "T10-DIF-TYPE1-IP",
+ .generate_fn = t10_pi_type1_generate_ip,
+ .verify_fn = t10_pi_type1_verify_ip,
+ .prepare_fn = t10_pi_type1_prepare,
+ .complete_fn = t10_pi_type1_complete,
+};
+EXPORT_SYMBOL(t10_pi_type1_ip);
+
+const struct blk_integrity_profile t10_pi_type3_crc = {
+ .name = "T10-DIF-TYPE3-CRC",
+ .generate_fn = t10_pi_type3_generate_crc,
+ .verify_fn = t10_pi_type3_verify_crc,
+ .prepare_fn = t10_pi_type3_prepare,
+ .complete_fn = t10_pi_type3_complete,
+};
+EXPORT_SYMBOL(t10_pi_type3_crc);
+
+const struct blk_integrity_profile t10_pi_type3_ip = {
+ .name = "T10-DIF-TYPE3-IP",
+ .generate_fn = t10_pi_type3_generate_ip,
+ .verify_fn = t10_pi_type3_verify_ip,
+ .prepare_fn = t10_pi_type3_prepare,
+ .complete_fn = t10_pi_type3_complete,
+};
+EXPORT_SYMBOL(t10_pi_type3_ip);