aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-mq.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r--block/blk-mq.c396
1 files changed, 256 insertions, 140 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..0015a1892153 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -41,6 +41,8 @@
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
+static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
+
static void blk_mq_poll_stats_start(struct request_queue *q);
static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
@@ -275,26 +277,20 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
struct request *rq = tags->static_rqs[tag];
- req_flags_t rq_flags = 0;
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
+ if (data->q->elevator) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
- rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
- }
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
- data->hctx->tags->rqs[rq->tag] = rq;
}
/* csd/requeue_work/fifo_time is initialized before use */
rq->q = data->q;
rq->mq_ctx = data->ctx;
rq->mq_hctx = data->hctx;
- rq->rq_flags = rq_flags;
+ rq->rq_flags = 0;
rq->cmd_flags = data->cmd_flags;
if (data->flags & BLK_MQ_REQ_PREEMPT)
rq->rq_flags |= RQF_PREEMPT;
@@ -362,8 +358,6 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
data->flags |= BLK_MQ_REQ_NOWAIT;
if (e) {
- data->flags |= BLK_MQ_REQ_INTERNAL;
-
/*
* Flush requests are special and go directly to the
* dispatch list. Don't include reserved tags in the
@@ -378,7 +372,7 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+ if (!e)
blk_mq_tag_busy(data->hctx);
/*
@@ -394,7 +388,7 @@ retry:
/*
* Give up the CPU and sleep for a random short time to ensure
* that thread using a realtime scheduling class are migrated
- * off the the CPU, and thus off the hctx that is going away.
+ * off the CPU, and thus off the hctx that is going away.
*/
msleep(3);
goto retry;
@@ -474,9 +468,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
data.ctx = __blk_mq_get_ctx(q, cpu);
- if (q->elevator)
- data.flags |= BLK_MQ_REQ_INTERNAL;
- else
+ if (!q->elevator)
blk_mq_tag_busy(data.hctx);
ret = -EWOULDBLOCK;
@@ -552,8 +544,7 @@ inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
blk_stat_add(rq, now);
}
- if (rq->internal_tag != BLK_MQ_NO_TAG)
- blk_mq_sched_completed_request(rq, now);
+ blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
@@ -574,71 +565,139 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
}
EXPORT_SYMBOL(blk_mq_end_request);
-static void __blk_mq_complete_request_remote(void *data)
+/*
+ * Softirq action handler - move entries to local list and loop over them
+ * while passing them to the queue registered handler.
+ */
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
{
- struct request *rq = data;
- struct request_queue *q = rq->q;
+ struct list_head *cpu_list, local_list;
- q->mq_ops->complete(rq);
+ local_irq_disable();
+ cpu_list = this_cpu_ptr(&blk_cpu_done);
+ list_replace_init(cpu_list, &local_list);
+ local_irq_enable();
+
+ while (!list_empty(&local_list)) {
+ struct request *rq;
+
+ rq = list_entry(local_list.next, struct request, ipi_list);
+ list_del_init(&rq->ipi_list);
+ rq->q->mq_ops->complete(rq);
+ }
}
-/**
- * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
- * injection that could drop the completion.
- * @rq: Request to be force completed
- *
- * Drivers should use blk_mq_complete_request() to complete requests in their
- * normal IO path. For timeout error recovery, drivers may call this forced
- * completion routine after they've reclaimed timed out requests to bypass
- * potentially subsequent fake timeouts.
- */
-void blk_mq_force_complete_rq(struct request *rq)
+static void blk_mq_trigger_softirq(struct request *rq)
{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct request_queue *q = rq->q;
- bool shared = false;
- int cpu;
+ struct list_head *list;
+ unsigned long flags;
+
+ local_irq_save(flags);
+ list = this_cpu_ptr(&blk_cpu_done);
+ list_add_tail(&rq->ipi_list, list);
+
+ /*
+ * If the list only contains our just added request, signal a raise of
+ * the softirq. If there are already entries there, someone already
+ * raised the irq but it hasn't run yet.
+ */
+ if (list->next == &rq->ipi_list)
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_restore(flags);
+}
+
+static int blk_softirq_cpu_dead(unsigned int cpu)
+{
+ /*
+ * If a CPU goes away, splice its entries to the current CPU
+ * and trigger a run of the softirq
+ */
+ local_irq_disable();
+ list_splice_init(&per_cpu(blk_cpu_done, cpu),
+ this_cpu_ptr(&blk_cpu_done));
+ raise_softirq_irqoff(BLOCK_SOFTIRQ);
+ local_irq_enable();
+
+ return 0;
+}
+
+
+static void __blk_mq_complete_request_remote(void *data)
+{
+ struct request *rq = data;
- WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
- * Most of single queue controllers, there is only one irq vector
- * for handling IO completion, and the only irq's affinity is set
- * as all possible CPUs. On most of ARCHs, this affinity means the
- * irq is handled on one specific CPU.
+ * For most of single queue controllers, there is only one irq vector
+ * for handling I/O completion, and the only irq's affinity is set
+ * to all possible CPUs. On most of ARCHs, this affinity means the irq
+ * is handled on one specific CPU.
*
- * So complete IO reqeust in softirq context in case of single queue
- * for not degrading IO performance by irqsoff latency.
+ * So complete I/O requests in softirq context in case of single queue
+ * devices to avoid degrading I/O performance due to irqsoff latency.
*/
- if (q->nr_hw_queues == 1) {
- __blk_complete_request(rq);
- return;
- }
+ if (rq->q->nr_hw_queues == 1)
+ blk_mq_trigger_softirq(rq);
+ else
+ rq->q->mq_ops->complete(rq);
+}
+
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!IS_ENABLED(CONFIG_SMP) ||
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
+ return false;
+
+ /* same CPU or cache domain? Complete locally */
+ if (cpu == rq->mq_ctx->cpu ||
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+ return false;
+
+ /* don't try to IPI to an offline CPU */
+ return cpu_online(rq->mq_ctx->cpu);
+}
+
+bool blk_mq_complete_request_remote(struct request *rq)
+{
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
/*
* For a polled request, always complete locallly, it's pointless
* to redirect the completion.
*/
- if ((rq->cmd_flags & REQ_HIPRI) ||
- !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
- q->mq_ops->complete(rq);
- return;
- }
-
- cpu = get_cpu();
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ctx->cpu);
+ if (rq->cmd_flags & REQ_HIPRI)
+ return false;
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
+ if (blk_mq_complete_need_ipi(rq)) {
rq->csd.func = __blk_mq_complete_request_remote;
rq->csd.info = rq;
rq->csd.flags = 0;
- smp_call_function_single_async(ctx->cpu, &rq->csd);
+ smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd);
} else {
- q->mq_ops->complete(rq);
+ if (rq->q->nr_hw_queues > 1)
+ return false;
+ blk_mq_trigger_softirq(rq);
}
- put_cpu();
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
+
+/**
+ * blk_mq_complete_request - end I/O on a request
+ * @rq: the request being processed
+ *
+ * Description:
+ * Complete a request by scheduling the ->complete_rq operation.
+ **/
+void blk_mq_complete_request(struct request *rq)
+{
+ if (!blk_mq_complete_request_remote(rq))
+ rq->q->mq_ops->complete(rq);
}
-EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
+EXPORT_SYMBOL(blk_mq_complete_request);
static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
__releases(hctx->srcu)
@@ -661,23 +720,6 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
}
/**
- * blk_mq_complete_request - end I/O on a request
- * @rq: the request being processed
- *
- * Description:
- * Ends all I/O on a request. It does not handle partial completions.
- * The actual completion happens out-of-order, through a IPI handler.
- **/
-bool blk_mq_complete_request(struct request *rq)
-{
- if (unlikely(blk_should_fake_timeout(rq->q)))
- return false;
- blk_mq_force_complete_rq(rq);
- return true;
-}
-EXPORT_SYMBOL(blk_mq_complete_request);
-
-/**
* blk_mq_start_request - Start processing a request
* @rq: Pointer to request to be started
*
@@ -1052,6 +1094,45 @@ static inline unsigned int queued_to_index(unsigned int queued)
return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
}
+static bool __blk_mq_get_driver_tag(struct request *rq)
+{
+ struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+ int tag;
+
+ blk_mq_tag_busy(rq->mq_hctx);
+
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+ bt = &rq->mq_hctx->tags->breserved_tags;
+ tag_offset = 0;
+ }
+
+ if (!hctx_may_queue(rq->mq_hctx, bt))
+ return false;
+ tag = __sbitmap_queue_get(bt);
+ if (tag == BLK_MQ_NO_TAG)
+ return false;
+
+ rq->tag = tag + tag_offset;
+ return true;
+}
+
+static bool blk_mq_get_driver_tag(struct request *rq)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq))
+ return false;
+
+ if ((hctx->flags & BLK_MQ_F_TAG_SHARED) &&
+ !(rq->rq_flags & RQF_MQ_INFLIGHT)) {
+ rq->rq_flags |= RQF_MQ_INFLIGHT;
+ atomic_inc(&hctx->nr_active);
+ }
+ hctx->tags->rqs[rq->tag] = rq;
+ return true;
+}
+
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
int flags, void *key)
{
@@ -1204,25 +1285,70 @@ static void blk_mq_handle_zone_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
+enum prep_dispatch {
+ PREP_DISPATCH_OK,
+ PREP_DISPATCH_NO_TAG,
+ PREP_DISPATCH_NO_BUDGET,
+};
+
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
+ bool need_budget)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ if (need_budget && !blk_mq_get_dispatch_budget(rq->q)) {
+ blk_mq_put_driver_tag(rq);
+ return PREP_DISPATCH_NO_BUDGET;
+ }
+
+ if (!blk_mq_get_driver_tag(rq)) {
+ /*
+ * The initial allocation attempt failed, so we need to
+ * rerun the hardware queue when a tag is freed. The
+ * waitqueue takes care of that. If the queue is run
+ * before we add this entry back on the dispatch list,
+ * we'll re-run it below.
+ */
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
+ /*
+ * All budgets not got from this function will be put
+ * together during handling partial dispatch
+ */
+ if (need_budget)
+ blk_mq_put_dispatch_budget(rq->q);
+ return PREP_DISPATCH_NO_TAG;
+ }
+ }
+
+ return PREP_DISPATCH_OK;
+}
+
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
+static void blk_mq_release_budgets(struct request_queue *q,
+ unsigned int nr_budgets)
+{
+ int i;
+
+ for (i = 0; i < nr_budgets; i++)
+ blk_mq_put_dispatch_budget(q);
+}
+
/*
* Returns true if we did some work AND can potentially do more.
*/
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- bool got_budget)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
+ unsigned int nr_budgets)
{
- struct blk_mq_hw_ctx *hctx;
+ enum prep_dispatch prep;
+ struct request_queue *q = hctx->queue;
struct request *rq, *nxt;
- bool no_tag = false;
int errors, queued;
blk_status_t ret = BLK_STS_OK;
- bool no_budget_avail = false;
LIST_HEAD(zone_list);
if (list_empty(list))
return false;
- WARN_ON(!list_is_singular(list) && got_budget);
-
/*
* Now process all the entries, sending them to the driver.
*/
@@ -1232,32 +1358,10 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
rq = list_first_entry(list, struct request, queuelist);
- hctx = rq->mq_hctx;
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
- blk_mq_put_driver_tag(rq);
- no_budget_avail = true;
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ if (prep != PREP_DISPATCH_OK)
break;
- }
-
- if (!blk_mq_get_driver_tag(rq)) {
- /*
- * The initial allocation attempt failed, so we need to
- * rerun the hardware queue when a tag is freed. The
- * waitqueue takes care of that. If the queue is run
- * before we add this entry back on the dispatch list,
- * we'll re-run it below.
- */
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
- blk_mq_put_dispatch_budget(hctx);
- /*
- * For non-shared tags, the RESTART check
- * will suffice.
- */
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
- no_tag = true;
- break;
- }
- }
list_del_init(&rq->queuelist);
@@ -1274,31 +1378,35 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
bd.last = !blk_mq_get_driver_tag(nxt);
}
+ /*
+ * once the request is queued to lld, no need to cover the
+ * budget any more
+ */
+ if (nr_budgets)
+ nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_handle_dev_resource(rq, list);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
break;
- } else if (ret == BLK_STS_ZONE_RESOURCE) {
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_handle_dev_resource(rq, list);
+ goto out;
+ case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
- if (list_empty(list))
- break;
- continue;
- }
-
- if (unlikely(ret != BLK_STS_OK)) {
+ break;
+ default:
errors++;
blk_mq_end_request(rq, BLK_STS_IOERR);
- continue;
}
-
- queued++;
} while (!list_empty(list));
-
+out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
@@ -1310,6 +1418,12 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*/
if (!list_empty(list)) {
bool needs_restart;
+ /* For non-shared tags, the RESTART check will suffice */
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
+ (hctx->flags & BLK_MQ_F_TAG_SHARED);
+ bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET;
+
+ blk_mq_release_budgets(q, nr_budgets);
/*
* If we didn't flush the entire list, we could have told
@@ -1361,13 +1475,6 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
} else
blk_mq_update_dispatch_busy(hctx, false);
- /*
- * If the host/device is unable to accept more work, inform the
- * caller of that.
- */
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- return false;
-
return (queued + errors) != 0;
}
@@ -1896,11 +2003,11 @@ static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
if (q->elevator && !bypass_insert)
goto insert;
- if (!blk_mq_get_dispatch_budget(hctx))
+ if (!blk_mq_get_dispatch_budget(q))
goto insert;
if (!blk_mq_get_driver_tag(rq)) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(q);
goto insert;
}
@@ -2005,8 +2112,7 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
}
/**
- * blk_mq_make_request - Create and send a request to block device.
- * @q: Request queue pointer.
+ * blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
@@ -2020,8 +2126,9 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*
* Returns: Request queue cookie.
*/
-blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+blk_qc_t blk_mq_submit_bio(struct bio *bio)
{
+ struct request_queue *q = bio->bi_disk->queue;
const int is_sync = op_is_sync(bio->bi_opf);
const int is_flush_fua = op_is_flush(bio->bi_opf);
struct blk_mq_alloc_data data = {
@@ -2035,7 +2142,7 @@ blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
blk_status_t ret;
blk_queue_bounce(q, &bio);
- __blk_queue_split(q, &bio, &nr_segs);
+ __blk_queue_split(&bio, &nr_segs);
if (!bio_integrity_prep(bio))
goto queue_exit;
@@ -2146,7 +2253,7 @@ queue_exit:
blk_queue_exit(q);
return BLK_QC_T_NONE;
}
-EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
+EXPORT_SYMBOL_GPL(blk_mq_submit_bio); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
@@ -2792,7 +2899,7 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
- list_del_rcu(&q->tag_set_list);
+ list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
set->flags &= ~BLK_MQ_F_TAG_SHARED;
@@ -2819,7 +2926,7 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
}
if (set->flags & BLK_MQ_F_TAG_SHARED)
queue_set_hctx_shared(q, true);
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
+ list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
@@ -2886,7 +2993,7 @@ struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
{
struct request_queue *uninit_q, *q;
- uninit_q = __blk_alloc_queue(set->numa_node);
+ uninit_q = blk_alloc_queue(set->numa_node);
if (!uninit_q)
return ERR_PTR(-ENOMEM);
uninit_q->queuedata = queuedata;
@@ -3760,6 +3867,15 @@ EXPORT_SYMBOL(blk_mq_rq_cpu);
static int __init blk_mq_init(void)
{
+ int i;
+
+ for_each_possible_cpu(i)
+ INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+ "block/softirq:dead", NULL,
+ blk_softirq_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",