diff options
Diffstat (limited to 'block/blk-mq.c')
-rw-r--r-- | block/blk-mq.c | 976 |
1 files changed, 626 insertions, 350 deletions
diff --git a/block/blk-mq.c b/block/blk-mq.c index e3c39ea8e17b..9437a5eb07cf 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -33,11 +33,11 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-tag.h" +#include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -74,14 +74,18 @@ static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) - sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + if (!sbitmap_test_bit(&hctx->ctx_map, bit)) + sbitmap_set_bit(&hctx->ctx_map, bit); } static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { - sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); + const int bit = ctx->index_hw[hctx->type]; + + sbitmap_clear_bit(&hctx->ctx_map, bit); } struct mq_inflight { @@ -89,33 +93,33 @@ struct mq_inflight { unsigned int *inflight; }; -static void blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { struct mq_inflight *mi = priv; /* - * index[0] counts the specific partition that was asked for. index[1] - * counts the ones that are active on the whole device, so increment - * that if mi->part is indeed a partition, and not a whole device. + * index[0] counts the specific partition that was asked for. */ if (rq->part == mi->part) mi->inflight[0]++; - if (mi->part->partno) - mi->inflight[1]++; + + return true; } -void blk_mq_in_flight(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) { + unsigned inflight[2]; struct mq_inflight mi = { .part = part, .inflight = inflight, }; inflight[0] = inflight[1] = 0; blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); + + return inflight[0]; } -static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { @@ -123,6 +127,8 @@ static void blk_mq_check_inflight_rw(struct blk_mq_hw_ctx *hctx, if (rq->part == mi->part) mi->inflight[rq_data_dir(rq)]++; + + return true; } void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, @@ -141,7 +147,7 @@ void blk_freeze_queue_start(struct request_queue *q) freeze_depth = atomic_inc_return(&q->mq_freeze_depth); if (freeze_depth == 1) { percpu_ref_kill(&q->q_usage_counter); - if (q->mq_ops) + if (queue_is_mq(q)) blk_mq_run_hw_queues(q, false); } } @@ -176,8 +182,6 @@ void blk_freeze_queue(struct request_queue *q) * exported to drivers as the only user for unfreeze is blk_mq. */ blk_freeze_queue_start(q); - if (!q->mq_ops) - blk_drain_queue(q); blk_mq_freeze_queue_wait(q); } @@ -198,7 +202,7 @@ void blk_mq_unfreeze_queue(struct request_queue *q) freeze_depth = atomic_dec_return(&q->mq_freeze_depth); WARN_ON_ONCE(freeze_depth < 0); if (!freeze_depth) { - percpu_ref_reinit(&q->q_usage_counter); + percpu_ref_resurrect(&q->q_usage_counter); wake_up_all(&q->mq_freeze_wq); } } @@ -274,6 +278,15 @@ bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) } EXPORT_SYMBOL(blk_mq_can_queue); +/* + * Only need start/end time stamping if we have stats enabled, or using + * an IO scheduler. + */ +static inline bool blk_mq_need_time_stamp(struct request *rq) +{ + return (rq->rq_flags & RQF_IO_STAT) || rq->q->elevator; +} + static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, unsigned int tag, unsigned int op) { @@ -297,8 +310,8 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, /* csd/requeue_work/fifo_time is initialized before use */ rq->q = data->q; rq->mq_ctx = data->ctx; + rq->mq_hctx = data->hctx; rq->rq_flags = rq_flags; - rq->cpu = -1; rq->cmd_flags = op; if (data->flags & BLK_MQ_REQ_PREEMPT) rq->rq_flags |= RQF_PREEMPT; @@ -309,7 +322,10 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, RB_CLEAR_NODE(&rq->rb_node); rq->rq_disk = NULL; rq->part = NULL; - rq->start_time_ns = ktime_get_ns(); + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) @@ -318,27 +334,22 @@ static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, rq->special = NULL; /* tag was already set */ rq->extra_len = 0; - rq->__deadline = 0; + WRITE_ONCE(rq->deadline, 0); - INIT_LIST_HEAD(&rq->timeout_list); rq->timeout = 0; rq->end_io = NULL; rq->end_io_data = NULL; rq->next_rq = NULL; -#ifdef CONFIG_BLK_CGROUP - rq->rl = NULL; -#endif - data->ctx->rq_dispatched[op_is_sync(op)]++; refcount_set(&rq->ref, 1); return rq; } static struct request *blk_mq_get_request(struct request_queue *q, - struct bio *bio, unsigned int op, - struct blk_mq_alloc_data *data) + struct bio *bio, + struct blk_mq_alloc_data *data) { struct elevator_queue *e = q->elevator; struct request *rq; @@ -352,8 +363,9 @@ static struct request *blk_mq_get_request(struct request_queue *q, put_ctx_on_error = true; } if (likely(!data->hctx)) - data->hctx = blk_mq_map_queue(q, data->ctx->cpu); - if (op & REQ_NOWAIT) + data->hctx = blk_mq_map_queue(q, data->cmd_flags, + data->ctx->cpu); + if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; if (e) { @@ -364,9 +376,10 @@ static struct request *blk_mq_get_request(struct request_queue *q, * dispatch list. Don't include reserved tags in the * limiting, as it isn't useful. */ - if (!op_is_flush(op) && e->type->ops.mq.limit_depth && + if (!op_is_flush(data->cmd_flags) && + e->type->ops.limit_depth && !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.mq.limit_depth(op, data); + e->type->ops.limit_depth(data->cmd_flags, data); } else { blk_mq_tag_busy(data->hctx); } @@ -381,14 +394,14 @@ static struct request *blk_mq_get_request(struct request_queue *q, return NULL; } - rq = blk_mq_rq_ctx_init(data, tag, op); - if (!op_is_flush(op)) { + rq = blk_mq_rq_ctx_init(data, tag, data->cmd_flags); + if (!op_is_flush(data->cmd_flags)) { rq->elv.icq = NULL; - if (e && e->type->ops.mq.prepare_request) { - if (e->type->icq_cache && rq_ioc(bio)) - blk_mq_sched_assign_ioc(rq, bio); + if (e && e->type->ops.prepare_request) { + if (e->type->icq_cache) + blk_mq_sched_assign_ioc(rq); - e->type->ops.mq.prepare_request(rq, bio); + e->type->ops.prepare_request(rq, bio); rq->rq_flags |= RQF_ELVPRIV; } } @@ -399,7 +412,7 @@ static struct request *blk_mq_get_request(struct request_queue *q, struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; int ret; @@ -407,7 +420,7 @@ struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, if (ret) return ERR_PTR(ret); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q); if (!rq) @@ -425,7 +438,7 @@ EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) { - struct blk_mq_alloc_data alloc_data = { .flags = flags }; + struct blk_mq_alloc_data alloc_data = { .flags = flags, .cmd_flags = op }; struct request *rq; unsigned int cpu; int ret; @@ -458,7 +471,7 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, cpu = cpumask_first_and(alloc_data.hctx->cpumask, cpu_online_mask); alloc_data.ctx = __blk_mq_get_ctx(q, cpu); - rq = blk_mq_get_request(q, NULL, op, &alloc_data); + rq = blk_mq_get_request(q, NULL, &alloc_data); blk_queue_exit(q); if (!rq) @@ -472,9 +485,11 @@ static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; const int sched_tag = rq->internal_tag; + blk_pm_mark_last_busy(rq); + rq->mq_hctx = NULL; if (rq->tag != -1) blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); if (sched_tag != -1) @@ -488,11 +503,11 @@ void blk_mq_free_request(struct request *rq) struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.mq.finish_request) - e->type->ops.mq.finish_request(rq); + if (e && e->type->ops.finish_request) + e->type->ops.finish_request(rq); if (rq->elv.icq) { put_io_context(rq->elv.icq->ioc); rq->elv.icq = NULL; @@ -508,9 +523,6 @@ void blk_mq_free_request(struct request *rq) rq_qos_done(q, rq); - if (blk_rq_rl(rq)) - blk_put_rl(blk_rq_rl(rq)); - WRITE_ONCE(rq->state, MQ_RQ_IDLE); if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); @@ -519,13 +531,19 @@ EXPORT_SYMBOL_GPL(blk_mq_free_request); inline void __blk_mq_end_request(struct request *rq, blk_status_t error) { - u64 now = ktime_get_ns(); + u64 now = 0; + + if (blk_mq_need_time_stamp(rq)) + now = ktime_get_ns(); if (rq->rq_flags & RQF_STATS) { blk_mq_poll_stats_start(rq->q); blk_stat_add(rq, now); } + if (rq->internal_tag != -1) + blk_mq_sched_completed_request(rq, now); + blk_account_io_done(rq, now); if (rq->end_io) { @@ -550,28 +568,45 @@ EXPORT_SYMBOL(blk_mq_end_request); static void __blk_mq_complete_request_remote(void *data) { struct request *rq = data; + struct request_queue *q = rq->q; - rq->q->softirq_done_fn(rq); + q->mq_ops->complete(rq); } static void __blk_mq_complete_request(struct request *rq) { struct blk_mq_ctx *ctx = rq->mq_ctx; + struct request_queue *q = rq->q; bool shared = false; int cpu; - if (!blk_mq_mark_complete(rq)) + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + /* + * Most of single queue controllers, there is only one irq vector + * for handling IO completion, and the only irq's affinity is set + * as all possible CPUs. On most of ARCHs, this affinity means the + * irq is handled on one specific CPU. + * + * So complete IO reqeust in softirq context in case of single queue + * for not degrading IO performance by irqsoff latency. + */ + if (q->nr_hw_queues == 1) { + __blk_complete_request(rq); return; - if (rq->internal_tag != -1) - blk_mq_sched_completed_request(rq); + } - if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { - rq->q->softirq_done_fn(rq); + /* + * For a polled request, always complete locallly, it's pointless + * to redirect the completion. + */ + if ((rq->cmd_flags & REQ_HIPRI) || + !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { + q->mq_ops->complete(rq); return; } cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) + if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) shared = cpus_share_cache(cpu, ctx->cpu); if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { @@ -580,7 +615,7 @@ static void __blk_mq_complete_request(struct request *rq) rq->csd.flags = 0; smp_call_function_single_async(ctx->cpu, &rq->csd); } else { - rq->q->softirq_done_fn(rq); + q->mq_ops->complete(rq); } put_cpu(); } @@ -613,11 +648,12 @@ static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) * Ends all I/O on a request. It does not handle partial completions. * The actual completion happens out-of-order, through a IPI handler. **/ -void blk_mq_complete_request(struct request *rq) +bool blk_mq_complete_request(struct request *rq) { if (unlikely(blk_should_fake_timeout(rq->q))) - return; + return false; __blk_mq_complete_request(rq); + return true; } EXPORT_SYMBOL(blk_mq_complete_request); @@ -684,7 +720,7 @@ void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(blk_queued_rq(rq)); + BUG_ON(!list_empty(&rq->queuelist)); blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -701,12 +737,20 @@ static void blk_mq_requeue_work(struct work_struct *work) spin_unlock_irq(&q->requeue_lock); list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->rq_flags & RQF_SOFTBARRIER)) + if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) continue; rq->rq_flags &= ~RQF_SOFTBARRIER; list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, true, false, false); + /* + * If RQF_DONTPREP, rq has contained some driver specific + * data, so insert it to hctx dispatch list to avoid any + * merge. + */ + if (rq->rq_flags & RQF_DONTPREP) + blk_mq_request_bypass_insert(rq, false); + else + blk_mq_sched_insert_request(rq, true, false, false); } while (!list_empty(&rq_list)) { @@ -769,6 +813,32 @@ struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) } EXPORT_SYMBOL(blk_mq_tag_to_rq); +static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, + void *priv, bool reserved) +{ + /* + * If we find a request that is inflight and the queue matches, + * we know the queue is busy. Return false to stop the iteration. + */ + if (rq->state == MQ_RQ_IN_FLIGHT && rq->q == hctx->queue) { + bool *busy = priv; + + *busy = true; + return false; + } + + return true; +} + +bool blk_mq_queue_inflight(struct request_queue *q) +{ + bool busy = false; + + blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); + return busy; +} +EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); + static void blk_mq_rq_timed_out(struct request *req, bool reserved) { req->rq_flags |= RQF_TIMED_OUT; @@ -793,7 +863,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) if (rq->rq_flags & RQF_TIMED_OUT) return false; - deadline = blk_rq_deadline(rq); + deadline = READ_ONCE(rq->deadline); if (time_after_eq(jiffies, deadline)) return true; @@ -804,7 +874,7 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; } -static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, +static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, struct request *rq, void *priv, bool reserved) { unsigned long *next = priv; @@ -814,7 +884,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * so we're not unnecessarilly synchronizing across CPUs. */ if (!blk_mq_req_expired(rq, next)) - return; + return true; /* * We have reason to believe the request may be expired. Take a @@ -826,7 +896,7 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, * timeout handler to posting a natural completion. */ if (!refcount_inc_not_zero(&rq->ref)) - return; + return true; /* * The request is now locked and cannot be reallocated underneath the @@ -838,6 +908,8 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, blk_mq_rq_timed_out(rq, reserved); if (refcount_dec_and_test(&rq->ref)) __blk_mq_free_request(rq); + + return true; } static void blk_mq_timeout_work(struct work_struct *work) @@ -894,9 +966,10 @@ static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) struct flush_busy_ctx_data *flush_data = data; struct blk_mq_hw_ctx *hctx = flush_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - list_splice_tail_init(&ctx->rq_list, flush_data->list); + list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); sbitmap_clear_bit(sb, bitnr); spin_unlock(&ctx->lock); return true; @@ -928,12 +1001,13 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct dispatch_rq_data *dispatch_data = data; struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; + enum hctx_type type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - dispatch_data->rq = list_entry_rq(ctx->rq_list.next); + if (!list_empty(&ctx->rq_lists[type])) { + dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); list_del_init(&dispatch_data->rq->queuelist); - if (list_empty(&ctx->rq_list)) + if (list_empty(&ctx->rq_lists[type])) sbitmap_clear_bit(sb, bitnr); } spin_unlock(&ctx->lock); @@ -944,7 +1018,7 @@ static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start) { - unsigned off = start ? start->index_hw : 0; + unsigned off = start ? start->index_hw[hctx->type] : 0; struct dispatch_rq_data data = { .hctx = hctx, .rq = NULL, @@ -968,8 +1042,9 @@ bool blk_mq_get_driver_tag(struct request *rq) { struct blk_mq_alloc_data data = { .q = rq->q, - .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), + .hctx = rq->mq_hctx, .flags = BLK_MQ_REQ_NOWAIT, + .cmd_flags = rq->cmd_flags, }; bool shared; @@ -1133,7 +1208,7 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, rq = list_first_entry(list, struct request, queuelist); - hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); + hctx = rq->mq_hctx; if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) break; @@ -1206,6 +1281,14 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, if (!list_empty(list)) { bool needs_restart; + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if (q->mq_ops->commit_rqs) + q->mq_ops->commit_rqs(hctx); + spin_lock(&hctx->lock); list_splice_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); @@ -1535,15 +1618,16 @@ static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, bool at_head) { struct blk_mq_ctx *ctx = rq->mq_ctx; + enum hctx_type type = hctx->type; lockdep_assert_held(&ctx->lock); trace_block_rq_insert(hctx->queue, rq); if (at_head) - list_add(&rq->queuelist, &ctx->rq_list); + list_add(&rq->queuelist, &ctx->rq_lists[type]); else - list_add_tail(&rq->queuelist, &ctx->rq_list); + list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); } void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, @@ -1563,8 +1647,7 @@ void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ void blk_mq_request_bypass_insert(struct request *rq, bool run_queue) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); list_add_tail(&rq->queuelist, &hctx->dispatch); @@ -1579,6 +1662,7 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, { struct request *rq; + enum hctx_type type = hctx->type; /* * preemption doesn't flush plug list, so it's possible ctx->cpu is @@ -1590,35 +1674,46 @@ void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, } spin_lock(&ctx->lock); - list_splice_tail_init(list, &ctx->rq_list); + list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); } -static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) { struct request *rqa = container_of(a, struct request, queuelist); struct request *rqb = container_of(b, struct request, queuelist); - return !(rqa->mq_ctx < rqb->mq_ctx || - (rqa->mq_ctx == rqb->mq_ctx && - blk_rq_pos(rqa) < blk_rq_pos(rqb))); + if (rqa->mq_ctx < rqb->mq_ctx) + return -1; + else if (rqa->mq_ctx > rqb->mq_ctx) + return 1; + else if (rqa->mq_hctx < rqb->mq_hctx) + return -1; + else if (rqa->mq_hctx > rqb->mq_hctx) + return 1; + + return blk_rq_pos(rqa) > blk_rq_pos(rqb); } void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) { + struct blk_mq_hw_ctx *this_hctx; struct blk_mq_ctx *this_ctx; struct request_queue *this_q; struct request *rq; LIST_HEAD(list); - LIST_HEAD(ctx_list); + LIST_HEAD(rq_list); unsigned int depth; list_splice_init(&plug->mq_list, &list); + plug->rq_count = 0; - list_sort(NULL, &list, plug_ctx_cmp); + if (plug->rq_count > 2 && plug->multiple_queues) + list_sort(NULL, &list, plug_rq_cmp); this_q = NULL; + this_hctx = NULL; this_ctx = NULL; depth = 0; @@ -1626,30 +1721,31 @@ void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) rq = list_entry_rq(list.next); list_del_init(&rq->queuelist); BUG_ON(!rq->q); - if (rq->mq_ctx != this_ctx) { - if (this_ctx) { + if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) { + if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, - &ctx_list, + blk_mq_sched_insert_requests(this_hctx, this_ctx, + &rq_list, from_schedule); } - this_ctx = rq->mq_ctx; this_q = rq->q; + this_ctx = rq->mq_ctx; + this_hctx = rq->mq_hctx; depth = 0; } depth++; - list_add_tail(&rq->queuelist, &ctx_list); + list_add_tail(&rq->queuelist, &rq_list); } /* - * If 'this_ctx' is set, we know we have entries to complete - * on 'ctx_list'. Do those. + * If 'this_hctx' is set, we know we have entries to complete + * on 'rq_list'. Do those. */ - if (this_ctx) { + if (this_hctx) { trace_block_unplug(this_q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, + blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, from_schedule); } } @@ -1658,27 +1754,17 @@ static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) { blk_init_request_from_bio(rq, bio); - blk_rq_set_rl(rq, blk_get_rl(rq->q, bio)); - blk_account_io_start(rq, true); } -static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) -{ - if (rq->tag != -1) - return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); - - return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); -} - static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, - blk_qc_t *cookie) + blk_qc_t *cookie, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, - .last = true, + .last = last, }; blk_qc_t new_cookie; blk_status_t ret; @@ -1710,77 +1796,74 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, return ret; } -static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, +blk_status_t blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, struct request *rq, blk_qc_t *cookie, - bool bypass_insert) + bool bypass, bool last) { struct request_queue *q = rq->q; bool run_queue = true; + blk_status_t ret = BLK_STS_RESOURCE; + int srcu_idx; + bool force = false; + hctx_lock(hctx, &srcu_idx); /* - * RCU or SRCU read lock is needed before checking quiesced flag. + * hctx_lock is needed before checking quiesced flag. * - * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, - * and avoid driver to try to dispatch again. + * When queue is stopped or quiesced, ignore 'bypass', insert + * and return BLK_STS_OK to caller, and avoid driver to try to + * dispatch again. */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) { run_queue = false; - bypass_insert = false; - goto insert; + bypass = false; + goto out_unlock; } - if (q->elevator && !bypass_insert) - goto insert; + if (unlikely(q->elevator && !bypass)) + goto out_unlock; if (!blk_mq_get_dispatch_budget(hctx)) - goto insert; + goto out_unlock; if (!blk_mq_get_driver_tag(rq)) { blk_mq_put_dispatch_budget(hctx); - goto insert; + goto out_unlock; } - return __blk_mq_issue_directly(hctx, rq, cookie); -insert: - if (bypass_insert) - return BLK_STS_RESOURCE; - - blk_mq_sched_insert_request(rq, false, run_queue, false); - return BLK_STS_OK; -} - -static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) -{ - blk_status_t ret; - int srcu_idx; - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_sched_insert_request(rq, false, true, false); - else if (ret != BLK_STS_OK) - blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); -} - -blk_status_t blk_mq_request_issue_directly(struct request *rq) -{ - blk_status_t ret; - int srcu_idx; - blk_qc_t unused_cookie; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(rq->q, ctx->cpu); - - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true); + /* + * Always add a request that has been through + *.queue_rq() to the hardware dispatch list. + */ + force = true; + ret = __blk_mq_issue_directly(hctx, rq, cookie, last); +out_unlock: hctx_unlock(hctx, srcu_idx); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_DEV_RESOURCE: + case BLK_STS_RESOURCE: + if (force) { + blk_mq_request_bypass_insert(rq, run_queue); + /* + * We have to return BLK_STS_OK for the DM + * to avoid livelock. Otherwise, we return + * the real result to indicate whether the + * request is direct-issued successfully. + */ + ret = bypass ? BLK_STS_OK : ret; + } else if (!bypass) { + blk_mq_sched_insert_request(rq, false, + run_queue, false); + } + break; + default: + if (!bypass) + blk_mq_end_request(rq, ret); + break; + } return ret; } @@ -1788,21 +1871,42 @@ blk_status_t blk_mq_request_issue_directly(struct request *rq) void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { + blk_qc_t unused; + blk_status_t ret = BLK_STS_OK; + while (!list_empty(list)) { - blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - ret = blk_mq_request_issue_directly(rq); - if (ret != BLK_STS_OK) { - if (ret == BLK_STS_RESOURCE || - ret == BLK_STS_DEV_RESOURCE) { - list_add(&rq->queuelist, list); - break; - } - blk_mq_end_request(rq, ret); - } + if (ret == BLK_STS_OK) + ret = blk_mq_try_issue_directly(hctx, rq, &unused, + false, + list_empty(list)); + else + blk_mq_sched_insert_request(rq, false, true, false); + } + + /* + * If we didn't flush the entire list, we could have told + * the driver there was more coming, but that turned out to + * be a lie. + */ + if (ret != BLK_STS_OK && hctx->queue->mq_ops->commit_rqs) + hctx->queue->mq_ops->commit_rqs(hctx); +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count++; + if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { + struct request *tmp; + + tmp = list_first_entry(&plug->mq_list, struct request, + queuelist); + if (tmp->q != rq->q) + plug->multiple_queues = true; } } @@ -1810,9 +1914,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) { const int is_sync = op_is_sync(bio->bi_opf); const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { .flags = 0 }; + struct blk_mq_alloc_data data = { .flags = 0}; struct request *rq; - unsigned int request_count = 0; struct blk_plug *plug; struct request *same_queue_rq = NULL; blk_qc_t cookie; @@ -1825,17 +1928,16 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) + blk_attempt_plug_merge(q, bio, &same_queue_rq)) return BLK_QC_T_NONE; if (blk_mq_sched_bio_merge(q, bio)) return BLK_QC_T_NONE; - rq_qos_throttle(q, bio, NULL); - - trace_block_getrq(q, bio, bio->bi_opf); + rq_qos_throttle(q, bio); - rq = blk_mq_get_request(q, bio, bio->bi_opf, &data); + data.cmd_flags = bio->bi_opf; + rq = blk_mq_get_request(q, bio, &data); if (unlikely(!rq)) { rq_qos_cleanup(q, bio); if (bio->bi_opf & REQ_NOWAIT) @@ -1843,6 +1945,8 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } + trace_block_getrq(q, bio, bio->bi_opf); + rq_qos_track(q, rq, bio); cookie = request_to_qc_t(data.hctx, rq); @@ -1855,21 +1959,17 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) /* bypass scheduler for flush rq */ blk_insert_flush(rq); blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && q->nr_hw_queues == 1) { + } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs)) { + /* + * Use plugging if we have a ->commit_rqs() hook as well, as + * we know the driver uses bd->last in a smart fashion. + */ + unsigned int request_count = plug->rq_count; struct request *last = NULL; blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - /* - * @request_count may become stale because of schedule - * out, so check the list again. - */ - if (list_empty(&plug->mq_list)) - request_count = 0; - else if (blk_queue_nomerges(q)) - request_count = blk_plug_queued_count(q); - if (!request_count) trace_block_plug(q); else @@ -1881,7 +1981,7 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) trace_block_plug(q); } - list_add_tail(&rq->queuelist, &plug->mq_list); + blk_add_rq_to_plug(plug, rq); } else if (plug && !blk_queue_nomerges(q)) { blk_mq_bio_to_request(rq, bio); @@ -1894,23 +1994,24 @@ static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) */ if (list_empty(&plug->mq_list)) same_queue_rq = NULL; - if (same_queue_rq) + if (same_queue_rq) { list_del_init(&same_queue_rq->queuelist); - list_add_tail(&rq->queuelist, &plug->mq_list); + plug->rq_count--; + } + blk_add_rq_to_plug(plug, rq); blk_mq_put_ctx(data.ctx); if (same_queue_rq) { - data.hctx = blk_mq_map_queue(q, - same_queue_rq->mq_ctx->cpu); + data.hctx = same_queue_rq->mq_hctx; blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); + &cookie, false, true); } } else if ((q->nr_hw_queues > 1 && is_sync) || (!q->elevator && !data.hctx->dispatch_busy)) { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); - blk_mq_try_issue_directly(data.hctx, rq, &cookie); + blk_mq_try_issue_directly(data.hctx, rq, &cookie, false, true); } else { blk_mq_put_ctx(data.ctx); blk_mq_bio_to_request(rq, bio); @@ -1968,7 +2069,7 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, struct blk_mq_tags *tags; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2024,7 +2125,7 @@ int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, size_t rq_size, left; int node; - node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); + node = blk_mq_hw_queue_to_node(&set->map[0], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2104,13 +2205,15 @@ static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; LIST_HEAD(tmp); + enum hctx_type type; hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); ctx = __blk_mq_get_ctx(hctx->queue, cpu); + type = hctx->type; spin_lock(&ctx->lock); - if (!list_empty(&ctx->rq_list)) { - list_splice_init(&ctx->rq_list, &tmp); + if (!list_empty(&ctx->rq_lists[type])) { + list_splice_init(&ctx->rq_lists[type], &tmp); blk_mq_hctx_clear_pending(hctx, ctx); } spin_unlock(&ctx->lock); @@ -2137,8 +2240,6 @@ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { - blk_mq_debugfs_unregister_hctx(hctx); - if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); @@ -2165,6 +2266,7 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; + blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } @@ -2194,12 +2296,12 @@ static int blk_mq_init_hctx(struct request_queue *q, * runtime */ hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), - GFP_KERNEL, node); + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); if (!hctx->ctxs) goto unregister_cpu_notifier; - if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, - node)) + if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node)) goto free_ctxs; hctx->nr_ctx = 0; @@ -2212,7 +2314,8 @@ static int blk_mq_init_hctx(struct request_queue *q, set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) goto free_bitmap; - hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); + hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size, + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!hctx->fq) goto exit_hctx; @@ -2222,8 +2325,6 @@ static int blk_mq_init_hctx(struct request_queue *q, if (hctx->flags & BLK_MQ_F_BLOCKING) init_srcu_struct(hctx->srcu); - blk_mq_debugfs_register_hctx(q, hctx); - return 0; free_fq: @@ -2243,24 +2344,30 @@ static int blk_mq_init_hctx(struct request_queue *q, static void blk_mq_init_cpu_queues(struct request_queue *q, unsigned int nr_hw_queues) { - unsigned int i; + struct blk_mq_tag_set *set = q->tag_set; + unsigned int i, j; for_each_possible_cpu(i) { struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); struct blk_mq_hw_ctx *hctx; + int k; __ctx->cpu = i; spin_lock_init(&__ctx->lock); - INIT_LIST_HEAD(&__ctx->rq_list); + for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) + INIT_LIST_HEAD(&__ctx->rq_lists[k]); + __ctx->queue = q; /* * Set local node, IFF we have more than one hw queue. If * not, we remain on the home node of the device */ - hctx = blk_mq_map_queue(q, i); - if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = local_memory_node(cpu_to_node(i)); + for (j = 0; j < set->nr_maps; j++) { + hctx = blk_mq_map_queue_type(q, j, i); + if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) + hctx->numa_node = local_memory_node(cpu_to_node(i)); + } } } @@ -2286,7 +2393,7 @@ static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, unsigned int hctx_idx) { - if (set->tags[hctx_idx]) { + if (set->tags && set->tags[hctx_idx]) { blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); blk_mq_free_rq_map(set->tags[hctx_idx]); set->tags[hctx_idx] = NULL; @@ -2295,7 +2402,7 @@ static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, hctx_idx; + unsigned int i, j, hctx_idx; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -2317,7 +2424,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) * If the cpu isn't present, the cpu is mapped to first hctx. */ for_each_possible_cpu(i) { - hctx_idx = q->mq_map[i]; + hctx_idx = set->map[0].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && !__blk_mq_alloc_rq_map(set, hctx_idx)) { @@ -2327,15 +2434,35 @@ static void blk_mq_map_swqueue(struct request_queue *q) * case, remap the current ctx to hctx[0] which * is guaranteed to always have tags allocated */ - q->mq_map[i] = 0; + set->map[0].mq_map[i] = 0; } ctx = per_cpu_ptr(q->queue_ctx, i); - hctx = blk_mq_map_queue(q, i); + for (j = 0; j < set->nr_maps; j++) { + if (!set->map[j].nr_queues) + continue; + + hctx = blk_mq_map_queue_type(q, j, i); + + /* + * If the CPU is already set in the mask, then we've + * mapped this one already. This can happen if + * devices share queues across queue maps. + */ + if (cpumask_test_cpu(i, hctx->cpumask)) + continue; + + cpumask_set_cpu(i, hctx->cpumask); + hctx->type = j; + ctx->index_hw[hctx->type] = hctx->nr_ctx; + hctx->ctxs[hctx->nr_ctx++] = ctx; - cpumask_set_cpu(i, hctx->cpumask); - ctx->index_hw = hctx->nr_ctx; - hctx->ctxs[hctx->nr_ctx++] = ctx; + /* + * If the nr_ctx type overflows, we have exceeded the + * amount of sw queues we can support. + */ + BUG_ON(!hctx->nr_ctx); + } } mutex_unlock(&q->sysfs_lock); @@ -2425,8 +2552,6 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, struct request_queue *q) { - q->tag_set = set; - mutex_lock(&set->tag_list_lock); /* @@ -2445,6 +2570,34 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, mutex_unlock(&set->tag_list_lock); } +/* All allocations will be freed in release handler of q->mq_kobj */ +static int blk_mq_alloc_ctxs(struct request_queue *q) +{ + struct blk_mq_ctxs *ctxs; + int cpu; + + ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); + if (!ctxs) + return -ENOMEM; + + ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); + if (!ctxs->queue_ctx) + goto fail; + + for_each_possible_cpu(cpu) { + struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); + ctx->ctxs = ctxs; + } + + q->mq_kobj = &ctxs->kobj; + q->queue_ctx = ctxs->queue_ctx; + + return 0; + fail: + kfree(ctxs); + return -ENOMEM; +} + /* * It is the actual release handler for mq, but we do it from * request queue's release handler for avoiding use-after-free @@ -2463,8 +2616,6 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - q->mq_map = NULL; - kfree(q->queue_hw_ctx); /* @@ -2472,15 +2623,13 @@ void blk_mq_release(struct request_queue *q) * both share lifetime with request queue. */ blk_mq_sysfs_deinit(q); - - free_percpu(q->queue_ctx); } struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { struct request_queue *uninit_q, *q; - uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node, NULL); + uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); if (!uninit_q) return ERR_PTR(-ENOMEM); @@ -2492,6 +2641,40 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); +/* + * Helper for setting up a queue with mq ops, given queue depth, and + * the passed in mq ops flags. + */ +struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, + unsigned int queue_depth, + unsigned int set_flags) +{ + struct request_queue *q; + int ret; + + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + + ret = blk_mq_alloc_tag_set(set); + if (ret) + return ERR_PTR(ret); + + q = blk_mq_init_queue(set); + if (IS_ERR(q)) { + blk_mq_free_tag_set(set); + return q; + } + + return q; +} +EXPORT_SYMBOL(blk_mq_init_sq_queue); + static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) { int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); @@ -2506,48 +2689,90 @@ static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) return hw_ctx_size; } +static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( + struct blk_mq_tag_set *set, struct request_queue *q, + int hctx_idx, int node) +{ + struct blk_mq_hw_ctx *hctx; + + hctx = kzalloc_node(blk_mq_hw_ctx_size(set), + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node); + if (!hctx) + return NULL; + + if (!zalloc_cpumask_var_node(&hctx->cpumask, + GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, + node)) { + kfree(hctx); + return NULL; + } + + atomic_set(&hctx->nr_active, 0); + hctx->numa_node = node; + hctx->queue_num = hctx_idx; + + if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) { + free_cpumask_var(hctx->cpumask); + kfree(hctx); + return NULL; + } + blk_mq_hctx_kobj_init(hctx); + + return hctx; +} + static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j; + int i, j, end; struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - blk_mq_sysfs_unregister(q); - /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { int node; + struct blk_mq_hw_ctx *hctx; - if (hctxs[i]) + node = blk_mq_hw_queue_to_node(&set->map[0], i); + /* + * If the hw queue has been mapped to another numa node, + * we need to realloc the hctx. If allocation fails, fallback + * to use the previous one. + */ + if (hctxs[i] && (hctxs[i]->numa_node == node)) continue; - node = blk_mq_hw_queue_to_node(q->mq_map, i); - hctxs[i] = kzalloc_node(blk_mq_hw_ctx_size(set), - GFP_KERNEL, node); - if (!hctxs[i]) - break; - - if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, - node)) { - kfree(hctxs[i]); - hctxs[i] = NULL; - break; - } - - atomic_set(&hctxs[i]->nr_active, 0); - hctxs[i]->numa_node = node; - hctxs[i]->queue_num = i; - - if (blk_mq_init_hctx(q, set, hctxs[i], i)) { - free_cpumask_var(hctxs[i]->cpumask); - kfree(hctxs[i]); - hctxs[i] = NULL; - break; + hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); + if (hctx) { + if (hctxs[i]) { + blk_mq_exit_hctx(q, set, hctxs[i], i); + kobject_put(&hctxs[i]->kobj); + } + hctxs[i] = hctx; + } else { + if (hctxs[i]) + pr_warn("Allocate new hctx on node %d fails,\ + fallback to previous one on node %d\n", + node, hctxs[i]->numa_node); + else + break; } - blk_mq_hctx_kobj_init(hctxs[i]); } - for (j = i; j < q->nr_hw_queues; j++) { + /* + * Increasing nr_hw_queues fails. Free the newly allocated + * hctxs and keep the previous q->nr_hw_queues. + */ + if (i != set->nr_hw_queues) { + j = q->nr_hw_queues; + end = i; + } else { + j = i; + end = q->nr_hw_queues; + q->nr_hw_queues = set->nr_hw_queues; + } + + for (; j < end; j++) { struct blk_mq_hw_ctx *hctx = hctxs[j]; if (hctx) { @@ -2559,9 +2784,20 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, } } - q->nr_hw_queues = i; mutex_unlock(&q->sysfs_lock); - blk_mq_sysfs_register(q); +} + +/* + * Maximum number of hardware queues we support. For single sets, we'll never + * have more than the CPUs (software queues). For multiple sets, the tag_set + * user may have set ->nr_hw_queues larger. + */ +static unsigned int nr_hw_queues(struct blk_mq_tag_set *set) +{ + if (set->nr_maps == 1) + return nr_cpu_ids; + + return max(set->nr_hw_queues, nr_cpu_ids); } struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, @@ -2576,19 +2812,17 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, if (!q->poll_cb) goto err_exit; - q->queue_ctx = alloc_percpu(struct blk_mq_ctx); - if (!q->queue_ctx) + if (blk_mq_alloc_ctxs(q)) goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); - q->queue_hw_ctx = kcalloc_node(nr_cpu_ids, sizeof(*(q->queue_hw_ctx)), + q->nr_queues = nr_hw_queues(set); + q->queue_hw_ctx = kcalloc_node(q->nr_queues, sizeof(*(q->queue_hw_ctx)), GFP_KERNEL, set->numa_node); if (!q->queue_hw_ctx) - goto err_percpu; - - q->mq_map = set->mq_map; + goto err_sys_init; blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) @@ -2597,12 +2831,15 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); - q->nr_queues = nr_cpu_ids; + q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); if (!(set->flags & BLK_MQ_F_SG_MERGE)) - queue_flag_set_unlocked(QUEUE_FLAG_NO_SG_MERGE, q); + blk_queue_flag_set(QUEUE_FLAG_NO_SG_MERGE, q); q->sg_reserved_size = INT_MAX; @@ -2611,8 +2848,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, spin_lock_init(&q->requeue_lock); blk_queue_make_request(q, blk_mq_make_request); - if (q->mq_ops->poll) - q->poll_fn = blk_mq_poll; /* * Do this after blk_queue_make_request() overrides it... @@ -2624,9 +2859,6 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, */ q->poll_nsec = -1; - if (set->ops->complete) - blk_queue_softirq_done(q, set->ops->complete); - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); @@ -2643,8 +2875,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, err_hctxs: kfree(q->queue_hw_ctx); -err_percpu: - free_percpu(q->queue_ctx); +err_sys_init: + blk_mq_sysfs_deinit(q); err_exit: q->mq_ops = NULL; return ERR_PTR(-ENOMEM); @@ -2659,25 +2891,6 @@ void blk_mq_free_queue(struct request_queue *q) blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); } -/* Basically redo blk_mq_init_queue with queue frozen */ -static void blk_mq_queue_reinit(struct request_queue *q) -{ - WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); - - blk_mq_debugfs_unregister_hctxs(q); - blk_mq_sysfs_unregister(q); - - /* - * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe - * we should change hctx numa_node according to the new topology (this - * involves freeing and re-allocating memory, worth doing?) - */ - blk_mq_map_swqueue(q); - - blk_mq_sysfs_register(q); - blk_mq_debugfs_register_hctxs(q); -} - static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; @@ -2732,7 +2945,9 @@ static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) { - if (set->ops->map_queues) { + if (set->ops->map_queues && !is_kdump_kernel()) { + int i; + /* * transport .map_queues is usually done in the following * way: @@ -2740,18 +2955,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) * for (queue = 0; queue < set->nr_hw_queues; queue++) { * mask = get_cpu_mask(queue) * for_each_cpu(cpu, mask) - * set->mq_map[cpu] = queue; + * set->map[x].mq_map[cpu] = queue; * } * * When we need to remap, the table has to be cleared for * killing stale mapping since one CPU may not be mapped * to any hw queue. */ - blk_mq_clear_mq_map(set); + for (i = 0; i < set->nr_maps; i++) + blk_mq_clear_mq_map(&set->map[i]); return set->ops->map_queues(set); - } else - return blk_mq_map_queues(set); + } else { + BUG_ON(set->nr_maps > 1); + return blk_mq_map_queues(&set->map[0]); + } } /* @@ -2762,7 +2980,7 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) */ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) { - int ret; + int i, ret; BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); @@ -2785,6 +3003,11 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->queue_depth = BLK_MQ_MAX_DEPTH; } + if (!set->nr_maps) + set->nr_maps = 1; + else if (set->nr_maps > HCTX_MAX_TYPES) + return -EINVAL; + /* * If a crashdump is active, then we are potentially in a very * memory constrained environment. Limit us to 1 queue and @@ -2792,24 +3015,30 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) */ if (is_kdump_kernel()) { set->nr_hw_queues = 1; + set->nr_maps = 1; set->queue_depth = min(64U, set->queue_depth); } /* - * There is no use for more h/w queues than cpus. + * There is no use for more h/w queues than cpus if we just have + * a single map */ - if (set->nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - set->tags = kcalloc_node(nr_cpu_ids, sizeof(struct blk_mq_tags *), + set->tags = kcalloc_node(nr_hw_queues(set), sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); if (!set->tags) return -ENOMEM; ret = -ENOMEM; - set->mq_map = kcalloc_node(nr_cpu_ids, sizeof(*set->mq_map), - GFP_KERNEL, set->numa_node); - if (!set->mq_map) - goto out_free_tags; + for (i = 0; i < set->nr_maps; i++) { + set->map[i].mq_map = kcalloc_node(nr_cpu_ids, + sizeof(set->map[i].mq_map[0]), + GFP_KERNEL, set->numa_node); + if (!set->map[i].mq_map) + goto out_free_mq_map; + set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; + } ret = blk_mq_update_queue_map(set); if (ret) @@ -2825,9 +3054,10 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) return 0; out_free_mq_map: - kfree(set->mq_map); - set->mq_map = NULL; -out_free_tags: + for (i = 0; i < set->nr_maps; i++) { + kfree(set->map[i].mq_map); + set->map[i].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; return ret; @@ -2836,13 +3066,15 @@ EXPORT_SYMBOL(blk_mq_alloc_tag_set); void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { - int i; + int i, j; - for (i = 0; i < nr_cpu_ids; i++) + for (i = 0; i < nr_hw_queues(set); i++) blk_mq_free_map_and_requests(set, i); - kfree(set->mq_map); - set->mq_map = NULL; + for (j = 0; j < set->nr_maps; j++) { + kfree(set->map[j].mq_map); + set->map[j].mq_map = NULL; + } kfree(set->tags); set->tags = NULL; @@ -2964,10 +3196,11 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, { struct request_queue *q; LIST_HEAD(head); + int prev_nr_hw_queues; lockdep_assert_held(&set->tag_list_lock); - if (nr_hw_queues > nr_cpu_ids) + if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) nr_hw_queues = nr_cpu_ids; if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) return; @@ -2987,11 +3220,30 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, if (!blk_mq_elv_switch_none(&head, q)) goto switch_back; + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_debugfs_unregister_hctxs(q); + blk_mq_sysfs_unregister(q); + } + + prev_nr_hw_queues = set->nr_hw_queues; set->nr_hw_queues = nr_hw_queues; blk_mq_update_queue_map(set); +fallback: list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); - blk_mq_queue_reinit(q); + if (q->nr_hw_queues != set->nr_hw_queues) { + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", + nr_hw_queues, prev_nr_hw_queues); + set->nr_hw_queues = prev_nr_hw_queues; + blk_mq_map_queues(&set->map[0]); + goto fallback; + } + blk_mq_map_swqueue(q); + } + + list_for_each_entry(q, &set->tag_list, tag_set_list) { + blk_mq_sysfs_register(q); + blk_mq_debugfs_register_hctxs(q); } switch_back: @@ -3090,15 +3342,12 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, return false; /* - * poll_nsec can be: + * If we get here, hybrid polling is enabled. Hence poll_nsec can be: * - * -1: don't ever hybrid sleep * 0: use half of prev avg * >0: use this specific value */ - if (q->poll_nsec == -1) - return false; - else if (q->poll_nsec > 0) + if (q->poll_nsec > 0) nsecs = q->poll_nsec; else nsecs = blk_mq_poll_nsecs(q, hctx, rq); @@ -3135,11 +3384,57 @@ static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, return true; } -static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) +static bool blk_mq_poll_hybrid(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) +{ + struct request *rq; + + if (q->poll_nsec == -1) + return false; + + if (!blk_qc_t_is_internal(cookie)) + rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); + else { + rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); + /* + * With scheduling, if the request has completed, we'll + * get a NULL return here, as we clear the sched tag when + * that happens. The request still remains valid, like always, + * so we should be safe with just the NULL check. + */ + if (!rq) + return false; + } + + return blk_mq_poll_hybrid_sleep(q, hctx, rq); +} + +/** + * blk_poll - poll for IO completions + * @q: the queue + * @cookie: cookie passed back at IO submission time + * @spin: whether to spin for completions + * + * Description: + * Poll for completions on the passed in queue. Returns number of + * completed entries found. If @spin is true, then blk_poll will continue + * looping until at least one completion is found, unless the task is + * otherwise marked running (or we need to reschedule). + */ +int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) { - struct request_queue *q = hctx->queue; + struct blk_mq_hw_ctx *hctx; long state; + if (!blk_qc_t_valid(cookie) || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; + + if (current->plug) + blk_flush_plug_list(current->plug, false); + + hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; + /* * If we sleep, have the caller restart the poll loop to reset * the state. Like for the other success return cases, the @@ -3147,63 +3442,44 @@ static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) * the IO isn't complete, we'll get called again and will go * straight to the busy poll loop. */ - if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) - return true; + if (blk_mq_poll_hybrid(q, hctx, cookie)) + return 1; hctx->poll_considered++; state = current->state; - while (!need_resched()) { + do { int ret; hctx->poll_invoked++; - ret = q->mq_ops->poll(hctx, rq->tag); + ret = q->mq_ops->poll(hctx); if (ret > 0) { hctx->poll_success++; - set_current_state(TASK_RUNNING); - return true; + __set_current_state(TASK_RUNNING); + return ret; } if (signal_pending_state(state, current)) - set_current_state(TASK_RUNNING); + __set_current_state(TASK_RUNNING); if (current->state == TASK_RUNNING) - return true; - if (ret < 0) + return 1; + if (ret < 0 || !spin) break; cpu_relax(); - } + } while (!need_resched()); __set_current_state(TASK_RUNNING); - return false; + return 0; } +EXPORT_SYMBOL_GPL(blk_poll); -static bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) +unsigned int blk_mq_rq_cpu(struct request *rq) { - struct blk_mq_hw_ctx *hctx; - struct request *rq; - - if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - return false; - - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - if (!blk_qc_t_is_internal(cookie)) - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); - else { - rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); - /* - * With scheduling, if the request has completed, we'll - * get a NULL return here, as we clear the sched tag when - * that happens. The request still remains valid, like always, - * so we should be safe with just the NULL check. - */ - if (!rq) - return false; - } - - return __blk_mq_poll(hctx, rq); + return rq->mq_ctx->cpu; } +EXPORT_SYMBOL(blk_mq_rq_cpu); static int __init blk_mq_init(void) { |