diff options
Diffstat (limited to 'block/blk-mq-sched.c')
-rw-r--r-- | block/blk-mq-sched.c | 614 |
1 files changed, 292 insertions, 322 deletions
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index 2766066a15db..67c95f31b15b 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -6,7 +6,7 @@ */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/blk-mq.h> +#include <linux/list_sort.h> #include <trace/events/block.h> @@ -14,105 +14,183 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_free_hctx_data(struct request_queue *q, - void (*exit)(struct blk_mq_hw_ctx *)) +/* + * Mark a hardware queue as needing a restart. + */ +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx; - int i; + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return; - queue_for_each_hw_ctx(q, hctx, i) { - if (exit && hctx->sched_data) - exit(hctx); - kfree(hctx->sched_data); - hctx->sched_data = NULL; - } + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); +EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); -void blk_mq_sched_assign_ioc(struct request *rq) +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - struct request_queue *q = rq->q; - struct io_context *ioc; - struct io_cq *icq; + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* - * May not have an IO context if it's a passthrough request + * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) + * in blk_mq_run_hw_queue(). Its pair is the barrier in + * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, + * meantime new request added to hctx->dispatch is missed to check in + * blk_mq_run_hw_queue(). */ - ioc = current->io_context; - if (!ioc) - return; + smp_mb(); - spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(&q->queue_lock); - - if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) - return; - } - get_io_context(icq->ioc); - rq->elv.icq = icq; + blk_mq_run_hw_queue(hctx, true); } -/* - * Mark a hardware queue as needing a restart. For shared queues, maintain - * a count of how many hardware queues are marked for restart. - */ -void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +static int sched_rq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + return rqa->mq_hctx > rqb->mq_hctx; } -EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + struct blk_mq_hw_ctx *hctx = + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; + struct request *rq; + LIST_HEAD(hctx_list); + unsigned int count = 0; - blk_mq_run_hw_queue(hctx, true); + list_for_each_entry(rq, rq_list, queuelist) { + if (rq->mq_hctx != hctx) { + list_cut_before(&hctx_list, rq_list, &rq->queuelist); + goto dispatch; + } + count++; + } + list_splice_tail_init(rq_list, &hctx_list); + +dispatch: + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); } +#define BLK_MQ_BUDGET_DELAY 3 /* ms units */ + /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * be run again. This is necessary to avoid starving flushes. */ -static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; + bool multi_hctxs = false, run_queue = false; + bool dispatched = false, busy = false; + unsigned int max_dispatch; LIST_HEAD(rq_list); + int count = 0; + + if (hctx->dispatch_busy) + max_dispatch = 1; + else + max_dispatch = hctx->queue->nr_requests; do { struct request *rq; + int budget_token; if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + if (!list_empty_careful(&hctx->dispatch)) { + busy = true; + break; + } + + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q, budget_token); + /* + * We're releasing without dispatching. Holding the + * budget could have blocked any "hctx"s with the + * same queue and if we didn't dispatch then there's + * no guarantee anyone will kick the queue. Kick it + * ourselves. + */ + run_queue = true; break; } + blk_mq_set_rq_budget_token(rq, budget_token); + /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + list_add_tail(&rq->queuelist, &rq_list); + count++; + if (rq->mq_hctx != hctx) + multi_hctxs = true; + + /* + * If we cannot get tag for the request, stop dequeueing + * requests from the IO scheduler. We are unlikely to be able + * to submit them anyway and it creates false impression for + * scheduling heuristics that the device can take more IO. + */ + if (!blk_mq_get_driver_tag(rq)) + break; + } while (count < max_dispatch); + + if (!count) { + if (run_queue) + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + } else if (multi_hctxs) { + /* + * Requests from different hctx may be dequeued from some + * schedulers, such as bfq and deadline. + * + * Sort the requests in the list according to their hctx, + * dispatch batching requests from same hctx at a time. + */ + list_sort(NULL, &rq_list, sched_rq_cmp); + do { + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); + } while (!list_empty(&rq_list)); + } else { + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + } + + if (busy) + return -EAGAIN; + return !!dispatched; +} + +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + unsigned long end = jiffies + HZ; + int ret; + + do { + ret = __blk_mq_do_dispatch_sched(hctx); + if (ret != 1) + break; + if (need_resched() || time_is_before_jiffies(end)) { + blk_mq_delay_run_hw_queue(hctx, 0); + break; + } + } while (1); + + return ret; } static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, @@ -129,29 +207,50 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. + * + * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to + * be run again. This is necessary to avoid starving flushes. */ -static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) +static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); + int ret = 0; + struct request *rq; do { - struct request *rq; + int budget_token; + + if (!list_empty_careful(&hctx->dispatch)) { + ret = -EAGAIN; + break; + } if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q, budget_token); + /* + * We're releasing without dispatching. Holding the + * budget could have blocked any "hctx"s with the + * same queue and if we didn't dispatch then there's + * no guarantee anyone will kick the queue. Kick it + * ourselves. + */ + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); break; } + blk_mq_set_rq_budget_token(rq, budget_token); + /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() @@ -162,24 +261,17 @@ static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); WRITE_ONCE(hctx->dispatch_from, ctx); + return ret; } -void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { - struct request_queue *q = hctx->queue; - struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.dispatch_request; + bool need_dispatch = false; LIST_HEAD(rq_list); - /* RCU or SRCU read lock is needed before checking quiesced flag */ - if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) - return; - - hctx->run++; - /* * If we have previous entries on our dispatch list, grab them first for * more fair dispatch. @@ -206,341 +298,206 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { - if (has_sched_dispatch) - blk_mq_do_dispatch_sched(hctx); - else - blk_mq_do_dispatch_ctx(hctx); - } - } else if (has_sched_dispatch) { - blk_mq_do_dispatch_sched(hctx); - } else if (hctx->dispatch_busy) { - /* dequeue request one by one from sw queue if queue is busy */ - blk_mq_do_dispatch_ctx(hctx); + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + return 0; + need_dispatch = true; } else { - blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list, false); + need_dispatch = hctx->dispatch_busy; } -} -bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, - struct request **merged_request) -{ - struct request *rq; + if (hctx->queue->elevator) + return blk_mq_do_dispatch_sched(hctx); - switch (elv_merge(q, &rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (!blk_mq_sched_allow_merge(q, rq, bio)) - return false; - if (!bio_attempt_back_merge(q, rq, bio)) - return false; - *merged_request = attempt_back_merge(q, rq); - if (!*merged_request) - elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); - return true; - case ELEVATOR_FRONT_MERGE: - if (!blk_mq_sched_allow_merge(q, rq, bio)) - return false; - if (!bio_attempt_front_merge(q, rq, bio)) - return false; - *merged_request = attempt_front_merge(q, rq); - if (!*merged_request) - elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); - return true; - case ELEVATOR_DISCARD_MERGE: - return bio_attempt_discard_merge(q, rq, bio); - default: - return false; - } + /* dequeue request one by one from sw queue if queue is busy */ + if (need_dispatch) + return blk_mq_do_dispatch_ctx(hctx); + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + return 0; } -EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); -/* - * Iterate list of requests and see if we can merge this bio with any - * of them. - */ -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio) +void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { - struct request *rq; - int checked = 8; - - list_for_each_entry_reverse(rq, list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(q, rq, bio); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(q, rq, bio); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } - - return merged; - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); + struct request_queue *q = hctx->queue; -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct bio *bio) -{ - enum hctx_type type = hctx->type; + /* RCU or SRCU read lock is needed before checking quiesced flag */ + if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) + return; - lockdep_assert_held(&ctx->lock); + hctx->run++; - if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio)) { - ctx->rq_merged++; - return true; + /* + * A return of -EAGAIN is an indication that hctx->dispatch is not + * empty and we must run again in order to avoid starving flushes. + */ + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { + if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) + blk_mq_run_hw_queue(hctx, true); } - - return false; } -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) { struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; if (e && e->type->ops.bio_merge) { - blk_mq_put_ctx(ctx); - return e->type->ops.bio_merge(hctx, bio); + ret = e->type->ops.bio_merge(q, bio, nr_segs); + goto out_put; } + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_lists[type])) { - /* default per sw-queue merge */ - spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, hctx, ctx, bio); - spin_unlock(&ctx->lock); - } + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || + list_empty_careful(&ctx->rq_lists[type])) + goto out_put; - blk_mq_put_ctx(ctx); + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + /* + * Reverse check our software queue for entries that we could + * potentially merge with. Currently includes a hand-wavy stop + * count of 8, to not spend too much time checking for merges. + */ + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) + ret = true; + + spin_unlock(&ctx->lock); +out_put: return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -void blk_mq_sched_request_inserted(struct request *rq) -{ - trace_block_rq_insert(rq->q, rq); -} -EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); - -static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, - bool has_sched, - struct request *rq) +static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) { - /* dispatch flush rq directly */ - if (rq->rq_flags & RQF_FLUSH_SEQ) { - spin_lock(&hctx->lock); - list_add(&rq->queuelist, &hctx->dispatch); - spin_unlock(&hctx->lock); - return true; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + hctx->sched_tags = q->sched_shared_tags; + return 0; } - if (has_sched) - rq->rq_flags |= RQF_SORTED; + hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, + q->nr_requests); - return false; + if (!hctx->sched_tags) + return -ENOMEM; + return 0; } -void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async) +static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) { - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - /* flush rq in flush machinery need to be dispatched directly */ - if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { - blk_insert_flush(rq); - goto run; - } - - WARN_ON(e && (rq->tag != -1)); - - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) - goto run; - - if (e && e->type->ops.insert_requests) { - LIST_HEAD(list); - - list_add(&rq->queuelist, &list); - e->type->ops.insert_requests(hctx, &list, at_head); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } - -run: - if (run_queue) - blk_mq_run_hw_queue(hctx, async); + blk_mq_free_rq_map(queue->sched_shared_tags); + queue->sched_shared_tags = NULL; } -void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async) +/* called in queue's release handler, tagset has gone away */ +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { - struct elevator_queue *e; - struct request_queue *q = hctx->queue; - - /* - * blk_mq_sched_insert_requests() is called from flush plug - * context only, and hold one usage counter to prevent queue - * from being released. - */ - percpu_ref_get(&q->q_usage_counter); + struct blk_mq_hw_ctx *hctx; + unsigned long i; - e = hctx->queue->elevator; - if (e && e->type->ops.insert_requests) - e->type->ops.insert_requests(hctx, list, false); - else { - /* - * try to issue requests directly if the hw queue isn't - * busy in case of 'none' scheduler, and this way may save - * us one extra enqueue & dequeue to sw queue. - */ - if (!hctx->dispatch_busy && !e && !run_queue_async) { - blk_mq_try_issue_list_directly(hctx, list); - if (list_empty(list)) - goto out; + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) { + if (!blk_mq_is_shared_tags(flags)) + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; } - blk_mq_insert_requests(hctx, ctx, list); } - blk_mq_run_hw_queue(hctx, run_queue_async); - out: - percpu_ref_put(&q->q_usage_counter); + if (blk_mq_is_shared_tags(flags)) + blk_mq_exit_sched_shared_tags(q); } -static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +static int blk_mq_init_sched_shared_tags(struct request_queue *queue) { - if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } -} + struct blk_mq_tag_set *set = queue->tag_set; -static int blk_mq_sched_alloc_tags(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - struct blk_mq_tag_set *set = q->tag_set; - int ret; - - hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags); - if (!hctx->sched_tags) + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!queue->sched_shared_tags) return -ENOMEM; - ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) - blk_mq_sched_free_tags(set, hctx, hctx_idx); + blk_mq_tag_update_sched_shared_tags(queue); - return ret; -} - -/* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q) -{ - struct blk_mq_hw_ctx *hctx; - int i; - - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } - } + return 0; } +/* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { + unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; - unsigned int i; + unsigned long i; int ret; - if (!e) { - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - return 0; - } - /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. * Additionally, this is a per-hw queue depth. */ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_MAX_RQ); + BLKDEV_DEFAULT_RQ); + + if (blk_mq_is_shared_tags(flags)) { + ret = blk_mq_init_sched_shared_tags(q); + if (ret) + return ret; + } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_tags(q, hctx, i); + ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) - goto err; + goto err_free_map_and_rqs; } ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_map_and_rqs; + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); + mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } return 0; -err: - blk_mq_sched_free_requests(q); - blk_mq_sched_tags_teardown(q); +err_free_map_and_rqs: + blk_mq_sched_free_rqs(q); + blk_mq_sched_tags_teardown(q, flags); + q->elevator = NULL; return ret; } @@ -549,34 +506,47 @@ err: * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ -void blk_mq_sched_free_requests(struct request_queue *q) +void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; - - lockdep_assert_held(&q->sysfs_lock); + unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, + BLK_MQ_NO_HCTX_IDX); + } else { + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, + hctx->sched_tags, i); + } } } void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; + unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } + flags = hctx->flags; } + + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); - blk_mq_sched_tags_teardown(q); + blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; } |