aboutsummaryrefslogtreecommitdiffstats
path: root/block/blk-core.c
diff options
context:
space:
mode:
Diffstat (limited to 'block/blk-core.c')
-rw-r--r--block/blk-core.c556
1 files changed, 359 insertions, 197 deletions
diff --git a/block/blk-core.c b/block/blk-core.c
index 8340f69670d8..03252af8c82c 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -20,6 +20,7 @@
#include <linux/blk-mq.h>
#include <linux/highmem.h>
#include <linux/mm.h>
+#include <linux/pagemap.h>
#include <linux/kernel_stat.h>
#include <linux/string.h>
#include <linux/init.h>
@@ -34,8 +35,12 @@
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
#include <linux/blk-cgroup.h>
+#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
+#include <linux/psi.h>
+#include <linux/sched/sysctl.h>
+#include <linux/blk-crypto.h>
#define CREATE_TRACE_POINTS
#include <trace/events/block.h>
@@ -117,9 +122,52 @@ void blk_rq_init(struct request_queue *q, struct request *rq)
rq->internal_tag = -1;
rq->start_time_ns = ktime_get_ns();
rq->part = NULL;
+ refcount_set(&rq->ref, 1);
+ blk_crypto_rq_set_defaults(rq);
}
EXPORT_SYMBOL(blk_rq_init);
+#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
+static const char *const blk_op_name[] = {
+ REQ_OP_NAME(READ),
+ REQ_OP_NAME(WRITE),
+ REQ_OP_NAME(FLUSH),
+ REQ_OP_NAME(DISCARD),
+ REQ_OP_NAME(SECURE_ERASE),
+ REQ_OP_NAME(ZONE_RESET),
+ REQ_OP_NAME(ZONE_RESET_ALL),
+ REQ_OP_NAME(ZONE_OPEN),
+ REQ_OP_NAME(ZONE_CLOSE),
+ REQ_OP_NAME(ZONE_FINISH),
+ REQ_OP_NAME(ZONE_APPEND),
+ REQ_OP_NAME(WRITE_SAME),
+ REQ_OP_NAME(WRITE_ZEROES),
+ REQ_OP_NAME(SCSI_IN),
+ REQ_OP_NAME(SCSI_OUT),
+ REQ_OP_NAME(DRV_IN),
+ REQ_OP_NAME(DRV_OUT),
+};
+#undef REQ_OP_NAME
+
+/**
+ * blk_op_str - Return string XXX in the REQ_OP_XXX.
+ * @op: REQ_OP_XXX.
+ *
+ * Description: Centralize block layer function to convert REQ_OP_XXX into
+ * string format. Useful in the debugging and tracing bio or request. For
+ * invalid REQ_OP_XXX it returns string "UNKNOWN".
+ */
+inline const char *blk_op_str(unsigned int op)
+{
+ const char *op_str = "UNKNOWN";
+
+ if (op < ARRAY_SIZE(blk_op_name) && blk_op_name[op])
+ op_str = blk_op_name[op];
+
+ return op_str;
+}
+EXPORT_SYMBOL_GPL(blk_op_str);
+
static const struct {
int errno;
const char *name;
@@ -167,18 +215,23 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
-static void print_req_error(struct request *req, blk_status_t status)
+static void print_req_error(struct request *req, blk_status_t status,
+ const char *caller)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
return;
- printk_ratelimited(KERN_ERR "%s: %s error, dev %s, sector %llu flags %x\n",
- __func__, blk_errors[idx].name,
- req->rq_disk ? req->rq_disk->disk_name : "?",
- (unsigned long long)blk_rq_pos(req),
- req->cmd_flags);
+ printk_ratelimited(KERN_ERR
+ "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+ "phys_seg %u prio class %u\n",
+ caller, blk_errors[idx].name,
+ req->rq_disk ? req->rq_disk->disk_name : "?",
+ blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
+ req->cmd_flags & ~REQ_OP_MASK,
+ req->nr_phys_segments,
+ IOPRIO_PRIO_CLASS(req->ioprio));
}
static void req_bio_endio(struct request *rq, struct bio *bio,
@@ -192,6 +245,17 @@ static void req_bio_endio(struct request *rq, struct bio *bio,
bio_advance(bio, nbytes);
+ if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ */
+ if (bio->bi_iter.bi_size)
+ bio->bi_status = BLK_STS_IOERR;
+ else
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+
/* don't actually finish bio if it's part of flush sequence */
if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
bio_endio(bio);
@@ -291,18 +355,18 @@ EXPORT_SYMBOL_GPL(blk_set_queue_dying);
*/
void blk_cleanup_queue(struct request_queue *q)
{
+ WARN_ON_ONCE(blk_queue_registered(q));
+
/* mark @q DYING, no new request or merges will be allowed afterwards */
- mutex_lock(&q->sysfs_lock);
blk_set_queue_dying(q);
blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
- blk_queue_flag_set(QUEUE_FLAG_DYING, q);
- mutex_unlock(&q->sysfs_lock);
/*
* Drain all requests queued before DYING marking. Set DEAD flag to
- * prevent that q->request_fn() gets invoked after draining finished.
+ * prevent that blk_mq_run_hw_queues() accesses the hardware queues
+ * after draining finished.
*/
blk_freeze_queue(q);
@@ -340,12 +404,6 @@ void blk_cleanup_queue(struct request_queue *q)
}
EXPORT_SYMBOL(blk_cleanup_queue);
-struct request_queue *blk_alloc_queue(gfp_t gfp_mask)
-{
- return blk_alloc_queue_node(gfp_mask, NUMA_NO_NODE);
-}
-EXPORT_SYMBOL(blk_alloc_queue);
-
/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
@@ -398,6 +456,23 @@ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
}
}
+static inline int bio_queue_enter(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ bool nowait = bio->bi_opf & REQ_NOWAIT;
+ int ret;
+
+ ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
+ if (unlikely(ret)) {
+ if (nowait && !blk_queue_dying(q))
+ bio_wouldblock_error(bio);
+ else
+ bio_io_error(bio);
+ }
+
+ return ret;
+}
+
void blk_queue_exit(struct request_queue *q)
{
percpu_ref_put(&q->q_usage_counter);
@@ -422,25 +497,19 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-/**
- * blk_alloc_queue_node - allocate a request queue
- * @gfp_mask: memory allocation flags
- * @node_id: NUMA node to allocate memory from
- */
-struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
+struct request_queue *__blk_alloc_queue(int node_id)
{
struct request_queue *q;
int ret;
q = kmem_cache_alloc_node(blk_requestq_cachep,
- gfp_mask | __GFP_ZERO, node_id);
+ GFP_KERNEL | __GFP_ZERO, node_id);
if (!q)
return NULL;
- INIT_LIST_HEAD(&q->queue_head);
q->last_merge = NULL;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, gfp_mask);
+ q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
if (q->id < 0)
goto fail_q;
@@ -448,7 +517,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (ret)
goto fail_id;
- q->backing_dev_info = bdi_alloc_node(gfp_mask, node_id);
+ q->backing_dev_info = bdi_alloc(node_id);
if (!q->backing_dev_info)
goto fail_split;
@@ -458,7 +527,6 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
- q->backing_dev_info->name = "block";
q->node = node_id;
timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
@@ -476,6 +544,7 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
mutex_init(&q->blk_trace_mutex);
#endif
mutex_init(&q->sysfs_lock);
+ mutex_init(&q->sysfs_dir_lock);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
@@ -493,6 +562,9 @@ struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id)
if (blkcg_init_queue(q))
goto fail_ref;
+ blk_queue_dma_alignment(q, 511);
+ blk_set_default_limits(&q->limits);
+
return q;
fail_ref:
@@ -509,7 +581,22 @@ fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
-EXPORT_SYMBOL(blk_alloc_queue_node);
+
+struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id)
+{
+ struct request_queue *q;
+
+ if (WARN_ON_ONCE(!make_request))
+ return NULL;
+
+ q = __blk_alloc_queue(node_id);
+ if (!q)
+ return NULL;
+ q->make_request_fn = make_request;
+ q->nr_requests = BLKDEV_MAX_RQ;
+ return q;
+}
+EXPORT_SYMBOL(blk_alloc_queue);
bool blk_get_queue(struct request_queue *q)
{
@@ -550,15 +637,26 @@ void blk_put_request(struct request *req)
}
EXPORT_SYMBOL(blk_put_request);
-bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+static void blk_account_io_merge_bio(struct request *req)
+{
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+}
+
+bool bio_attempt_back_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
- if (!ll_back_merge_fn(q, req, bio))
+ if (!ll_back_merge_fn(req, bio, nr_segs))
return false;
- trace_block_bio_backmerge(q, req, bio);
+ trace_block_bio_backmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -567,19 +665,22 @@ bool bio_attempt_back_merge(struct request_queue *q, struct request *req,
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_free_ctx(bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
-bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
+bool bio_attempt_front_merge(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
- if (!ll_front_merge_fn(q, req, bio))
+ if (!ll_front_merge_fn(req, bio, nr_segs))
return false;
- trace_block_bio_frontmerge(q, req, bio);
+ trace_block_bio_frontmerge(req->q, req, bio);
+ rq_qos_merge(req->q, req, bio);
if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
blk_rq_set_mixed_merge(req);
@@ -590,7 +691,9 @@ bool bio_attempt_front_merge(struct request_queue *q, struct request *req,
req->__sector = bio->bi_iter.bi_sector;
req->__data_len += bio->bi_iter.bi_size;
- blk_account_io_start(req, false);
+ bio_crypt_do_front_merge(req, bio);
+
+ blk_account_io_merge_bio(req);
return true;
}
@@ -605,12 +708,14 @@ bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
blk_rq_get_max_sectors(req, blk_rq_pos(req)))
goto no_merge;
+ rq_qos_merge(q, req, bio);
+
req->biotail->bi_next = bio;
req->biotail = bio;
req->__data_len += bio->bi_iter.bi_size;
req->nr_phys_segments = segments + 1;
- blk_account_io_start(req, false);
+ blk_account_io_merge_bio(req);
return true;
no_merge:
req_set_nomerge(q, req);
@@ -621,6 +726,7 @@ no_merge:
* blk_attempt_plug_merge - try to merge with %current's plugged list
* @q: request_queue new bio is being queued at
* @bio: new bio being queued
+ * @nr_segs: number of segments in @bio
* @same_queue_rq: pointer to &struct request that gets filled in when
* another request associated with @q is found on the plug list
* (optional, may be %NULL)
@@ -639,13 +745,13 @@ no_merge:
* Caller must ensure !blk_queue_nomerges(q) beforehand.
*/
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- struct request **same_queue_rq)
+ unsigned int nr_segs, struct request **same_queue_rq)
{
struct blk_plug *plug;
struct request *rq;
struct list_head *plug_list;
- plug = current->plug;
+ plug = blk_mq_plug(q, bio);
if (!plug)
return false;
@@ -668,10 +774,10 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
switch (blk_try_merge(rq, bio)) {
case ELEVATOR_BACK_MERGE:
- merged = bio_attempt_back_merge(q, rq, bio);
+ merged = bio_attempt_back_merge(rq, bio, nr_segs);
break;
case ELEVATOR_FRONT_MERGE:
- merged = bio_attempt_front_merge(q, rq, bio);
+ merged = bio_attempt_front_merge(rq, bio, nr_segs);
break;
case ELEVATOR_DISCARD_MERGE:
merged = bio_attempt_discard_merge(q, rq, bio);
@@ -687,18 +793,6 @@ bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
return false;
}
-void blk_init_request_from_bio(struct request *req, struct bio *bio)
-{
- if (bio->bi_opf & REQ_RAHEAD)
- req->cmd_flags |= REQ_FAILFAST_MASK;
-
- req->__sector = bio->bi_iter.bi_sector;
- req->ioprio = bio_prio(bio);
- req->write_hint = bio->bi_write_hint;
- blk_rq_bio_prep(req->q, req, bio);
-}
-EXPORT_SYMBOL_GPL(blk_init_request_from_bio);
-
static void handle_bad_sector(struct bio *bio, sector_t maxsector)
{
char b[BDEVNAME_SIZE];
@@ -809,11 +903,7 @@ static inline int blk_partition_remap(struct bio *bio)
if (unlikely(bio_check_ro(bio, p)))
goto out;
- /*
- * Zone reset does not include bi_size so bio_sectors() is always 0.
- * Include a test for the reset op code and perform the remap if needed.
- */
- if (bio_sectors(bio) || bio_op(bio) == REQ_OP_ZONE_RESET) {
+ if (bio_sectors(bio)) {
if (bio_check_eod(bio, part_nr_sects_read(p)))
goto out;
bio->bi_iter.bi_sector += p->start_sect;
@@ -827,6 +917,41 @@ out:
return ret;
}
+/*
+ * Check write append to a zoned block device.
+ */
+static inline blk_status_t blk_check_zone_append(struct request_queue *q,
+ struct bio *bio)
+{
+ sector_t pos = bio->bi_iter.bi_sector;
+ int nr_sectors = bio_sectors(bio);
+
+ /* Only applicable to zoned block devices */
+ if (!blk_queue_is_zoned(q))
+ return BLK_STS_NOTSUPP;
+
+ /* The bio sector must point to the start of a sequential zone */
+ if (pos & (blk_queue_zone_sectors(q) - 1) ||
+ !blk_queue_zone_is_seq(q, pos))
+ return BLK_STS_IOERR;
+
+ /*
+ * Not allowed to cross zone boundaries. Otherwise, the BIO will be
+ * split and could result in non-contiguous sectors being written in
+ * different zones.
+ */
+ if (nr_sectors > q->limits.chunk_sectors)
+ return BLK_STS_IOERR;
+
+ /* Make sure the BIO is small enough and will not get split */
+ if (nr_sectors > q->limits.max_zone_append_sectors)
+ return BLK_STS_IOERR;
+
+ bio->bi_opf |= REQ_NOMERGE;
+
+ return BLK_STS_OK;
+}
+
static noinline_for_stack bool
generic_make_request_checks(struct bio *bio)
{
@@ -896,10 +1021,22 @@ generic_make_request_checks(struct bio *bio)
if (!q->limits.max_write_same_sectors)
goto not_supported;
break;
+ case REQ_OP_ZONE_APPEND:
+ status = blk_check_zone_append(q, bio);
+ if (status != BLK_STS_OK)
+ goto end_io;
+ break;
case REQ_OP_ZONE_RESET:
+ case REQ_OP_ZONE_OPEN:
+ case REQ_OP_ZONE_CLOSE:
+ case REQ_OP_ZONE_FINISH:
if (!blk_queue_is_zoned(q))
goto not_supported;
break;
+ case REQ_OP_ZONE_RESET_ALL:
+ if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
+ goto not_supported;
+ break;
case REQ_OP_WRITE_ZEROES:
if (!q->limits.max_write_zeroes_sectors)
goto not_supported;
@@ -909,12 +1046,13 @@ generic_make_request_checks(struct bio *bio)
}
/*
- * Various block parts want %current->io_context and lazy ioc
- * allocation ends up trading a lot of pain for a small amount of
- * memory. Just allocate it upfront. This may fail and block
- * layer knows how to live with it.
+ * Various block parts want %current->io_context, so allocate it up
+ * front rather than dealing with lots of pain to allocate it only
+ * where needed. This may fail and the block layer knows how to live
+ * with it.
*/
- create_io_context(GFP_ATOMIC, q->node);
+ if (unlikely(!current->io_context))
+ create_task_io_context(current, GFP_ATOMIC, q->node);
if (!blkcg_bio_issue_check(q, bio))
return false;
@@ -936,29 +1074,28 @@ end_io:
return false;
}
+static blk_qc_t do_make_request(struct bio *bio)
+{
+ struct request_queue *q = bio->bi_disk->queue;
+ blk_qc_t ret = BLK_QC_T_NONE;
+
+ if (blk_crypto_bio_prep(&bio)) {
+ if (!q->make_request_fn)
+ return blk_mq_make_request(q, bio);
+ ret = q->make_request_fn(q, bio);
+ }
+ blk_queue_exit(q);
+ return ret;
+}
+
/**
- * generic_make_request - hand a buffer to its device driver for I/O
+ * generic_make_request - re-submit a bio to the block device layer for I/O
* @bio: The bio describing the location in memory and on the device.
*
- * generic_make_request() is used to make I/O requests of block
- * devices. It is passed a &struct bio, which describes the I/O that needs
- * to be done.
- *
- * generic_make_request() does not return any status. The
- * success/failure status of the request, along with notification of
- * completion, is delivered asynchronously through the bio->bi_end_io
- * function described (one day) else where.
- *
- * The caller of generic_make_request must make sure that bi_io_vec
- * are set to describe the memory buffer, and that bi_dev and bi_sector are
- * set to describe the device address, and the
- * bi_end_io and optionally bi_private are set to describe how
- * completion notification should be signaled.
- *
- * generic_make_request and the drivers it calls may use bi_next if this
- * bio happens to be merged with someone else, and may resubmit the bio to
- * a lower device by calling into generic_make_request recursively, which
- * means the bio should NOT be touched after the call to ->make_request_fn.
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
*/
blk_qc_t generic_make_request(struct bio *bio)
{
@@ -1009,18 +1146,14 @@ blk_qc_t generic_make_request(struct bio *bio)
current->bio_list = bio_list_on_stack;
do {
struct request_queue *q = bio->bi_disk->queue;
- blk_mq_req_flags_t flags = bio->bi_opf & REQ_NOWAIT ?
- BLK_MQ_REQ_NOWAIT : 0;
- if (likely(blk_queue_enter(q, flags) == 0)) {
+ if (likely(bio_queue_enter(bio) == 0)) {
struct bio_list lower, same;
/* Create a fresh bio_list for all subordinate requests */
bio_list_on_stack[1] = bio_list_on_stack[0];
bio_list_init(&bio_list_on_stack[0]);
- ret = q->make_request_fn(q, bio);
-
- blk_queue_exit(q);
+ ret = do_make_request(bio);
/* sort new bios into those for a lower level
* and those for the same level
@@ -1036,12 +1169,6 @@ blk_qc_t generic_make_request(struct bio *bio)
bio_list_merge(&bio_list_on_stack[0], &lower);
bio_list_merge(&bio_list_on_stack[0], &same);
bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- } else {
- if (unlikely(!blk_queue_dying(q) &&
- (bio->bi_opf & REQ_NOWAIT)))
- bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
}
bio = bio_list_pop(&bio_list_on_stack[0]);
} while (bio);
@@ -1058,31 +1185,25 @@ EXPORT_SYMBOL(generic_make_request);
*
* This function behaves like generic_make_request(), but does not protect
* against recursion. Must only be used if the called driver is known
- * to not call generic_make_request (or direct_make_request) again from
- * its make_request function. (Calling direct_make_request again from
- * a workqueue is perfectly fine as that doesn't recurse).
+ * to be blk-mq based.
*/
blk_qc_t direct_make_request(struct bio *bio)
{
struct request_queue *q = bio->bi_disk->queue;
- bool nowait = bio->bi_opf & REQ_NOWAIT;
- blk_qc_t ret;
+ if (WARN_ON_ONCE(q->make_request_fn)) {
+ bio_io_error(bio);
+ return BLK_QC_T_NONE;
+ }
if (!generic_make_request_checks(bio))
return BLK_QC_T_NONE;
-
- if (unlikely(blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0))) {
- if (nowait && !blk_queue_dying(q))
- bio->bi_status = BLK_STS_AGAIN;
- else
- bio->bi_status = BLK_STS_IOERR;
- bio_endio(bio);
+ if (unlikely(bio_queue_enter(bio)))
+ return BLK_QC_T_NONE;
+ if (!blk_crypto_bio_prep(&bio)) {
+ blk_queue_exit(q);
return BLK_QC_T_NONE;
}
-
- ret = q->make_request_fn(q, bio);
- blk_queue_exit(q);
- return ret;
+ return blk_mq_make_request(q, bio);
}
EXPORT_SYMBOL_GPL(direct_make_request);
@@ -1090,13 +1211,20 @@ EXPORT_SYMBOL_GPL(direct_make_request);
* submit_bio - submit a bio to the block device layer for I/O
* @bio: The &struct bio which describes the I/O
*
- * submit_bio() is very similar in purpose to generic_make_request(), and
- * uses that function to do most of the work. Both are fairly rough
- * interfaces; @bio must be presetup and ready for I/O.
+ * submit_bio() is used to submit I/O requests to block devices. It is passed a
+ * fully set up &struct bio that describes the I/O that needs to be done. The
+ * bio will be send to the device described by the bi_disk and bi_partno fields.
*
+ * The success/failure status of the request, along with notification of
+ * completion, is delivered asynchronously through the ->bi_end_io() callback
+ * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
+ * been called.
*/
blk_qc_t submit_bio(struct bio *bio)
{
+ if (blkcg_punt_bio_submit(bio))
+ return BLK_QC_T_NONE;
+
/*
* If it's a regular read/write or a barrier with data attached,
* go through the normal accounting stuff before submission.
@@ -1126,13 +1254,31 @@ blk_qc_t submit_bio(struct bio *bio)
}
}
+ /*
+ * If we're reading data that is part of the userspace workingset, count
+ * submission time as memory stall. When the device is congested, or
+ * the submitting cgroup IO-throttled, submission can be a significant
+ * part of overall IO time.
+ */
+ if (unlikely(bio_op(bio) == REQ_OP_READ &&
+ bio_flagged(bio, BIO_WORKINGSET))) {
+ unsigned long pflags;
+ blk_qc_t ret;
+
+ psi_memstall_enter(&pflags);
+ ret = generic_make_request(bio);
+ psi_memstall_leave(&pflags);
+
+ return ret;
+ }
+
return generic_make_request(bio);
}
EXPORT_SYMBOL(submit_bio);
/**
* blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for new the queue limits
+ * for the new queue limits
* @q: the queue
* @rq: the request being checked
*
@@ -1163,7 +1309,7 @@ static int blk_cloned_rq_check_limits(struct request_queue *q,
* Recalculate it to check the request correctly on this queue's
* limitation.
*/
- blk_recalc_rq_segments(rq);
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
if (rq->nr_phys_segments > queue_max_segments(q)) {
printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
__func__, rq->nr_phys_segments, queue_max_segments(q));
@@ -1187,8 +1333,11 @@ blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *
should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
return BLK_STS_IOERR;
+ if (blk_crypto_insert_cloned_request(rq))
+ return BLK_STS_IOERR;
+
if (blk_queue_io_stat(q))
- blk_account_io_start(rq, true);
+ blk_account_io_start(rq);
/*
* Since we have a scheduler attached on the top device,
@@ -1240,9 +1389,24 @@ unsigned int blk_rq_err_bytes(const struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
-void blk_account_io_completion(struct request *req, unsigned int bytes)
+static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
{
- if (blk_do_io_stat(req)) {
+ unsigned long stamp;
+again:
+ stamp = READ_ONCE(part->stamp);
+ if (unlikely(stamp != now)) {
+ if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+ __part_stat_add(part, io_ticks, end ? now - stamp : 1);
+ }
+ if (part->partno) {
+ part = &part_to_disk(part)->part0;
+ goto again;
+ }
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+ if (req->part && blk_do_io_stat(req)) {
const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
@@ -1260,59 +1424,68 @@ void blk_account_io_done(struct request *req, u64 now)
* normal IO on queueing nor completion. Accounting the
* containing request is enough.
*/
- if (blk_do_io_stat(req) && !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ if (req->part && blk_do_io_stat(req) &&
+ !(req->rq_flags & RQF_FLUSH_SEQ)) {
const int sgrp = op_stat_group(req_op(req));
struct hd_struct *part;
part_stat_lock();
part = req->part;
- update_io_ticks(part, jiffies);
+ update_io_ticks(part, jiffies, true);
part_stat_inc(part, ios[sgrp]);
part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_add(part, time_in_queue, nsecs_to_jiffies64(now - req->start_time_ns));
- part_dec_in_flight(req->q, part, rq_data_dir(req));
+ part_stat_unlock();
hd_struct_put(part);
- part_stat_unlock();
}
}
-void blk_account_io_start(struct request *rq, bool new_io)
+void blk_account_io_start(struct request *rq)
{
- struct hd_struct *part;
- int rw = rq_data_dir(rq);
-
if (!blk_do_io_stat(rq))
return;
+ rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
+
part_stat_lock();
+ update_io_ticks(rq->part, jiffies, false);
+ part_stat_unlock();
+}
- if (!new_io) {
- part = rq->part;
- part_stat_inc(part, merges[rw]);
- } else {
- part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
- if (!hd_struct_try_get(part)) {
- /*
- * The partition is already being removed,
- * the request will be accounted on the disk only
- *
- * We take a reference on disk->part0 although that
- * partition will never be deleted, so we can treat
- * it as any other partition.
- */
- part = &rq->rq_disk->part0;
- hd_struct_get(part);
- }
- part_inc_in_flight(rq->q, part, rw);
- rq->part = part;
- }
+unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
+ unsigned int op)
+{
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+
+ part_stat_lock();
+ update_io_ticks(part, now, false);
+ part_stat_inc(part, ios[sgrp]);
+ part_stat_add(part, sectors[sgrp], sectors);
+ part_stat_local_inc(part, in_flight[op_is_write(op)]);
+ part_stat_unlock();
- update_io_ticks(part, jiffies);
+ return now;
+}
+EXPORT_SYMBOL(disk_start_io_acct);
+
+void disk_end_io_acct(struct gendisk *disk, unsigned int op,
+ unsigned long start_time)
+{
+ struct hd_struct *part = &disk->part0;
+ const int sgrp = op_stat_group(op);
+ unsigned long now = READ_ONCE(jiffies);
+ unsigned long duration = now - start_time;
+ part_stat_lock();
+ update_io_ticks(part, now, true);
+ part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_local_dec(part, in_flight[op_is_write(op)]);
part_stat_unlock();
}
+EXPORT_SYMBOL(disk_end_io_acct);
/*
* Steal bios from a request and add them to a bio list.
@@ -1348,7 +1521,7 @@ EXPORT_SYMBOL_GPL(blk_steal_bios);
*
* This special helper function is only for request stacking drivers
* (e.g. request-based dm) so that they can handle partial completion.
- * Actual device drivers should use blk_end_request instead.
+ * Actual device drivers should use blk_mq_end_request instead.
*
* Passing the result of blk_rq_bytes() as @nr_bytes guarantees
* %false return from this function.
@@ -1371,9 +1544,15 @@ bool blk_update_request(struct request *req, blk_status_t error,
if (!req->bio)
return false;
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+ error == BLK_STS_OK)
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
if (unlikely(error && !blk_rq_is_passthrough(req) &&
!(req->rq_flags & RQF_QUIET)))
- print_req_error(req, error);
+ print_req_error(req, error, __func__);
blk_account_io_completion(req, nr_bytes);
@@ -1432,28 +1611,13 @@ bool blk_update_request(struct request *req, blk_status_t error,
}
/* recalculate the number of segments */
- blk_recalc_rq_segments(req);
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
}
return true;
}
EXPORT_SYMBOL_GPL(blk_update_request);
-void blk_rq_bio_prep(struct request_queue *q, struct request *rq,
- struct bio *bio)
-{
- if (bio_has_data(bio))
- rq->nr_phys_segments = bio_phys_segments(q, bio);
- else if (bio_op(bio) == REQ_OP_DISCARD)
- rq->nr_phys_segments = 1;
-
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
-
- if (bio->bi_disk)
- rq->rq_disk = bio->bi_disk;
-}
-
#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
/**
* rq_flush_dcache_pages - Helper function to flush all pages in a request
@@ -1520,23 +1684,6 @@ void blk_rq_unprep_clone(struct request *rq)
}
EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-/*
- * Copy attributes of the original request to the clone request.
- * The actual data parts (e.g. ->cmd, ->sense) are not copied.
- */
-static void __blk_rq_prep_clone(struct request *dst, struct request *src)
-{
- dst->__sector = blk_rq_pos(src);
- dst->__data_len = blk_rq_bytes(src);
- if (src->rq_flags & RQF_SPECIAL_PAYLOAD) {
- dst->rq_flags |= RQF_SPECIAL_PAYLOAD;
- dst->special_vec = src->special_vec;
- }
- dst->nr_phys_segments = src->nr_phys_segments;
- dst->ioprio = src->ioprio;
- dst->extra_len = src->extra_len;
-}
-
/**
* blk_rq_prep_clone - Helper function to setup clone request
* @rq: the request to be setup
@@ -1549,8 +1696,6 @@ static void __blk_rq_prep_clone(struct request *dst, struct request *src)
*
* Description:
* Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- * The actual data parts of @rq_src (e.g. ->cmd, ->sense)
- * are not copied, and copying such parts is the caller's responsibility.
* Also, pages which the original bios are pointing to are not copied
* and the cloned bios just point same pages.
* So cloned bios must be completed before original bios, which means
@@ -1581,7 +1726,18 @@ int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
rq->bio = rq->biotail = bio;
}
- __blk_rq_prep_clone(rq, rq_src);
+ /* Copy attributes of the original request to the clone request. */
+ rq->__sector = blk_rq_pos(rq_src);
+ rq->__data_len = blk_rq_bytes(rq_src);
+ if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+ rq->special_vec = rq_src->special_vec;
+ }
+ rq->nr_phys_segments = rq_src->nr_phys_segments;
+ rq->ioprio = rq_src->ioprio;
+
+ if (rq->bio)
+ blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask);
return 0;
@@ -1600,12 +1756,6 @@ int kblockd_schedule_work(struct work_struct *work)
}
EXPORT_SYMBOL(kblockd_schedule_work);
-int kblockd_schedule_work_on(int cpu, struct work_struct *work)
-{
- return queue_work_on(cpu, kblockd_workqueue, work);
-}
-EXPORT_SYMBOL(kblockd_schedule_work_on);
-
int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
unsigned long delay)
{
@@ -1729,13 +1879,25 @@ void blk_finish_plug(struct blk_plug *plug)
}
EXPORT_SYMBOL(blk_finish_plug);
+void blk_io_schedule(void)
+{
+ /* Prevent hang_check timer from firing at us during very long I/O */
+ unsigned long timeout = sysctl_hung_task_timeout_secs * HZ / 2;
+
+ if (timeout)
+ io_schedule_timeout(timeout);
+ else
+ io_schedule();
+}
+EXPORT_SYMBOL_GPL(blk_io_schedule);
+
int __init blk_dev_init(void)
{
BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
- FIELD_SIZEOF(struct request, cmd_flags));
+ sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
- FIELD_SIZEOF(struct bio, bi_opf));
+ sizeof_field(struct bio, bi_opf));
/* used for unplugging and affects IO latency/throughput - HIGHPRI */
kblockd_workqueue = alloc_workqueue("kblockd",