aboutsummaryrefslogtreecommitdiffstats
path: root/block
diff options
context:
space:
mode:
Diffstat (limited to 'block')
-rw-r--r--block/Kconfig147
-rw-r--r--block/Kconfig.iosched6
-rw-r--r--block/Makefile18
-rw-r--r--block/badblocks.c1622
-rw-r--r--block/bdev.c1148
-rw-r--r--block/bfq-cgroup.c267
-rw-r--r--block/bfq-iosched.c1950
-rw-r--r--block/bfq-iosched.h302
-rw-r--r--block/bfq-wf2q.c213
-rw-r--r--block/bio-integrity.c342
-rw-r--r--block/bio.c1658
-rw-r--r--block/blk-cgroup-fc-appid.c57
-rw-r--r--block/blk-cgroup-rwstat.c3
-rw-r--r--block/blk-cgroup-rwstat.h10
-rw-r--r--block/blk-cgroup.c1575
-rw-r--r--block/blk-cgroup.h508
-rw-r--r--block/blk-core.c1676
-rw-r--r--block/blk-crypto-fallback.c165
-rw-r--r--block/blk-crypto-internal.h83
-rw-r--r--block/blk-crypto-profile.c559
-rw-r--r--block/blk-crypto-sysfs.c173
-rw-r--r--block/blk-crypto.c150
-rw-r--r--block/blk-exec.c95
-rw-r--r--block/blk-flush.c256
-rw-r--r--block/blk-ia-ranges.c318
-rw-r--r--block/blk-integrity.c190
-rw-r--r--block/blk-ioc.c359
-rw-r--r--block/blk-iocost.c2101
-rw-r--r--block/blk-iolatency.c281
-rw-r--r--block/blk-ioprio.c243
-rw-r--r--block/blk-ioprio.h28
-rw-r--r--block/blk-lib.c228
-rw-r--r--block/blk-map.c451
-rw-r--r--block/blk-merge.c733
-rw-r--r--block/blk-mq-cpumap.c70
-rw-r--r--block/blk-mq-debugfs-zoned.c6
-rw-r--r--block/blk-mq-debugfs.c307
-rw-r--r--block/blk-mq-debugfs.h12
-rw-r--r--block/blk-mq-pci.c8
-rw-r--r--block/blk-mq-rdma.c44
-rw-r--r--block/blk-mq-sched.c607
-rw-r--r--block/blk-mq-sched.h63
-rw-r--r--block/blk-mq-sysfs.c161
-rw-r--r--block/blk-mq-tag.c357
-rw-r--r--block/blk-mq-tag.h99
-rw-r--r--block/blk-mq-virtio.c8
-rw-r--r--block/blk-mq.c3731
-rw-r--r--block/blk-mq.h369
-rw-r--r--block/blk-pm.c71
-rw-r--r--block/blk-pm.h52
-rw-r--r--block/blk-rq-qos.c77
-rw-r--r--block/blk-rq-qos.h70
-rw-r--r--block/blk-settings.c347
-rw-r--r--block/blk-softirq.c156
-rw-r--r--block/blk-stat.c42
-rw-r--r--block/blk-stat.h2
-rw-r--r--block/blk-sysfs.c836
-rw-r--r--block/blk-throttle.c733
-rw-r--r--block/blk-throttle.h217
-rw-r--r--block/blk-timeout.c30
-rw-r--r--block/blk-wbt.c218
-rw-r--r--block/blk-wbt.h117
-rw-r--r--block/blk-zoned.c362
-rw-r--r--block/blk.h518
-rw-r--r--block/bounce.c248
-rw-r--r--block/bsg-lib.c133
-rw-r--r--block/bsg.c491
-rw-r--r--block/cmdline-parser.c255
-rw-r--r--block/disk-events.c489
-rw-r--r--block/early-lookup.c316
-rw-r--r--block/elevator.c337
-rw-r--r--block/elevator.h186
-rw-r--r--block/fops.c877
-rw-r--r--block/genhd.c2158
-rw-r--r--block/holder.c154
-rw-r--r--block/ioctl.c341
-rw-r--r--block/ioprio.c97
-rw-r--r--block/keyslot-manager.c396
-rw-r--r--block/kyber-iosched.c75
-rw-r--r--block/mq-deadline.c885
-rw-r--r--block/opal_proto.h20
-rw-r--r--block/partitions/Kconfig5
-rw-r--r--block/partitions/acorn.c8
-rw-r--r--block/partitions/aix.c20
-rw-r--r--block/partitions/amiga.c111
-rw-r--r--block/partitions/atari.c5
-rw-r--r--block/partitions/check.h7
-rw-r--r--block/partitions/cmdline.c273
-rw-r--r--block/partitions/core.c550
-rw-r--r--block/partitions/efi.c50
-rw-r--r--block/partitions/efi.h1
-rw-r--r--block/partitions/ibm.c128
-rw-r--r--block/partitions/ldm.c37
-rw-r--r--block/partitions/ldm.h4
-rw-r--r--block/partitions/mac.c2
-rw-r--r--block/partitions/msdos.c32
-rw-r--r--block/partitions/sgi.c5
-rw-r--r--block/partitions/sun.c5
-rw-r--r--block/scsi_ioctl.c900
-rw-r--r--block/sed-opal.c767
-rw-r--r--block/t10-pi.c216
101 files changed, 23170 insertions, 15019 deletions
diff --git a/block/Kconfig b/block/Kconfig
index 9357d7302398..1de4682d48cc 100644
--- a/block/Kconfig
+++ b/block/Kconfig
@@ -5,8 +5,8 @@
menuconfig BLOCK
bool "Enable the block layer" if EXPERT
default y
+ select FS_IOMAP
select SBITMAP
- select SRCU
help
Provide block layer support for the kernel.
@@ -26,38 +26,34 @@ menuconfig BLOCK
if BLOCK
-config BLK_RQ_ALLOC_TIME
- bool
+config BLOCK_LEGACY_AUTOLOAD
+ bool "Legacy autoloading support"
+ default y
+ help
+ Enable loading modules and creating block device instances based on
+ accesses through their device special file. This is a historic Linux
+ feature and makes no sense in a udev world where device files are
+ created on demand, but scripts that manually create device nodes and
+ then call losetup might rely on this behavior.
-config BLK_SCSI_REQUEST
+config BLK_RQ_ALLOC_TIME
bool
config BLK_CGROUP_RWSTAT
bool
-config BLK_DEV_BSG
- bool "Block layer SG support v4"
- default y
- select BLK_SCSI_REQUEST
- help
- Saying Y here will enable generic SG (SCSI generic) v4 support
- for any block device.
-
- Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4
- can handle complicated SCSI commands: tagged variable length cdbs
- with bidirectional data transfers and generic request/response
- protocols (e.g. Task Management Functions and SMP in Serial
- Attached SCSI).
+config BLK_CGROUP_PUNT_BIO
+ bool
- This option is required by recent UDEV versions to properly
- access device serial numbers, etc.
+config BLK_DEV_BSG_COMMON
+ tristate
- If unsure, say Y.
+config BLK_ICQ
+ bool
config BLK_DEV_BSGLIB
bool "Block layer SG support v4 helper lib"
- select BLK_DEV_BSG
- select BLK_SCSI_REQUEST
+ select BLK_DEV_BSG_COMMON
help
Subsystems will normally enable this if needed. Users will not
normally need to manually enable this.
@@ -80,19 +76,41 @@ config BLK_DEV_INTEGRITY_T10
tristate
depends on BLK_DEV_INTEGRITY
select CRC_T10DIF
+ select CRC64_ROCKSOFT
+
+config BLK_DEV_WRITE_MOUNTED
+ bool "Allow writing to mounted block devices"
+ default y
+ help
+ When a block device is mounted, writing to its buffer cache is very
+ likely going to cause filesystem corruption. It is also rather easy to
+ crash the kernel in this way since the filesystem has no practical way
+ of detecting these writes to buffer cache and verifying its metadata
+ integrity. However there are some setups that need this capability
+ like running fsck on read-only mounted root device, modifying some
+ features on mounted ext4 filesystem, and similar. If you say N, the
+ kernel will prevent processes from writing to block devices that are
+ mounted by filesystems which provides some more protection from runaway
+ privileged processes and generally makes it much harder to crash
+ filesystem drivers. Note however that this does not prevent
+ underlying device(s) from being modified by other means, e.g. by
+ directly submitting SCSI commands or through access to lower layers of
+ storage stack. If in doubt, say Y. The configuration can be overridden
+ with the bdev_allow_write_mounted boot option.
config BLK_DEV_ZONED
bool "Zoned block device support"
select MQ_IOSCHED_DEADLINE
help
Block layer zoned block device support. This option enables
- support for ZAC/ZBC host-managed and host-aware zoned block devices.
+ support for ZAC/ZBC/ZNS host-managed and host-aware zoned block
+ devices.
- Say yes here if you have a ZAC or ZBC storage device.
+ Say yes here if you have a ZAC, ZBC, or ZNS storage device.
config BLK_DEV_THROTTLING
bool "Block layer bio throttling support"
- depends on BLK_CGROUP=y
+ depends on BLK_CGROUP
select BLK_CGROUP_RWSTAT
help
Block layer bio throttling support. It can be used to limit
@@ -113,16 +131,6 @@ config BLK_DEV_THROTTLING_LOW
Note, this is an experimental interface and could be changed someday.
-config BLK_CMDLINE_PARSER
- bool "Block device command line partition parser"
- help
- Enabling this option allows you to specify the partition layout from
- the kernel boot args. This is typically of use for embedded devices
- which don't otherwise have any standardized method for listing the
- partitions on a block device.
-
- See Documentation/block/cmdline-partition.rst for more information.
-
config BLK_WBT
bool "Enable support for block device writeback throttling"
help
@@ -132,9 +140,16 @@ config BLK_WBT
dynamically on an algorithm loosely based on CoDel, factoring in
the realtime performance of the disk.
+config BLK_WBT_MQ
+ bool "Enable writeback throttling by default"
+ default y
+ depends on BLK_WBT
+ help
+ Enable writeback throttling by default for request-based block devices.
+
config BLK_CGROUP_IOLATENCY
bool "Enable support for latency based cgroup IO protection"
- depends on BLK_CGROUP=y
+ depends on BLK_CGROUP
help
Enabling this option enables the .latency interface for IO throttling.
The IO controller will attempt to maintain average IO latencies below
@@ -143,10 +158,18 @@ config BLK_CGROUP_IOLATENCY
Note, this is an experimental interface and could be changed someday.
+config BLK_CGROUP_FC_APPID
+ bool "Enable support to track FC I/O Traffic across cgroup applications"
+ depends on BLK_CGROUP && NVME_FC
+ help
+ Enabling this option enables the support to track FC I/O traffic across
+ cgroup applications. It enables the Fabric and the storage targets to
+ identify, monitor, and handle FC traffic based on VM tags by inserting
+ application specific identification into the FC frame.
+
config BLK_CGROUP_IOCOST
bool "Enable support for cost model based cgroup IO controller"
- depends on BLK_CGROUP=y
- select BLK_RQ_IO_DATA_LEN
+ depends on BLK_CGROUP
select BLK_RQ_ALLOC_TIME
help
Enabling this option enables the .weight interface for cost
@@ -154,14 +177,14 @@ config BLK_CGROUP_IOCOST
distributes IO capacity between different groups based on
their share of the overall weight distribution.
-config BLK_WBT_MQ
- bool "Multiqueue writeback throttling"
- default y
- depends on BLK_WBT
+config BLK_CGROUP_IOPRIO
+ bool "Cgroup I/O controller for assigning an I/O priority class"
+ depends on BLK_CGROUP
help
- Enable writeback throttling by default on multiqueue devices.
- Multiqueue currently doesn't have support for IO scheduling,
- enabling this option is recommended.
+ Enable the .prio interface for assigning an I/O priority class to
+ requests. The I/O priority class affects the order in which an I/O
+ scheduler and block devices process requests. Only some I/O schedulers
+ and some block devices support I/O priorities.
config BLK_DEBUG_FS
bool "Block layer debugging information in debugfs"
@@ -181,6 +204,9 @@ config BLK_DEBUG_FS_ZONED
config BLK_SED_OPAL
bool "Logic for interfacing with Opal enabled SEDs"
+ depends on KEYS
+ select PSERIES_PLPKS if PPC_PSERIES
+ select PSERIES_PLPKS_SED if PPC_PSERIES
help
Builds Logic for interfacing with Opal enabled controllers.
Enabling this option enables users to setup/unlock/lock
@@ -203,35 +229,26 @@ config BLK_INLINE_ENCRYPTION_FALLBACK
by falling back to the kernel crypto API when inline
encryption hardware is not present.
-menu "Partition Types"
-
source "block/partitions/Kconfig"
-endmenu
-
-endif # BLOCK
-
-config BLOCK_COMPAT
- bool
- depends on BLOCK && COMPAT
- default y
-
config BLK_MQ_PCI
- bool
- depends on BLOCK && PCI
- default y
+ def_bool PCI
config BLK_MQ_VIRTIO
bool
- depends on BLOCK && VIRTIO
+ depends on VIRTIO
default y
-config BLK_MQ_RDMA
+config BLK_PM
+ def_bool PM
+
+# do not use in new code
+config BLOCK_HOLDER_DEPRECATED
bool
- depends on BLOCK && INFINIBAND
- default y
-config BLK_PM
- def_bool BLOCK && PM
+config BLK_MQ_STACKING
+ bool
source "block/Kconfig.iosched"
+
+endif # BLOCK
diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched
index 2f2158e05a91..27f11320b8d1 100644
--- a/block/Kconfig.iosched
+++ b/block/Kconfig.iosched
@@ -1,6 +1,4 @@
# SPDX-License-Identifier: GPL-2.0
-if BLOCK
-
menu "IO Schedulers"
config MQ_IOSCHED_DEADLINE
@@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER
config IOSCHED_BFQ
tristate "BFQ I/O scheduler"
+ select BLK_ICQ
help
BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of
of the device among all processes according to their weights,
@@ -31,6 +30,7 @@ config IOSCHED_BFQ
config BFQ_GROUP_IOSCHED
bool "BFQ hierarchical scheduling support"
depends on IOSCHED_BFQ && BLK_CGROUP
+ default y
select BLK_CGROUP_RWSTAT
help
@@ -45,5 +45,3 @@ config BFQ_CGROUP_DEBUG
files in a cgroup which can be useful for debugging.
endmenu
-
-endif
diff --git a/block/Makefile b/block/Makefile
index 78719169fb2a..46ada9dc8bbf 100644
--- a/block/Makefile
+++ b/block/Makefile
@@ -3,20 +3,22 @@
# Makefile for the kernel block layer
#
-obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \
+obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \
blk-flush.o blk-settings.o blk-ioc.o blk-map.o \
- blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \
+ blk-merge.o blk-timeout.o \
blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \
blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \
- genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o
+ genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \
+ disk-events.o blk-ia-ranges.o early-lookup.o
obj-$(CONFIG_BOUNCE) += bounce.o
-obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o
-obj-$(CONFIG_BLK_DEV_BSG) += bsg.o
+obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o
obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o
obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o
obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o
+obj-$(CONFIG_BLK_CGROUP_FC_APPID) += blk-cgroup-fc-appid.o
obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o
+obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o
obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o
obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o
obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o
@@ -24,17 +26,17 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o
bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o
obj-$(CONFIG_IOSCHED_BFQ) += bfq.o
-obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o
obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o
obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o
obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o
obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o
-obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o
obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o
obj-$(CONFIG_BLK_WBT) += blk-wbt.o
obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o
obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o
obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o
obj-$(CONFIG_BLK_PM) += blk-pm.o
-obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o
+obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \
+ blk-crypto-sysfs.o
obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o
+obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o
diff --git a/block/badblocks.c b/block/badblocks.c
index 2e5f5697db35..db4ec8b9b2a8 100644
--- a/block/badblocks.c
+++ b/block/badblocks.c
@@ -16,120 +16,830 @@
#include <linux/types.h>
#include <linux/slab.h>
-/**
- * badblocks_check() - check a given range for bad sectors
- * @bb: the badblocks structure that holds all badblock information
- * @s: sector (start) at which to check for badblocks
- * @sectors: number of sectors to check for badblocks
- * @first_bad: pointer to store location of the first badblock
- * @bad_sectors: pointer to store number of badblocks after @first_bad
+/*
+ * The purpose of badblocks set/clear is to manage bad blocks ranges which are
+ * identified by LBA addresses.
*
- * We can record which blocks on each device are 'bad' and so just
- * fail those blocks, or that stripe, rather than the whole device.
- * Entries in the bad-block table are 64bits wide. This comprises:
- * Length of bad-range, in sectors: 0-511 for lengths 1-512
- * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
- * A 'shift' can be set so that larger blocks are tracked and
- * consequently larger devices can be covered.
- * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ * When the caller of badblocks_set() wants to set a range of bad blocks, the
+ * setting range can be acked or unacked. And the setting range may merge,
+ * overwrite, skip the overlapped already set range, depends on who they are
+ * overlapped or adjacent, and the acknowledgment type of the ranges. It can be
+ * more complicated when the setting range covers multiple already set bad block
+ * ranges, with restrictions of maximum length of each bad range and the bad
+ * table space limitation.
*
- * Locking of the bad-block table uses a seqlock so badblocks_check
- * might need to retry if it is very unlucky.
- * We will sometimes want to check for bad blocks in a bi_end_io function,
- * so we use the write_seqlock_irq variant.
+ * It is difficult and unnecessary to take care of all the possible situations,
+ * for setting a large range of bad blocks, we can handle it by dividing the
+ * large range into smaller ones when encounter overlap, max range length or
+ * bad table full conditions. Every time only a smaller piece of the bad range
+ * is handled with a limited number of conditions how it is interacted with
+ * possible overlapped or adjacent already set bad block ranges. Then the hard
+ * complicated problem can be much simpler to handle in proper way.
*
- * When looking for a bad block we specify a range and want to
- * know if any block in the range is bad. So we binary-search
- * to the last range that starts at-or-before the given endpoint,
- * (or "before the sector after the target range")
- * then see if it ends after the given start.
+ * When setting a range of bad blocks to the bad table, the simplified situations
+ * to be considered are, (The already set bad blocks ranges are naming with
+ * prefix E, and the setting bad blocks range is naming with prefix S)
*
- * Return:
- * 0: there are no known bad blocks in the range
- * 1: there are known bad block which are all acknowledged
- * -1: there are bad blocks which have not yet been acknowledged in metadata.
- * plus the start/length of the first bad section we overlap.
+ * 1) A setting range is not overlapped or adjacent to any other already set bad
+ * block range.
+ * +--------+
+ * | S |
+ * +--------+
+ * +-------------+ +-------------+
+ * | E1 | | E2 |
+ * +-------------+ +-------------+
+ * For this situation if the bad blocks table is not full, just allocate a
+ * free slot from the bad blocks table to mark the setting range S. The
+ * result is,
+ * +-------------+ +--------+ +-------------+
+ * | E1 | | S | | E2 |
+ * +-------------+ +--------+ +-------------+
+ * 2) A setting range starts exactly at a start LBA of an already set bad blocks
+ * range.
+ * 2.1) The setting range size < already set range size
+ * +--------+
+ * | S |
+ * +--------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.1.1) If S and E are both acked or unacked range, the setting range S can
+ * be merged into existing bad range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E.
+ * An extra slot from the bad blocks table will be allocated for S, and head
+ * of E will move to end of the inserted range S. The result is,
+ * +--------+----+
+ * | S | E |
+ * +--------+----+
+ * 2.2) The setting range size == already set range size
+ * 2.2.1) If S and E are both acked or unacked range, the setting range S can
+ * be merged into existing bad range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and
+ * the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of
+ bad blocks range E. The result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 2.3) The setting range size > already set range size
+ * +-------------------+
+ * | S |
+ * +-------------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For such situation, the setting range S can be treated as two parts, the
+ * first part (S1) is as same size as the already set range E, the second
+ * part (S2) is the rest of setting range.
+ * +-------------+-----+ +-------------+ +-----+
+ * | S1 | S2 | | S1 | | S2 |
+ * +-------------+-----+ ===> +-------------+ +-----+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now we only focus on how to handle the setting range S1 and already set
+ * range E, which are already explained in 2.2), for the rest S2 it will be
+ * handled later in next loop.
+ * 3) A setting range starts before the start LBA of an already set bad blocks
+ * range.
+ * +-------------+
+ * | S |
+ * +-------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For this situation, the setting range S can be divided into two parts, the
+ * first (S1) ends at the start LBA of already set range E, the second part
+ * (S2) starts exactly at a start LBA of the already set range E.
+ * +----+---------+ +----+ +---------+
+ * | S1 | S2 | | S1 | | S2 |
+ * +----+---------+ ===> +----+ +---------+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now only the first part S1 should be handled in this loop, which is in
+ * similar condition as 1). The rest part S2 has exact same start LBA address
+ * of the already set range E, they will be handled in next loop in one of
+ * situations in 2).
+ * 4) A setting range starts after the start LBA of an already set bad blocks
+ * range.
+ * 4.1) If the setting range S exactly matches the tail part of already set bad
+ * blocks range E, like the following chart shows,
+ * +---------+
+ * | S |
+ * +---------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 4.1.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +-------------+
+ * | S |
+ * +-------------+
+ * 4.1.2) If range E is acked and the setting range S is unacked, the setting
+ * request of S will be rejected, the result is,
+ * +-------------+
+ * | E |
+ * +-------------+
+ * 4.1.3) If range E is unacked, and the setting range S is acked, then S may
+ * overwrite the overlapped range of E, the result is,
+ * +---+---------+
+ * | E | S |
+ * +---+---------+
+ * 4.2) If the setting range S stays in middle of an already set range E, like
+ * the following chart shows,
+ * +----+
+ * | S |
+ * +----+
+ * +--------------+
+ * | E |
+ * +--------------+
+ * 4.2.1) If range S and E have same acknowledge value (both acked or unacked),
+ * they will be merged into one, the result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 4.2.2) If range E is acked and the setting range S is unacked, the setting
+ * request of S will be rejected, the result is also,
+ * +--------------+
+ * | E |
+ * +--------------+
+ * 4.2.3) If range E is unacked, and the setting range S is acked, then S will
+ * inserted into middle of E and split previous range E into two parts (E1
+ * and E2), the result is,
+ * +----+----+----+
+ * | E1 | S | E2 |
+ * +----+----+----+
+ * 4.3) If the setting bad blocks range S is overlapped with an already set bad
+ * blocks range E. The range S starts after the start LBA of range E, and
+ * ends after the end LBA of range E, as the following chart shows,
+ * +-------------------+
+ * | S |
+ * +-------------------+
+ * +-------------+
+ * | E |
+ * +-------------+
+ * For this situation the range S can be divided into two parts, the first
+ * part (S1) ends at end range E, and the second part (S2) has rest range of
+ * origin S.
+ * +---------+---------+ +---------+ +---------+
+ * | S1 | S2 | | S1 | | S2 |
+ * +---------+---------+ ===> +---------+ +---------+
+ * +-------------+ +-------------+
+ * | E | | E |
+ * +-------------+ +-------------+
+ * Now in this loop the setting range S1 and already set range E can be
+ * handled as the situations 4.1), the rest range S2 will be handled in next
+ * loop and ignored in this loop.
+ * 5) A setting bad blocks range S is adjacent to one or more already set bad
+ * blocks range(s), and they are all acked or unacked range.
+ * 5.1) Front merge: If the already set bad blocks range E is before setting
+ * range S and they are adjacent,
+ * +------+
+ * | S |
+ * +------+
+ * +-------+
+ * | E |
+ * +-------+
+ * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge
+ * values are same, the setting range S can front merges into range E. The
+ * result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting
+ * range S right after already set range E into the bad blocks table. The
+ * result is,
+ * +--------+------+
+ * | E | S |
+ * +--------+------+
+ * 6) Special cases which above conditions cannot handle
+ * 6.1) Multiple already set ranges may merge into less ones in a full bad table
+ * +-------------------------------------------------------+
+ * | S |
+ * +-------------------------------------------------------+
+ * |<----- BB_MAX_LEN ----->|
+ * +-----+ +-----+ +-----+
+ * | E1 | | E2 | | E3 |
+ * +-----+ +-----+ +-----+
+ * In the above example, when the bad blocks table is full, inserting the
+ * first part of setting range S will fail because no more available slot
+ * can be allocated from bad blocks table. In this situation a proper
+ * setting method should be go though all the setting bad blocks range and
+ * look for chance to merge already set ranges into less ones. When there
+ * is available slot from bad blocks table, re-try again to handle more
+ * setting bad blocks ranges as many as possible.
+ * +------------------------+
+ * | S3 |
+ * +------------------------+
+ * |<----- BB_MAX_LEN ----->|
+ * +-----+-----+-----+---+-----+--+
+ * | S1 | S2 |
+ * +-----+-----+-----+---+-----+--+
+ * The above chart shows although the first part (S3) cannot be inserted due
+ * to no-space in bad blocks table, but the following E1, E2 and E3 ranges
+ * can be merged with rest part of S into less range S1 and S2. Now there is
+ * 1 free slot in bad blocks table.
+ * +------------------------+-----+-----+-----+---+-----+--+
+ * | S3 | S1 | S2 |
+ * +------------------------+-----+-----+-----+---+-----+--+
+ * Since the bad blocks table is not full anymore, re-try again for the
+ * origin setting range S. Now the setting range S3 can be inserted into the
+ * bad blocks table with previous freed slot from multiple ranges merge.
+ * 6.2) Front merge after overwrite
+ * In the following example, in bad blocks table, E1 is an acked bad blocks
+ * range and E2 is an unacked bad blocks range, therefore they are not able
+ * to merge into a larger range. The setting bad blocks range S is acked,
+ * therefore part of E2 can be overwritten by S.
+ * +--------+
+ * | S | acknowledged
+ * +--------+ S: 1
+ * +-------+-------------+ E1: 1
+ * | E1 | E2 | E2: 0
+ * +-------+-------------+
+ * With previous simplified routines, after overwriting part of E2 with S,
+ * the bad blocks table should be (E3 is remaining part of E2 which is not
+ * overwritten by S),
+ * acknowledged
+ * +-------+--------+----+ S: 1
+ * | E1 | S | E3 | E1: 1
+ * +-------+--------+----+ E3: 0
+ * The above result is correct but not perfect. Range E1 and S in the bad
+ * blocks table are all acked, merging them into a larger one range may
+ * occupy less bad blocks table space and make badblocks_check() faster.
+ * Therefore in such situation, after overwriting range S, the previous range
+ * E1 should be checked for possible front combination. Then the ideal
+ * result can be,
+ * +----------------+----+ acknowledged
+ * | E1 | E3 | E1: 1
+ * +----------------+----+ E3: 0
+ * 6.3) Behind merge: If the already set bad blocks range E is behind the setting
+ * range S and they are adjacent. Normally we don't need to care about this
+ * because front merge handles this while going though range S from head to
+ * tail, except for the tail part of range S. When the setting range S are
+ * fully handled, all the above simplified routine doesn't check whether the
+ * tail LBA of range S is adjacent to the next already set range and not
+ * merge them even it is possible.
+ * +------+
+ * | S |
+ * +------+
+ * +-------+
+ * | E |
+ * +-------+
+ * For the above special situation, when the setting range S are all handled
+ * and the loop ends, an extra check is necessary for whether next already
+ * set range E is right after S and mergeable.
+ * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge
+ * values are same, the setting range S can behind merges into range E. The
+ * result is,
+ * +--------------+
+ * | S |
+ * +--------------+
+ * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range
+ * S in front of the already set range E in the bad blocks table. The result
+ * is,
+ * +------+-------+
+ * | S | E |
+ * +------+-------+
+ *
+ * All the above 5 simplified situations and 3 special cases may cover 99%+ of
+ * the bad block range setting conditions. Maybe there is some rare corner case
+ * is not considered and optimized, it won't hurt if badblocks_set() fails due
+ * to no space, or some ranges are not merged to save bad blocks table space.
+ *
+ * Inside badblocks_set() each loop starts by jumping to re_insert label, every
+ * time for the new loop prev_badblocks() is called to find an already set range
+ * which starts before or at current setting range. Since the setting bad blocks
+ * range is handled from head to tail, most of the cases it is unnecessary to do
+ * the binary search inside prev_badblocks(), it is possible to provide a hint
+ * to prev_badblocks() for a fast path, then the expensive binary search can be
+ * avoided. In my test with the hint to prev_badblocks(), except for the first
+ * loop, all rested calls to prev_badblocks() can go into the fast path and
+ * return correct bad blocks table index immediately.
+ *
+ *
+ * Clearing a bad blocks range from the bad block table has similar idea as
+ * setting does, but much more simpler. The only thing needs to be noticed is
+ * when the clearing range hits middle of a bad block range, the existing bad
+ * block range will split into two, and one more item should be added into the
+ * bad block table. The simplified situations to be considered are, (The already
+ * set bad blocks ranges in bad block table are naming with prefix E, and the
+ * clearing bad blocks range is naming with prefix C)
+ *
+ * 1) A clearing range is not overlapped to any already set ranges in bad block
+ * table.
+ * +-----+ | +-----+ | +-----+
+ * | C | | | C | | | C |
+ * +-----+ or +-----+ or +-----+
+ * +---+ | +----+ +----+ | +---+
+ * | E | | | E1 | | E2 | | | E |
+ * +---+ | +----+ +----+ | +---+
+ * For the above situations, no bad block to be cleared and no failure
+ * happens, simply returns 0.
+ * 2) The clearing range hits middle of an already setting bad blocks range in
+ * the bad block table.
+ * +---+
+ * | C |
+ * +---+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * In this situation if the bad block table is not full, the range E will be
+ * split into two ranges E1 and E2. The result is,
+ * +------+ +------+
+ * | E1 | | E2 |
+ * +------+ +------+
+ * 3) The clearing range starts exactly at same LBA as an already set bad block range
+ * from the bad block table.
+ * 3.1) Partially covered at head part
+ * +------------+
+ * | C |
+ * +------------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For this situation, the overlapped already set range will update the
+ * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No
+ * item deleted from bad block table. The result is,
+ * +----+
+ * | E1 |
+ * +----+
+ * 3.2) Exact fully covered
+ * +-----------------+
+ * | C |
+ * +-----------------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For this situation the whole bad blocks range E will be cleared and its
+ * corresponded item is deleted from the bad block table.
+ * 4) The clearing range exactly ends at same LBA as an already set bad block
+ * range.
+ * +-------+
+ * | C |
+ * +-------+
+ * +-----------------+
+ * | E |
+ * +-----------------+
+ * For the above situation, the already set range E is updated to shrink its
+ * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C).
+ * The result is,
+ * +---------+
+ * | E |
+ * +---------+
+ * 5) The clearing range is partially overlapped with an already set bad block
+ * range from the bad block table.
+ * 5.1) The already set bad block range is front overlapped with the clearing
+ * range.
+ * +----------+
+ * | C |
+ * +----------+
+ * +------------+
+ * | E |
+ * +------------+
+ * For such situation, the clearing range C can be treated as two parts. The
+ * first part ends at the start LBA of range E, and the second part starts at
+ * same LBA of range E.
+ * +----+-----+ +----+ +-----+
+ * | C1 | C2 | | C1 | | C2 |
+ * +----+-----+ ===> +----+ +-----+
+ * +------------+ +------------+
+ * | E | | E |
+ * +------------+ +------------+
+ * Now the first part C1 can be handled as condition 1), and the second part C2 can be
+ * handled as condition 3.1) in next loop.
+ * 5.2) The already set bad block range is behind overlaopped with the clearing
+ * range.
+ * +----------+
+ * | C |
+ * +----------+
+ * +------------+
+ * | E |
+ * +------------+
+ * For such situation, the clearing range C can be treated as two parts. The
+ * first part C1 ends at same end LBA of range E, and the second part starts
+ * at end LBA of range E.
+ * +----+-----+ +----+ +-----+
+ * | C1 | C2 | | C1 | | C2 |
+ * +----+-----+ ===> +----+ +-----+
+ * +------------+ +------------+
+ * | E | | E |
+ * +------------+ +------------+
+ * Now the first part clearing range C1 can be handled as condition 4), and
+ * the second part clearing range C2 can be handled as condition 1) in next
+ * loop.
+ *
+ * All bad blocks range clearing can be simplified into the above 5 situations
+ * by only handling the head part of the clearing range in each run of the
+ * while-loop. The idea is similar to bad blocks range setting but much
+ * simpler.
*/
-int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
- sector_t *first_bad, int *bad_sectors)
+
+/*
+ * Find the range starts at-or-before 's' from bad table. The search
+ * starts from index 'hint' and stops at index 'hint_end' from the bad
+ * table.
+ */
+static int prev_by_hint(struct badblocks *bb, sector_t s, int hint)
{
- int hi;
- int lo;
+ int hint_end = hint + 2;
u64 *p = bb->page;
- int rv;
- sector_t target = s + sectors;
- unsigned seq;
+ int ret = -1;
- if (bb->shift > 0) {
- /* round the start down, and the end up */
- s >>= bb->shift;
- target += (1<<bb->shift) - 1;
- target >>= bb->shift;
- sectors = target - s;
+ while ((hint < hint_end) && ((hint + 1) <= bb->count) &&
+ (BB_OFFSET(p[hint]) <= s)) {
+ if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) {
+ ret = hint;
+ break;
+ }
+ hint++;
+ }
+
+ return ret;
+}
+
+/*
+ * Find the range starts at-or-before bad->start. If 'hint' is provided
+ * (hint >= 0) then search in the bad table from hint firstly. It is
+ * very probably the wanted bad range can be found from the hint index,
+ * then the unnecessary while-loop iteration can be avoided.
+ */
+static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad,
+ int hint)
+{
+ sector_t s = bad->start;
+ int ret = -1;
+ int lo, hi;
+ u64 *p;
+
+ if (!bb->count)
+ goto out;
+
+ if (hint >= 0) {
+ ret = prev_by_hint(bb, s, hint);
+ if (ret >= 0)
+ goto out;
}
- /* 'target' is now the first block after the bad range */
-retry:
- seq = read_seqbegin(&bb->lock);
lo = 0;
- rv = 0;
hi = bb->count;
+ p = bb->page;
- /* Binary search between lo and hi for 'target'
- * i.e. for the last range that starts before 'target'
- */
- /* INVARIANT: ranges before 'lo' and at-or-after 'hi'
- * are known not to be the last range before target.
- * VARIANT: hi-lo is the number of possible
- * ranges, and decreases until it reaches 1
- */
+ /* The following bisect search might be unnecessary */
+ if (BB_OFFSET(p[lo]) > s)
+ return -1;
+ if (BB_OFFSET(p[hi - 1]) <= s)
+ return hi - 1;
+
+ /* Do bisect search in bad table */
while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
+ int mid = (lo + hi)/2;
sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- /* This could still be the one, earlier ranges
- * could not.
- */
+ if (a == s) {
+ ret = mid;
+ goto out;
+ }
+
+ if (a < s)
lo = mid;
else
- /* This and later ranges are definitely out. */
hi = mid;
}
- /* 'lo' might be the last that started before target, but 'hi' isn't */
- if (hi > lo) {
- /* need to check all range that end after 's' to see if
- * any are unacknowledged.
+
+ if (BB_OFFSET(p[lo]) <= s)
+ ret = lo;
+out:
+ return ret;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be backward merged
+ * with the bad range (from the bad table) index by 'behind'.
+ */
+static bool can_merge_behind(struct badblocks *bb,
+ struct badblocks_context *bad, int behind)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+
+ if ((s < BB_OFFSET(p[behind])) &&
+ ((s + sectors) >= BB_OFFSET(p[behind])) &&
+ ((BB_END(p[behind]) - s) <= BB_MAX_LEN) &&
+ BB_ACK(p[behind]) == bad->ack)
+ return true;
+ return false;
+}
+
+/*
+ * Do backward merge for range indicated by 'bad' and the bad range
+ * (from the bad table) indexed by 'behind'. The return value is merged
+ * sectors from bad->len.
+ */
+static int behind_merge(struct badblocks *bb, struct badblocks_context *bad,
+ int behind)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int merged = 0;
+
+ WARN_ON(s >= BB_OFFSET(p[behind]));
+ WARN_ON((s + sectors) < BB_OFFSET(p[behind]));
+
+ if (s < BB_OFFSET(p[behind])) {
+ merged = BB_OFFSET(p[behind]) - s;
+ p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack);
+
+ WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN);
+ }
+
+ return merged;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can be forward
+ * merged with the bad range (from the bad table) indexed by 'prev'.
+ */
+static bool can_merge_front(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+
+ if (BB_ACK(p[prev]) == bad->ack &&
+ (s < BB_END(p[prev]) ||
+ (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN))))
+ return true;
+ return false;
+}
+
+/*
+ * Do forward merge for range indicated by 'bad' and the bad range
+ * (from bad table) indexed by 'prev'. The return value is sectors
+ * merged from bad->len.
+ */
+static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad)
+{
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int merged = 0;
+
+ WARN_ON(s > BB_END(p[prev]));
+
+ if (s < BB_END(p[prev])) {
+ merged = min_t(sector_t, sectors, BB_END(p[prev]) - s);
+ } else {
+ merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev]));
+ if ((prev + 1) < bb->count &&
+ merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) {
+ merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]);
+ }
+
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + merged, bad->ack);
+ }
+
+ return merged;
+}
+
+/*
+ * 'Combine' is a special case which can_merge_front() is not able to
+ * handle: If a bad range (indexed by 'prev' from bad table) exactly
+ * starts as bad->start, and the bad range ahead of 'prev' (indexed by
+ * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and
+ * the sum of their lengths does not exceed BB_MAX_LEN limitation, then
+ * these two bad range (from bad table) can be combined.
+ *
+ * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad
+ * table can be combined.
+ */
+static bool can_combine_front(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+
+ if ((prev > 0) &&
+ (BB_OFFSET(p[prev]) == bad->start) &&
+ (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) &&
+ (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) &&
+ (BB_ACK(p[prev - 1]) == BB_ACK(p[prev])))
+ return true;
+ return false;
+}
+
+/*
+ * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad
+ * table) into one larger bad range, and the new range is indexed by
+ * 'prev - 1'.
+ * The caller of front_combine() will decrease bb->count, therefore
+ * it is unnecessary to clear p[perv] after front merge.
+ */
+static void front_combine(struct badblocks *bb, int prev)
+{
+ u64 *p = bb->page;
+
+ p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]),
+ BB_LEN(p[prev - 1]) + BB_LEN(p[prev]),
+ BB_ACK(p[prev]));
+ if ((prev + 1) < bb->count)
+ memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8);
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly forward
+ * overlapped with the bad range (from bad table) indexed by 'front'.
+ * Exactly forward overlap means the bad range (from bad table) indexed
+ * by 'prev' does not cover the whole range indicated by 'bad'.
+ */
+static bool overlap_front(struct badblocks *bb, int front,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+
+ if (bad->start >= BB_OFFSET(p[front]) &&
+ bad->start < BB_END(p[front]))
+ return true;
+ return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' is exactly backward
+ * overlapped with the bad range (from bad table) indexed by 'behind'.
+ */
+static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad,
+ int behind)
+{
+ u64 *p = bb->page;
+
+ if (bad->start < BB_OFFSET(p[behind]) &&
+ (bad->start + bad->len) > BB_OFFSET(p[behind]))
+ return true;
+ return false;
+}
+
+/*
+ * Return 'true' if the range indicated by 'bad' can overwrite the bad
+ * range (from bad table) indexed by 'prev'.
+ *
+ * The range indicated by 'bad' can overwrite the bad range indexed by
+ * 'prev' when,
+ * 1) The whole range indicated by 'bad' can cover partial or whole bad
+ * range (from bad table) indexed by 'prev'.
+ * 2) The ack value of 'bad' is larger or equal to the ack value of bad
+ * range 'prev'.
+ *
+ * If the overwriting doesn't cover the whole bad range (from bad table)
+ * indexed by 'prev', new range might be split from existing bad range,
+ * 1) The overwrite covers head or tail part of existing bad range, 1
+ * extra bad range will be split and added into the bad table.
+ * 2) The overwrite covers middle of existing bad range, 2 extra bad
+ * ranges will be split (ahead and after the overwritten range) and
+ * added into the bad table.
+ * The number of extra split ranges of the overwriting is stored in
+ * 'extra' and returned for the caller.
+ */
+static bool can_front_overwrite(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int *extra)
+{
+ u64 *p = bb->page;
+ int len;
+
+ WARN_ON(!overlap_front(bb, prev, bad));
+
+ if (BB_ACK(p[prev]) >= bad->ack)
+ return false;
+
+ if (BB_END(p[prev]) <= (bad->start + bad->len)) {
+ len = BB_END(p[prev]) - bad->start;
+ if (BB_OFFSET(p[prev]) == bad->start)
+ *extra = 0;
+ else
+ *extra = 1;
+
+ bad->len = len;
+ } else {
+ if (BB_OFFSET(p[prev]) == bad->start)
+ *extra = 1;
+ else
+ /*
+ * prev range will be split into two, beside the overwritten
+ * one, an extra slot needed from bad table.
*/
- while (lo >= 0 &&
- BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) {
- if (BB_OFFSET(p[lo]) < target) {
- /* starts before the end, and finishes after
- * the start, so they must overlap
- */
- if (rv != -1 && BB_ACK(p[lo]))
- rv = 1;
- else
- rv = -1;
- *first_bad = BB_OFFSET(p[lo]);
- *bad_sectors = BB_LEN(p[lo]);
- }
- lo--;
+ *extra = 2;
+ }
+
+ if ((bb->count + (*extra)) >= MAX_BADBLOCKS)
+ return false;
+
+ return true;
+}
+
+/*
+ * Do the overwrite from the range indicated by 'bad' to the bad range
+ * (from bad table) indexed by 'prev'.
+ * The previously called can_front_overwrite() will provide how many
+ * extra bad range(s) might be split and added into the bad table. All
+ * the splitting cases in the bad table will be handled here.
+ */
+static int front_overwrite(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int extra)
+{
+ u64 *p = bb->page;
+ sector_t orig_end = BB_END(p[prev]);
+ int orig_ack = BB_ACK(p[prev]);
+
+ switch (extra) {
+ case 0:
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]),
+ bad->ack);
+ break;
+ case 1:
+ if (BB_OFFSET(p[prev]) == bad->start) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->len, bad->ack);
+ memmove(p + prev + 2, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start + bad->len,
+ orig_end - BB_END(p[prev]),
+ orig_ack);
+ } else {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->start - BB_OFFSET(p[prev]),
+ orig_ack);
+ /*
+ * prev +2 -> prev + 1 + 1, which is for,
+ * 1) prev + 1: the slot index of the previous one
+ * 2) + 1: one more slot for extra being 1.
+ */
+ memmove(p + prev + 2, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
}
+ break;
+ case 2:
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ bad->start - BB_OFFSET(p[prev]),
+ orig_ack);
+ /*
+ * prev + 3 -> prev + 1 + 2, which is for,
+ * 1) prev + 1: the slot index of the previous one
+ * 2) + 2: two more slots for extra being 2.
+ */
+ memmove(p + prev + 3, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack);
+ p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]),
+ orig_end - BB_END(p[prev + 1]),
+ orig_ack);
+ break;
+ default:
+ break;
}
- if (read_seqretry(&bb->lock, seq))
- goto retry;
+ return bad->len;
+}
- return rv;
+/*
+ * Explicitly insert a range indicated by 'bad' to the bad table, where
+ * the location is indexed by 'at'.
+ */
+static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+ int len;
+
+ WARN_ON(badblocks_full(bb));
+
+ len = min_t(sector_t, bad->len, BB_MAX_LEN);
+ if (at < bb->count)
+ memmove(p + at + 1, p + at, (bb->count - at) * 8);
+ p[at] = BB_MAKE(bad->start, len, bad->ack);
+
+ return len;
}
-EXPORT_SYMBOL_GPL(badblocks_check);
static void badblocks_update_acked(struct badblocks *bb)
{
+ bool unacked = false;
u64 *p = bb->page;
int i;
- bool unacked = false;
if (!bb->unacked_exist)
return;
@@ -145,282 +855,602 @@ static void badblocks_update_acked(struct badblocks *bb)
bb->unacked_exist = 0;
}
-/**
- * badblocks_set() - Add a range of bad blocks to the table.
- * @bb: the badblocks structure that holds all badblock information
- * @s: first sector to mark as bad
- * @sectors: number of sectors to mark as bad
- * @acknowledged: weather to mark the bad sectors as acknowledged
- *
- * This might extend the table, or might contract it if two adjacent ranges
- * can be merged. We binary-search to find the 'insertion' point, then
- * decide how best to handle it.
- *
- * Return:
- * 0: success
- * 1: failed to set badblocks (out of space)
- */
-int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
- int acknowledged)
+/* Do exact work to set bad block range into the bad block table */
+static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
{
- u64 *p;
- int lo, hi;
- int rv = 0;
+ int retried = 0, space_desired = 0;
+ int orig_len, len = 0, added = 0;
+ struct badblocks_context bad;
+ int prev = -1, hint = -1;
+ sector_t orig_start;
unsigned long flags;
+ int rv = 0;
+ u64 *p;
if (bb->shift < 0)
/* badblocks are disabled */
return 1;
+ if (sectors == 0)
+ /* Invalid sectors number */
+ return 1;
+
if (bb->shift) {
/* round the start down, and the end up */
sector_t next = s + sectors;
- s >>= bb->shift;
- next += (1<<bb->shift) - 1;
- next >>= bb->shift;
+ rounddown(s, bb->shift);
+ roundup(next, bb->shift);
sectors = next - s;
}
write_seqlock_irqsave(&bb->lock, flags);
+ orig_start = s;
+ orig_len = sectors;
+ bad.ack = acknowledged;
p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts at-or-before 's' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a <= s)
- lo = mid;
- else
- hi = mid;
+re_insert:
+ bad.start = s;
+ bad.len = sectors;
+ len = 0;
+
+ if (badblocks_empty(bb)) {
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ goto update_sectors;
}
- if (hi > lo && BB_OFFSET(p[lo]) > s)
- hi = lo;
- if (hi > lo) {
- /* we found a range that might merge with the start
- * of our new range
- */
- sector_t a = BB_OFFSET(p[lo]);
- sector_t e = a + BB_LEN(p[lo]);
- int ack = BB_ACK(p[lo]);
-
- if (e >= s) {
- /* Yes, we can merge with a previous range */
- if (s == a && s + sectors >= e)
- /* new range covers old */
- ack = acknowledged;
- else
- ack = ack && acknowledged;
-
- if (e < s + sectors)
- e = s + sectors;
- if (e - a <= BB_MAX_LEN) {
- p[lo] = BB_MAKE(a, e-a, ack);
- s = e;
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* start before all badblocks */
+ if (prev < 0) {
+ if (!badblocks_full(bb)) {
+ /* insert on the first */
+ if (bad.len > (BB_OFFSET(p[0]) - bad.start))
+ bad.len = BB_OFFSET(p[0]) - bad.start;
+ len = insert_at(bb, 0, &bad);
+ bb->count++;
+ added++;
+ hint = 0;
+ goto update_sectors;
+ }
+
+ /* No sapce, try to merge */
+ if (overlap_behind(bb, &bad, 0)) {
+ if (can_merge_behind(bb, &bad, 0)) {
+ len = behind_merge(bb, &bad, 0);
+ added++;
} else {
- /* does not all fit in one range,
- * make p[lo] maximal
- */
- if (BB_LEN(p[lo]) != BB_MAX_LEN)
- p[lo] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
+ len = BB_OFFSET(p[0]) - s;
+ space_desired = 1;
}
- sectors = e - s;
+ hint = 0;
+ goto update_sectors;
}
+
+ /* no table space and give up */
+ goto out;
}
- if (sectors && hi < bb->count) {
- /* 'hi' points to the first range that starts after 's'.
- * Maybe we can merge with the start of that range
- */
- sector_t a = BB_OFFSET(p[hi]);
- sector_t e = a + BB_LEN(p[hi]);
- int ack = BB_ACK(p[hi]);
-
- if (a <= s + sectors) {
- /* merging is possible */
- if (e <= s + sectors) {
- /* full overlap */
- e = s + sectors;
- ack = acknowledged;
- } else
- ack = ack && acknowledged;
-
- a = s;
- if (e - a <= BB_MAX_LEN) {
- p[hi] = BB_MAKE(a, e-a, ack);
- s = e;
- } else {
- p[hi] = BB_MAKE(a, BB_MAX_LEN, ack);
- s = a + BB_MAX_LEN;
+
+ /* in case p[prev-1] can be merged with p[prev] */
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
+ added++;
+ hint = prev;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ } else {
+ int extra = 0;
+
+ if (!can_front_overwrite(bb, prev, &bad, &extra)) {
+ len = min_t(sector_t,
+ BB_END(p[prev]) - s, sectors);
+ hint = prev;
+ goto update_sectors;
+ }
+
+ len = front_overwrite(bb, prev, &bad, extra);
+ added++;
+ bb->count += extra;
+
+ if (can_combine_front(bb, prev, &bad)) {
+ front_combine(bb, prev);
+ bb->count--;
}
- sectors = e - s;
- lo = hi;
- hi++;
}
+ hint = prev;
+ goto update_sectors;
+ }
+
+ if (can_merge_front(bb, prev, &bad)) {
+ len = front_merge(bb, prev, &bad);
+ added++;
+ hint = prev;
+ goto update_sectors;
}
- if (sectors == 0 && hi < bb->count) {
- /* we might be able to combine lo and hi */
- /* Note: 's' is at the end of 'lo' */
- sector_t a = BB_OFFSET(p[hi]);
- int lolen = BB_LEN(p[lo]);
- int hilen = BB_LEN(p[hi]);
- int newlen = lolen + hilen - (s - a);
-
- if (s >= a && newlen < BB_MAX_LEN) {
- /* yes, we can combine them */
- int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]);
-
- p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack);
- memmove(p + hi, p + hi + 1,
- (bb->count - hi - 1) * 8);
- bb->count--;
+
+ /* if no space in table, still try to merge in the covered range */
+ if (badblocks_full(bb)) {
+ /* skip the cannot-merge range */
+ if (((prev + 1) < bb->count) &&
+ overlap_behind(bb, &bad, prev + 1) &&
+ ((s + sectors) >= BB_END(p[prev + 1]))) {
+ len = BB_END(p[prev + 1]) - s;
+ hint = prev + 1;
+ goto update_sectors;
}
+
+ /* no retry any more */
+ len = sectors;
+ space_desired = 1;
+ hint = -1;
+ goto update_sectors;
}
- while (sectors) {
- /* didn't merge (it all).
- * Need to add a range just before 'hi'
- */
- if (bb->count >= MAX_BADBLOCKS) {
- /* No room for more */
- rv = 1;
- break;
- } else {
- int this_sectors = sectors;
- memmove(p + hi + 1, p + hi,
- (bb->count - hi) * 8);
- bb->count++;
+ /* cannot merge and there is space in bad table */
+ if ((prev + 1) < bb->count &&
+ overlap_behind(bb, &bad, prev + 1))
+ bad.len = min_t(sector_t,
+ bad.len, BB_OFFSET(p[prev + 1]) - bad.start);
- if (this_sectors > BB_MAX_LEN)
- this_sectors = BB_MAX_LEN;
- p[hi] = BB_MAKE(s, this_sectors, acknowledged);
- sectors -= this_sectors;
- s += this_sectors;
- }
+ len = insert_at(bb, prev + 1, &bad);
+ bb->count++;
+ added++;
+ hint = prev + 1;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_insert;
+
+ WARN_ON(sectors < 0);
+
+ /*
+ * Check whether the following already set range can be
+ * merged. (prev < 0) condition is not handled here,
+ * because it's already complicated enough.
+ */
+ if (prev >= 0 &&
+ (prev + 1) < bb->count &&
+ BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) &&
+ (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN &&
+ BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ BB_LEN(p[prev]) + BB_LEN(p[prev + 1]),
+ BB_ACK(p[prev]));
+
+ if ((prev + 2) < bb->count)
+ memmove(p + prev + 1, p + prev + 2,
+ (bb->count - (prev + 2)) * 8);
+ bb->count--;
+ }
+
+ if (space_desired && !badblocks_full(bb)) {
+ s = orig_start;
+ sectors = orig_len;
+ space_desired = 0;
+ if (retried++ < 3)
+ goto re_insert;
+ }
+
+out:
+ if (added) {
+ set_changed(bb);
+
+ if (!acknowledged)
+ bb->unacked_exist = 1;
+ else
+ badblocks_update_acked(bb);
}
- bb->changed = 1;
- if (!acknowledged)
- bb->unacked_exist = 1;
- else
- badblocks_update_acked(bb);
write_sequnlock_irqrestore(&bb->lock, flags);
+ if (!added)
+ rv = 1;
+
return rv;
}
-EXPORT_SYMBOL_GPL(badblocks_set);
-/**
- * badblocks_clear() - Remove a range of bad blocks to the table.
- * @bb: the badblocks structure that holds all badblock information
- * @s: first sector to mark as bad
- * @sectors: number of sectors to mark as bad
- *
- * This may involve extending the table if we spilt a region,
- * but it must not fail. So if the table becomes full, we just
- * drop the remove request.
- *
- * Return:
- * 0: success
- * 1: failed to clear badblocks
+/*
+ * Clear the bad block range from bad block table which is front overlapped
+ * with the clearing range. The return value is how many sectors from an
+ * already set bad block range are cleared. If the whole bad block range is
+ * covered by the clearing range and fully cleared, 'delete' is set as 1 for
+ * the caller to reduce bb->count.
*/
-int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+static int front_clear(struct badblocks *bb, int prev,
+ struct badblocks_context *bad, int *deleted)
{
- u64 *p;
- int lo, hi;
- sector_t target = s + sectors;
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+ u64 *p = bb->page;
+ int cleared = 0;
+
+ *deleted = 0;
+ if (s == BB_OFFSET(p[prev])) {
+ if (BB_LEN(p[prev]) > sectors) {
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors,
+ BB_LEN(p[prev]) - sectors,
+ BB_ACK(p[prev]));
+ cleared = sectors;
+ } else {
+ /* BB_LEN(p[prev]) <= sectors */
+ cleared = BB_LEN(p[prev]);
+ if ((prev + 1) < bb->count)
+ memmove(p + prev, p + prev + 1,
+ (bb->count - prev - 1) * 8);
+ *deleted = 1;
+ }
+ } else if (s > BB_OFFSET(p[prev])) {
+ if (BB_END(p[prev]) <= (s + sectors)) {
+ cleared = BB_END(p[prev]) - s;
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ s - BB_OFFSET(p[prev]),
+ BB_ACK(p[prev]));
+ } else {
+ /* Splitting is handled in front_splitting_clear() */
+ BUG();
+ }
+ }
+
+ return cleared;
+}
+
+/*
+ * Handle the condition that the clearing range hits middle of an already set
+ * bad block range from bad block table. In this condition the existing bad
+ * block range is split into two after the middle part is cleared.
+ */
+static int front_splitting_clear(struct badblocks *bb, int prev,
+ struct badblocks_context *bad)
+{
+ u64 *p = bb->page;
+ u64 end = BB_END(p[prev]);
+ int ack = BB_ACK(p[prev]);
+ sector_t sectors = bad->len;
+ sector_t s = bad->start;
+
+ p[prev] = BB_MAKE(BB_OFFSET(p[prev]),
+ s - BB_OFFSET(p[prev]),
+ ack);
+ memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8);
+ p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack);
+ return sectors;
+}
+
+/* Do the exact work to clear bad block range from the bad block table */
+static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ struct badblocks_context bad;
+ int prev = -1, hint = -1;
+ int len = 0, cleared = 0;
int rv = 0;
+ u64 *p;
+
+ if (bb->shift < 0)
+ /* badblocks are disabled */
+ return 1;
+
+ if (sectors == 0)
+ /* Invalid sectors number */
+ return 1;
+
+ if (bb->shift) {
+ sector_t target;
- if (bb->shift > 0) {
/* When clearing we round the start up and the end down.
* This should not matter as the shift should align with
* the block size and no rounding should ever be needed.
* However it is better the think a block is bad when it
* isn't than to think a block is not bad when it is.
*/
- s += (1<<bb->shift) - 1;
- s >>= bb->shift;
- target >>= bb->shift;
+ target = s + sectors;
+ roundup(s, bb->shift);
+ rounddown(target, bb->shift);
sectors = target - s;
}
write_seqlock_irq(&bb->lock);
+ bad.ack = true;
p = bb->page;
- lo = 0;
- hi = bb->count;
- /* Find the last range that starts before 'target' */
- while (hi - lo > 1) {
- int mid = (lo + hi) / 2;
- sector_t a = BB_OFFSET(p[mid]);
- if (a < target)
- lo = mid;
- else
- hi = mid;
+re_clear:
+ bad.start = s;
+ bad.len = sectors;
+
+ if (badblocks_empty(bb)) {
+ len = sectors;
+ cleared++;
+ goto update_sectors;
}
- if (hi > lo) {
- /* p[lo] is the last range that could overlap the
- * current range. Earlier ranges could also overlap,
- * but only this one can overlap the end of the range.
- */
- if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) &&
- (BB_OFFSET(p[lo]) < target)) {
- /* Partial overlap, leave the tail of this range */
- int ack = BB_ACK(p[lo]);
- sector_t a = BB_OFFSET(p[lo]);
- sector_t end = a + BB_LEN(p[lo]);
-
- if (a < s) {
- /* we need to split this range */
- if (bb->count >= MAX_BADBLOCKS) {
- rv = -ENOSPC;
- goto out;
- }
- memmove(p+lo+1, p+lo, (bb->count - lo) * 8);
- bb->count++;
- p[lo] = BB_MAKE(a, s-a, ack);
- lo++;
- }
- p[lo] = BB_MAKE(target, end - target, ack);
- /* there is no longer an overlap */
- hi = lo;
- lo--;
- }
- while (lo >= 0 &&
- (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) &&
- (BB_OFFSET(p[lo]) < target)) {
- /* This range does overlap */
- if (BB_OFFSET(p[lo]) < s) {
- /* Keep the early parts of this range. */
- int ack = BB_ACK(p[lo]);
- sector_t start = BB_OFFSET(p[lo]);
-
- p[lo] = BB_MAKE(start, s - start, ack);
- /* now low doesn't overlap, so.. */
- break;
- }
- lo--;
+
+
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* Start before all badblocks */
+ if (prev < 0) {
+ if (overlap_behind(bb, &bad, 0)) {
+ len = BB_OFFSET(p[0]) - s;
+ hint = 0;
+ } else {
+ len = sectors;
}
- /* 'lo' is strictly before, 'hi' is strictly after,
- * anything between needs to be discarded
+ /*
+ * Both situations are to clear non-bad range,
+ * should be treated as successful
*/
- if (hi - lo > 1) {
- memmove(p+lo+1, p+hi, (bb->count - hi) * 8);
- bb->count -= (hi - lo - 1);
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Start after all badblocks */
+ if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) {
+ len = sectors;
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Clear will split a bad record but the table is full */
+ if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) &&
+ (BB_END(p[prev]) > (bad.start + sectors))) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ if (overlap_front(bb, prev, &bad)) {
+ if ((BB_OFFSET(p[prev]) < bad.start) &&
+ (BB_END(p[prev]) > (bad.start + bad.len))) {
+ /* Splitting */
+ if ((bb->count + 1) < MAX_BADBLOCKS) {
+ len = front_splitting_clear(bb, prev, &bad);
+ bb->count += 1;
+ cleared++;
+ } else {
+ /* No space to split, give up */
+ len = sectors;
+ }
+ } else {
+ int deleted = 0;
+
+ len = front_clear(bb, prev, &bad, &deleted);
+ bb->count -= deleted;
+ cleared++;
+ hint = prev;
}
+
+ goto update_sectors;
+ }
+
+ /* Not front overlap, but behind overlap */
+ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+ len = BB_OFFSET(p[prev + 1]) - bad.start;
+ hint = prev + 1;
+ /* Clear non-bad range should be treated as successful */
+ cleared++;
+ goto update_sectors;
+ }
+
+ /* Not cover any badblocks range in the table */
+ len = sectors;
+ /* Clear non-bad range should be treated as successful */
+ cleared++;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_clear;
+
+ WARN_ON(sectors < 0);
+
+ if (cleared) {
+ badblocks_update_acked(bb);
+ set_changed(bb);
}
- badblocks_update_acked(bb);
- bb->changed = 1;
-out:
write_sequnlock_irq(&bb->lock);
+
+ if (!cleared)
+ rv = 1;
+
+ return rv;
+}
+
+/* Do the exact work to check bad blocks range from the bad block table */
+static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ int unacked_badblocks, acked_badblocks;
+ int prev = -1, hint = -1, set = 0;
+ struct badblocks_context bad;
+ unsigned int seq;
+ int len, rv;
+ u64 *p;
+
+ WARN_ON(bb->shift < 0 || sectors == 0);
+
+ if (bb->shift > 0) {
+ sector_t target;
+
+ /* round the start down, and the end up */
+ target = s + sectors;
+ rounddown(s, bb->shift);
+ roundup(target, bb->shift);
+ sectors = target - s;
+ }
+
+retry:
+ seq = read_seqbegin(&bb->lock);
+
+ p = bb->page;
+ unacked_badblocks = 0;
+ acked_badblocks = 0;
+
+re_check:
+ bad.start = s;
+ bad.len = sectors;
+
+ if (badblocks_empty(bb)) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ prev = prev_badblocks(bb, &bad, hint);
+
+ /* start after all badblocks */
+ if ((prev >= 0) &&
+ ((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) {
+ len = sectors;
+ goto update_sectors;
+ }
+
+ /* Overlapped with front badblocks record */
+ if ((prev >= 0) && overlap_front(bb, prev, &bad)) {
+ if (BB_ACK(p[prev]))
+ acked_badblocks++;
+ else
+ unacked_badblocks++;
+
+ if (BB_END(p[prev]) >= (s + sectors))
+ len = sectors;
+ else
+ len = BB_END(p[prev]) - s;
+
+ if (set == 0) {
+ *first_bad = BB_OFFSET(p[prev]);
+ *bad_sectors = BB_LEN(p[prev]);
+ set = 1;
+ }
+ goto update_sectors;
+ }
+
+ /* Not front overlap, but behind overlap */
+ if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) {
+ len = BB_OFFSET(p[prev + 1]) - bad.start;
+ hint = prev + 1;
+ goto update_sectors;
+ }
+
+ /* not cover any badblocks range in the table */
+ len = sectors;
+
+update_sectors:
+ s += len;
+ sectors -= len;
+
+ if (sectors > 0)
+ goto re_check;
+
+ WARN_ON(sectors < 0);
+
+ if (unacked_badblocks > 0)
+ rv = -1;
+ else if (acked_badblocks > 0)
+ rv = 1;
+ else
+ rv = 0;
+
+ if (read_seqretry(&bb->lock, seq))
+ goto retry;
+
return rv;
}
+
+/**
+ * badblocks_check() - check a given range for bad sectors
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: sector (start) at which to check for badblocks
+ * @sectors: number of sectors to check for badblocks
+ * @first_bad: pointer to store location of the first badblock
+ * @bad_sectors: pointer to store number of badblocks after @first_bad
+ *
+ * We can record which blocks on each device are 'bad' and so just
+ * fail those blocks, or that stripe, rather than the whole device.
+ * Entries in the bad-block table are 64bits wide. This comprises:
+ * Length of bad-range, in sectors: 0-511 for lengths 1-512
+ * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes)
+ * A 'shift' can be set so that larger blocks are tracked and
+ * consequently larger devices can be covered.
+ * 'Acknowledged' flag - 1 bit. - the most significant bit.
+ *
+ * Locking of the bad-block table uses a seqlock so badblocks_check
+ * might need to retry if it is very unlucky.
+ * We will sometimes want to check for bad blocks in a bi_end_io function,
+ * so we use the write_seqlock_irq variant.
+ *
+ * When looking for a bad block we specify a range and want to
+ * know if any block in the range is bad. So we binary-search
+ * to the last range that starts at-or-before the given endpoint,
+ * (or "before the sector after the target range")
+ * then see if it ends after the given start.
+ *
+ * Return:
+ * 0: there are no known bad blocks in the range
+ * 1: there are known bad block which are all acknowledged
+ * -1: there are bad blocks which have not yet been acknowledged in metadata.
+ * plus the start/length of the first bad section we overlap.
+ */
+int badblocks_check(struct badblocks *bb, sector_t s, int sectors,
+ sector_t *first_bad, int *bad_sectors)
+{
+ return _badblocks_check(bb, s, sectors, first_bad, bad_sectors);
+}
+EXPORT_SYMBOL_GPL(badblocks_check);
+
+/**
+ * badblocks_set() - Add a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ * @acknowledged: weather to mark the bad sectors as acknowledged
+ *
+ * This might extend the table, or might contract it if two adjacent ranges
+ * can be merged. We binary-search to find the 'insertion' point, then
+ * decide how best to handle it.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to set badblocks (out of space)
+ */
+int badblocks_set(struct badblocks *bb, sector_t s, int sectors,
+ int acknowledged)
+{
+ return _badblocks_set(bb, s, sectors, acknowledged);
+}
+EXPORT_SYMBOL_GPL(badblocks_set);
+
+/**
+ * badblocks_clear() - Remove a range of bad blocks to the table.
+ * @bb: the badblocks structure that holds all badblock information
+ * @s: first sector to mark as bad
+ * @sectors: number of sectors to mark as bad
+ *
+ * This may involve extending the table if we spilt a region,
+ * but it must not fail. So if the table becomes full, we just
+ * drop the remove request.
+ *
+ * Return:
+ * 0: success
+ * 1: failed to clear badblocks
+ */
+int badblocks_clear(struct badblocks *bb, sector_t s, int sectors)
+{
+ return _badblocks_clear(bb, s, sectors);
+}
EXPORT_SYMBOL_GPL(badblocks_clear);
/**
@@ -525,7 +1555,7 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len,
case 3:
if (newline != '\n')
return -EINVAL;
- /* fall through */
+ fallthrough;
case 2:
if (length <= 0)
return -EINVAL;
diff --git a/block/bdev.c b/block/bdev.c
new file mode 100644
index 000000000000..e9f1b12bd75c
--- /dev/null
+++ b/block/bdev.c
@@ -0,0 +1,1148 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2016 - 2020 Christoph Hellwig
+ */
+
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/slab.h>
+#include <linux/kmod.h>
+#include <linux/major.h>
+#include <linux/device_cgroup.h>
+#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
+#include <linux/backing-dev.h>
+#include <linux/module.h>
+#include <linux/blkpg.h>
+#include <linux/magic.h>
+#include <linux/buffer_head.h>
+#include <linux/swap.h>
+#include <linux/writeback.h>
+#include <linux/mount.h>
+#include <linux/pseudo_fs.h>
+#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/part_stat.h>
+#include <linux/uaccess.h>
+#include <linux/stat.h>
+#include "../fs/internal.h"
+#include "blk.h"
+
+/* Should we allow writing to mounted block devices? */
+static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED);
+
+struct bdev_inode {
+ struct block_device bdev;
+ struct inode vfs_inode;
+};
+
+static inline struct bdev_inode *BDEV_I(struct inode *inode)
+{
+ return container_of(inode, struct bdev_inode, vfs_inode);
+}
+
+struct block_device *I_BDEV(struct inode *inode)
+{
+ return &BDEV_I(inode)->bdev;
+}
+EXPORT_SYMBOL(I_BDEV);
+
+static void bdev_write_inode(struct block_device *bdev)
+{
+ struct inode *inode = bdev->bd_inode;
+ int ret;
+
+ spin_lock(&inode->i_lock);
+ while (inode->i_state & I_DIRTY) {
+ spin_unlock(&inode->i_lock);
+ ret = write_inode_now(inode, true);
+ if (ret)
+ pr_warn_ratelimited(
+ "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n",
+ bdev, ret);
+ spin_lock(&inode->i_lock);
+ }
+ spin_unlock(&inode->i_lock);
+}
+
+/* Kill _all_ buffers and pagecache , dirty or not.. */
+static void kill_bdev(struct block_device *bdev)
+{
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping_empty(mapping))
+ return;
+
+ invalidate_bh_lrus();
+ truncate_inode_pages(mapping, 0);
+}
+
+/* Invalidate clean unused buffers and pagecache. */
+void invalidate_bdev(struct block_device *bdev)
+{
+ struct address_space *mapping = bdev->bd_inode->i_mapping;
+
+ if (mapping->nrpages) {
+ invalidate_bh_lrus();
+ lru_add_drain_all(); /* make sure all lru add caches are flushed */
+ invalidate_mapping_pages(mapping, 0, -1);
+ }
+}
+EXPORT_SYMBOL(invalidate_bdev);
+
+/*
+ * Drop all buffers & page cache for given bdev range. This function bails
+ * with error if bdev has other exclusive owner (such as filesystem).
+ */
+int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
+ loff_t lstart, loff_t lend)
+{
+ /*
+ * If we don't hold exclusive handle for the device, upgrade to it
+ * while we discard the buffer cache to avoid discarding buffers
+ * under live filesystem.
+ */
+ if (!(mode & BLK_OPEN_EXCL)) {
+ int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL);
+ if (err)
+ goto invalidate;
+ }
+
+ truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend);
+ if (!(mode & BLK_OPEN_EXCL))
+ bd_abort_claiming(bdev, truncate_bdev_range);
+ return 0;
+
+invalidate:
+ /*
+ * Someone else has handle exclusively open. Try invalidating instead.
+ * The 'end' argument is inclusive so the rounding is safe.
+ */
+ return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping,
+ lstart >> PAGE_SHIFT,
+ lend >> PAGE_SHIFT);
+}
+
+static void set_init_blocksize(struct block_device *bdev)
+{
+ unsigned int bsize = bdev_logical_block_size(bdev);
+ loff_t size = i_size_read(bdev->bd_inode);
+
+ while (bsize < PAGE_SIZE) {
+ if (size & bsize)
+ break;
+ bsize <<= 1;
+ }
+ bdev->bd_inode->i_blkbits = blksize_bits(bsize);
+}
+
+int set_blocksize(struct block_device *bdev, int size)
+{
+ /* Size must be a power of two, and between 512 and PAGE_SIZE */
+ if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size))
+ return -EINVAL;
+
+ /* Size cannot be smaller than the size supported by the device */
+ if (size < bdev_logical_block_size(bdev))
+ return -EINVAL;
+
+ /* Don't change the size if it is same as current */
+ if (bdev->bd_inode->i_blkbits != blksize_bits(size)) {
+ sync_blockdev(bdev);
+ bdev->bd_inode->i_blkbits = blksize_bits(size);
+ kill_bdev(bdev);
+ }
+ return 0;
+}
+
+EXPORT_SYMBOL(set_blocksize);
+
+int sb_set_blocksize(struct super_block *sb, int size)
+{
+ if (set_blocksize(sb->s_bdev, size))
+ return 0;
+ /* If we get here, we know size is power of two
+ * and it's value is between 512 and PAGE_SIZE */
+ sb->s_blocksize = size;
+ sb->s_blocksize_bits = blksize_bits(size);
+ return sb->s_blocksize;
+}
+
+EXPORT_SYMBOL(sb_set_blocksize);
+
+int sb_min_blocksize(struct super_block *sb, int size)
+{
+ int minsize = bdev_logical_block_size(sb->s_bdev);
+ if (size < minsize)
+ size = minsize;
+ return sb_set_blocksize(sb, size);
+}
+
+EXPORT_SYMBOL(sb_min_blocksize);
+
+int sync_blockdev_nowait(struct block_device *bdev)
+{
+ if (!bdev)
+ return 0;
+ return filemap_flush(bdev->bd_inode->i_mapping);
+}
+EXPORT_SYMBOL_GPL(sync_blockdev_nowait);
+
+/*
+ * Write out and wait upon all the dirty data associated with a block
+ * device via its mapping. Does not take the superblock lock.
+ */
+int sync_blockdev(struct block_device *bdev)
+{
+ if (!bdev)
+ return 0;
+ return filemap_write_and_wait(bdev->bd_inode->i_mapping);
+}
+EXPORT_SYMBOL(sync_blockdev);
+
+int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend)
+{
+ return filemap_write_and_wait_range(bdev->bd_inode->i_mapping,
+ lstart, lend);
+}
+EXPORT_SYMBOL(sync_blockdev_range);
+
+/**
+ * bdev_freeze - lock a filesystem and force it into a consistent state
+ * @bdev: blockdevice to lock
+ *
+ * If a superblock is found on this device, we take the s_umount semaphore
+ * on it to make sure nobody unmounts until the snapshot creation is done.
+ * The reference counter (bd_fsfreeze_count) guarantees that only the last
+ * unfreeze process can unfreeze the frozen filesystem actually when multiple
+ * freeze requests arrive simultaneously. It counts up in bdev_freeze() and
+ * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze
+ * actually.
+ *
+ * Return: On success zero is returned, negative error code on failure.
+ */
+int bdev_freeze(struct block_device *bdev)
+{
+ int error = 0;
+
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+
+ if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) {
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return 0;
+ }
+
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) {
+ error = bdev->bd_holder_ops->freeze(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ error = sync_blockdev(bdev);
+ }
+
+ if (error)
+ atomic_dec(&bdev->bd_fsfreeze_count);
+
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return error;
+}
+EXPORT_SYMBOL(bdev_freeze);
+
+/**
+ * bdev_thaw - unlock filesystem
+ * @bdev: blockdevice to unlock
+ *
+ * Unlocks the filesystem and marks it writeable again after bdev_freeze().
+ *
+ * Return: On success zero is returned, negative error code on failure.
+ */
+int bdev_thaw(struct block_device *bdev)
+{
+ int error = -EINVAL, nr_freeze;
+
+ mutex_lock(&bdev->bd_fsfreeze_mutex);
+
+ /*
+ * If this returns < 0 it means that @bd_fsfreeze_count was
+ * already 0 and no decrement was performed.
+ */
+ nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count);
+ if (nr_freeze < 0)
+ goto out;
+
+ error = 0;
+ if (nr_freeze > 0)
+ goto out;
+
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) {
+ error = bdev->bd_holder_ops->thaw(bdev);
+ lockdep_assert_not_held(&bdev->bd_holder_lock);
+ } else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ }
+
+ if (error)
+ atomic_inc(&bdev->bd_fsfreeze_count);
+out:
+ mutex_unlock(&bdev->bd_fsfreeze_mutex);
+ return error;
+}
+EXPORT_SYMBOL(bdev_thaw);
+
+/*
+ * pseudo-fs
+ */
+
+static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock);
+static struct kmem_cache *bdev_cachep __ro_after_init;
+
+static struct inode *bdev_alloc_inode(struct super_block *sb)
+{
+ struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL);
+
+ if (!ei)
+ return NULL;
+ memset(&ei->bdev, 0, sizeof(ei->bdev));
+ return &ei->vfs_inode;
+}
+
+static void bdev_free_inode(struct inode *inode)
+{
+ struct block_device *bdev = I_BDEV(inode);
+
+ free_percpu(bdev->bd_stats);
+ kfree(bdev->bd_meta_info);
+
+ if (!bdev_is_partition(bdev)) {
+ if (bdev->bd_disk && bdev->bd_disk->bdi)
+ bdi_put(bdev->bd_disk->bdi);
+ kfree(bdev->bd_disk);
+ }
+
+ if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(MINOR(bdev->bd_dev));
+
+ kmem_cache_free(bdev_cachep, BDEV_I(inode));
+}
+
+static void init_once(void *data)
+{
+ struct bdev_inode *ei = data;
+
+ inode_init_once(&ei->vfs_inode);
+}
+
+static void bdev_evict_inode(struct inode *inode)
+{
+ truncate_inode_pages_final(&inode->i_data);
+ invalidate_inode_buffers(inode); /* is it needed here? */
+ clear_inode(inode);
+}
+
+static const struct super_operations bdev_sops = {
+ .statfs = simple_statfs,
+ .alloc_inode = bdev_alloc_inode,
+ .free_inode = bdev_free_inode,
+ .drop_inode = generic_delete_inode,
+ .evict_inode = bdev_evict_inode,
+};
+
+static int bd_init_fs_context(struct fs_context *fc)
+{
+ struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+ fc->s_iflags |= SB_I_CGROUPWB;
+ ctx->ops = &bdev_sops;
+ return 0;
+}
+
+static struct file_system_type bd_type = {
+ .name = "bdev",
+ .init_fs_context = bd_init_fs_context,
+ .kill_sb = kill_anon_super,
+};
+
+struct super_block *blockdev_superblock __ro_after_init;
+EXPORT_SYMBOL_GPL(blockdev_superblock);
+
+void __init bdev_cache_init(void)
+{
+ int err;
+ static struct vfsmount *bd_mnt __ro_after_init;
+
+ bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode),
+ 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC),
+ init_once);
+ err = register_filesystem(&bd_type);
+ if (err)
+ panic("Cannot register bdev pseudo-fs");
+ bd_mnt = kern_mount(&bd_type);
+ if (IS_ERR(bd_mnt))
+ panic("Cannot create bdev pseudo-fs");
+ blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */
+}
+
+struct block_device *bdev_alloc(struct gendisk *disk, u8 partno)
+{
+ struct block_device *bdev;
+ struct inode *inode;
+
+ inode = new_inode(blockdev_superblock);
+ if (!inode)
+ return NULL;
+ inode->i_mode = S_IFBLK;
+ inode->i_rdev = 0;
+ inode->i_data.a_ops = &def_blk_aops;
+ mapping_set_gfp_mask(&inode->i_data, GFP_USER);
+
+ bdev = I_BDEV(inode);
+ mutex_init(&bdev->bd_fsfreeze_mutex);
+ spin_lock_init(&bdev->bd_size_lock);
+ mutex_init(&bdev->bd_holder_lock);
+ bdev->bd_partno = partno;
+ bdev->bd_inode = inode;
+ bdev->bd_queue = disk->queue;
+ if (partno)
+ bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio;
+ else
+ bdev->bd_has_submit_bio = false;
+ bdev->bd_stats = alloc_percpu(struct disk_stats);
+ if (!bdev->bd_stats) {
+ iput(inode);
+ return NULL;
+ }
+ bdev->bd_disk = disk;
+ return bdev;
+}
+
+void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors)
+{
+ spin_lock(&bdev->bd_size_lock);
+ i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT);
+ bdev->bd_nr_sectors = sectors;
+ spin_unlock(&bdev->bd_size_lock);
+}
+
+void bdev_add(struct block_device *bdev, dev_t dev)
+{
+ if (bdev_stable_writes(bdev))
+ mapping_set_stable_writes(bdev->bd_inode->i_mapping);
+ bdev->bd_dev = dev;
+ bdev->bd_inode->i_rdev = dev;
+ bdev->bd_inode->i_ino = dev;
+ insert_inode_hash(bdev->bd_inode);
+}
+
+long nr_blockdev_pages(void)
+{
+ struct inode *inode;
+ long ret = 0;
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list)
+ ret += inode->i_mapping->nrpages;
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+
+ return ret;
+}
+
+/**
+ * bd_may_claim - test whether a block device can be claimed
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ * @hops: holder ops
+ *
+ * Test whether @bdev can be claimed by @holder.
+ *
+ * RETURNS:
+ * %true if @bdev can be claimed, %false otherwise.
+ */
+static bool bd_may_claim(struct block_device *bdev, void *holder,
+ const struct blk_holder_ops *hops)
+{
+ struct block_device *whole = bdev_whole(bdev);
+
+ lockdep_assert_held(&bdev_lock);
+
+ if (bdev->bd_holder) {
+ /*
+ * The same holder can always re-claim.
+ */
+ if (bdev->bd_holder == holder) {
+ if (WARN_ON_ONCE(bdev->bd_holder_ops != hops))
+ return false;
+ return true;
+ }
+ return false;
+ }
+
+ /*
+ * If the whole devices holder is set to bd_may_claim, a partition on
+ * the device is claimed, but not the whole device.
+ */
+ if (whole != bdev &&
+ whole->bd_holder && whole->bd_holder != bd_may_claim)
+ return false;
+ return true;
+}
+
+/**
+ * bd_prepare_to_claim - claim a block device
+ * @bdev: block device of interest
+ * @holder: holder trying to claim @bdev
+ * @hops: holder ops.
+ *
+ * Claim @bdev. This function fails if @bdev is already claimed by another
+ * holder and waits if another claiming is in progress. return, the caller
+ * has ownership of bd_claiming and bd_holder[s].
+ *
+ * RETURNS:
+ * 0 if @bdev can be claimed, -EBUSY otherwise.
+ */
+int bd_prepare_to_claim(struct block_device *bdev, void *holder,
+ const struct blk_holder_ops *hops)
+{
+ struct block_device *whole = bdev_whole(bdev);
+
+ if (WARN_ON_ONCE(!holder))
+ return -EINVAL;
+retry:
+ mutex_lock(&bdev_lock);
+ /* if someone else claimed, fail */
+ if (!bd_may_claim(bdev, holder, hops)) {
+ mutex_unlock(&bdev_lock);
+ return -EBUSY;
+ }
+
+ /* if claiming is already in progress, wait for it to finish */
+ if (whole->bd_claiming) {
+ wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0);
+ DEFINE_WAIT(wait);
+
+ prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE);
+ mutex_unlock(&bdev_lock);
+ schedule();
+ finish_wait(wq, &wait);
+ goto retry;
+ }
+
+ /* yay, all mine */
+ whole->bd_claiming = holder;
+ mutex_unlock(&bdev_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */
+
+static void bd_clear_claiming(struct block_device *whole, void *holder)
+{
+ lockdep_assert_held(&bdev_lock);
+ /* tell others that we're done */
+ BUG_ON(whole->bd_claiming != holder);
+ whole->bd_claiming = NULL;
+ wake_up_bit(&whole->bd_claiming, 0);
+}
+
+/**
+ * bd_finish_claiming - finish claiming of a block device
+ * @bdev: block device of interest
+ * @holder: holder that has claimed @bdev
+ * @hops: block device holder operations
+ *
+ * Finish exclusive open of a block device. Mark the device as exlusively
+ * open by the holder and wake up all waiters for exclusive open to finish.
+ */
+static void bd_finish_claiming(struct block_device *bdev, void *holder,
+ const struct blk_holder_ops *hops)
+{
+ struct block_device *whole = bdev_whole(bdev);
+
+ mutex_lock(&bdev_lock);
+ BUG_ON(!bd_may_claim(bdev, holder, hops));
+ /*
+ * Note that for a whole device bd_holders will be incremented twice,
+ * and bd_holder will be set to bd_may_claim before being set to holder
+ */
+ whole->bd_holders++;
+ whole->bd_holder = bd_may_claim;
+ bdev->bd_holders++;
+ mutex_lock(&bdev->bd_holder_lock);
+ bdev->bd_holder = holder;
+ bdev->bd_holder_ops = hops;
+ mutex_unlock(&bdev->bd_holder_lock);
+ bd_clear_claiming(whole, holder);
+ mutex_unlock(&bdev_lock);
+}
+
+/**
+ * bd_abort_claiming - abort claiming of a block device
+ * @bdev: block device of interest
+ * @holder: holder that has claimed @bdev
+ *
+ * Abort claiming of a block device when the exclusive open failed. This can be
+ * also used when exclusive open is not actually desired and we just needed
+ * to block other exclusive openers for a while.
+ */
+void bd_abort_claiming(struct block_device *bdev, void *holder)
+{
+ mutex_lock(&bdev_lock);
+ bd_clear_claiming(bdev_whole(bdev), holder);
+ mutex_unlock(&bdev_lock);
+}
+EXPORT_SYMBOL(bd_abort_claiming);
+
+static void bd_end_claim(struct block_device *bdev, void *holder)
+{
+ struct block_device *whole = bdev_whole(bdev);
+ bool unblock = false;
+
+ /*
+ * Release a claim on the device. The holder fields are protected with
+ * bdev_lock. open_mutex is used to synchronize disk_holder unlinking.
+ */
+ mutex_lock(&bdev_lock);
+ WARN_ON_ONCE(bdev->bd_holder != holder);
+ WARN_ON_ONCE(--bdev->bd_holders < 0);
+ WARN_ON_ONCE(--whole->bd_holders < 0);
+ if (!bdev->bd_holders) {
+ mutex_lock(&bdev->bd_holder_lock);
+ bdev->bd_holder = NULL;
+ bdev->bd_holder_ops = NULL;
+ mutex_unlock(&bdev->bd_holder_lock);
+ if (bdev->bd_write_holder)
+ unblock = true;
+ }
+ if (!whole->bd_holders)
+ whole->bd_holder = NULL;
+ mutex_unlock(&bdev_lock);
+
+ /*
+ * If this was the last claim, remove holder link and unblock evpoll if
+ * it was a write holder.
+ */
+ if (unblock) {
+ disk_unblock_events(bdev->bd_disk);
+ bdev->bd_write_holder = false;
+ }
+}
+
+static void blkdev_flush_mapping(struct block_device *bdev)
+{
+ WARN_ON_ONCE(bdev->bd_holders);
+ sync_blockdev(bdev);
+ kill_bdev(bdev);
+ bdev_write_inode(bdev);
+}
+
+static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ int ret;
+
+ if (disk->fops->open) {
+ ret = disk->fops->open(disk, mode);
+ if (ret) {
+ /* avoid ghost partitions on a removed medium */
+ if (ret == -ENOMEDIUM &&
+ test_bit(GD_NEED_PART_SCAN, &disk->state))
+ bdev_disk_changed(disk, true);
+ return ret;
+ }
+ }
+
+ if (!atomic_read(&bdev->bd_openers))
+ set_init_blocksize(bdev);
+ if (test_bit(GD_NEED_PART_SCAN, &disk->state))
+ bdev_disk_changed(disk, false);
+ atomic_inc(&bdev->bd_openers);
+ return 0;
+}
+
+static void blkdev_put_whole(struct block_device *bdev)
+{
+ if (atomic_dec_and_test(&bdev->bd_openers))
+ blkdev_flush_mapping(bdev);
+ if (bdev->bd_disk->fops->release)
+ bdev->bd_disk->fops->release(bdev->bd_disk);
+}
+
+static int blkdev_get_part(struct block_device *part, blk_mode_t mode)
+{
+ struct gendisk *disk = part->bd_disk;
+ int ret;
+
+ ret = blkdev_get_whole(bdev_whole(part), mode);
+ if (ret)
+ return ret;
+
+ ret = -ENXIO;
+ if (!bdev_nr_sectors(part))
+ goto out_blkdev_put;
+
+ if (!atomic_read(&part->bd_openers)) {
+ disk->open_partitions++;
+ set_init_blocksize(part);
+ }
+ atomic_inc(&part->bd_openers);
+ return 0;
+
+out_blkdev_put:
+ blkdev_put_whole(bdev_whole(part));
+ return ret;
+}
+
+static void blkdev_put_part(struct block_device *part)
+{
+ struct block_device *whole = bdev_whole(part);
+
+ if (atomic_dec_and_test(&part->bd_openers)) {
+ blkdev_flush_mapping(part);
+ whole->bd_disk->open_partitions--;
+ }
+ blkdev_put_whole(whole);
+}
+
+struct block_device *blkdev_get_no_open(dev_t dev)
+{
+ struct block_device *bdev;
+ struct inode *inode;
+
+ inode = ilookup(blockdev_superblock, dev);
+ if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) {
+ blk_request_module(dev);
+ inode = ilookup(blockdev_superblock, dev);
+ if (inode)
+ pr_warn_ratelimited(
+"block device autoloading is deprecated and will be removed.\n");
+ }
+ if (!inode)
+ return NULL;
+
+ /* switch from the inode reference to a device mode one: */
+ bdev = &BDEV_I(inode)->bdev;
+ if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
+ bdev = NULL;
+ iput(inode);
+ return bdev;
+}
+
+void blkdev_put_no_open(struct block_device *bdev)
+{
+ put_device(&bdev->bd_device);
+}
+
+static bool bdev_writes_blocked(struct block_device *bdev)
+{
+ return bdev->bd_writers == -1;
+}
+
+static void bdev_block_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = -1;
+}
+
+static void bdev_unblock_writes(struct block_device *bdev)
+{
+ bdev->bd_writers = 0;
+}
+
+static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return true;
+ /* Writes blocked? */
+ if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev))
+ return false;
+ if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0)
+ return false;
+ return true;
+}
+
+static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Claim exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_block_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers++;
+}
+
+static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode)
+{
+ if (bdev_allow_write_mounted)
+ return;
+
+ /* Yield exclusive or shared write access. */
+ if (mode & BLK_OPEN_RESTRICT_WRITES)
+ bdev_unblock_writes(bdev);
+ else if (mode & BLK_OPEN_WRITE)
+ bdev->bd_writers--;
+}
+
+/**
+ * bdev_open_by_dev - open a block device by device number
+ * @dev: device number of block device to open
+ * @mode: open mode (BLK_OPEN_*)
+ * @holder: exclusive holder identifier
+ * @hops: holder operations
+ *
+ * Open the block device described by device number @dev. If @holder is not
+ * %NULL, the block device is opened with exclusive access. Exclusive opens may
+ * nest for the same @holder.
+ *
+ * Use this interface ONLY if you really do not have anything better - i.e. when
+ * you are behind a truly sucky interface and all you are given is a device
+ * number. Everything else should use bdev_open_by_path().
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
+ */
+struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder,
+ const struct blk_holder_ops *hops)
+{
+ struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle),
+ GFP_KERNEL);
+ struct block_device *bdev;
+ bool unblock_events = true;
+ struct gendisk *disk;
+ int ret;
+
+ if (!handle)
+ return ERR_PTR(-ENOMEM);
+
+ ret = devcgroup_check_permission(DEVCG_DEV_BLOCK,
+ MAJOR(dev), MINOR(dev),
+ ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) |
+ ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0));
+ if (ret)
+ goto free_handle;
+
+ /* Blocking writes requires exclusive opener */
+ if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) {
+ ret = -EINVAL;
+ goto free_handle;
+ }
+
+ bdev = blkdev_get_no_open(dev);
+ if (!bdev) {
+ ret = -ENXIO;
+ goto free_handle;
+ }
+ disk = bdev->bd_disk;
+
+ if (holder) {
+ mode |= BLK_OPEN_EXCL;
+ ret = bd_prepare_to_claim(bdev, holder, hops);
+ if (ret)
+ goto put_blkdev;
+ } else {
+ if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) {
+ ret = -EIO;
+ goto put_blkdev;
+ }
+ }
+
+ disk_block_events(disk);
+
+ mutex_lock(&disk->open_mutex);
+ ret = -ENXIO;
+ if (!disk_live(disk))
+ goto abort_claiming;
+ if (!try_module_get(disk->fops->owner))
+ goto abort_claiming;
+ ret = -EBUSY;
+ if (!bdev_may_open(bdev, mode))
+ goto abort_claiming;
+ if (bdev_is_partition(bdev))
+ ret = blkdev_get_part(bdev, mode);
+ else
+ ret = blkdev_get_whole(bdev, mode);
+ if (ret)
+ goto put_module;
+ bdev_claim_write_access(bdev, mode);
+ if (holder) {
+ bd_finish_claiming(bdev, holder, hops);
+
+ /*
+ * Block event polling for write claims if requested. Any write
+ * holder makes the write_holder state stick until all are
+ * released. This is good enough and tracking individual
+ * writeable reference is too fragile given the way @mode is
+ * used in blkdev_get/put().
+ */
+ if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder &&
+ (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) {
+ bdev->bd_write_holder = true;
+ unblock_events = false;
+ }
+ }
+ mutex_unlock(&disk->open_mutex);
+
+ if (unblock_events)
+ disk_unblock_events(disk);
+ handle->bdev = bdev;
+ handle->holder = holder;
+ handle->mode = mode;
+ return handle;
+put_module:
+ module_put(disk->fops->owner);
+abort_claiming:
+ if (holder)
+ bd_abort_claiming(bdev, holder);
+ mutex_unlock(&disk->open_mutex);
+ disk_unblock_events(disk);
+put_blkdev:
+ blkdev_put_no_open(bdev);
+free_handle:
+ kfree(handle);
+ return ERR_PTR(ret);
+}
+EXPORT_SYMBOL(bdev_open_by_dev);
+
+/**
+ * bdev_open_by_path - open a block device by name
+ * @path: path to the block device to open
+ * @mode: open mode (BLK_OPEN_*)
+ * @holder: exclusive holder identifier
+ * @hops: holder operations
+ *
+ * Open the block device described by the device file at @path. If @holder is
+ * not %NULL, the block device is opened with exclusive access. Exclusive opens
+ * may nest for the same @holder.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * Handle with a reference to the block_device on success, ERR_PTR(-errno) on
+ * failure.
+ */
+struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode,
+ void *holder, const struct blk_holder_ops *hops)
+{
+ struct bdev_handle *handle;
+ dev_t dev;
+ int error;
+
+ error = lookup_bdev(path, &dev);
+ if (error)
+ return ERR_PTR(error);
+
+ handle = bdev_open_by_dev(dev, mode, holder, hops);
+ if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) &&
+ bdev_read_only(handle->bdev)) {
+ bdev_release(handle);
+ return ERR_PTR(-EACCES);
+ }
+
+ return handle;
+}
+EXPORT_SYMBOL(bdev_open_by_path);
+
+void bdev_release(struct bdev_handle *handle)
+{
+ struct block_device *bdev = handle->bdev;
+ struct gendisk *disk = bdev->bd_disk;
+
+ /*
+ * Sync early if it looks like we're the last one. If someone else
+ * opens the block device between now and the decrement of bd_openers
+ * then we did a sync that we didn't need to, but that's not the end
+ * of the world and we want to avoid long (could be several minute)
+ * syncs while holding the mutex.
+ */
+ if (atomic_read(&bdev->bd_openers) == 1)
+ sync_blockdev(bdev);
+
+ mutex_lock(&disk->open_mutex);
+ bdev_yield_write_access(bdev, handle->mode);
+
+ if (handle->holder)
+ bd_end_claim(bdev, handle->holder);
+
+ /*
+ * Trigger event checking and tell drivers to flush MEDIA_CHANGE
+ * event. This is to ensure detection of media removal commanded
+ * from userland - e.g. eject(1).
+ */
+ disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE);
+
+ if (bdev_is_partition(bdev))
+ blkdev_put_part(bdev);
+ else
+ blkdev_put_whole(bdev);
+ mutex_unlock(&disk->open_mutex);
+
+ module_put(disk->fops->owner);
+ blkdev_put_no_open(bdev);
+ kfree(handle);
+}
+EXPORT_SYMBOL(bdev_release);
+
+/**
+ * lookup_bdev() - Look up a struct block_device by name.
+ * @pathname: Name of the block device in the filesystem.
+ * @dev: Pointer to the block device's dev_t, if found.
+ *
+ * Lookup the block device's dev_t at @pathname in the current
+ * namespace if possible and return it in @dev.
+ *
+ * Context: May sleep.
+ * Return: 0 if succeeded, negative errno otherwise.
+ */
+int lookup_bdev(const char *pathname, dev_t *dev)
+{
+ struct inode *inode;
+ struct path path;
+ int error;
+
+ if (!pathname || !*pathname)
+ return -EINVAL;
+
+ error = kern_path(pathname, LOOKUP_FOLLOW, &path);
+ if (error)
+ return error;
+
+ inode = d_backing_inode(path.dentry);
+ error = -ENOTBLK;
+ if (!S_ISBLK(inode->i_mode))
+ goto out_path_put;
+ error = -EACCES;
+ if (!may_open_dev(&path))
+ goto out_path_put;
+
+ *dev = inode->i_rdev;
+ error = 0;
+out_path_put:
+ path_put(&path);
+ return error;
+}
+EXPORT_SYMBOL(lookup_bdev);
+
+/**
+ * bdev_mark_dead - mark a block device as dead
+ * @bdev: block device to operate on
+ * @surprise: indicate a surprise removal
+ *
+ * Tell the file system that this devices or media is dead. If @surprise is set
+ * to %true the device or media is already gone, if not we are preparing for an
+ * orderly removal.
+ *
+ * This calls into the file system, which then typicall syncs out all dirty data
+ * and writes back inodes and then invalidates any cached data in the inodes on
+ * the file system. In addition we also invalidate the block device mapping.
+ */
+void bdev_mark_dead(struct block_device *bdev, bool surprise)
+{
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead)
+ bdev->bd_holder_ops->mark_dead(bdev, surprise);
+ else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ sync_blockdev(bdev);
+ }
+
+ invalidate_bdev(bdev);
+}
+/*
+ * New drivers should not use this directly. There are some drivers however
+ * that needs this for historical reasons. For example, the DASD driver has
+ * historically had a shutdown to offline mode that doesn't actually remove the
+ * gendisk that otherwise looks a lot like a safe device removal.
+ */
+EXPORT_SYMBOL_GPL(bdev_mark_dead);
+
+void sync_bdevs(bool wait)
+{
+ struct inode *inode, *old_inode = NULL;
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) {
+ struct address_space *mapping = inode->i_mapping;
+ struct block_device *bdev;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) ||
+ mapping->nrpages == 0) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ __iget(inode);
+ spin_unlock(&inode->i_lock);
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+ /*
+ * We hold a reference to 'inode' so it couldn't have been
+ * removed from s_inodes list while we dropped the
+ * s_inode_list_lock We cannot iput the inode now as we can
+ * be holding the last reference and we cannot iput it under
+ * s_inode_list_lock. So we keep the reference and iput it
+ * later.
+ */
+ iput(old_inode);
+ old_inode = inode;
+ bdev = I_BDEV(inode);
+
+ mutex_lock(&bdev->bd_disk->open_mutex);
+ if (!atomic_read(&bdev->bd_openers)) {
+ ; /* skip */
+ } else if (wait) {
+ /*
+ * We keep the error status of individual mapping so
+ * that applications can catch the writeback error using
+ * fsync(2). See filemap_fdatawait_keep_errors() for
+ * details.
+ */
+ filemap_fdatawait_keep_errors(inode->i_mapping);
+ } else {
+ filemap_fdatawrite(inode->i_mapping);
+ }
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+
+ spin_lock(&blockdev_superblock->s_inode_list_lock);
+ }
+ spin_unlock(&blockdev_superblock->s_inode_list_lock);
+ iput(old_inode);
+}
+
+/*
+ * Handle STATX_DIOALIGN for block devices.
+ *
+ * Note that the inode passed to this is the inode of a block device node file,
+ * not the block device's internal inode. Therefore it is *not* valid to use
+ * I_BDEV() here; the block device has to be looked up by i_rdev instead.
+ */
+void bdev_statx_dioalign(struct inode *inode, struct kstat *stat)
+{
+ struct block_device *bdev;
+
+ bdev = blkdev_get_no_open(inode->i_rdev);
+ if (!bdev)
+ return;
+
+ stat->dio_mem_align = bdev_dma_alignment(bdev) + 1;
+ stat->dio_offset_align = bdev_logical_block_size(bdev);
+ stat->result_mask |= STATX_DIOALIGN;
+
+ blkdev_put_no_open(bdev);
+}
+
+static int __init setup_bdev_allow_write_mounted(char *str)
+{
+ if (kstrtobool(str, &bdev_allow_write_mounted))
+ pr_warn("Invalid option string for bdev_allow_write_mounted:"
+ " '%s'\n", str);
+ return 1;
+}
+__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted);
diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c
index 68882b9b8f11..2c90e5de0acd 100644
--- a/block/bfq-cgroup.c
+++ b/block/bfq-cgroup.c
@@ -6,13 +6,13 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
-#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
#include <linux/sbitmap.h>
#include <linux/delay.h>
+#include "elevator.h"
#include "bfq-iosched.h"
#ifdef CONFIG_BFQ_CGROUP_DEBUG
@@ -220,51 +220,46 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg)
}
void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op)
+ blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.queued, op, 1);
+ blkg_rwstat_add(&bfqg->stats.queued, opf, 1);
bfqg_stats_end_empty_time(&bfqg->stats);
- if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue))
+ if (!(bfqq == bfqg->bfqd->in_service_queue))
bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq));
}
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op)
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.queued, op, -1);
+ blkg_rwstat_add(&bfqg->stats.queued, opf, -1);
}
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op)
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf)
{
- blkg_rwstat_add(&bfqg->stats.merged, op, 1);
+ blkg_rwstat_add(&bfqg->stats.merged, opf, 1);
}
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op)
+ u64 io_start_time_ns, blk_opf_t opf)
{
struct bfqg_stats *stats = &bfqg->stats;
u64 now = ktime_get_ns();
if (now > io_start_time_ns)
- blkg_rwstat_add(&stats->service_time, op,
+ blkg_rwstat_add(&stats->service_time, opf,
now - io_start_time_ns);
if (io_start_time_ns > start_time_ns)
- blkg_rwstat_add(&stats->wait_time, op,
+ blkg_rwstat_add(&stats->wait_time, opf,
io_start_time_ns - start_time_ns);
}
#else /* CONFIG_BFQ_CGROUP_DEBUG */
-void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op) { }
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { }
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { }
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { }
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { }
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op) { }
+ u64 io_start_time_ns, blk_opf_t opf) { }
void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { }
-void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { }
-void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { }
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { }
-void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { }
#endif /* CONFIG_BFQ_CGROUP_DEBUG */
@@ -321,18 +316,16 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
static void bfqg_get(struct bfq_group *bfqg)
{
- bfqg->ref++;
+ refcount_inc(&bfqg->ref);
}
static void bfqg_put(struct bfq_group *bfqg)
{
- bfqg->ref--;
-
- if (bfqg->ref == 0)
+ if (refcount_dec_and_test(&bfqg->ref))
kfree(bfqg);
}
-void bfqg_and_blkg_get(struct bfq_group *bfqg)
+static void bfqg_and_blkg_get(struct bfq_group *bfqg)
{
/* see comments in bfq_bic_update_cgroup for why refcounting bfqg */
bfqg_get(bfqg);
@@ -463,7 +456,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
{
if (blkg_rwstat_init(&stats->bytes, gfp) ||
blkg_rwstat_init(&stats->ios, gfp))
- return -ENOMEM;
+ goto error;
#ifdef CONFIG_BFQ_CGROUP_DEBUG
if (blkg_rwstat_init(&stats->merged, gfp) ||
@@ -476,13 +469,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp)
bfq_stat_init(&stats->dequeue, gfp) ||
bfq_stat_init(&stats->group_wait_time, gfp) ||
bfq_stat_init(&stats->idle_time, gfp) ||
- bfq_stat_init(&stats->empty_time, gfp)) {
- bfqg_stats_exit(stats);
- return -ENOMEM;
- }
+ bfq_stat_init(&stats->empty_time, gfp))
+ goto error;
#endif
return 0;
+
+error:
+ bfqg_stats_exit(stats);
+ return -ENOMEM;
}
static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd)
@@ -502,15 +497,9 @@ static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp)
bgd = kzalloc(sizeof(*bgd), gfp);
if (!bgd)
return NULL;
- return &bgd->pd;
-}
-
-static void bfq_cpd_init(struct blkcg_policy_data *cpd)
-{
- struct bfq_group_data *d = cpd_to_bfqgd(cpd);
- d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ?
- CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL;
+ bgd->weight = CGROUP_WEIGHT_DFL;
+ return &bgd->pd;
}
static void bfq_cpd_free(struct blkcg_policy_data *cpd)
@@ -518,12 +507,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd)
kfree(cpd_to_bfqgd(cpd));
}
-static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q,
- struct blkcg *blkcg)
+static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk,
+ struct blkcg *blkcg, gfp_t gfp)
{
struct bfq_group *bfqg;
- bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node);
+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id);
if (!bfqg)
return NULL;
@@ -533,7 +522,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q,
}
/* see comments in bfq_bic_update_cgroup for why refcounting */
- bfqg_get(bfqg);
+ refcount_set(&bfqg->ref, 1);
return &bfqg->pd;
}
@@ -547,12 +536,15 @@ static void bfq_pd_init(struct blkg_policy_data *pd)
entity->orig_weight = entity->weight = entity->new_weight = d->weight;
entity->my_sched_data = &bfqg->sched_data;
+ entity->last_bfqq_created = NULL;
+
bfqg->my_entity = entity; /*
* the root_group's will be set to NULL
* in bfq_init_queue()
*/
bfqg->bfqd = bfqd;
bfqg->active_entities = 0;
+ bfqg->num_queues_with_pending_reqs = 0;
bfqg->rq_pos_tree = RB_ROOT;
}
@@ -581,28 +573,11 @@ static void bfq_group_set_parent(struct bfq_group *bfqg,
entity->sched_data = &parent->sched_data;
}
-static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd,
- struct blkcg *blkcg)
-{
- struct blkcg_gq *blkg;
-
- blkg = blkg_lookup(blkcg, bfqd->queue);
- if (likely(blkg))
- return blkg_to_bfqg(blkg);
- return NULL;
-}
-
-struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
- struct blkcg *blkcg)
+static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg)
{
- struct bfq_group *bfqg, *parent;
+ struct bfq_group *parent;
struct bfq_entity *entity;
- bfqg = bfq_lookup_bfqg(bfqd, blkcg);
-
- if (unlikely(!bfqg))
- return NULL;
-
/*
* Update chain of bfq_groups as we might be handling a leaf group
* which, along with some of its relatives, has not been hooked yet
@@ -619,8 +594,28 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
bfq_group_set_parent(curr_bfqg, parent);
}
}
+}
- return bfqg;
+struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+ struct bfq_group *bfqg;
+
+ while (blkg) {
+ if (!blkg->online) {
+ blkg = blkg->parent;
+ continue;
+ }
+ bfqg = blkg_to_bfqg(blkg);
+ if (bfqg->pd.online) {
+ bio_associate_blkg_from_css(bio, &blkg->blkcg->css);
+ return bfqg;
+ }
+ blkg = blkg->parent;
+ }
+ bio_associate_blkg_from_css(bio,
+ &bfqg_to_blkg(bfqd->root_group)->blkcg->css);
+ return bfqd->root_group;
}
/**
@@ -641,13 +636,33 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg)
{
struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_group *old_parent = bfqq_group(bfqq);
+ bool has_pending_reqs = false;
/*
+ * No point to move bfqq to the same group, which can happen when
+ * root group is offlined
+ */
+ if (old_parent == bfqg)
+ return;
+
+ /*
+ * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group
+ * until elevator exit.
+ */
+ if (bfqq == &bfqd->oom_bfqq)
+ return;
+ /*
* Get extra reference to prevent bfqq from being freed in
* next possible expire or deactivate.
*/
bfqq->ref++;
+ if (entity->in_groups_with_pending_reqs) {
+ has_pending_reqs = true;
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ }
+
/* If bfqq is empty, then bfq_bfqq_expire also invokes
* bfq_del_bfqq_busy, thereby removing bfqq and its entity
* from data structures related to current group. Otherwise we
@@ -662,88 +677,126 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
else if (entity->on_st_or_in_serv)
bfq_put_idle_entity(bfq_entity_service_tree(entity), entity);
- bfqg_and_blkg_put(bfqq_group(bfqq));
+ bfqg_and_blkg_put(old_parent);
+
+ if (entity->parent &&
+ entity->parent->last_bfqq_created == bfqq)
+ entity->parent->last_bfqq_created = NULL;
+ else if (bfqd->last_bfqq_created == bfqq)
+ bfqd->last_bfqq_created = NULL;
entity->parent = bfqg->my_entity;
entity->sched_data = &bfqg->sched_data;
/* pin down bfqg and its associated blkg */
bfqg_and_blkg_get(bfqg);
+ if (has_pending_reqs)
+ bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
+
if (bfq_bfqq_busy(bfqq)) {
if (unlikely(!bfqd->nonrot_with_queueing))
bfq_pos_tree_add_move(bfqd, bfqq);
bfq_activate_bfqq(bfqd, bfqq);
}
- if (!bfqd->in_service_queue && !bfqd->rq_in_driver)
+ if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver)
bfq_schedule_dispatch(bfqd);
/* release extra ref taken above, bfqq may happen to be freed now */
bfq_put_queue(bfqq);
}
+static void bfq_sync_bfqq_move(struct bfq_data *bfqd,
+ struct bfq_queue *sync_bfqq,
+ struct bfq_io_cq *bic,
+ struct bfq_group *bfqg,
+ unsigned int act_idx)
+{
+ struct bfq_queue *bfqq;
+
+ if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) {
+ /* We are the only user of this bfqq, just move it */
+ if (sync_bfqq->entity.sched_data != &bfqg->sched_data)
+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ return;
+ }
+
+ /*
+ * The queue was merged to a different queue. Check
+ * that the merge chain still belongs to the same
+ * cgroup.
+ */
+ for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq)
+ if (bfqq->entity.sched_data != &bfqg->sched_data)
+ break;
+ if (bfqq) {
+ /*
+ * Some queue changed cgroup so the merge is not valid
+ * anymore. We cannot easily just cancel the merge (by
+ * clearing new_bfqq) as there may be other processes
+ * using this queue and holding refs to all queues
+ * below sync_bfqq->new_bfqq. Similarly if the merge
+ * already happened, we need to detach from bfqq now
+ * so that we cannot merge bio to a request from the
+ * old cgroup.
+ */
+ bfq_put_cooperator(sync_bfqq);
+ bic_set_bfqq(bic, NULL, true, act_idx);
+ bfq_release_process_ref(bfqd, sync_bfqq);
+ }
+}
+
/**
- * __bfq_bic_change_cgroup - move @bic to @cgroup.
+ * __bfq_bic_change_cgroup - move @bic to @bfqg.
* @bfqd: the queue descriptor.
* @bic: the bic to move.
- * @blkcg: the blk-cgroup to move to.
+ * @bfqg: the group to move to.
*
* Move bic to blkcg, assuming that bfqd->lock is held; which makes
* sure that the reference to cgroup is valid across the call (see
* comments in bfq_bic_update_cgroup on this issue)
- *
- * NOTE: an alternative approach might have been to store the current
- * cgroup in bfqq and getting a reference to it, reducing the lookup
- * time here, at the price of slightly more complex code.
*/
-static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd,
- struct bfq_io_cq *bic,
- struct blkcg *blkcg)
+static void __bfq_bic_change_cgroup(struct bfq_data *bfqd,
+ struct bfq_io_cq *bic,
+ struct bfq_group *bfqg)
{
- struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0);
- struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1);
- struct bfq_group *bfqg;
- struct bfq_entity *entity;
-
- bfqg = bfq_find_set_group(bfqd, blkcg);
+ unsigned int act_idx;
- if (unlikely(!bfqg))
- bfqg = bfqd->root_group;
+ for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) {
+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx);
+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx);
- if (async_bfqq) {
- entity = &async_bfqq->entity;
-
- if (entity->sched_data != &bfqg->sched_data) {
- bic_set_bfqq(bic, NULL, 0);
+ if (async_bfqq &&
+ async_bfqq->entity.sched_data != &bfqg->sched_data) {
+ bic_set_bfqq(bic, NULL, false, act_idx);
bfq_release_process_ref(bfqd, async_bfqq);
}
- }
- if (sync_bfqq) {
- entity = &sync_bfqq->entity;
- if (entity->sched_data != &bfqg->sched_data)
- bfq_bfqq_move(bfqd, sync_bfqq, bfqg);
+ if (sync_bfqq)
+ bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx);
}
-
- return bfqg;
}
void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
{
struct bfq_data *bfqd = bic_to_bfqd(bic);
- struct bfq_group *bfqg = NULL;
+ struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio);
uint64_t serial_nr;
- rcu_read_lock();
- serial_nr = __bio_blkcg(bio)->css.serial_nr;
+ serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr;
/*
* Check whether blkcg has changed. The condition may trigger
* spuriously on a newly created cic but there's no harm.
*/
if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr))
- goto out;
+ return;
- bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio));
+ /*
+ * New cgroup for this process. Make sure it is linked to bfq internal
+ * cgroup hierarchy.
+ */
+ bfq_link_bfqg(bfqd, bfqg);
+ __bfq_bic_change_cgroup(bfqd, bic, bfqg);
/*
* Update blkg_path for bfq_log_* functions. We cache this
* path, and update it here, for the following
@@ -796,8 +849,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio)
*/
blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path));
bic->blkcg_serial_nr = serial_nr;
-out:
- rcu_read_unlock();
}
/**
@@ -817,6 +868,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st)
* @bfqd: the device data structure with the root group.
* @entity: the entity to move, if entity is a leaf; or the parent entity
* of an active leaf entity to move, if entity is not a leaf.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
struct bfq_entity *entity,
@@ -846,6 +898,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd,
* @bfqd: the device data structure with the root group.
* @bfqg: the group to move from.
* @st: the service tree to start the search from.
+ * @ioprio_class: I/O priority class to reparent.
*/
static void bfq_reparent_active_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg,
@@ -1052,9 +1105,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
struct bfq_group *bfqg;
u64 v;
- ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx);
+ blkg_conf_init(&ctx, buf);
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx);
if (ret)
- return ret;
+ goto out;
if (sscanf(ctx.body, "%llu", &v) == 1) {
/* require "default" on dfl */
@@ -1076,7 +1131,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of,
ret = 0;
}
out:
- blkg_conf_finish(&ctx);
+ blkg_conf_exit(&ctx);
return ret ?: nbytes;
}
@@ -1230,7 +1285,7 @@ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
{
int ret;
- ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq);
+ ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq);
if (ret)
return NULL;
@@ -1242,8 +1297,6 @@ struct blkcg_policy blkcg_policy_bfq = {
.legacy_cftypes = bfq_blkcg_legacy_files,
.cpd_alloc_fn = bfq_cpd_alloc,
- .cpd_init_fn = bfq_cpd_init,
- .cpd_bind_fn = bfq_cpd_init,
.cpd_free_fn = bfq_cpd_free,
.pd_alloc_fn = bfq_pd_alloc,
@@ -1414,7 +1467,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd)
bfq_end_wr_async_queues(bfqd, bfqd->root_group);
}
-struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg)
+struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio)
{
return bfqd->root_group;
}
@@ -1424,8 +1477,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq)
return bfqq->bfqd->root_group;
}
-void bfqg_and_blkg_get(struct bfq_group *bfqg) {}
-
void bfqg_and_blkg_put(struct bfq_group *bfqg) {}
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node)
diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c
index 50c8f034c01c..3cce6de464a7 100644
--- a/block/bfq-iosched.c
+++ b/block/bfq-iosched.c
@@ -117,7 +117,6 @@
#include <linux/slab.h>
#include <linux/blkdev.h>
#include <linux/cgroup.h>
-#include <linux/elevator.h>
#include <linux/ktime.h>
#include <linux/rbtree.h>
#include <linux/ioprio.h>
@@ -125,9 +124,11 @@
#include <linux/delay.h>
#include <linux/backing-dev.h>
+#include <trace/events/block.h>
+
+#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
-#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
#include "bfq-iosched.h"
#include "blk-wbt.h"
@@ -158,10 +159,9 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
-BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS \
-/* Expiration time of sync (0) and async (1) requests, in ns. */
+/* Expiration time of async (0) and sync (1) requests, in ns. */
static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 };
/* Maximum backwards seek (magic number lifted from CFQ), in KiB. */
@@ -363,17 +363,74 @@ static int ref_wr_duration[2];
*/
static const unsigned long max_service_from_wr = 120000;
-#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0])
+/*
+ * Maximum time between the creation of two queues, for stable merge
+ * to be activated (in ms)
+ */
+static const unsigned long bfq_activation_stable_merging = 600;
+/*
+ * Minimum time to be waited before evaluating delayed stable merge (in ms)
+ */
+static const unsigned long bfq_late_stable_merging = 600;
+
+#define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0]))
#define RQ_BFQQ(rq) ((rq)->elv.priv[1])
-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync)
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
+ unsigned int actuator_idx)
{
- return bic->bfqq[is_sync];
+ if (is_sync)
+ return bic->bfqq[1][actuator_idx];
+
+ return bic->bfqq[0][actuator_idx];
}
-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync)
+static void bfq_put_stable_ref(struct bfq_queue *bfqq);
+
+void bic_set_bfqq(struct bfq_io_cq *bic,
+ struct bfq_queue *bfqq,
+ bool is_sync,
+ unsigned int actuator_idx)
{
- bic->bfqq[is_sync] = bfqq;
+ struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx];
+
+ /*
+ * If bfqq != NULL, then a non-stable queue merge between
+ * bic->bfqq and bfqq is happening here. This causes troubles
+ * in the following case: bic->bfqq has also been scheduled
+ * for a possible stable merge with bic->stable_merge_bfqq,
+ * and bic->stable_merge_bfqq == bfqq happens to
+ * hold. Troubles occur because bfqq may then undergo a split,
+ * thereby becoming eligible for a stable merge. Yet, if
+ * bic->stable_merge_bfqq points exactly to bfqq, then bfqq
+ * would be stably merged with itself. To avoid this anomaly,
+ * we cancel the stable merge if
+ * bic->stable_merge_bfqq == bfqq.
+ */
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx];
+
+ /* Clear bic pointer if bfqq is detached from this bic */
+ if (old_bfqq && old_bfqq->bic == bic)
+ old_bfqq->bic = NULL;
+
+ if (is_sync)
+ bic->bfqq[1][actuator_idx] = bfqq;
+ else
+ bic->bfqq[0][actuator_idx] = bfqq;
+
+ if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) {
+ /*
+ * Actually, these same instructions are executed also
+ * in bfq_setup_cooperator, in case of abort or actual
+ * execution of a stable merge. We could avoid
+ * repeating these instructions there too, but if we
+ * did so, we would nest even more complexity in this
+ * function.
+ */
+ bfq_put_stable_ref(bfqq_data->stable_merge_bfqq);
+
+ bfqq_data->stable_merge_bfqq = NULL;
+ }
}
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic)
@@ -393,26 +450,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq)
/**
* bfq_bic_lookup - search into @ioc a bic associated to @bfqd.
- * @bfqd: the lookup key.
- * @ioc: the io_context of the process doing I/O.
* @q: the request queue.
*/
-static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
- struct io_context *ioc,
- struct request_queue *q)
+static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q)
{
- if (ioc) {
- unsigned long flags;
- struct bfq_io_cq *icq;
+ struct bfq_io_cq *icq;
+ unsigned long flags;
- spin_lock_irqsave(&q->queue_lock, flags);
- icq = icq_to_bic(ioc_lookup_icq(ioc, q));
- spin_unlock_irqrestore(&q->queue_lock, flags);
+ if (!current->io_context)
+ return NULL;
- return icq;
- }
+ spin_lock_irqsave(&q->queue_lock, flags);
+ icq = icq_to_bic(ioc_lookup_icq(q));
+ spin_unlock_irqrestore(&q->queue_lock, flags);
- return NULL;
+ return icq;
}
/*
@@ -421,6 +473,8 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd,
*/
void bfq_schedule_dispatch(struct bfq_data *bfqd)
{
+ lockdep_assert_held(&bfqd->lock);
+
if (bfqd->queued != 0) {
bfq_log(bfqd, "schedule dispatch");
blk_mq_run_hw_queues(bfqd->queue, true);
@@ -525,26 +579,149 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd,
}
}
+#define BFQ_LIMIT_INLINE_DEPTH 16
+
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+ struct bfq_data *bfqd = bfqq->bfqd;
+ struct bfq_entity *entity = &bfqq->entity;
+ struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH];
+ struct bfq_entity **entities = inline_entities;
+ int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH;
+ int class_idx = bfqq->ioprio_class - 1;
+ struct bfq_sched_data *sched_data;
+ unsigned long wsum;
+ bool ret = false;
+
+ if (!entity->on_st_or_in_serv)
+ return false;
+
+retry:
+ spin_lock_irq(&bfqd->lock);
+ /* +1 for bfqq entity, root cgroup not included */
+ depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1;
+ if (depth > alloc_depth) {
+ spin_unlock_irq(&bfqd->lock);
+ if (entities != inline_entities)
+ kfree(entities);
+ entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO);
+ if (!entities)
+ return false;
+ alloc_depth = depth;
+ goto retry;
+ }
+
+ sched_data = entity->sched_data;
+ /* Gather our ancestors as we need to traverse them in reverse order */
+ level = 0;
+ for_each_entity(entity) {
+ /*
+ * If at some level entity is not even active, allow request
+ * queueing so that BFQ knows there's work to do and activate
+ * entities.
+ */
+ if (!entity->on_st_or_in_serv)
+ goto out;
+ /* Uh, more parents than cgroup subsystem thinks? */
+ if (WARN_ON_ONCE(level >= depth))
+ break;
+ entities[level++] = entity;
+ }
+ WARN_ON_ONCE(level != depth);
+ for (level--; level >= 0; level--) {
+ entity = entities[level];
+ if (level > 0) {
+ wsum = bfq_entity_service_tree(entity)->wsum;
+ } else {
+ int i;
+ /*
+ * For bfqq itself we take into account service trees
+ * of all higher priority classes and multiply their
+ * weights so that low prio queue from higher class
+ * gets more requests than high prio queue from lower
+ * class.
+ */
+ wsum = 0;
+ for (i = 0; i <= class_idx; i++) {
+ wsum = wsum * IOPRIO_BE_NR +
+ sched_data->service_tree[i].wsum;
+ }
+ }
+ if (!wsum)
+ continue;
+ limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum);
+ if (entity->allocated >= limit) {
+ bfq_log_bfqq(bfqq->bfqd, bfqq,
+ "too many requests: allocated %d limit %d level %d",
+ entity->allocated, limit, level);
+ ret = true;
+ break;
+ }
+ }
+out:
+ spin_unlock_irq(&bfqd->lock);
+ if (entities != inline_entities)
+ kfree(entities);
+ return ret;
+}
+#else
+static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit)
+{
+ return false;
+}
+#endif
+
/*
* Async I/O can easily starve sync I/O (both sync reads and sync
* writes), by consuming all tags. Similarly, storms of sync writes,
* such as those that sync(2) may trigger, can starve sync reads.
* Limit depths of async I/O and sync writes so as to counter both
* problems.
+ *
+ * Also if a bfq queue or its parent cgroup consume more tags than would be
+ * appropriate for their weight, we trim the available tag depth to 1. This
+ * avoids a situation where one cgroup can starve another cgroup from tags and
+ * thus block service differentiation among cgroups. Note that because the
+ * queue / cgroup already has many requests allocated and queued, this does not
+ * significantly affect service guarantees coming from the BFQ scheduling
+ * algorithm.
*/
-static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
struct bfq_data *bfqd = data->q->elevator->elevator_data;
+ struct bfq_io_cq *bic = bfq_bic_lookup(data->q);
+ int depth;
+ unsigned limit = data->q->nr_requests;
+ unsigned int act_idx;
+
+ /* Sync reads have full depth available */
+ if (op_is_sync(opf) && !op_is_write(opf)) {
+ depth = 0;
+ } else {
+ depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)];
+ limit = (limit * depth) >> bfqd->full_depth_shift;
+ }
- if (op_is_sync(op) && !op_is_write(op))
- return;
-
- data->shallow_depth =
- bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)];
+ for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) {
+ struct bfq_queue *bfqq =
+ bic_to_bfqq(bic, op_is_sync(opf), act_idx);
+ /*
+ * Does queue (or any parent entity) exceed number of
+ * requests that should be available to it? Heavily
+ * limit depth so that it cannot consume more
+ * available requests and thus starve other entities.
+ */
+ if (bfqq && bfqq_request_over_limit(bfqq, limit)) {
+ depth = 1;
+ break;
+ }
+ }
bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u",
- __func__, bfqd->wr_busy_queues, op_is_sync(op),
- data->shallow_depth);
+ __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth);
+ if (depth)
+ data->shallow_depth = depth;
}
static struct bfq_queue *
@@ -631,7 +808,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
if (!bfqq->next_rq)
return;
- bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree;
__bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root,
blk_rq_pos(bfqq->next_rq), &parent, &p);
if (!__bfqq) {
@@ -669,7 +846,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq)
* much easier to maintain the needed state:
* 1) all active queues have the same weight,
* 2) all active queues belong to the same I/O-priority class,
- * 3) there are no active groups.
+ * 3) there is at most one active group.
* In particular, the last condition is always true if hierarchical
* support or the cgroups interface are not enabled, thus no state
* needs to be maintained in this case.
@@ -701,7 +878,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
return varied_queue_weights || multiple_classes_busy
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- || bfqd->num_groups_with_pending_reqs > 0
+ || bfqd->num_groups_with_pending_reqs > 1
#endif
;
}
@@ -719,9 +896,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd,
* In most scenarios, the rate at which nodes are created/destroyed
* should be low too.
*/
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct rb_root_cached *root)
+void bfq_weights_tree_add(struct bfq_queue *bfqq)
{
+ struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree;
struct bfq_entity *entity = &bfqq->entity;
struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL;
bool leftmost = true;
@@ -793,13 +970,14 @@ inc_counter:
* See the comments to the function bfq_weights_tree_add() for considerations
* about overhead.
*/
-void __bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct rb_root_cached *root)
+void bfq_weights_tree_remove(struct bfq_queue *bfqq)
{
+ struct rb_root_cached *root;
+
if (!bfqq->weight_counter)
return;
+ root = &bfqq->bfqd->queue_weights_tree;
bfqq->weight_counter->num_active--;
if (bfqq->weight_counter->num_active > 0)
goto reset_entity_pointer;
@@ -813,59 +991,6 @@ reset_entity_pointer:
}
/*
- * Invoke __bfq_weights_tree_remove on bfqq and decrement the number
- * of active groups for each queue's inactive parent entity.
- */
-void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq)
-{
- struct bfq_entity *entity = bfqq->entity.parent;
-
- for_each_entity(entity) {
- struct bfq_sched_data *sd = entity->my_sched_data;
-
- if (sd->next_in_service || sd->in_service_entity) {
- /*
- * entity is still active, because either
- * next_in_service or in_service_entity is not
- * NULL (see the comments on the definition of
- * next_in_service for details on why
- * in_service_entity must be checked too).
- *
- * As a consequence, its parent entities are
- * active as well, and thus this loop must
- * stop here.
- */
- break;
- }
-
- /*
- * The decrement of num_groups_with_pending_reqs is
- * not performed immediately upon the deactivation of
- * entity, but it is delayed to when it also happens
- * that the first leaf descendant bfqq of entity gets
- * all its pending requests completed. The following
- * instructions perform this delayed decrement, if
- * needed. See the comments on
- * num_groups_with_pending_reqs for details.
- */
- if (entity->in_groups_with_pending_reqs) {
- entity->in_groups_with_pending_reqs = false;
- bfqd->num_groups_with_pending_reqs--;
- }
- }
-
- /*
- * Next function is invoked last, because it causes bfqq to be
- * freed if the following holds: bfqq is not in service and
- * has no dispatched request. DO NOT use bfqq after the next
- * function invocation.
- */
- __bfq_weights_tree_remove(bfqd, bfqq,
- &bfqd->queue_weights_tree);
-}
-
-/*
* Return expired entry, or NULL to just start from scratch in rbtree.
*/
static struct request *bfq_check_fifo(struct bfq_queue *bfqq,
@@ -969,9 +1094,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd)
{
u64 dur;
- if (bfqd->bfq_wr_max_time > 0)
- return bfqd->bfq_wr_max_time;
-
dur = bfqd->rate_dur_prod;
do_div(dur, bfqd->peak_rate);
@@ -1011,25 +1133,41 @@ static void
bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
struct bfq_io_cq *bic, bool bfq_already_existing)
{
- unsigned int old_wr_coeff = bfqq->wr_coeff;
+ unsigned int old_wr_coeff = 1;
bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq);
+ unsigned int a_idx = bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
- if (bic->saved_has_short_ttime)
+ if (bfqq_data->saved_has_short_ttime)
bfq_mark_bfqq_has_short_ttime(bfqq);
else
bfq_clear_bfqq_has_short_ttime(bfqq);
- if (bic->saved_IO_bound)
+ if (bfqq_data->saved_IO_bound)
bfq_mark_bfqq_IO_bound(bfqq);
else
bfq_clear_bfqq_IO_bound(bfqq);
- bfqq->entity.new_weight = bic->saved_weight;
- bfqq->ttime = bic->saved_ttime;
- bfqq->wr_coeff = bic->saved_wr_coeff;
- bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt;
- bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish;
- bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time;
+ bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns;
+ bfqq->inject_limit = bfqq_data->saved_inject_limit;
+ bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif;
+
+ bfqq->entity.new_weight = bfqq_data->saved_weight;
+ bfqq->ttime = bfqq_data->saved_ttime;
+ bfqq->io_start_time = bfqq_data->saved_io_start_time;
+ bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time;
+ /*
+ * Restore weight coefficient only if low_latency is on
+ */
+ if (bfqd->low_latency) {
+ old_wr_coeff = bfqq->wr_coeff;
+ bfqq->wr_coeff = bfqq_data->saved_wr_coeff;
+ }
+ bfqq->service_from_wr = bfqq_data->saved_service_from_wr;
+ bfqq->wr_start_at_switch_to_srt =
+ bfqq_data->saved_wr_start_at_switch_to_srt;
+ bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish;
+ bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time;
if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) ||
time_is_before_jiffies(bfqq->last_wr_start_finish +
@@ -1060,8 +1198,9 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd,
static int bfqq_process_refs(struct bfq_queue *bfqq)
{
- return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv -
- (bfqq->weight_counter != NULL);
+ return bfqq->ref - bfqq->entity.allocated -
+ bfqq->entity.on_st_or_in_serv -
+ (bfqq->weight_counter != NULL) - bfqq->stable_ref;
}
/* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */
@@ -1647,6 +1786,35 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq,
return bfqq_weight > in_serv_weight;
}
+/*
+ * Get the index of the actuator that will serve bio.
+ */
+static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio)
+{
+ unsigned int i;
+ sector_t end;
+
+ /* no search needed if one or zero ranges present */
+ if (bfqd->num_actuators == 1)
+ return 0;
+
+ /* bio_end_sector(bio) gives the sector after the last one */
+ end = bio_end_sector(bio) - 1;
+
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ if (end >= bfqd->sector[i] &&
+ end < bfqd->sector[i] + bfqd->nr_sectors[i])
+ return i;
+ }
+
+ WARN_ONCE(true,
+ "bfq_actuator_index: bio sector out of ranges: end=%llu\n",
+ end);
+ return 0;
+}
+
+static bool bfq_better_to_idle(struct bfq_queue *bfqq);
+
static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
int old_wr_coeff,
@@ -1664,26 +1832,44 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
arrived_in_time = ktime_get_ns() <=
bfqq->ttime.last_end_request +
bfqd->bfq_slice_idle * 3;
-
+ unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio);
+ bool bfqq_non_merged_or_stably_merged =
+ bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged;
/*
* bfqq deserves to be weight-raised if:
* - it is sync,
* - it does not belong to a large burst,
* - it has been idle for enough time or is soft real-time,
- * - is linked to a bfq_io_cq (it is not shared in any sense).
+ * - is linked to a bfq_io_cq (it is not shared in any sense),
+ * - has a default weight (otherwise we assume the user wanted
+ * to control its weight explicitly)
*/
in_burst = bfq_bfqq_in_large_burst(bfqq);
soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 &&
!BFQQ_TOTALLY_SEEKY(bfqq) &&
!in_burst &&
time_is_before_jiffies(bfqq->soft_rt_next_start) &&
- bfqq->dispatched == 0;
- *interactive = !in_burst && idle_for_long_time;
+ bfqq->dispatched == 0 &&
+ bfqq->entity.new_weight == 40;
+ *interactive = !in_burst && idle_for_long_time &&
+ bfqq->entity.new_weight == 40;
+ /*
+ * Merged bfq_queues are kept out of weight-raising
+ * (low-latency) mechanisms. The reason is that these queues
+ * are usually created for non-interactive and
+ * non-soft-real-time tasks. Yet this is not the case for
+ * stably-merged queues. These queues are merged just because
+ * they are created shortly after each other. So they may
+ * easily serve the I/O of an interactive or soft-real time
+ * application, if the application happens to spawn multiple
+ * processes. So let also stably-merged queued enjoy weight
+ * raising.
+ */
wr_or_deserves_wr = bfqd->low_latency &&
(bfqq->wr_coeff > 1 ||
- (bfq_bfqq_sync(bfqq) &&
- bfqq->bic && (*interactive || soft_rt)));
+ (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged &&
+ (*interactive || soft_rt)));
/*
* Using the last flag, update budget and check whether bfqq
@@ -1717,17 +1903,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfq_clear_bfqq_just_created(bfqq);
-
- if (!bfq_bfqq_IO_bound(bfqq)) {
- if (arrived_in_time) {
- bfqq->requests_within_timer++;
- if (bfqq->requests_within_timer >=
- bfqd->bfq_requests_within_timer)
- bfq_mark_bfqq_IO_bound(bfqq);
- } else
- bfqq->requests_within_timer = 0;
- }
-
if (bfqd->low_latency) {
if (unlikely(time_is_after_jiffies(bfqq->split_time)))
/* wraparound */
@@ -1752,13 +1927,13 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
bfqq->service_from_backlogged = 0;
bfq_clear_bfqq_softrt_update(bfqq);
- bfq_add_bfqq_busy(bfqd, bfqq);
+ bfq_add_bfqq_busy(bfqq);
/*
- * Expire in-service queue only if preemption may be needed
- * for guarantees. In particular, we care only about two
- * cases. The first is that bfqq has to recover a service
- * hole, as explained in the comments on
+ * Expire in-service queue if preemption may be needed for
+ * guarantees or throughput. As for guarantees, we care
+ * explicitly about two cases. The first is that bfqq has to
+ * recover a service hole, as explained in the comments on
* bfq_bfqq_update_budg_for_activation(), i.e., that
* bfqq_wants_to_preempt is true. However, if bfqq does not
* carry time-critical I/O, then bfqq's bandwidth is less
@@ -1785,11 +1960,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd,
* timestamps of the in-service queue would need to be
* updated, and this operation is quite costly (see the
* comments on bfq_bfqq_update_budg_for_activation()).
+ *
+ * As for throughput, we ask bfq_better_to_idle() whether we
+ * still need to plug I/O dispatching. If bfq_better_to_idle()
+ * says no, then plugging is not needed any longer, either to
+ * boost throughput or to perserve service guarantees. Then
+ * the best option is to stop plugging I/O, as not doing so
+ * would certainly lower throughput. We may end up in this
+ * case if: (1) upon a dispatch attempt, we detected that it
+ * was better to plug I/O dispatch, and to wait for a new
+ * request to arrive for the currently in-service queue, but
+ * (2) this switch of bfqq to busy changes the scenario.
*/
if (bfqd->in_service_queue &&
((bfqq_wants_to_preempt &&
bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) ||
- bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) &&
+ bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) ||
+ !bfq_better_to_idle(bfqd->in_service_queue)) &&
next_queue_may_preempt(bfqd))
bfq_bfqq_expire(bfqd, bfqd->in_service_queue,
false, BFQQE_PREEMPTED);
@@ -1861,6 +2048,159 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd,
bfqq->decrease_time_jif = jiffies;
}
+static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns)
+{
+ u64 tot_io_time = now_ns - bfqq->io_start_time;
+
+ if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0)
+ bfqq->tot_idle_time +=
+ now_ns - bfqq->ttime.last_end_request;
+
+ if (unlikely(bfq_bfqq_just_created(bfqq)))
+ return;
+
+ /*
+ * Must be busy for at least about 80% of the time to be
+ * considered I/O bound.
+ */
+ if (bfqq->tot_idle_time * 5 > tot_io_time)
+ bfq_clear_bfqq_IO_bound(bfqq);
+ else
+ bfq_mark_bfqq_IO_bound(bfqq);
+
+ /*
+ * Keep an observation window of at most 200 ms in the past
+ * from now.
+ */
+ if (tot_io_time > 200 * NSEC_PER_MSEC) {
+ bfqq->io_start_time = now_ns - (tot_io_time>>1);
+ bfqq->tot_idle_time >>= 1;
+ }
+}
+
+/*
+ * Detect whether bfqq's I/O seems synchronized with that of some
+ * other queue, i.e., whether bfqq, after remaining empty, happens to
+ * receive new I/O only right after some I/O request of the other
+ * queue has been completed. We call waker queue the other queue, and
+ * we assume, for simplicity, that bfqq may have at most one waker
+ * queue.
+ *
+ * A remarkable throughput boost can be reached by unconditionally
+ * injecting the I/O of the waker queue, every time a new
+ * bfq_dispatch_request happens to be invoked while I/O is being
+ * plugged for bfqq. In addition to boosting throughput, this
+ * unblocks bfqq's I/O, thereby improving bandwidth and latency for
+ * bfqq. Note that these same results may be achieved with the general
+ * injection mechanism, but less effectively. For details on this
+ * aspect, see the comments on the choice of the queue for injection
+ * in bfq_select_queue().
+ *
+ * Turning back to the detection of a waker queue, a queue Q is deemed as a
+ * waker queue for bfqq if, for three consecutive times, bfqq happens to become
+ * non empty right after a request of Q has been completed within given
+ * timeout. In this respect, even if bfqq is empty, we do not check for a waker
+ * if it still has some in-flight I/O. In fact, in this case bfqq is actually
+ * still being served by the drive, and may receive new I/O on the completion
+ * of some of the in-flight requests. In particular, on the first time, Q is
+ * tentatively set as a candidate waker queue, while on the third consecutive
+ * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q
+ * is a waker queue for bfqq. These detection steps are performed only if bfqq
+ * has a long think time, so as to make it more likely that bfqq's I/O is
+ * actually being blocked by a synchronization. This last filter, plus the
+ * above three-times requirement and time limit for detection, make false
+ * positives less likely.
+ *
+ * NOTE
+ *
+ * The sooner a waker queue is detected, the sooner throughput can be
+ * boosted by injecting I/O from the waker queue. Fortunately,
+ * detection is likely to be actually fast, for the following
+ * reasons. While blocked by synchronization, bfqq has a long think
+ * time. This implies that bfqq's inject limit is at least equal to 1
+ * (see the comments in bfq_update_inject_limit()). So, thanks to
+ * injection, the waker queue is likely to be served during the very
+ * first I/O-plugging time interval for bfqq. This triggers the first
+ * step of the detection mechanism. Thanks again to injection, the
+ * candidate waker queue is then likely to be confirmed no later than
+ * during the next I/O-plugging interval for bfqq.
+ *
+ * ISSUE
+ *
+ * On queue merging all waker information is lost.
+ */
+static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ u64 now_ns)
+{
+ char waker_name[MAX_BFQQ_NAME_LENGTH];
+
+ if (!bfqd->last_completed_rq_bfqq ||
+ bfqd->last_completed_rq_bfqq == bfqq ||
+ bfq_bfqq_has_short_ttime(bfqq) ||
+ now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC ||
+ bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq ||
+ bfqq == &bfqd->oom_bfqq)
+ return;
+
+ /*
+ * We reset waker detection logic also if too much time has passed
+ * since the first detection. If wakeups are rare, pointless idling
+ * doesn't hurt throughput that much. The condition below makes sure
+ * we do not uselessly idle blocking waker in more than 1/64 cases.
+ */
+ if (bfqd->last_completed_rq_bfqq !=
+ bfqq->tentative_waker_bfqq ||
+ now_ns > bfqq->waker_detection_started +
+ 128 * (u64)bfqd->bfq_slice_idle) {
+ /*
+ * First synchronization detected with a
+ * candidate waker queue, or with a different
+ * candidate waker queue from the current one.
+ */
+ bfqq->tentative_waker_bfqq =
+ bfqd->last_completed_rq_bfqq;
+ bfqq->num_waker_detections = 1;
+ bfqq->waker_detection_started = now_ns;
+ bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name,
+ MAX_BFQQ_NAME_LENGTH);
+ bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name);
+ } else /* Same tentative waker queue detected again */
+ bfqq->num_waker_detections++;
+
+ if (bfqq->num_waker_detections == 3) {
+ bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
+ bfqq->tentative_waker_bfqq = NULL;
+ bfq_bfqq_name(bfqq->waker_bfqq, waker_name,
+ MAX_BFQQ_NAME_LENGTH);
+ bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name);
+
+ /*
+ * If the waker queue disappears, then
+ * bfqq->waker_bfqq must be reset. To
+ * this goal, we maintain in each
+ * waker queue a list, woken_list, of
+ * all the queues that reference the
+ * waker queue through their
+ * waker_bfqq pointer. When the waker
+ * queue exits, the waker_bfqq pointer
+ * of all the queues in the woken_list
+ * is reset.
+ *
+ * In addition, if bfqq is already in
+ * the woken_list of a waker queue,
+ * then, before being inserted into
+ * the woken_list of a new waker
+ * queue, bfqq must be removed from
+ * the woken_list of the old waker
+ * queue.
+ */
+ if (!hlist_unhashed(&bfqq->woken_list_node))
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqd->last_completed_rq_bfqq->woken_list);
+ }
+}
+
static void bfq_add_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
@@ -1868,117 +2208,18 @@ static void bfq_add_request(struct request *rq)
struct request *next_rq, *prev;
unsigned int old_wr_coeff = bfqq->wr_coeff;
bool interactive = false;
+ u64 now_ns = ktime_get_ns();
bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq));
bfqq->queued[rq_is_sync(rq)]++;
- bfqd->queued++;
+ /*
+ * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it
+ * may be read without holding the lock in bfq_has_work().
+ */
+ WRITE_ONCE(bfqd->queued, bfqd->queued + 1);
- if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) {
- /*
- * Detect whether bfqq's I/O seems synchronized with
- * that of some other queue, i.e., whether bfqq, after
- * remaining empty, happens to receive new I/O only
- * right after some I/O request of the other queue has
- * been completed. We call waker queue the other
- * queue, and we assume, for simplicity, that bfqq may
- * have at most one waker queue.
- *
- * A remarkable throughput boost can be reached by
- * unconditionally injecting the I/O of the waker
- * queue, every time a new bfq_dispatch_request
- * happens to be invoked while I/O is being plugged
- * for bfqq. In addition to boosting throughput, this
- * unblocks bfqq's I/O, thereby improving bandwidth
- * and latency for bfqq. Note that these same results
- * may be achieved with the general injection
- * mechanism, but less effectively. For details on
- * this aspect, see the comments on the choice of the
- * queue for injection in bfq_select_queue().
- *
- * Turning back to the detection of a waker queue, a
- * queue Q is deemed as a waker queue for bfqq if, for
- * two consecutive times, bfqq happens to become non
- * empty right after a request of Q has been
- * completed. In particular, on the first time, Q is
- * tentatively set as a candidate waker queue, while
- * on the second time, the flag
- * bfq_bfqq_has_waker(bfqq) is set to confirm that Q
- * is a waker queue for bfqq. These detection steps
- * are performed only if bfqq has a long think time,
- * so as to make it more likely that bfqq's I/O is
- * actually being blocked by a synchronization. This
- * last filter, plus the above two-times requirement,
- * make false positives less likely.
- *
- * NOTE
- *
- * The sooner a waker queue is detected, the sooner
- * throughput can be boosted by injecting I/O from the
- * waker queue. Fortunately, detection is likely to be
- * actually fast, for the following reasons. While
- * blocked by synchronization, bfqq has a long think
- * time. This implies that bfqq's inject limit is at
- * least equal to 1 (see the comments in
- * bfq_update_inject_limit()). So, thanks to
- * injection, the waker queue is likely to be served
- * during the very first I/O-plugging time interval
- * for bfqq. This triggers the first step of the
- * detection mechanism. Thanks again to injection, the
- * candidate waker queue is then likely to be
- * confirmed no later than during the next
- * I/O-plugging interval for bfqq.
- */
- if (bfqd->last_completed_rq_bfqq &&
- !bfq_bfqq_has_short_ttime(bfqq) &&
- ktime_get_ns() - bfqd->last_completion <
- 200 * NSEC_PER_USEC) {
- if (bfqd->last_completed_rq_bfqq != bfqq &&
- bfqd->last_completed_rq_bfqq !=
- bfqq->waker_bfqq) {
- /*
- * First synchronization detected with
- * a candidate waker queue, or with a
- * different candidate waker queue
- * from the current one.
- */
- bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq;
-
- /*
- * If the waker queue disappears, then
- * bfqq->waker_bfqq must be reset. To
- * this goal, we maintain in each
- * waker queue a list, woken_list, of
- * all the queues that reference the
- * waker queue through their
- * waker_bfqq pointer. When the waker
- * queue exits, the waker_bfqq pointer
- * of all the queues in the woken_list
- * is reset.
- *
- * In addition, if bfqq is already in
- * the woken_list of a waker queue,
- * then, before being inserted into
- * the woken_list of a new waker
- * queue, bfqq must be removed from
- * the woken_list of the old waker
- * queue.
- */
- if (!hlist_unhashed(&bfqq->woken_list_node))
- hlist_del_init(&bfqq->woken_list_node);
- hlist_add_head(&bfqq->woken_list_node,
- &bfqd->last_completed_rq_bfqq->woken_list);
-
- bfq_clear_bfqq_has_waker(bfqq);
- } else if (bfqd->last_completed_rq_bfqq ==
- bfqq->waker_bfqq &&
- !bfq_bfqq_has_waker(bfqq)) {
- /*
- * synchronization with waker_bfqq
- * seen for the second time
- */
- bfq_mark_bfqq_has_waker(bfqq);
- }
- }
+ if (bfq_bfqq_sync(bfqq) && RQ_BIC(rq)->requests <= 1) {
+ bfq_check_waker(bfqd, bfqq, now_ns);
/*
* Periodically reset inject limit, to make sure that
@@ -2016,9 +2257,9 @@ static void bfq_add_request(struct request *rq)
* elapsed.
*/
if (bfqq == bfqd->in_service_queue &&
- (bfqd->rq_in_driver == 0 ||
+ (bfqd->tot_rq_in_driver == 0 ||
(bfqq->last_serv_time_ns > 0 &&
- bfqd->rqs_injected && bfqd->rq_in_driver > 0)) &&
+ bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) &&
time_is_before_eq_jiffies(bfqq->decrease_time_jif +
msecs_to_jiffies(10))) {
bfqd->last_empty_occupied_ns = ktime_get_ns();
@@ -2042,11 +2283,14 @@ static void bfq_add_request(struct request *rq)
* will be set in case injection is performed
* on bfqq before rq is completed).
*/
- if (bfqd->rq_in_driver == 0)
+ if (bfqd->tot_rq_in_driver == 0)
bfqd->rqs_injected = false;
}
}
+ if (bfq_bfqq_sync(bfqq))
+ bfq_update_io_intensity(bfqq, now_ns);
+
elv_rb_add(&bfqq->sort_list, rq);
/*
@@ -2133,22 +2377,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq)
return 0;
}
-#if 0 /* Still not clear if we can do without next two functions */
-static void bfq_activate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- bfqd->rq_in_driver++;
-}
-
-static void bfq_deactivate_request(struct request_queue *q, struct request *rq)
-{
- struct bfq_data *bfqd = q->elevator->elevator_data;
-
- bfqd->rq_in_driver--;
-}
-#endif
-
static void bfq_remove_request(struct request_queue *q,
struct request *rq)
{
@@ -2164,7 +2392,11 @@ static void bfq_remove_request(struct request_queue *q,
if (rq->queuelist.prev != &rq->queuelist)
list_del_init(&rq->queuelist);
bfqq->queued[sync]--;
- bfqd->queued--;
+ /*
+ * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it
+ * may be read without holding the lock in bfq_has_work().
+ */
+ WRITE_ONCE(bfqd->queued, bfqd->queued - 1);
elv_rb_del(&bfqq->sort_list, rq);
elv_rqhash_del(q, rq);
@@ -2175,7 +2407,7 @@ static void bfq_remove_request(struct request_queue *q,
bfqq->next_rq = NULL;
if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) {
- bfq_del_bfqq_busy(bfqd, bfqq, false);
+ bfq_del_bfqq_busy(bfqq, false);
/*
* bfqq emptied. In normal operation, when
* bfqq is empty, bfqq->entity.service and
@@ -2210,10 +2442,9 @@ static void bfq_remove_request(struct request_queue *q,
}
-static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+static bool bfq_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
struct request *free = NULL;
/*
@@ -2223,22 +2454,30 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
* returned by bfq_bic_lookup does not go away before
* bfqd->lock is taken.
*/
- struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q);
+ struct bfq_io_cq *bic = bfq_bic_lookup(q);
bool ret;
spin_lock_irq(&bfqd->lock);
- if (bic)
- bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf));
- else
+ if (bic) {
+ /*
+ * Make sure cgroup info is uptodate for current process before
+ * considering the merge.
+ */
+ bfq_bic_update_cgroup(bic, bio);
+
+ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf),
+ bfq_actuator_index(bfqd, bio));
+ } else {
bfqd->bio_bfqq = NULL;
+ }
bfqd->bio_bic = bic;
ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free);
+ spin_unlock_irq(&bfqd->lock);
if (free)
blk_mq_free_request(free);
- spin_unlock_irq(&bfqd->lock);
return ret;
}
@@ -2252,14 +2491,15 @@ static int bfq_request_merge(struct request_queue *q, struct request **req,
__rq = bfq_find_rq_fmerge(bfqd, bio, q);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
+
+ if (blk_discard_mergable(__rq))
+ return ELEVATOR_DISCARD_MERGE;
return ELEVATOR_FRONT_MERGE;
}
return ELEVATOR_NO_MERGE;
}
-static struct bfq_queue *bfq_init_rq(struct request *rq);
-
static void bfq_request_merged(struct request_queue *q, struct request *req,
enum elv_merge type)
{
@@ -2268,7 +2508,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
blk_rq_pos(req) <
blk_rq_pos(container_of(rb_prev(&req->rb_node),
struct request, rb_node))) {
- struct bfq_queue *bfqq = bfq_init_rq(req);
+ struct bfq_queue *bfqq = RQ_BFQQ(req);
struct bfq_data *bfqd;
struct request *prev, *next_rq;
@@ -2320,11 +2560,11 @@ static void bfq_request_merged(struct request_queue *q, struct request *req,
static void bfq_requests_merged(struct request_queue *q, struct request *rq,
struct request *next)
{
- struct bfq_queue *bfqq = bfq_init_rq(rq),
- *next_bfqq = bfq_init_rq(next);
+ struct bfq_queue *bfqq = RQ_BFQQ(rq),
+ *next_bfqq = RQ_BFQQ(next);
if (!bfqq)
- return;
+ goto remove;
/*
* If next and rq belong to the same bfq_queue and next is older
@@ -2347,11 +2587,37 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq,
bfqq->next_rq = rq;
bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags);
+remove:
+ /* Merged request may be in the IO scheduler. Remove it. */
+ if (!RB_EMPTY_NODE(&next->rb_node)) {
+ bfq_remove_request(next->q, next);
+ if (next_bfqq)
+ bfqg_stats_update_io_remove(bfqq_group(next_bfqq),
+ next->cmd_flags);
+ }
}
/* Must be called with bfqq != NULL */
static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
{
+ /*
+ * If bfqq has been enjoying interactive weight-raising, then
+ * reset soft_rt_next_start. We do it for the following
+ * reason. bfqq may have been conveying the I/O needed to load
+ * a soft real-time application. Such an application actually
+ * exhibits a soft real-time I/O pattern after it finishes
+ * loading, and finally starts doing its job. But, if bfqq has
+ * been receiving a lot of bandwidth so far (likely to happen
+ * on a fast device), then soft_rt_next_start now contains a
+ * high value that. So, without this reset, bfqq would be
+ * prevented from being possibly considered as soft_rt for a
+ * very long time.
+ */
+
+ if (bfqq->wr_cur_max_time !=
+ bfqq->bfqd->bfq_wr_rt_max_time)
+ bfqq->soft_rt_next_start = jiffies;
+
if (bfq_bfqq_busy(bfqq))
bfqq->bfqd->wr_busy_queues--;
bfqq->wr_coeff = 1;
@@ -2367,24 +2633,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq)
void bfq_end_wr_async_queues(struct bfq_data *bfqd,
struct bfq_group *bfqg)
{
- int i, j;
+ int i, j, k;
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
- if (bfqg->async_bfqq[i][j])
- bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]);
- if (bfqg->async_idle_bfqq)
- bfq_bfqq_end_wr(bfqg->async_idle_bfqq);
+ for (k = 0; k < bfqd->num_actuators; k++) {
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_NR_LEVELS; j++)
+ if (bfqg->async_bfqq[i][j][k])
+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]);
+ if (bfqg->async_idle_bfqq[k])
+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]);
+ }
}
static void bfq_end_wr(struct bfq_data *bfqd)
{
struct bfq_queue *bfqq;
+ int i;
spin_lock_irq(&bfqd->lock);
- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
- bfq_bfqq_end_wr(bfqq);
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
+ bfq_bfqq_end_wr(bfqq);
+ }
list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list)
bfq_bfqq_end_wr(bfqq);
bfq_end_wr_async(bfqd);
@@ -2411,7 +2682,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd,
struct bfq_queue *bfqq,
sector_t sector)
{
- struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree;
+ struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree;
struct rb_node *parent, *node;
struct bfq_queue *__bfqq;
@@ -2500,6 +2771,14 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
if (process_refs == 0 || new_process_refs == 0)
return NULL;
+ /*
+ * Make sure merged queues belong to the same parent. Parents could
+ * have changed since the time we decided the two queues are suitable
+ * for merging.
+ */
+ if (new_bfqq->entity.parent != bfqq->entity.parent)
+ return NULL;
+
bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d",
new_bfqq->pid);
@@ -2524,6 +2803,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq)
* are likely to increase the throughput.
*/
bfqq->new_bfqq = new_bfqq;
+ /*
+ * The above assignment schedules the following redirections:
+ * each time some I/O for bfqq arrives, the process that
+ * generated that I/O is disassociated from bfqq and
+ * associated with new_bfqq. Here we increases new_bfqq->ref
+ * in advance, adding the number of processes that are
+ * expected to be associated with new_bfqq as they happen to
+ * issue I/O.
+ */
new_bfqq->ref += process_refs;
return new_bfqq;
}
@@ -2557,6 +2845,43 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
return true;
}
+static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq);
+
+static struct bfq_queue *
+bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_queue *stable_merge_bfqq,
+ struct bfq_iocq_bfqq_data *bfqq_data)
+{
+ int proc_ref = min(bfqq_process_refs(bfqq),
+ bfqq_process_refs(stable_merge_bfqq));
+ struct bfq_queue *new_bfqq = NULL;
+
+ bfqq_data->stable_merge_bfqq = NULL;
+ if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0)
+ goto out;
+
+ /* next function will take at least one ref */
+ new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq);
+
+ if (new_bfqq) {
+ bfqq_data->stably_merged = true;
+ if (new_bfqq->bic) {
+ unsigned int new_a_idx = new_bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *new_bfqq_data =
+ &new_bfqq->bic->bfqq_data[new_a_idx];
+
+ new_bfqq_data->stably_merged = true;
+ }
+ }
+
+out:
+ /* deschedule stable merge, because done or aborted here */
+ bfq_put_stable_ref(stable_merge_bfqq);
+
+ return new_bfqq;
+}
+
/*
* Attempt to schedule a merge of bfqq with the currently in-service
* queue or with a close queue among the scheduled queues. Return
@@ -2579,9 +2904,46 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq,
*/
static struct bfq_queue *
bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- void *io_struct, bool request)
+ void *io_struct, bool request, struct bfq_io_cq *bic)
{
struct bfq_queue *in_service_bfqq, *new_bfqq;
+ unsigned int a_idx = bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
+
+ /* if a merge has already been setup, then proceed with that first */
+ if (bfqq->new_bfqq)
+ return bfqq->new_bfqq;
+
+ /*
+ * Check delayed stable merge for rotational or non-queueing
+ * devs. For this branch to be executed, bfqq must not be
+ * currently merged with some other queue (i.e., bfqq->bic
+ * must be non null). If we considered also merged queues,
+ * then we should also check whether bfqq has already been
+ * merged with bic->stable_merge_bfqq. But this would be
+ * costly and complicated.
+ */
+ if (unlikely(!bfqd->nonrot_with_queueing)) {
+ /*
+ * Make sure also that bfqq is sync, because
+ * bic->stable_merge_bfqq may point to some queue (for
+ * stable merging) also if bic is associated with a
+ * sync queue, but this bfqq is async
+ */
+ if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq &&
+ !bfq_bfqq_just_created(bfqq) &&
+ time_is_before_jiffies(bfqq->split_time +
+ msecs_to_jiffies(bfq_late_stable_merging)) &&
+ time_is_before_jiffies(bfqq->creation_time +
+ msecs_to_jiffies(bfq_late_stable_merging))) {
+ struct bfq_queue *stable_merge_bfqq =
+ bfqq_data->stable_merge_bfqq;
+
+ return bfq_setup_stable_merge(bfqd, bfqq,
+ stable_merge_bfqq,
+ bfqq_data);
+ }
+ }
/*
* Do not perform queue merging if the device is non
@@ -2637,9 +2999,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfq_too_late_for_merging(bfqq))
return NULL;
- if (bfqq->new_bfqq)
- return bfqq->new_bfqq;
-
if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq))
return NULL;
@@ -2677,6 +3036,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq,
static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
{
struct bfq_io_cq *bic = bfqq->bic;
+ unsigned int a_idx = bfqq->actuator_idx;
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx];
/*
* If !bfqq->bic, the queue is already shared or its requests
@@ -2686,12 +3047,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
if (!bic)
return;
- bic->saved_weight = bfqq->entity.orig_weight;
- bic->saved_ttime = bfqq->ttime;
- bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq);
- bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
- bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
- bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node);
+ bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns;
+ bfqq_data->saved_inject_limit = bfqq->inject_limit;
+ bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif;
+
+ bfqq_data->saved_weight = bfqq->entity.orig_weight;
+ bfqq_data->saved_ttime = bfqq->ttime;
+ bfqq_data->saved_has_short_ttime =
+ bfq_bfqq_has_short_ttime(bfqq);
+ bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq);
+ bfqq_data->saved_io_start_time = bfqq->io_start_time;
+ bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time;
+ bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq);
+ bfqq_data->was_in_burst_list =
+ !hlist_unhashed(&bfqq->burst_list_node);
+
if (unlikely(bfq_bfqq_just_created(bfqq) &&
!bfq_bfqq_in_large_burst(bfqq) &&
bfqq->bfqd->low_latency)) {
@@ -2704,19 +3074,35 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq)
* to bfqq, so that to avoid that bfqq unjustly fails
* to enjoy weight raising if split soon.
*/
- bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
- bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now();
- bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd);
- bic->saved_last_wr_start_finish = jiffies;
+ bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff;
+ bfqq_data->saved_wr_start_at_switch_to_srt =
+ bfq_smallest_from_now();
+ bfqq_data->saved_wr_cur_max_time =
+ bfq_wr_duration(bfqq->bfqd);
+ bfqq_data->saved_last_wr_start_finish = jiffies;
} else {
- bic->saved_wr_coeff = bfqq->wr_coeff;
- bic->saved_wr_start_at_switch_to_srt =
+ bfqq_data->saved_wr_coeff = bfqq->wr_coeff;
+ bfqq_data->saved_wr_start_at_switch_to_srt =
bfqq->wr_start_at_switch_to_srt;
- bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish;
- bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
+ bfqq_data->saved_service_from_wr =
+ bfqq->service_from_wr;
+ bfqq_data->saved_last_wr_start_finish =
+ bfqq->last_wr_start_finish;
+ bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time;
}
}
+
+static void
+bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq)
+{
+ if (cur_bfqq->entity.parent &&
+ cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq)
+ cur_bfqq->entity.parent->last_bfqq_created = new_bfqq;
+ else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq)
+ cur_bfqq->bfqd->last_bfqq_created = new_bfqq;
+}
+
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
{
/*
@@ -2732,7 +3118,9 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq)
*/
if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) &&
bfqq != bfqd->in_service_queue)
- bfq_del_bfqq_busy(bfqd, bfqq, false);
+ bfq_del_bfqq_busy(bfqq, false);
+
+ bfq_reassign_last_bfqq(bfqq, NULL);
bfq_put_queue(bfqq);
}
@@ -2751,6 +3139,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
bfq_clear_bfqq_IO_bound(bfqq);
/*
+ * The processes associated with bfqq are cooperators of the
+ * processes associated with new_bfqq. So, if bfqq has a
+ * waker, then assume that all these processes will be happy
+ * to let bfqq's waker freely inject I/O when they have no
+ * I/O.
+ */
+ if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq &&
+ bfqq->waker_bfqq != new_bfqq) {
+ new_bfqq->waker_bfqq = bfqq->waker_bfqq;
+ new_bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then
+ * new_bfqq->waker_bfqq must be reset. So insert
+ * new_bfqq into the woken_list of the waker. See
+ * bfq_check_waker for details.
+ */
+ hlist_add_head(&new_bfqq->woken_list_node,
+ &new_bfqq->waker_bfqq->woken_list);
+
+ }
+
+ /*
* If bfqq is weight-raised, then let new_bfqq inherit
* weight-raising. To reduce false positives, neglect the case
* where bfqq has just been created, but has not yet made it
@@ -2783,7 +3194,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
/*
* Merge queues (that is, let bic redirect its requests to new_bfqq)
*/
- bic_set_bfqq(bic, new_bfqq, 1);
+ bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx);
bfq_mark_bfqq_coop(new_bfqq);
/*
* new_bfqq now belongs to at least two bics (it is a shared queue):
@@ -2807,6 +3218,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic,
*/
new_bfqq->pid = -1;
bfqq->bic = NULL;
+
+ bfq_reassign_last_bfqq(bfqq, new_bfqq);
+
bfq_release_process_ref(bfqd, bfqq);
}
@@ -2834,7 +3248,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq,
* We take advantage of this function to perform an early merge
* of the queues of possible cooperating processes.
*/
- new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false);
+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic);
if (new_bfqq) {
/*
* bic still points to bfqq, then it has not yet been
@@ -2937,6 +3351,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd,
}
bfqd->in_service_queue = bfqq;
+ bfqd->in_serv_last_pos = 0;
}
/*
@@ -3197,13 +3612,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq)
* - start a new observation interval with this dispatch
*/
if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC &&
- bfqd->rq_in_driver == 0)
+ bfqd->tot_rq_in_driver == 0)
goto update_rate_and_reset;
/* Update sampling information */
bfqd->peak_rate_samples++;
- if ((bfqd->rq_in_driver > 0 ||
+ if ((bfqd->tot_rq_in_driver > 0 ||
now_ns - bfqd->last_completion < BFQ_MIN_TT)
&& !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq))
bfqd->sequential_samples++;
@@ -3442,20 +3857,36 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq)
* order until all the requests already queued in the device have been
* served. The last sub-condition commented above somewhat mitigates
* this problem for weight-raised queues.
+ *
+ * However, as an additional mitigation for this problem, we preserve
+ * plugging for a special symmetric case that may suddenly turn into
+ * asymmetric: the case where only bfqq is busy. In this case, not
+ * expiring bfqq does not cause any harm to any other queues in terms
+ * of service guarantees. In contrast, it avoids the following unlucky
+ * sequence of events: (1) bfqq is expired, (2) a new queue with a
+ * lower weight than bfqq becomes busy (or more queues), (3) the new
+ * queue is served until a new request arrives for bfqq, (4) when bfqq
+ * is finally served, there are so many requests of the new queue in
+ * the drive that the pending requests for bfqq take a lot of time to
+ * be served. In particular, event (2) may case even already
+ * dispatched requests of bfqq to be delayed, inside the drive. So, to
+ * avoid this series of events, the scenario is preventively declared
+ * as asymmetric also if bfqq is the only busy queues
*/
static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
+ int tot_busy_queues = bfq_tot_busy_queues(bfqd);
+
/* No point in idling for bfqq if it won't get requests any longer */
if (unlikely(!bfqq_process_refs(bfqq)))
return false;
return (bfqq->wr_coeff > 1 &&
- (bfqd->wr_busy_queues <
- bfq_tot_busy_queues(bfqd) ||
- bfqd->rq_in_driver >=
- bfqq->dispatched + 4)) ||
- bfq_asymmetric_scenario(bfqd, bfqq);
+ (bfqd->wr_busy_queues < tot_busy_queues ||
+ bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) ||
+ bfq_asymmetric_scenario(bfqd, bfqq) ||
+ tot_busy_queues == 1;
}
static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
@@ -3495,7 +3926,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
*/
bfqq->budget_timeout = jiffies;
- bfq_del_bfqq_busy(bfqd, bfqq, true);
+ bfq_del_bfqq_busy(bfqq, true);
} else {
bfq_requeue_bfqq(bfqd, bfqq, true);
/*
@@ -3719,8 +4150,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd,
* function to evaluate the I/O speed of a process.
*/
static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool compensate, enum bfqq_expiration reason,
- unsigned long *delta_ms)
+ bool compensate, unsigned long *delta_ms)
{
ktime_t delta_ktime;
u32 delta_usecs;
@@ -3916,7 +4346,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
/*
* Check whether the process is slow (see bfq_bfqq_is_slow).
*/
- slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta);
+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta);
/*
* As above explained, charge slow (typically seeky) and
@@ -3939,10 +4369,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
bfq_bfqq_budget_left(bfqq) >= entity->budget / 3)))
bfq_bfqq_charge_time(bfqd, bfqq, delta);
- if (reason == BFQQE_TOO_IDLE &&
- entity->service <= 2 * entity->budget / 10)
- bfq_clear_bfqq_IO_bound(bfqq);
-
if (bfqd->low_latency && bfqq->wr_coeff == 1)
bfqq->last_wr_start_finish = jiffies;
@@ -3952,30 +4378,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd,
* If we get here, and there are no outstanding
* requests, then the request pattern is isochronous
* (see the comments on the function
- * bfq_bfqq_softrt_next_start()). Thus we can compute
- * soft_rt_next_start. And we do it, unless bfqq is in
- * interactive weight raising. We do not do it in the
- * latter subcase, for the following reason. bfqq may
- * be conveying the I/O needed to load a soft
- * real-time application. Such an application will
- * actually exhibit a soft real-time I/O pattern after
- * it finally starts doing its job. But, if
- * soft_rt_next_start is computed here for an
- * interactive bfqq, and bfqq had received a lot of
- * service before remaining with no outstanding
- * request (likely to happen on a fast device), then
- * soft_rt_next_start would be assigned such a high
- * value that, for a very long time, bfqq would be
- * prevented from being possibly considered as soft
- * real time.
+ * bfq_bfqq_softrt_next_start()). Therefore we can
+ * compute soft_rt_next_start.
*
* If, instead, the queue still has outstanding
* requests, then we have to wait for the completion
* of all the outstanding requests to discover whether
* the request pattern is actually isochronous.
*/
- if (bfqq->dispatched == 0 &&
- bfqq->wr_coeff != bfqd->bfq_wr_coeff)
+ if (bfqq->dispatched == 0)
bfqq->soft_rt_next_start =
bfq_bfqq_softrt_next_start(bfqd, bfqq);
else if (bfqq->dispatched > 0) {
@@ -4243,6 +4654,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
{
struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue;
unsigned int limit = in_serv_bfqq->inject_limit;
+ int i;
+
/*
* If
* - bfqq is not weight-raised and therefore does not carry
@@ -4274,7 +4687,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
)
limit = 1;
- if (bfqd->rq_in_driver >= limit)
+ if (bfqd->tot_rq_in_driver >= limit)
return NULL;
/*
@@ -4289,11 +4702,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
* (and re-added only if it gets new requests, but then it
* is assigned again enough budget for its new backlog).
*/
- list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list)
- if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
- (in_serv_always_inject || bfqq->wr_coeff > 1) &&
- bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
- bfq_bfqq_budget_left(bfqq)) {
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list)
+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ (in_serv_always_inject || bfqq->wr_coeff > 1) &&
+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+ bfq_bfqq_budget_left(bfqq)) {
/*
* Allow for only one large in-flight request
* on non-rotational devices, for the
@@ -4313,27 +4727,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd)
*/
if (blk_queue_nonrot(bfqd->queue) &&
blk_rq_sectors(bfqq->next_rq) >=
- BFQQ_SECT_THR_NONROT)
- limit = min_t(unsigned int, 1, limit);
- else
- limit = in_serv_bfqq->inject_limit;
-
- if (bfqd->rq_in_driver < limit) {
+ BFQQ_SECT_THR_NONROT &&
+ bfqd->tot_rq_in_driver >= 1)
+ continue;
+ else {
bfqd->rqs_injected = true;
return bfqq;
}
}
+ }
+
+ return NULL;
+}
+
+static struct bfq_queue *
+bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx)
+{
+ struct bfq_queue *bfqq;
+
+ if (bfqd->in_service_queue &&
+ bfqd->in_service_queue->actuator_idx == idx)
+ return bfqd->in_service_queue;
+
+ list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) {
+ if (!RB_EMPTY_ROOT(&bfqq->sort_list) &&
+ bfq_serv_to_charge(bfqq->next_rq, bfqq) <=
+ bfq_bfqq_budget_left(bfqq)) {
+ return bfqq;
+ }
+ }
+
+ return NULL;
+}
+
+/*
+ * Perform a linear scan of each actuator, until an actuator is found
+ * for which the following three conditions hold: the load of the
+ * actuator is below the threshold (see comments on
+ * actuator_load_threshold for details) and lower than that of the
+ * next actuator (comments on this extra condition below), and there
+ * is a queue that contains I/O for that actuator. On success, return
+ * that queue.
+ *
+ * Performing a plain linear scan entails a prioritization among
+ * actuators. The extra condition above breaks this prioritization and
+ * tends to distribute injection uniformly across actuators.
+ */
+static struct bfq_queue *
+bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd)
+{
+ int i;
+
+ for (i = 0 ; i < bfqd->num_actuators; i++) {
+ if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold &&
+ (i == bfqd->num_actuators - 1 ||
+ bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) {
+ struct bfq_queue *bfqq =
+ bfq_find_active_bfqq_for_actuator(bfqd, i);
+
+ if (bfqq)
+ return bfqq;
+ }
+ }
return NULL;
}
+
/*
* Select a queue for service. If we have a current queue in service,
* check whether to continue servicing it, or retrieve and set a new one.
*/
static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
{
- struct bfq_queue *bfqq;
+ struct bfq_queue *bfqq, *inject_bfqq;
struct request *next_rq;
enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT;
@@ -4356,6 +4823,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd)
check_queue:
/*
+ * If some actuator is underutilized, but the in-service
+ * queue does not contain I/O for that actuator, then try to
+ * inject I/O for that actuator.
+ */
+ inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd);
+ if (inject_bfqq && inject_bfqq != bfqq)
+ return inject_bfqq;
+
+ /*
* This loop is rarely executed more than once. Even when it
* happens, it is much more convenient to re-execute this loop
* than to return NULL and trigger a new dispatch to get a
@@ -4414,14 +4890,21 @@ check_queue:
*/
if (bfq_bfqq_wait_request(bfqq) ||
(bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) {
- struct bfq_queue *async_bfqq =
- bfqq->bic && bfqq->bic->bfqq[0] &&
- bfq_bfqq_busy(bfqq->bic->bfqq[0]) &&
- bfqq->bic->bfqq[0]->next_rq ?
- bfqq->bic->bfqq[0] : NULL;
-
+ unsigned int act_idx = bfqq->actuator_idx;
+ struct bfq_queue *async_bfqq = NULL;
+ struct bfq_queue *blocked_bfqq =
+ !hlist_empty(&bfqq->woken_list) ?
+ container_of(bfqq->woken_list.first,
+ struct bfq_queue,
+ woken_list_node)
+ : NULL;
+
+ if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] &&
+ bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) &&
+ bfqq->bic->bfqq[0][act_idx]->next_rq)
+ async_bfqq = bfqq->bic->bfqq[0][act_idx];
/*
- * The next three mutually-exclusive ifs decide
+ * The next four mutually-exclusive ifs decide
* whether to try injection, and choose the queue to
* pick an I/O request from.
*
@@ -4454,7 +4937,15 @@ check_queue:
* next bfqq's I/O is brought forward dramatically,
* for it is not blocked for milliseconds.
*
- * The third if checks whether bfqq is a queue for
+ * The third if checks whether there is a queue woken
+ * by bfqq, and currently with pending I/O. Such a
+ * woken queue does not steal bandwidth from bfqq,
+ * because it remains soon without I/O if bfqq is not
+ * served. So there is virtually no risk of loss of
+ * bandwidth for bfqq if this woken queue has I/O
+ * dispatched while bfqq is waiting for new I/O.
+ *
+ * The fourth if checks whether bfqq is a queue for
* which it is better to avoid injection. It is so if
* bfqq delivers more throughput when served without
* any further I/O from other queues in the middle, or
@@ -4474,11 +4965,11 @@ check_queue:
* bfq_update_has_short_ttime(), it is rather likely
* that, if I/O is being plugged for bfqq and the
* waker queue has pending I/O requests that are
- * blocking bfqq's I/O, then the third alternative
+ * blocking bfqq's I/O, then the fourth alternative
* above lets the waker queue get served before the
* I/O-plugging timeout fires. So one may deem the
* second alternative superfluous. It is not, because
- * the third alternative may be way less effective in
+ * the fourth alternative may be way less effective in
* case of a synchronization. For two main
* reasons. First, throughput may be low because the
* inject limit may be too low to guarantee the same
@@ -4487,7 +4978,7 @@ check_queue:
* guarantees (the second alternative unconditionally
* injects a pending I/O request of the waker queue
* for each bfq_dispatch_request()). Second, with the
- * third alternative, the duration of the plugging,
+ * fourth alternative, the duration of the plugging,
* i.e., the time before bfqq finally receives new I/O,
* may not be minimized, because the waker queue may
* happen to be served only after other queues.
@@ -4496,15 +4987,23 @@ check_queue:
icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic &&
bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <=
bfq_bfqq_budget_left(async_bfqq))
- bfqq = bfqq->bic->bfqq[0];
- else if (bfq_bfqq_has_waker(bfqq) &&
+ bfqq = async_bfqq;
+ else if (bfqq->waker_bfqq &&
bfq_bfqq_busy(bfqq->waker_bfqq) &&
- bfqq->next_rq &&
+ bfqq->waker_bfqq->next_rq &&
bfq_serv_to_charge(bfqq->waker_bfqq->next_rq,
bfqq->waker_bfqq) <=
bfq_bfqq_budget_left(bfqq->waker_bfqq)
)
bfqq = bfqq->waker_bfqq;
+ else if (blocked_bfqq &&
+ bfq_bfqq_busy(blocked_bfqq) &&
+ blocked_bfqq->next_rq &&
+ bfq_serv_to_charge(blocked_bfqq->next_rq,
+ blocked_bfqq) <=
+ bfq_bfqq_budget_left(blocked_bfqq)
+ )
+ bfqq = blocked_bfqq;
else if (!idling_boosts_thr_without_issues(bfqd, bfqq) &&
(bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 ||
!bfq_bfqq_has_short_ttime(bfqq)))
@@ -4559,9 +5058,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfqq->wr_cur_max_time)) {
if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time ||
time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
- bfq_wr_duration(bfqd)))
+ bfq_wr_duration(bfqd))) {
+ /*
+ * Either in interactive weight
+ * raising, or in soft_rt weight
+ * raising with the
+ * interactive-weight-raising period
+ * elapsed (so no switch back to
+ * interactive weight raising).
+ */
bfq_bfqq_end_wr(bfqq);
- else {
+ } else { /*
+ * soft_rt finishing while still in
+ * interactive period, switch back to
+ * interactive weight raising
+ */
switch_back_to_interactive_wr(bfqq, bfqd);
bfqq->entity.prio_changed = 1;
}
@@ -4607,7 +5118,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
bfq_dispatch_remove(bfqd->queue, rq);
if (bfqq != bfqd->in_service_queue)
- goto return_rq;
+ return rq;
/*
* If weight raising has to terminate for bfqq, then next
@@ -4627,12 +5138,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd,
* belongs to CLASS_IDLE and other queues are waiting for
* service.
*/
- if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)))
- goto return_rq;
-
- bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
+ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))
+ bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED);
-return_rq:
return rq;
}
@@ -4641,11 +5149,11 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx)
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
/*
- * Avoiding lock: a race on bfqd->busy_queues should cause at
+ * Avoiding lock: a race on bfqd->queued should cause at
* most a call to dispatch for nothing
*/
return !list_empty_careful(&bfqd->dispatch) ||
- bfq_tot_busy_queues(bfqd) > 0;
+ READ_ONCE(bfqd->queued);
}
static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
@@ -4675,11 +5183,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
/*
* We exploit the bfq_finish_requeue_request hook to
- * decrement rq_in_driver, but
+ * decrement tot_rq_in_driver, but
* bfq_finish_requeue_request will not be invoked on
* this request. So, to avoid unbalance, just start
- * this request, without incrementing rq_in_driver. As
- * a negative consequence, rq_in_driver is deceptively
+ * this request, without incrementing tot_rq_in_driver. As
+ * a negative consequence, tot_rq_in_driver is deceptively
* lower than it should be while this request is in
* service. This may cause bfq_schedule_dispatch to be
* invoked uselessly.
@@ -4688,7 +5196,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* bfq_finish_requeue_request hook, if defined, is
* probably invoked also on this request. So, by
* exploiting this hook, we could 1) increment
- * rq_in_driver here, and 2) decrement it in
+ * tot_rq_in_driver here, and 2) decrement it in
* bfq_finish_requeue_request. Such a solution would
* let the value of the counter be always accurate,
* but it would entail using an extra interface
@@ -4714,10 +5222,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
* some unlucky request wait for as long as the device
* wishes.
*
- * Of course, serving one request at at time may cause loss of
+ * Of course, serving one request at a time may cause loss of
* throughput.
*/
- if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0)
+ if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0)
goto exit;
bfqq = bfq_select_queue(bfqd);
@@ -4728,7 +5236,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
if (rq) {
inc_in_driver_start_rq:
- bfqd->rq_in_driver++;
+ bfqd->rq_in_driver[bfqq->actuator_idx]++;
+ bfqd->tot_rq_in_driver++;
start_rq:
rq->rq_flags |= RQF_STARTED;
}
@@ -4793,7 +5302,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct request *rq;
struct bfq_queue *in_serv_queue;
- bool waiting_rq, idle_timer_disabled;
+ bool waiting_rq, idle_timer_disabled = false;
spin_lock_irq(&bfqd->lock);
@@ -4801,14 +5310,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx)
waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue);
rq = __bfq_dispatch_request(hctx);
-
- idle_timer_disabled =
- waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ if (in_serv_queue == bfqd->in_service_queue) {
+ idle_timer_disabled =
+ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue);
+ }
spin_unlock_irq(&bfqd->lock);
-
- bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue,
- idle_timer_disabled);
+ bfq_update_dispatch_stats(hctx->queue, rq,
+ idle_timer_disabled ? in_serv_queue : NULL,
+ idle_timer_disabled);
return rq;
}
@@ -4826,9 +5336,7 @@ void bfq_put_queue(struct bfq_queue *bfqq)
struct hlist_node *n;
struct bfq_group *bfqg = bfqq_group(bfqq);
- if (bfqq->bfqd)
- bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d",
- bfqq, bfqq->ref);
+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref);
bfqq->ref--;
if (bfqq->ref)
@@ -4889,18 +5397,27 @@ void bfq_put_queue(struct bfq_queue *bfqq)
hlist_for_each_entry_safe(item, n, &bfqq->woken_list,
woken_list_node) {
item->waker_bfqq = NULL;
- bfq_clear_bfqq_has_waker(item);
hlist_del_init(&item->woken_list_node);
}
- if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq)
+ if (bfqq->bfqd->last_completed_rq_bfqq == bfqq)
bfqq->bfqd->last_completed_rq_bfqq = NULL;
+ WARN_ON_ONCE(!list_empty(&bfqq->fifo));
+ WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list));
+ WARN_ON_ONCE(bfqq->dispatched);
+
kmem_cache_free(bfq_pool, bfqq);
bfqg_and_blkg_put(bfqg);
}
-static void bfq_put_cooperator(struct bfq_queue *bfqq)
+static void bfq_put_stable_ref(struct bfq_queue *bfqq)
+{
+ bfqq->stable_ref--;
+ bfq_put_queue(bfqq);
+}
+
+void bfq_put_cooperator(struct bfq_queue *bfqq)
{
struct bfq_queue *__bfqq, *next;
@@ -4911,8 +5428,6 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq)
*/
__bfqq = bfqq->new_bfqq;
while (__bfqq) {
- if (__bfqq == bfqq)
- break;
next = __bfqq->new_bfqq;
bfq_put_queue(__bfqq);
__bfqq = next;
@@ -4933,31 +5448,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_release_process_ref(bfqd, bfqq);
}
-static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync)
+static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync,
+ unsigned int actuator_idx)
{
- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx);
struct bfq_data *bfqd;
if (bfqq)
bfqd = bfqq->bfqd; /* NULL if scheduler already exited */
if (bfqq && bfqd) {
- unsigned long flags;
-
- spin_lock_irqsave(&bfqd->lock, flags);
- bfqq->bic = NULL;
+ bic_set_bfqq(bic, NULL, is_sync, actuator_idx);
bfq_exit_bfqq(bfqd, bfqq);
- bic_set_bfqq(bic, NULL, is_sync);
- spin_unlock_irqrestore(&bfqd->lock, flags);
}
}
static void bfq_exit_icq(struct io_cq *icq)
{
struct bfq_io_cq *bic = icq_to_bic(icq);
+ struct bfq_data *bfqd = bic_to_bfqd(bic);
+ unsigned long flags;
+ unsigned int act_idx;
+ /*
+ * If bfqd and thus bfqd->num_actuators is not available any
+ * longer, then cycle over all possible per-actuator bfqqs in
+ * next loop. We rely on bic being zeroed on creation, and
+ * therefore on its unused per-actuator fields being NULL.
+ */
+ unsigned int num_actuators = BFQ_MAX_ACTUATORS;
+ struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data;
+
+ /*
+ * bfqd is NULL if scheduler already exited, and in that case
+ * this is the last time these queues are accessed.
+ */
+ if (bfqd) {
+ spin_lock_irqsave(&bfqd->lock, flags);
+ num_actuators = bfqd->num_actuators;
+ }
+
+ for (act_idx = 0; act_idx < num_actuators; act_idx++) {
+ if (bfqq_data[act_idx].stable_merge_bfqq)
+ bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq);
+
+ bfq_exit_icq_bfqq(bic, true, act_idx);
+ bfq_exit_icq_bfqq(bic, false, act_idx);
+ }
- bfq_exit_icq_bfqq(bic, true);
- bfq_exit_icq_bfqq(bic, false);
+ if (bfqd)
+ spin_unlock_irqrestore(&bfqd->lock, flags);
}
/*
@@ -4978,9 +5517,9 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
switch (ioprio_class) {
default:
pr_err("bdi %s: bfq: bad prio class %d\n",
- bdi_dev_name(bfqq->bfqd->queue->backing_dev_info),
- ioprio_class);
- /* fall through */
+ bdi_dev_name(bfqq->bfqd->queue->disk->bdi),
+ ioprio_class);
+ fallthrough;
case IOPRIO_CLASS_NONE:
/*
* No prio set, inherit CPU scheduling settings.
@@ -4989,32 +5528,35 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic)
bfqq->new_ioprio_class = task_nice_ioclass(tsk);
break;
case IOPRIO_CLASS_RT:
- bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio);
bfqq->new_ioprio_class = IOPRIO_CLASS_RT;
break;
case IOPRIO_CLASS_BE:
- bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio);
bfqq->new_ioprio_class = IOPRIO_CLASS_BE;
break;
case IOPRIO_CLASS_IDLE:
bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE;
- bfqq->new_ioprio = 7;
+ bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1;
break;
}
- if (bfqq->new_ioprio >= IOPRIO_BE_NR) {
+ if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) {
pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n",
bfqq->new_ioprio);
- bfqq->new_ioprio = IOPRIO_BE_NR;
+ bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1;
}
bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio);
+ bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d",
+ bfqq->new_ioprio, bfqq->entity.new_weight);
bfqq->entity.prio_changed = 1;
}
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
- struct bfq_io_cq *bic);
+ struct bfq_io_cq *bic,
+ bool respawn);
static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
{
@@ -5031,21 +5573,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio)
bic->ioprio = ioprio;
- bfqq = bic_to_bfqq(bic, false);
+ bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio));
if (bfqq) {
- bfq_release_process_ref(bfqd, bfqq);
- bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic);
- bic_set_bfqq(bic, bfqq, false);
+ struct bfq_queue *old_bfqq = bfqq;
+
+ bfqq = bfq_get_queue(bfqd, bio, false, bic, true);
+ bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio));
+ bfq_release_process_ref(bfqd, old_bfqq);
}
- bfqq = bic_to_bfqq(bic, true);
+ bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio));
if (bfqq)
bfq_set_next_ioprio_data(bfqq, bic);
}
static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct bfq_io_cq *bic, pid_t pid, int is_sync)
+ struct bfq_io_cq *bic, pid_t pid, int is_sync,
+ unsigned int act_idx)
{
+ u64 now_ns = ktime_get_ns();
+
+ bfqq->actuator_idx = act_idx;
RB_CLEAR_NODE(&bfqq->entity.rb_node);
INIT_LIST_HEAD(&bfqq->fifo);
INIT_HLIST_NODE(&bfqq->burst_list_node);
@@ -5073,7 +5621,11 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_clear_bfqq_sync(bfqq);
/* set end request to minus infinity from now */
- bfqq->ttime.last_end_request = ktime_get_ns() + 1;
+ bfqq->ttime.last_end_request = now_ns + 1;
+
+ bfqq->creation_time = jiffies;
+
+ bfqq->io_start_time = now_ns;
bfq_mark_bfqq_IO_bound(bfqq);
@@ -5101,48 +5653,198 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
/* first request is almost certainly seeky */
bfqq->seek_history = 1;
+
+ bfqq->decrease_time_jif = jiffies;
}
static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd,
struct bfq_group *bfqg,
- int ioprio_class, int ioprio)
+ int ioprio_class, int ioprio, int act_idx)
{
switch (ioprio_class) {
case IOPRIO_CLASS_RT:
- return &bfqg->async_bfqq[0][ioprio];
+ return &bfqg->async_bfqq[0][ioprio][act_idx];
case IOPRIO_CLASS_NONE:
- ioprio = IOPRIO_NORM;
- /* fall through */
+ ioprio = IOPRIO_BE_NORM;
+ fallthrough;
case IOPRIO_CLASS_BE:
- return &bfqg->async_bfqq[1][ioprio];
+ return &bfqg->async_bfqq[1][ioprio][act_idx];
case IOPRIO_CLASS_IDLE:
- return &bfqg->async_idle_bfqq;
+ return &bfqg->async_idle_bfqq[act_idx];
default:
return NULL;
}
}
+static struct bfq_queue *
+bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic,
+ struct bfq_queue *last_bfqq_created)
+{
+ unsigned int a_idx = last_bfqq_created->actuator_idx;
+ struct bfq_queue *new_bfqq =
+ bfq_setup_merge(bfqq, last_bfqq_created);
+
+ if (!new_bfqq)
+ return bfqq;
+
+ if (new_bfqq->bic)
+ new_bfqq->bic->bfqq_data[a_idx].stably_merged = true;
+ bic->bfqq_data[a_idx].stably_merged = true;
+
+ /*
+ * Reusing merge functions. This implies that
+ * bfqq->bic must be set too, for
+ * bfq_merge_bfqqs to correctly save bfqq's
+ * state before killing it.
+ */
+ bfqq->bic = bic;
+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq);
+
+ return new_bfqq;
+}
+
+/*
+ * Many throughput-sensitive workloads are made of several parallel
+ * I/O flows, with all flows generated by the same application, or
+ * more generically by the same task (e.g., system boot). The most
+ * counterproductive action with these workloads is plugging I/O
+ * dispatch when one of the bfq_queues associated with these flows
+ * remains temporarily empty.
+ *
+ * To avoid this plugging, BFQ has been using a burst-handling
+ * mechanism for years now. This mechanism has proven effective for
+ * throughput, and not detrimental for service guarantees. The
+ * following function pushes this mechanism a little bit further,
+ * basing on the following two facts.
+ *
+ * First, all the I/O flows of a the same application or task
+ * contribute to the execution/completion of that common application
+ * or task. So the performance figures that matter are total
+ * throughput of the flows and task-wide I/O latency. In particular,
+ * these flows do not need to be protected from each other, in terms
+ * of individual bandwidth or latency.
+ *
+ * Second, the above fact holds regardless of the number of flows.
+ *
+ * Putting these two facts together, this commits merges stably the
+ * bfq_queues associated with these I/O flows, i.e., with the
+ * processes that generate these IO/ flows, regardless of how many the
+ * involved processes are.
+ *
+ * To decide whether a set of bfq_queues is actually associated with
+ * the I/O flows of a common application or task, and to merge these
+ * queues stably, this function operates as follows: given a bfq_queue,
+ * say Q2, currently being created, and the last bfq_queue, say Q1,
+ * created before Q2, Q2 is merged stably with Q1 if
+ * - very little time has elapsed since when Q1 was created
+ * - Q2 has the same ioprio as Q1
+ * - Q2 belongs to the same group as Q1
+ *
+ * Merging bfq_queues also reduces scheduling overhead. A fio test
+ * with ten random readers on /dev/nullb shows a throughput boost of
+ * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of
+ * the total per-request processing time, the above throughput boost
+ * implies that BFQ's overhead is reduced by more than 50%.
+ *
+ * This new mechanism most certainly obsoletes the current
+ * burst-handling heuristics. We keep those heuristics for the moment.
+ */
+static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd,
+ struct bfq_queue *bfqq,
+ struct bfq_io_cq *bic)
+{
+ struct bfq_queue **source_bfqq = bfqq->entity.parent ?
+ &bfqq->entity.parent->last_bfqq_created :
+ &bfqd->last_bfqq_created;
+
+ struct bfq_queue *last_bfqq_created = *source_bfqq;
+
+ /*
+ * If last_bfqq_created has not been set yet, then init it. If
+ * it has been set already, but too long ago, then move it
+ * forward to bfqq. Finally, move also if bfqq belongs to a
+ * different group than last_bfqq_created, or if bfqq has a
+ * different ioprio, ioprio_class or actuator_idx. If none of
+ * these conditions holds true, then try an early stable merge
+ * or schedule a delayed stable merge. As for the condition on
+ * actuator_idx, the reason is that, if queues associated with
+ * different actuators are merged, then control is lost on
+ * each actuator. Therefore some actuator may be
+ * underutilized, and throughput may decrease.
+ *
+ * A delayed merge is scheduled (instead of performing an
+ * early merge), in case bfqq might soon prove to be more
+ * throughput-beneficial if not merged. Currently this is
+ * possible only if bfqd is rotational with no queueing. For
+ * such a drive, not merging bfqq is better for throughput if
+ * bfqq happens to contain sequential I/O. So, we wait a
+ * little bit for enough I/O to flow through bfqq. After that,
+ * if such an I/O is sequential, then the merge is
+ * canceled. Otherwise the merge is finally performed.
+ */
+ if (!last_bfqq_created ||
+ time_before(last_bfqq_created->creation_time +
+ msecs_to_jiffies(bfq_activation_stable_merging),
+ bfqq->creation_time) ||
+ bfqq->entity.parent != last_bfqq_created->entity.parent ||
+ bfqq->ioprio != last_bfqq_created->ioprio ||
+ bfqq->ioprio_class != last_bfqq_created->ioprio_class ||
+ bfqq->actuator_idx != last_bfqq_created->actuator_idx)
+ *source_bfqq = bfqq;
+ else if (time_after_eq(last_bfqq_created->creation_time +
+ bfqd->bfq_burst_interval,
+ bfqq->creation_time)) {
+ if (likely(bfqd->nonrot_with_queueing))
+ /*
+ * With this type of drive, leaving
+ * bfqq alone may provide no
+ * throughput benefits compared with
+ * merging bfqq. So merge bfqq now.
+ */
+ bfqq = bfq_do_early_stable_merge(bfqd, bfqq,
+ bic,
+ last_bfqq_created);
+ else { /* schedule tentative stable merge */
+ /*
+ * get reference on last_bfqq_created,
+ * to prevent it from being freed,
+ * until we decide whether to merge
+ */
+ last_bfqq_created->ref++;
+ /*
+ * need to keep track of stable refs, to
+ * compute process refs correctly
+ */
+ last_bfqq_created->stable_ref++;
+ /*
+ * Record the bfqq to merge to.
+ */
+ bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq =
+ last_bfqq_created;
+ }
+ }
+
+ return bfqq;
+}
+
+
static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
struct bio *bio, bool is_sync,
- struct bfq_io_cq *bic)
+ struct bfq_io_cq *bic,
+ bool respawn)
{
- const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio);
+ const int ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio);
const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio);
struct bfq_queue **async_bfqq = NULL;
struct bfq_queue *bfqq;
struct bfq_group *bfqg;
- rcu_read_lock();
-
- bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio));
- if (!bfqg) {
- bfqq = &bfqd->oom_bfqq;
- goto out;
- }
-
+ bfqg = bfq_bio_bfqg(bfqd, bio);
if (!is_sync) {
async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class,
- ioprio);
+ ioprio,
+ bfq_actuator_index(bfqd, bio));
bfqq = *async_bfqq;
if (bfqq)
goto out;
@@ -5154,7 +5856,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
if (bfqq) {
bfq_init_bfqq(bfqd, bfqq, bic, current->pid,
- is_sync);
+ is_sync, bfq_actuator_index(bfqd, bio));
bfq_init_entity(&bfqq->entity, bfqg);
bfq_log_bfqq(bfqd, bfqq, "allocated");
} else {
@@ -5182,8 +5884,9 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
out:
bfqq->ref++; /* get a process reference to this queue */
- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref);
- rcu_read_unlock();
+
+ if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn)
+ bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic);
return bfqq;
}
@@ -5191,11 +5894,19 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd,
struct bfq_queue *bfqq)
{
struct bfq_ttime *ttime = &bfqq->ttime;
- u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
+ u64 elapsed;
+ /*
+ * We are really interested in how long it takes for the queue to
+ * become busy when there is no outstanding IO for this queue. So
+ * ignore cases when the bfq queue has already IO queued.
+ */
+ if (bfqq->dispatched || bfq_bfqq_busy(bfqq))
+ return;
+ elapsed = ktime_get_ns() - bfqq->ttime.last_end_request;
elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle);
- ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8;
+ ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8;
ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8);
ttime->ttime_mean = div64_ul(ttime->ttime_total + 128,
ttime->ttime_samples);
@@ -5210,8 +5921,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq,
if (bfqq->wr_coeff > 1 &&
bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time &&
- BFQQ_TOTALLY_SEEKY(bfqq))
- bfq_bfqq_end_wr(bfqq);
+ BFQQ_TOTALLY_SEEKY(bfqq)) {
+ if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt +
+ bfq_wr_duration(bfqd))) {
+ /*
+ * In soft_rt weight raising with the
+ * interactive-weight-raising period
+ * elapsed (so no switch back to
+ * interactive weight raising).
+ */
+ bfq_bfqq_end_wr(bfqq);
+ } else { /*
+ * stopping soft_rt weight raising
+ * while still in interactive period,
+ * switch back to interactive weight
+ * raising
+ */
+ switch_back_to_interactive_wr(bfqq, bfqd);
+ bfqq->entity.prio_changed = 1;
+ }
+ }
}
static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
@@ -5235,12 +5964,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd,
return;
/* Think time is infinite if no process is linked to
- * bfqq. Otherwise check average think time to
- * decide whether to mark as has_short_ttime
+ * bfqq. Otherwise check average think time to decide whether
+ * to mark as has_short_ttime. To this goal, compare average
+ * think time with half the I/O-plugging timeout.
*/
if (atomic_read(&bic->icq.ioc->active_ref) == 0 ||
(bfq_sample_valid(bfqq->ttime.ttime_samples) &&
- bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle))
+ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1))
has_short_ttime = false;
state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq);
@@ -5401,11 +6131,28 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq,
}
}
+static void bfqq_request_allocated(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ for_each_entity(entity)
+ entity->allocated++;
+}
+
+static void bfqq_request_freed(struct bfq_queue *bfqq)
+{
+ struct bfq_entity *entity = &bfqq->entity;
+
+ for_each_entity(entity)
+ entity->allocated--;
+}
+
/* returns true if it causes the idle timer to be disabled */
static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq),
- *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true);
+ *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true,
+ RQ_BIC(rq));
bool waiting, idle_timer_disabled = false;
if (new_bfqq) {
@@ -5413,8 +6160,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* Release the request's reference to the old bfqq
* and make sure one is taken to the shared queue.
*/
- new_bfqq->allocated++;
- bfqq->allocated--;
+ bfqq_request_allocated(new_bfqq);
+ bfqq_request_freed(bfqq);
new_bfqq->ref++;
/*
* If the bic associated with the process
@@ -5424,7 +6171,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
* then complete the merge and redirect it to
* new_bfqq.
*/
- if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq)
+ if (bic_to_bfqq(RQ_BIC(rq), true,
+ bfq_actuator_index(bfqd, rq->bio)) == bfqq)
bfq_merge_bfqqs(bfqd, RQ_BIC(rq),
bfqq, new_bfqq);
@@ -5458,7 +6206,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq)
static void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
- unsigned int cmd_flags)
+ blk_opf_t cmd_flags)
{
if (!bfqq)
return;
@@ -5483,39 +6231,39 @@ static void bfq_update_insert_stats(struct request_queue *q,
static inline void bfq_update_insert_stats(struct request_queue *q,
struct bfq_queue *bfqq,
bool idle_timer_disabled,
- unsigned int cmd_flags) {}
+ blk_opf_t cmd_flags) {}
#endif /* CONFIG_BFQ_CGROUP_DEBUG */
+static struct bfq_queue *bfq_init_rq(struct request *rq);
+
static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- bool at_head)
+ blk_insert_t flags)
{
struct request_queue *q = hctx->queue;
struct bfq_data *bfqd = q->elevator->elevator_data;
struct bfq_queue *bfqq;
bool idle_timer_disabled = false;
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
+ LIST_HEAD(free);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio)
bfqg_stats_update_legacy_io(q, rq);
#endif
spin_lock_irq(&bfqd->lock);
- if (blk_mq_sched_try_insert_merge(q, rq)) {
+ bfqq = bfq_init_rq(rq);
+ if (blk_mq_sched_try_insert_merge(q, rq, &free)) {
spin_unlock_irq(&bfqd->lock);
+ blk_mq_free_requests(&free);
return;
}
- spin_unlock_irq(&bfqd->lock);
-
- blk_mq_sched_request_inserted(rq);
+ trace_block_rq_insert(rq);
- spin_lock_irq(&bfqd->lock);
- bfqq = bfq_init_rq(rq);
- if (!bfqq || at_head || blk_rq_is_passthrough(rq)) {
- if (at_head)
- list_add(&rq->queuelist, &bfqd->dispatch);
- else
- list_add_tail(&rq->queuelist, &bfqd->dispatch);
+ if (flags & BLK_MQ_INSERT_AT_HEAD) {
+ list_add(&rq->queuelist, &bfqd->dispatch);
+ } else if (!bfqq) {
+ list_add_tail(&rq->queuelist, &bfqd->dispatch);
} else {
idle_timer_disabled = __bfq_insert_request(bfqd, rq);
/*
@@ -5538,7 +6286,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* merge).
*/
cmd_flags = rq->cmd_flags;
-
spin_unlock_irq(&bfqd->lock);
bfq_update_insert_stats(q, bfqq, idle_timer_disabled,
@@ -5546,14 +6293,15 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
}
static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx,
- struct list_head *list, bool at_head)
+ struct list_head *list,
+ blk_insert_t flags)
{
while (!list_empty(list)) {
struct request *rq;
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
- bfq_insert_request(hctx, rq, at_head);
+ bfq_insert_request(hctx, rq, flags);
}
}
@@ -5562,7 +6310,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
struct bfq_queue *bfqq = bfqd->in_service_queue;
bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver,
- bfqd->rq_in_driver);
+ bfqd->tot_rq_in_driver);
if (bfqd->hw_tag == 1)
return;
@@ -5573,7 +6321,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
* sum is not exact, as it's not taking into account deactivated
* requests.
*/
- if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
+ if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD)
return;
/*
@@ -5584,7 +6332,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd)
if (bfqq && bfq_bfqq_has_short_ttime(bfqq) &&
bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] <
BFQ_HW_QUEUE_THRESHOLD &&
- bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
+ bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD)
return;
if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES)
@@ -5605,7 +6353,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
bfq_update_hw_tag(bfqd);
- bfqd->rq_in_driver--;
+ bfqd->rq_in_driver[bfqq->actuator_idx]--;
+ bfqd->tot_rq_in_driver--;
bfqq->dispatched--;
if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) {
@@ -5617,7 +6366,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
*/
bfqq->budget_timeout = jiffies;
- bfq_weights_tree_remove(bfqd, bfqq);
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ bfq_weights_tree_remove(bfqq);
}
now_ns = ktime_get_ns();
@@ -5651,7 +6401,19 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
1UL<<(BFQ_RATE_SHIFT - 10))
bfq_update_rate_reset(bfqd, NULL);
bfqd->last_completion = now_ns;
- bfqd->last_completed_rq_bfqq = bfqq;
+ /*
+ * Shared queues are likely to receive I/O at a high
+ * rate. This may deceptively let them be considered as wakers
+ * of other queues. But a false waker will unjustly steal
+ * bandwidth to its supposedly woken queue. So considering
+ * also shared queues in the waking mechanism may cause more
+ * control troubles than throughput benefits. Then reset
+ * last_completed_rq_bfqq if bfqq is a shared queue.
+ */
+ if (!bfq_bfqq_coop(bfqq))
+ bfqd->last_completed_rq_bfqq = bfqq;
+ else
+ bfqd->last_completed_rq_bfqq = NULL;
/*
* If we are waiting to discover whether the request pattern
@@ -5712,17 +6474,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd)
BFQQE_NO_MORE_REQUESTS);
}
- if (!bfqd->rq_in_driver)
+ if (!bfqd->tot_rq_in_driver)
bfq_schedule_dispatch(bfqd);
}
-static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq)
-{
- bfqq->allocated--;
-
- bfq_put_queue(bfqq);
-}
-
/*
* The processes associated with bfqq may happen to generate their
* cumulative I/O at a lower rate than the rate at which the device
@@ -5850,13 +6605,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
* conditions to do it, or we can lower the last base value
* computed.
*
- * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O
+ * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O
* request in flight, because this function is in the code
* path that handles the completion of a request of bfqq, and,
* in particular, this function is executed before
- * bfqd->rq_in_driver is decremented in such a code path.
+ * bfqd->tot_rq_in_driver is decremented in such a code path.
*/
- if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) ||
+ if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) ||
tot_time_ns < bfqq->last_serv_time_ns) {
if (bfqq->last_serv_time_ns == 0) {
/*
@@ -5866,7 +6621,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd,
bfqq->inject_limit = max_t(unsigned int, 1, old_limit);
}
bfqq->last_serv_time_ns = tot_time_ns;
- } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1)
+ } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1)
/*
* No I/O injected and no request still in service in
* the drive: these are the exact conditions for
@@ -5894,18 +6649,7 @@ static void bfq_finish_requeue_request(struct request *rq)
{
struct bfq_queue *bfqq = RQ_BFQQ(rq);
struct bfq_data *bfqd;
-
- /*
- * Requeue and finish hooks are invoked in blk-mq without
- * checking whether the involved request is actually still
- * referenced in the scheduler. To handle this fact, the
- * following two checks make this function exit in case of
- * spurious invocations, for which there is nothing to do.
- *
- * First, check whether rq has nothing to do with an elevator.
- */
- if (unlikely(!(rq->rq_flags & RQF_ELVPRIV)))
- return;
+ unsigned long flags;
/*
* rq either is not associated with any icq, or is an already
@@ -5923,39 +6667,17 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->io_start_time_ns,
rq->cmd_flags);
+ spin_lock_irqsave(&bfqd->lock, flags);
if (likely(rq->rq_flags & RQF_STARTED)) {
- unsigned long flags;
-
- spin_lock_irqsave(&bfqd->lock, flags);
-
if (rq == bfqd->waited_rq)
bfq_update_inject_limit(bfqd, bfqq);
bfq_completed_request(bfqq, bfqd);
- bfq_finish_requeue_request_body(bfqq);
-
- spin_unlock_irqrestore(&bfqd->lock, flags);
- } else {
- /*
- * Request rq may be still/already in the scheduler,
- * in which case we need to remove it (this should
- * never happen in case of requeue). And we cannot
- * defer such a check and removal, to avoid
- * inconsistencies in the time interval from the end
- * of this function to the start of the deferred work.
- * This situation seems to occur only in process
- * context, as a consequence of a merge. In the
- * current version of the code, this implies that the
- * lock is held.
- */
-
- if (!RB_EMPTY_NODE(&rq->rb_node)) {
- bfq_remove_request(rq->q, rq);
- bfqg_stats_update_io_remove(bfqq_group(bfqq),
- rq->cmd_flags);
- }
- bfq_finish_requeue_request_body(bfqq);
}
+ bfqq_request_freed(bfqq);
+ bfq_put_queue(bfqq);
+ RQ_BIC(rq)->requests--;
+ spin_unlock_irqrestore(&bfqd->lock, flags);
/*
* Reset private fields. In case of a requeue, this allows
@@ -5978,6 +6700,16 @@ static void bfq_finish_requeue_request(struct request *rq)
rq->elv.priv[1] = NULL;
}
+static void bfq_finish_request(struct request *rq)
+{
+ bfq_finish_requeue_request(rq);
+
+ if (rq->elv.icq) {
+ put_io_context(rq->elv.icq->ioc);
+ rq->elv.icq = NULL;
+ }
+}
+
/*
* Removes the association between the current task and bfqq, assuming
* that bic points to the bfq iocontext of the task.
@@ -5996,7 +6728,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq)
return bfqq;
}
- bic_set_bfqq(bic, NULL, 1);
+ bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx);
bfq_put_cooperator(bfqq);
@@ -6010,7 +6742,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
bool split, bool is_sync,
bool *new_queue)
{
- struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync);
+ unsigned int act_idx = bfq_actuator_index(bfqd, bio);
+ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx);
+ struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx];
if (likely(bfqq && bfqq != &bfqd->oom_bfqq))
return bfqq;
@@ -6020,16 +6754,16 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
if (bfqq)
bfq_put_queue(bfqq);
- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic);
+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split);
- bic_set_bfqq(bic, bfqq, is_sync);
+ bic_set_bfqq(bic, bfqq, is_sync, act_idx);
if (split && is_sync) {
- if ((bic->was_in_burst_list && bfqd->large_burst) ||
- bic->saved_in_large_burst)
+ if ((bfqq_data->was_in_burst_list && bfqd->large_burst) ||
+ bfqq_data->saved_in_large_burst)
bfq_mark_bfqq_in_large_burst(bfqq);
else {
bfq_clear_bfqq_in_large_burst(bfqq);
- if (bic->was_in_burst_list)
+ if (bfqq_data->was_in_burst_list)
/*
* If bfqq was in the current
* burst list before being
@@ -6075,6 +6809,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd,
*/
static void bfq_prepare_request(struct request *rq)
{
+ rq->elv.icq = ioc_find_get_icq(rq->q);
+
/*
* Regardless of whether we have an icq attached, we have to
* clear the scheduler pointers, as they might point to
@@ -6116,19 +6852,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
struct bfq_queue *bfqq;
bool new_queue = false;
bool bfqq_already_existing = false, split = false;
+ unsigned int a_idx = bfq_actuator_index(bfqd, bio);
if (unlikely(!rq->elv.icq))
return NULL;
/*
- * Assuming that elv.priv[1] is set only if everything is set
+ * Assuming that RQ_BFQQ(rq) is set only if everything is set
* for this rq. This holds true, because this function is
* invoked only for insertion or merging, and, after such
* events, a request cannot be manipulated any longer before
* being removed from bfq.
*/
- if (rq->elv.priv[1])
- return rq->elv.priv[1];
+ if (RQ_BFQQ(rq))
+ return RQ_BFQQ(rq);
bic = icq_to_bic(rq->elv.icq);
@@ -6141,27 +6878,48 @@ static struct bfq_queue *bfq_init_rq(struct request *rq)
if (likely(!new_queue)) {
/* If the queue was seeky for too long, break it apart. */
- if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) {
- bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq");
+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) &&
+ !bic->bfqq_data[a_idx].stably_merged) {
+ struct bfq_queue *old_bfqq = bfqq;
/* Update bic before losing reference to bfqq */
if (bfq_bfqq_in_large_burst(bfqq))
- bic->saved_in_large_burst = true;
+ bic->bfqq_data[a_idx].saved_in_large_burst =
+ true;
bfqq = bfq_split_bfqq(bic, bfqq);
split = true;
- if (!bfqq)
+ if (!bfqq) {
bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio,
true, is_sync,
NULL);
- else
+ if (unlikely(bfqq == &bfqd->oom_bfqq))
+ bfqq_already_existing = true;
+ } else
bfqq_already_existing = true;
+
+ if (!bfqq_already_existing) {
+ bfqq->waker_bfqq = old_bfqq->waker_bfqq;
+ bfqq->tentative_waker_bfqq = NULL;
+
+ /*
+ * If the waker queue disappears, then
+ * new_bfqq->waker_bfqq must be
+ * reset. So insert new_bfqq into the
+ * woken_list of the waker. See
+ * bfq_check_waker for details.
+ */
+ if (bfqq->waker_bfqq)
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqq->waker_bfqq->woken_list);
+ }
}
}
- bfqq->allocated++;
+ bfqq_request_allocated(bfqq);
bfqq->ref++;
+ bic->requests++;
bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d",
rq, bfqq, bfqq->ref);
@@ -6258,8 +7016,8 @@ bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_bfqq_expire(bfqd, bfqq, true, reason);
schedule_dispatch:
- spin_unlock_irqrestore(&bfqd->lock, flags);
bfq_schedule_dispatch(bfqd);
+ spin_unlock_irqrestore(&bfqd->lock, flags);
}
/*
@@ -6310,24 +7068,26 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd,
*/
void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg)
{
- int i, j;
+ int i, j, k;
- for (i = 0; i < 2; i++)
- for (j = 0; j < IOPRIO_BE_NR; j++)
- __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]);
+ for (k = 0; k < bfqd->num_actuators; k++) {
+ for (i = 0; i < 2; i++)
+ for (j = 0; j < IOPRIO_NR_LEVELS; j++)
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]);
- __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq);
+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]);
+ }
}
/*
* See the comments on bfq_limit_depth for the purpose of
* the depths set in the function. Return minimum shallow depth we'll use.
*/
-static unsigned int bfq_update_depths(struct bfq_data *bfqd,
- struct sbitmap_queue *bt)
+static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt)
{
- unsigned int i, j, min_shallow = UINT_MAX;
+ unsigned int depth = 1U << bt->sb.shift;
+ bfqd->full_depth_shift = bt->sb.shift;
/*
* In-word depths if no bfq_queue is being weight-raised:
* leaving 25% of tags only for sync reads.
@@ -6339,13 +7099,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* limit 'something'.
*/
/* no more than 50% of tags for async I/O */
- bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U);
+ bfqd->word_depths[0][0] = max(depth >> 1, 1U);
/*
* no more than 75% of tags for sync writes (25% extra tags
* w.r.t. async I/O, to prevent async I/O from starving sync
* writes)
*/
- bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U);
+ bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U);
/*
* In-word depths in case some bfq_queue is being weight-
@@ -6355,25 +7115,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd,
* shortage.
*/
/* no more than ~18% of tags for async I/O */
- bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U);
+ bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U);
/* no more than ~37% of tags for sync writes (~20% extra tags) */
- bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U);
-
- for (i = 0; i < 2; i++)
- for (j = 0; j < 2; j++)
- min_shallow = min(min_shallow, bfqd->word_depths[i][j]);
-
- return min_shallow;
+ bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U);
}
static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct bfq_data *bfqd = hctx->queue->elevator->elevator_data;
struct blk_mq_tags *tags = hctx->sched_tags;
- unsigned int min_shallow;
- min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags);
- sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow);
+ bfq_update_depths(bfqd, &tags->bitmap_tags);
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1);
}
static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index)
@@ -6386,6 +7139,7 @@ static void bfq_exit_queue(struct elevator_queue *e)
{
struct bfq_data *bfqd = e->elevator_data;
struct bfq_queue *bfqq, *n;
+ unsigned int actuator;
hrtimer_cancel(&bfqd->idle_slice_timer);
@@ -6394,13 +7148,17 @@ static void bfq_exit_queue(struct elevator_queue *e)
bfq_deactivate_bfqq(bfqd, bfqq, false, false);
spin_unlock_irq(&bfqd->lock);
+ for (actuator = 0; actuator < bfqd->num_actuators; actuator++)
+ WARN_ON_ONCE(bfqd->rq_in_driver[actuator]);
+ WARN_ON_ONCE(bfqd->tot_rq_in_driver);
+
hrtimer_cancel(&bfqd->idle_slice_timer);
/* release oom-queue reference to root group */
bfqg_and_blkg_put(bfqd->root_group);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
- blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq);
+ blkcg_deactivate_policy(bfqd->queue->disk, &blkcg_policy_bfq);
#else
spin_lock_irq(&bfqd->lock);
bfq_put_async_queues(bfqd, bfqd->root_group);
@@ -6408,6 +7166,10 @@ static void bfq_exit_queue(struct elevator_queue *e)
spin_unlock_irq(&bfqd->lock);
#endif
+ blk_stat_disable_accounting(bfqd->queue);
+ clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags);
+ wbt_enable_default(bfqd->queue->disk);
+
kfree(bfqd);
}
@@ -6431,6 +7193,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
{
struct bfq_data *bfqd;
struct elevator_queue *eq;
+ unsigned int i;
+ struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges;
eq = elevator_alloc(q, e);
if (!eq)
@@ -6451,8 +7215,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
* Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues.
* Grab a permanent reference to it, so that the normal code flow
* will not attempt to free it.
+ * Set zero as actuator index: we will pretend that
+ * all I/O requests are for the same actuator.
*/
- bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0);
+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0);
bfqd->oom_bfqq.ref++;
bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO;
bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE;
@@ -6471,6 +7237,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->queue = q;
+ bfqd->num_actuators = 1;
+ /*
+ * If the disk supports multiple actuators, copy independent
+ * access ranges from the request queue structure.
+ */
+ spin_lock_irq(&q->queue_lock);
+ if (ia_ranges) {
+ /*
+ * Check if the disk ia_ranges size exceeds the current bfq
+ * actuator limit.
+ */
+ if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) {
+ pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n",
+ ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS);
+ pr_crit("Falling back to single actuator mode.\n");
+ } else {
+ bfqd->num_actuators = ia_ranges->nr_ia_ranges;
+
+ for (i = 0; i < bfqd->num_actuators; i++) {
+ bfqd->sector[i] = ia_ranges->ia_range[i].sector;
+ bfqd->nr_sectors[i] =
+ ia_ranges->ia_range[i].nr_sectors;
+ }
+ }
+ }
+
+ /* Otherwise use single-actuator dev info */
+ if (bfqd->num_actuators == 1) {
+ bfqd->sector[0] = 0;
+ bfqd->nr_sectors[0] = get_capacity(q->disk);
+ }
+ spin_unlock_irq(&q->queue_lock);
+
INIT_LIST_HEAD(&bfqd->dispatch);
hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC,
@@ -6478,9 +7277,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->idle_slice_timer.function = bfq_idle_slice_timer;
bfqd->queue_weights_tree = RB_ROOT_CACHED;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
bfqd->num_groups_with_pending_reqs = 0;
+#endif
- INIT_LIST_HEAD(&bfqd->active_list);
+ INIT_LIST_HEAD(&bfqd->active_list[0]);
+ INIT_LIST_HEAD(&bfqd->active_list[1]);
INIT_LIST_HEAD(&bfqd->idle_list);
INIT_HLIST_HEAD(&bfqd->burst_list);
@@ -6496,8 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfqd->bfq_slice_idle = bfq_slice_idle;
bfqd->bfq_timeout = bfq_timeout;
- bfqd->bfq_requests_within_timer = 120;
-
bfqd->bfq_large_burst_thresh = 8;
bfqd->bfq_burst_interval = msecs_to_jiffies(180);
@@ -6508,7 +7308,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
*/
bfqd->bfq_wr_coeff = 30;
bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300);
- bfqd->bfq_wr_max_time = 0;
bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000);
bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500);
bfqd->bfq_wr_max_softrt_rate = 7000; /*
@@ -6527,6 +7326,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
ref_wr_duration[blk_queue_nonrot(bfqd->queue)];
bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3;
+ /* see comments on the definition of next field inside bfq_data */
+ bfqd->actuator_load_threshold = 4;
+
spin_lock_init(&bfqd->lock);
/*
@@ -6550,7 +7352,13 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e)
bfq_init_root_group(bfqd->root_group, bfqd);
bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group);
- wbt_disable_default(q);
+ /* We dispatch from request queue wide instead of hw queue */
+ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
+
+ set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags);
+ wbt_disable_default(q->disk);
+ blk_stat_enable_accounting(q);
+
return 0;
out_free:
@@ -6790,7 +7598,7 @@ static struct elevator_type iosched_bfq_mq = {
.limit_depth = bfq_limit_depth,
.prepare_request = bfq_prepare_request,
.requeue_request = bfq_finish_requeue_request,
- .finish_request = bfq_finish_requeue_request,
+ .finish_request = bfq_finish_request,
.exit_icq = bfq_exit_icq,
.insert_requests = bfq_insert_requests,
.dispatch_request = bfq_dispatch_request,
diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h
index cd224aaf9f52..467e8cfc41a2 100644
--- a/block/bfq-iosched.h
+++ b/block/bfq-iosched.h
@@ -8,7 +8,6 @@
#include <linux/blktrace_api.h>
#include <linux/hrtimer.h>
-#include <linux/blk-cgroup.h>
#include "blk-cgroup-rwstat.h"
@@ -21,11 +20,10 @@
#define BFQ_DEFAULT_QUEUE_IOPRIO 4
-#define BFQ_WEIGHT_LEGACY_DFL 100
#define BFQ_DEFAULT_GRP_IOPRIO 0
#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE
-#define MAX_PID_STR_LENGTH 12
+#define MAX_BFQQ_NAME_LENGTH 16
/*
* Soft real-time applications are extremely more latency sensitive
@@ -34,6 +32,14 @@
*/
#define BFQ_SOFTRT_WEIGHT_FACTOR 100
+/*
+ * Maximum number of actuators supported. This constant is used simply
+ * to define the size of the static array that will contain
+ * per-actuator data. The current value is hopefully a good upper
+ * bound to the possible number of actuators of any actual drive.
+ */
+#define BFQ_MAX_ACTUATORS 8
+
struct bfq_entity;
/**
@@ -170,6 +176,9 @@ struct bfq_entity {
/* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */
int budget;
+ /* Number of requests allocated in the subtree of this entity */
+ int allocated;
+
/* device weight, if non-zero, it overrides the default weight of
* bfq_group_data */
int dev_weight;
@@ -195,8 +204,13 @@ struct bfq_entity {
/* flag, set to request a weight, ioprio or ioprio_class change */
int prio_changed;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
/* flag, set if the entity is counted in groups_with_pending_reqs */
bool in_groups_with_pending_reqs;
+#endif
+
+ /* last child queue of entity created (for non-leaf entities) */
+ struct bfq_queue *last_bfqq_created;
};
struct bfq_group;
@@ -220,16 +234,20 @@ struct bfq_ttime {
* struct bfq_queue - leaf schedulable entity.
*
* A bfq_queue is a leaf request queue; it can be associated with an
- * io_context or more, if it is async or shared between cooperating
- * processes. @cgroup holds a reference to the cgroup, to be sure that it
- * does not disappear while a bfqq still references it (mostly to avoid
- * races between request issuing and task migration followed by cgroup
- * destruction).
- * All the fields are protected by the queue lock of the containing bfqd.
+ * io_context or more, if it is async or shared between cooperating
+ * processes. Besides, it contains I/O requests for only one actuator
+ * (an io_context is associated with a different bfq_queue for each
+ * actuator it generates I/O for). @cgroup holds a reference to the
+ * cgroup, to be sure that it does not disappear while a bfqq still
+ * references it (mostly to avoid races between request issuing and
+ * task migration followed by cgroup destruction). All the fields are
+ * protected by the queue lock of the containing bfqd.
*/
struct bfq_queue {
/* reference counter */
int ref;
+ /* counter of references from other queues for delayed stable merge */
+ int stable_ref;
/* parent bfq_data */
struct bfq_data *bfqd;
@@ -261,8 +279,6 @@ struct bfq_queue {
struct request *next_rq;
/* number of sync and async requests queued */
int queued[2];
- /* number of requests currently allocated */
- int allocated;
/* number of pending metadata requests */
int meta_pending;
/* fifo list of requests in sort_list */
@@ -291,6 +307,11 @@ struct bfq_queue {
/* associated @bfq_ttime struct */
struct bfq_ttime ttime;
+ /* when bfqq started to do I/O within the last observation window */
+ u64 io_start_time;
+ /* how long bfqq has remained empty during the last observ. window */
+ u64 tot_idle_time;
+
/* bit vector: a 1 for each seeky requests in history */
u32 seek_history;
@@ -359,9 +380,7 @@ struct bfq_queue {
unsigned long split_time; /* time of last split */
unsigned long first_IO_time; /* time of first I/O for this queue */
-
- /* max service rate measured so far */
- u32 max_service_rate;
+ unsigned long creation_time; /* when this queue is created */
/*
* Pointer to the waker queue for this queue, i.e., to the
@@ -371,6 +390,13 @@ struct bfq_queue {
* bfq_select_queue().
*/
struct bfq_queue *waker_bfqq;
+ /* pointer to the curr. tentative waker queue, see bfq_check_waker() */
+ struct bfq_queue *tentative_waker_bfqq;
+ /* number of times the same tentative waker has been detected */
+ unsigned int num_waker_detections;
+ /* time when we started considering this waker */
+ u64 waker_detection_started;
+
/* node for woken_list, see below */
struct hlist_node woken_list_node;
/*
@@ -380,24 +406,18 @@ struct bfq_queue {
* the woken queues when this queue exits.
*/
struct hlist_head woken_list;
+
+ /* index of the actuator this queue is associated with */
+ unsigned int actuator_idx;
};
/**
- * struct bfq_io_cq - per (request_queue, io_context) structure.
- */
-struct bfq_io_cq {
- /* associated io_cq structure */
- struct io_cq icq; /* must be the first member */
- /* array of two process queues, the sync and the async */
- struct bfq_queue *bfqq[2];
- /* per (request_queue, blkcg) ioprio */
- int ioprio;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- uint64_t blkcg_serial_nr; /* the current blkcg serial */
-#endif
+* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq
+*/
+struct bfq_iocq_bfqq_data {
/*
* Snapshot of the has_short_time flag before merging; taken
- * to remember its value while the queue is merged, so as to
+ * to remember its values while the queue is merged, so as to
* be able to restore it in case of split.
*/
bool saved_has_short_ttime;
@@ -407,8 +427,11 @@ struct bfq_io_cq {
*/
bool saved_IO_bound;
+ u64 saved_io_start_time;
+ u64 saved_tot_idle_time;
+
/*
- * Same purpose as the previous fields for the value of the
+ * Same purpose as the previous fields for the values of the
* field keeping the queue's belonging to a large burst
*/
bool saved_in_large_burst;
@@ -432,9 +455,53 @@ struct bfq_io_cq {
*/
unsigned long saved_wr_coeff;
unsigned long saved_last_wr_start_finish;
+ unsigned long saved_service_from_wr;
unsigned long saved_wr_start_at_switch_to_srt;
unsigned int saved_wr_cur_max_time;
struct bfq_ttime saved_ttime;
+
+ /* Save also injection state */
+ u64 saved_last_serv_time_ns;
+ unsigned int saved_inject_limit;
+ unsigned long saved_decrease_time_jif;
+
+ /* candidate queue for a stable merge (due to close creation time) */
+ struct bfq_queue *stable_merge_bfqq;
+
+ bool stably_merged; /* non splittable if true */
+};
+
+/**
+ * struct bfq_io_cq - per (request_queue, io_context) structure.
+ */
+struct bfq_io_cq {
+ /* associated io_cq structure */
+ struct io_cq icq; /* must be the first member */
+ /*
+ * Matrix of associated process queues: first row for async
+ * queues, second row sync queues. Each row contains one
+ * column for each actuator. An I/O request generated by the
+ * process is inserted into the queue pointed by bfqq[i][j] if
+ * the request is to be served by the j-th actuator of the
+ * drive, where i==0 or i==1, depending on whether the request
+ * is async or sync. So there is a distinct queue for each
+ * actuator.
+ */
+ struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS];
+ /* per (request_queue, blkcg) ioprio */
+ int ioprio;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ uint64_t blkcg_serial_nr; /* the current blkcg serial */
+#endif
+
+ /*
+ * Persistent data for associated synchronous process queues
+ * (one queue per actuator, see field bfqq above). In
+ * particular, each of these queues may undergo a merge.
+ */
+ struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS];
+
+ unsigned int requests; /* Number of requests this process has in flight */
};
/**
@@ -461,28 +528,29 @@ struct bfq_data {
*/
struct rb_root_cached queue_weights_tree;
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
/*
- * Number of groups with at least one descendant process that
+ * Number of groups with at least one process that
* has at least one request waiting for completion. Note that
* this accounts for also requests already dispatched, but not
* yet completed. Therefore this number of groups may differ
* (be larger) than the number of active groups, as a group is
* considered active only if its corresponding entity has
- * descendant queues with at least one request queued. This
+ * queues with at least one request queued. This
* number is used to decide whether a scenario is symmetric.
* For a detailed explanation see comments on the computation
* of the variable asymmetric_scenario in the function
* bfq_better_to_idle().
*
* However, it is hard to compute this number exactly, for
- * groups with multiple descendant processes. Consider a group
- * that is inactive, i.e., that has no descendant process with
+ * groups with multiple processes. Consider a group
+ * that is inactive, i.e., that has no process with
* pending I/O inside BFQ queues. Then suppose that
* num_groups_with_pending_reqs is still accounting for this
- * group, because the group has descendant processes with some
+ * group, because the group has processes with some
* I/O request still in flight. num_groups_with_pending_reqs
* should be decremented when the in-flight request of the
- * last descendant process is finally completed (assuming that
+ * last process is finally completed (assuming that
* nothing else has changed for the group in the meantime, in
* terms of composition of the group and active/inactive state of child
* groups and processes). To accomplish this, an additional
@@ -491,7 +559,7 @@ struct bfq_data {
* we resort to the following tradeoff between simplicity and
* accuracy: for an inactive group that is still counted in
* num_groups_with_pending_reqs, we decrement
- * num_groups_with_pending_reqs when the first descendant
+ * num_groups_with_pending_reqs when the first
* process of the group remains with no request waiting for
* completion.
*
@@ -499,15 +567,16 @@ struct bfq_data {
* carefulness: to avoid multiple decrements, we flag a group,
* more precisely an entity representing a group, as still
* counted in num_groups_with_pending_reqs when it becomes
- * inactive. Then, when the first descendant queue of the
+ * inactive. Then, when the first queue of the
* entity remains with no request waiting for completion,
* num_groups_with_pending_reqs is decremented, and this flag
* is reset. After this flag is reset for the entity,
* num_groups_with_pending_reqs won't be decremented any
- * longer in case a new descendant queue of the entity remains
+ * longer in case a new queue of the entity remains
* with no request waiting for completion.
*/
unsigned int num_groups_with_pending_reqs;
+#endif
/*
* Per-class (RT, BE, IDLE) number of bfq_queues containing
@@ -520,7 +589,12 @@ struct bfq_data {
/* number of queued requests */
int queued;
/* number of requests dispatched and waiting for completion */
- int rq_in_driver;
+ int tot_rq_in_driver;
+ /*
+ * number of requests dispatched and waiting for completion
+ * for each actuator
+ */
+ int rq_in_driver[BFQ_MAX_ACTUATORS];
/* true if the device is non rotational and performs queueing */
bool nonrot_with_queueing;
@@ -559,6 +633,9 @@ struct bfq_data {
/* bfqq owning the last completed rq */
struct bfq_queue *last_completed_rq_bfqq;
+ /* last bfqq created, among those in the root group */
+ struct bfq_queue *last_bfqq_created;
+
/* time of last transition from empty to non-empty (ns) */
u64 last_empty_occupied_ns;
@@ -611,8 +688,13 @@ struct bfq_data {
/* maximum budget allotted to a bfq_queue before rescheduling */
int bfq_max_budget;
- /* list of all the bfq_queues active on the device */
- struct list_head active_list;
+ /*
+ * List of all the bfq_queues active for a specific actuator
+ * on the device. Keeping active queues separate on a
+ * per-actuator basis helps implementing per-actuator
+ * injection more efficiently.
+ */
+ struct list_head active_list[BFQ_MAX_ACTUATORS];
/* list of all the bfq_queues idle on the device */
struct list_head idle_list;
@@ -642,14 +724,6 @@ struct bfq_data {
unsigned int bfq_timeout;
/*
- * Number of consecutive requests that must be issued within
- * the idle time slice to set again idling to a queue which
- * was marked as non-I/O-bound (see the definition of the
- * IO_bound flag for further details).
- */
- unsigned int bfq_requests_within_timer;
-
- /*
* Force device idling whenever needed to provide accurate
* service guarantees, without caring about throughput
* issues. CAVEAT: this may even increase latencies, in case
@@ -694,8 +768,6 @@ struct bfq_data {
* is multiplied.
*/
unsigned int bfq_wr_coeff;
- /* maximum duration of a weight-raising period (jiffies) */
- unsigned int bfq_wr_max_time;
/* Maximum weight-raising duration for soft real-time processes */
unsigned int bfq_wr_rt_max_time;
@@ -742,6 +814,43 @@ struct bfq_data {
* function)
*/
unsigned int word_depths[2][2];
+ unsigned int full_depth_shift;
+
+ /*
+ * Number of independent actuators. This is equal to 1 in
+ * case of single-actuator drives.
+ */
+ unsigned int num_actuators;
+ /*
+ * Disk independent access ranges for each actuator
+ * in this device.
+ */
+ sector_t sector[BFQ_MAX_ACTUATORS];
+ sector_t nr_sectors[BFQ_MAX_ACTUATORS];
+ struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS];
+
+ /*
+ * If the number of I/O requests queued in the device for a
+ * given actuator is below next threshold, then the actuator
+ * is deemed as underutilized. If this condition is found to
+ * hold for some actuator upon a dispatch, but (i) the
+ * in-service queue does not contain I/O for that actuator,
+ * while (ii) some other queue does contain I/O for that
+ * actuator, then the head I/O request of the latter queue is
+ * returned (injected), instead of the head request of the
+ * currently in-service queue.
+ *
+ * We set the threshold, empirically, to the minimum possible
+ * value for which an actuator is fully utilized, or close to
+ * be fully utilized. By doing so, injected I/O 'steals' as
+ * few drive-queue slots as possibile to the in-service
+ * queue. This reduces as much as possible the probability
+ * that the service of I/O from the in-service bfq_queue gets
+ * delayed because of slot exhaustion, i.e., because all the
+ * slots of the drive queue are filled with I/O injected from
+ * other queues (NCQ provides for 32 slots).
+ */
+ unsigned int actuator_load_threshold;
};
enum bfqq_state_flags {
@@ -770,7 +879,6 @@ enum bfqq_state_flags {
*/
BFQQF_coop, /* bfqq is shared */
BFQQF_split_coop, /* shared bfqq will be split */
- BFQQF_has_waker /* bfqq has a waker queue */
};
#define BFQ_BFQQ_FNS(name) \
@@ -790,7 +898,6 @@ BFQ_BFQQ_FNS(in_large_burst);
BFQ_BFQQ_FNS(coop);
BFQ_BFQQ_FNS(split_coop);
BFQ_BFQQ_FNS(softrt_update);
-BFQ_BFQQ_FNS(has_waker);
#undef BFQ_BFQQ_FNS
/* Expiration reasons. */
@@ -900,19 +1007,20 @@ struct bfq_group {
char blkg_path[128];
/* reference counter (see comments in bfq_bic_update_cgroup) */
- int ref;
+ refcount_t ref;
struct bfq_entity entity;
struct bfq_sched_data sched_data;
- void *bfqd;
+ struct bfq_data *bfqd;
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
- struct bfq_queue *async_idle_bfqq;
+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
+ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
struct bfq_entity *my_entity;
int active_entities;
+ int num_queues_with_pending_reqs;
struct rb_root rq_pos_tree;
@@ -924,15 +1032,13 @@ struct bfq_group {
struct bfq_entity entity;
struct bfq_sched_data sched_data;
- struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR];
- struct bfq_queue *async_idle_bfqq;
+ struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS];
+ struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS];
struct rb_root rq_pos_tree;
};
#endif
-struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
-
/* --------------- main algorithm interface ----------------- */
#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \
@@ -940,20 +1046,18 @@ struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
extern const int bfq_timeout;
-struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync);
-void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync);
+struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync,
+ unsigned int actuator_idx);
+void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync,
+ unsigned int actuator_idx);
struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic);
void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq);
-void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- struct rb_root_cached *root);
-void __bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq,
- struct rb_root_cached *root);
-void bfq_weights_tree_remove(struct bfq_data *bfqd,
- struct bfq_queue *bfqq);
+void bfq_weights_tree_add(struct bfq_queue *bfqq);
+void bfq_weights_tree_remove(struct bfq_queue *bfqq);
void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool compensate, enum bfqq_expiration reason);
void bfq_put_queue(struct bfq_queue *bfqq);
+void bfq_put_cooperator(struct bfq_queue *bfqq);
void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_schedule_dispatch(struct bfq_data *bfqd);
@@ -964,29 +1068,30 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg);
/* ---------------- cgroups-support interface ---------------- */
void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq);
-void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
- unsigned int op);
-void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op);
-void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op);
+void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf);
+void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf);
void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns,
- u64 io_start_time_ns, unsigned int op);
+ u64 io_start_time_ns, blk_opf_t opf);
void bfqg_stats_update_dequeue(struct bfq_group *bfqg);
-void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
-void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg);
-void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq,
struct bfq_group *bfqg);
+#ifdef CONFIG_BFQ_CGROUP_DEBUG
+void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq,
+ blk_opf_t opf);
+void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg);
+void bfqg_stats_update_idle_time(struct bfq_group *bfqg);
+void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg);
+#endif
+
void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg);
void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio);
void bfq_end_wr_async(struct bfq_data *bfqd);
-struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd,
- struct blkcg *blkcg);
+struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio);
struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg);
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node);
-void bfqg_and_blkg_get(struct bfq_group *bfqg);
void bfqg_and_blkg_put(struct bfq_group *bfqg);
#ifdef CONFIG_BFQ_GROUP_IOSCHED
@@ -1026,7 +1131,6 @@ extern struct blkcg_policy blkcg_policy_bfq;
for (parent = NULL; entity ; entity = parent)
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq);
struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity);
unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd);
struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity);
@@ -1051,50 +1155,50 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq);
void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bool expiration);
-void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool expiration);
-void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq);
+void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration);
+void bfq_add_bfqq_busy(struct bfq_queue *bfqq);
+void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
+void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq);
/* --------------- end of interface of B-WF2Q+ ---------------- */
/* Logging facilities. */
-static inline void bfq_pid_to_str(int pid, char *str, int len)
+static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len)
{
- if (pid != -1)
- snprintf(str, len, "%d", pid);
+ char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A';
+
+ if (bfqq->pid != -1)
+ snprintf(str, len, "bfq%d%c", bfqq->pid, type);
else
- snprintf(str, len, "SHARED-");
+ snprintf(str, len, "bfqSHARED-%c", type);
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
struct bfq_group *bfqq_group(struct bfq_queue *bfqq);
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char pid_str[MAX_PID_STR_LENGTH]; \
+ char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \
- bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
+ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
blk_add_cgroup_trace_msg((bfqd)->queue, \
- bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \
- "bfq%s%c " fmt, pid_str, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \
+ &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \
+ "%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \
blk_add_cgroup_trace_msg((bfqd)->queue, \
- bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \
+ &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \
} while (0)
#else /* CONFIG_BFQ_GROUP_IOSCHED */
#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \
- char pid_str[MAX_PID_STR_LENGTH]; \
+ char pid_str[MAX_BFQQ_NAME_LENGTH]; \
if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \
break; \
- bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \
- blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \
- bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \
- ##args); \
+ bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \
+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \
} while (0)
#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0)
diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c
index eb0e2a6daabe..7941b6f07391 100644
--- a/block/bfq-wf2q.c
+++ b/block/bfq-wf2q.c
@@ -137,24 +137,11 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd,
sd->next_in_service = next_in_service;
- if (!next_in_service)
- return parent_sched_may_change;
-
return parent_sched_may_change;
}
#ifdef CONFIG_BFQ_GROUP_IOSCHED
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
-{
- struct bfq_entity *group_entity = bfqq->entity.parent;
-
- if (!group_entity)
- group_entity = &bfqq->bfqd->root_group->entity;
-
- return container_of(group_entity, struct bfq_group, entity);
-}
-
/*
* Returns true if this budget changes may let next_in_service->parent
* become the next_in_service entity for its parent entity.
@@ -231,13 +218,26 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return false;
}
-#else /* CONFIG_BFQ_GROUP_IOSCHED */
+static void bfq_inc_active_entities(struct bfq_entity *entity)
+{
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
+
+ if (bfqg != bfqg->bfqd->root_group)
+ bfqg->active_entities++;
+}
-struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq)
+static void bfq_dec_active_entities(struct bfq_entity *entity)
{
- return bfqq->bfqd->root_group;
+ struct bfq_sched_data *sd = entity->sched_data;
+ struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data);
+
+ if (bfqg != bfqg->bfqd->root_group)
+ bfqg->active_entities--;
}
+#else /* CONFIG_BFQ_GROUP_IOSCHED */
+
static bool bfq_update_parent_budget(struct bfq_entity *next_in_service)
{
return false;
@@ -248,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity)
return true;
}
+static void bfq_inc_active_entities(struct bfq_entity *entity)
+{
+}
+
+static void bfq_dec_active_entities(struct bfq_entity *entity)
+{
+}
+
#endif /* CONFIG_BFQ_GROUP_IOSCHED */
/*
@@ -474,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node = &entity->rb_node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
bfq_insert(&st->active, entity);
@@ -489,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st,
bfq_update_active_tree(node);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
if (bfqq)
- list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (bfqg != bfqd->root_group)
- bfqg->active_entities++;
-#endif
+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]);
+
+ bfq_inc_active_entities(entity);
}
/**
@@ -508,7 +504,7 @@ static void bfq_active_insert(struct bfq_service_tree *st,
*/
unsigned short bfq_ioprio_to_weight(int ioprio)
{
- return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
+ return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF;
}
/**
@@ -517,12 +513,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio)
*
* To preserve as much as possible the old only-ioprio user interface,
* 0 is used as an escape ioprio value for weights (numerically) equal or
- * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF.
+ * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF.
*/
static unsigned short bfq_weight_to_ioprio(int weight)
{
return max_t(int, 0,
- IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight);
+ IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF);
}
static void bfq_get_entity(struct bfq_entity *entity)
@@ -533,9 +529,7 @@ static void bfq_get_entity(struct bfq_entity *entity)
bfqq->ref++;
bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d",
bfqq, bfqq->ref);
- } else
- bfqg_and_blkg_get(container_of(entity, struct bfq_group,
- entity));
+ }
}
/**
@@ -578,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st,
{
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
struct rb_node *node;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd = NULL;
- struct bfq_group *bfqg = NULL;
- struct bfq_data *bfqd = NULL;
-#endif
node = bfq_find_deepest(&entity->rb_node);
bfq_extract(&st->active, entity);
if (node)
bfq_update_active_tree(node);
-
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- sd = entity->sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
-#endif
if (bfqq)
list_del(&bfqq->bfqq_list);
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (bfqg != bfqd->root_group)
- bfqg->active_entities--;
-#endif
+
+ bfq_dec_active_entities(entity);
}
/**
@@ -649,14 +630,8 @@ static void bfq_forget_entity(struct bfq_service_tree *st,
entity->on_st_or_in_serv = false;
st->wsum -= entity->weight;
- if (is_in_service)
- return;
-
- if (bfqq)
+ if (bfqq && !is_in_service)
bfq_put_queue(bfqq);
- else
- bfqg_and_blkg_put(container_of(entity, struct bfq_group,
- entity));
}
/**
@@ -732,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
if (entity->prio_changed) {
struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity);
unsigned int prev_weight, new_weight;
- struct bfq_data *bfqd = NULL;
- struct rb_root_cached *root;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- struct bfq_sched_data *sd;
- struct bfq_group *bfqg;
-#endif
-
- if (bfqq)
- bfqd = bfqq->bfqd;
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- else {
- sd = entity->my_sched_data;
- bfqg = container_of(sd, struct bfq_group, sched_data);
- bfqd = (struct bfq_data *)bfqg->bfqd;
- }
-#endif
/* Matches the smp_wmb() in bfq_group_set_weight. */
smp_rmb();
@@ -796,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st,
* queue, remove the entity from its old weight counter (if
* there is a counter associated with the entity).
*/
- if (prev_weight != new_weight && bfqq) {
- root = &bfqd->queue_weights_tree;
- __bfq_weights_tree_remove(bfqd, bfqq, root);
- }
+ if (prev_weight != new_weight && bfqq)
+ bfq_weights_tree_remove(bfqq);
entity->weight = new_weight;
/*
* Add the entity, if it is not a weight-raised queue,
* to the counter associated with its new weight.
*/
- if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) {
- /* If we get here, root has been initialized. */
- bfq_weights_tree_add(bfqd, bfqq, root);
- }
+ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1)
+ bfq_weights_tree_add(bfqq);
new_st->wsum += entity->weight;
@@ -1010,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity,
entity->on_st_or_in_serv = true;
}
-#ifdef CONFIG_BFQ_GROUP_IOSCHED
- if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */
- struct bfq_group *bfqg =
- container_of(entity, struct bfq_group, entity);
- struct bfq_data *bfqd = bfqg->bfqd;
-
- if (!entity->in_groups_with_pending_reqs) {
- entity->in_groups_with_pending_reqs = true;
- bfqd->num_groups_with_pending_reqs++;
- }
- }
-#endif
-
bfq_update_fin_time_enqueue(entity, st, backshifted);
}
@@ -1108,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity)
}
static void __bfq_activate_requeue_entity(struct bfq_entity *entity,
- struct bfq_sched_data *sd,
bool non_blocking_wait_rq)
{
struct bfq_service_tree *st = bfq_entity_service_tree(entity);
- if (sd->in_service_entity == entity || entity->tree == &st->active)
+ if (entity->sched_data->in_service_entity == entity ||
+ entity->tree == &st->active)
/*
* in service or already queued on the active tree,
* requeue or reposition
@@ -1145,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity,
bool non_blocking_wait_rq,
bool requeue, bool expiration)
{
- struct bfq_sched_data *sd;
-
for_each_entity(entity) {
- sd = entity->sched_data;
- __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq);
-
- if (!bfq_update_next_in_service(sd, entity, expiration) &&
- !requeue)
+ __bfq_activate_requeue_entity(entity, non_blocking_wait_rq);
+ if (!bfq_update_next_in_service(entity->sched_data, entity,
+ expiration) && !requeue)
break;
}
}
@@ -1386,6 +1324,8 @@ left:
/**
* __bfq_lookup_next_entity - return the first eligible entity in @st.
* @st: the service tree.
+ * @in_service: whether or not there is an in-service entity for the sched_data
+ * this active tree belongs to.
*
* If there is no in-service entity for the sched_data st belongs to,
* then return the entity that will be set in service if:
@@ -1498,9 +1438,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd,
break;
}
- if (!entity)
- return NULL;
-
return entity;
}
@@ -1673,14 +1610,41 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfqq == bfqd->in_service_queue, expiration);
}
+void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (!entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = true;
+ if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++))
+ bfqq->bfqd->num_groups_with_pending_reqs++;
+ }
+#endif
+}
+
+void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq)
+{
+#ifdef CONFIG_BFQ_GROUP_IOSCHED
+ struct bfq_entity *entity = &bfqq->entity;
+
+ if (entity->in_groups_with_pending_reqs) {
+ entity->in_groups_with_pending_reqs = false;
+ if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs))
+ bfqq->bfqd->num_groups_with_pending_reqs--;
+ }
+#endif
+}
+
/*
* Called when the bfqq no longer has requests pending, remove it from
* the service tree. As a special case, it can be invoked during an
* expiration.
*/
-void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
- bool expiration)
+void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration)
{
+ struct bfq_data *bfqd = bfqq->bfqd;
+
bfq_log_bfqq(bfqd, bfqq, "del from busy");
bfq_clear_bfqq_busy(bfqq);
@@ -1694,15 +1658,23 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq,
bfq_deactivate_bfqq(bfqd, bfqq, true, expiration);
- if (!bfqq->dispatched)
- bfq_weights_tree_remove(bfqd, bfqq);
+ if (!bfqq->dispatched) {
+ bfq_del_bfqq_in_groups_with_pending_reqs(bfqq);
+ /*
+ * Next function is invoked last, because it causes bfqq to be
+ * freed. DO NOT use bfqq after the next function invocation.
+ */
+ bfq_weights_tree_remove(bfqq);
+ }
}
/*
* Called when an inactive queue receives a new request.
*/
-void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
+void bfq_add_bfqq_busy(struct bfq_queue *bfqq)
{
+ struct bfq_data *bfqd = bfqq->bfqd;
+
bfq_log_bfqq(bfqd, bfqq, "add to busy");
bfq_activate_bfqq(bfqd, bfqq);
@@ -1710,11 +1682,20 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq)
bfq_mark_bfqq_busy(bfqq);
bfqd->busy_queues[bfqq->ioprio_class - 1]++;
- if (!bfqq->dispatched)
+ if (!bfqq->dispatched) {
+ bfq_add_bfqq_in_groups_with_pending_reqs(bfqq);
if (bfqq->wr_coeff == 1)
- bfq_weights_tree_add(bfqd, bfqq,
- &bfqd->queue_weights_tree);
+ bfq_weights_tree_add(bfqq);
+ }
if (bfqq->wr_coeff > 1)
bfqd->wr_busy_queues++;
+
+ /* Move bfqq to the head of the woken list of its waker */
+ if (!hlist_unhashed(&bfqq->woken_list_node) &&
+ &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) {
+ hlist_del_init(&bfqq->woken_list_node);
+ hlist_add_head(&bfqq->woken_list_node,
+ &bfqq->waker_bfqq->woken_list);
+ }
}
diff --git a/block/bio-integrity.c b/block/bio-integrity.c
index 9ffd7e289554..c9a16fba58b9 100644
--- a/block/bio-integrity.c
+++ b/block/bio-integrity.c
@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/mempool.h>
#include <linux/export.h>
#include <linux/bio.h>
@@ -14,8 +14,6 @@
#include <linux/slab.h>
#include "blk.h"
-#define BIP_INLINE_VECS 4
-
static struct kmem_cache *bip_slab;
static struct workqueue_struct *kintegrityd_wq;
@@ -30,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs,
if (bs && mempool_initialized(&bs->bio_integrity_pool)) {
if (bip->bip_vec)
bvec_free(&bs->bvec_integrity_pool, bip->bip_vec,
- bip->bip_slab);
+ bip->bip_max_vcnt);
mempool_free(bip, &bs->bio_integrity_pool);
} else {
kfree(bip);
@@ -63,7 +61,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
inline_vecs = nr_vecs;
} else {
bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask);
- inline_vecs = BIP_INLINE_VECS;
+ inline_vecs = BIO_INLINE_VECS;
}
if (unlikely(!bip))
@@ -71,18 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio,
memset(bip, 0, sizeof(*bip));
+ /* always report as many vecs as asked explicitly, not inline vecs */
+ bip->bip_max_vcnt = nr_vecs;
if (nr_vecs > inline_vecs) {
- unsigned long idx = 0;
-
- bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx,
- &bs->bvec_integrity_pool);
+ bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool,
+ &bip->bip_max_vcnt, gfp_mask);
if (!bip->bip_vec)
goto err;
- bip->bip_max_vcnt = bvec_nr_vecs(idx);
- bip->bip_slab = idx;
} else {
bip->bip_vec = bip->bip_inline_vecs;
- bip->bip_max_vcnt = inline_vecs;
}
bip->bip_bio = bio;
@@ -96,6 +91,47 @@ err:
}
EXPORT_SYMBOL(bio_integrity_alloc);
+static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs,
+ bool dirty)
+{
+ int i;
+
+ for (i = 0; i < nr_vecs; i++) {
+ if (dirty && !PageCompound(bv[i].bv_page))
+ set_page_dirty_lock(bv[i].bv_page);
+ unpin_user_page(bv[i].bv_page);
+ }
+}
+
+static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip)
+{
+ unsigned short nr_vecs = bip->bip_max_vcnt - 1;
+ struct bio_vec *copy = &bip->bip_vec[1];
+ size_t bytes = bip->bip_iter.bi_size;
+ struct iov_iter iter;
+ int ret;
+
+ iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes);
+ ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter);
+ WARN_ON_ONCE(ret != bytes);
+
+ bio_integrity_unpin_bvec(copy, nr_vecs, true);
+}
+
+static void bio_integrity_unmap_user(struct bio_integrity_payload *bip)
+{
+ bool dirty = bio_data_dir(bip->bip_bio) == READ;
+
+ if (bip->bip_flags & BIP_COPY_USER) {
+ if (dirty)
+ bio_integrity_uncopy_user(bip);
+ kfree(bvec_virt(bip->bip_vec));
+ return;
+ }
+
+ bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty);
+}
+
/**
* bio_integrity_free - Free bio integrity payload
* @bio: bio containing bip to be freed
@@ -109,8 +145,9 @@ void bio_integrity_free(struct bio *bio)
struct bio_set *bs = bio->bi_pool;
if (bip->bip_flags & BIP_BLOCK_INTEGRITY)
- kfree(page_address(bip->bip_vec->bv_page) +
- bip->bip_vec->bv_offset);
+ kfree(bvec_virt(bip->bip_vec));
+ else if (bip->bip_flags & BIP_INTEGRITY_USER)
+ bio_integrity_unmap_user(bip);
__bio_integrity_free(bs, bip);
bio->bi_integrity = NULL;
@@ -129,30 +166,214 @@ void bio_integrity_free(struct bio *bio)
int bio_integrity_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int offset)
{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct bio_vec *iv;
- if (bip->bip_vcnt >= bip->bip_max_vcnt) {
- printk(KERN_ERR "%s: bip_vec full\n", __func__);
+ if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) >
+ queue_max_hw_sectors(q))
return 0;
- }
- iv = bip->bip_vec + bip->bip_vcnt;
+ if (bip->bip_vcnt > 0) {
+ struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1];
+ bool same_page = false;
- if (bip->bip_vcnt &&
- bvec_gap_to_prev(bio->bi_disk->queue,
- &bip->bip_vec[bip->bip_vcnt - 1], offset))
- return 0;
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset,
+ &same_page)) {
+ bip->bip_iter.bi_size += len;
+ return len;
+ }
+
+ if (bip->bip_vcnt >=
+ min(bip->bip_max_vcnt, queue_max_integrity_segments(q)))
+ return 0;
- iv->bv_page = page;
- iv->bv_len = len;
- iv->bv_offset = offset;
+ /*
+ * If the queue doesn't support SG gaps and adding this segment
+ * would create a gap, disallow it.
+ */
+ if (bvec_gap_to_prev(&q->limits, bv, offset))
+ return 0;
+ }
+
+ bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset);
bip->bip_vcnt++;
+ bip->bip_iter.bi_size += len;
return len;
}
EXPORT_SYMBOL(bio_integrity_add_page);
+static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec,
+ int nr_vecs, unsigned int len,
+ unsigned int direction, u32 seed)
+{
+ bool write = direction == ITER_SOURCE;
+ struct bio_integrity_payload *bip;
+ struct iov_iter iter;
+ void *buf;
+ int ret;
+
+ buf = kmalloc(len, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ if (write) {
+ iov_iter_bvec(&iter, direction, bvec, nr_vecs, len);
+ if (!copy_from_iter_full(buf, len, &iter)) {
+ ret = -EFAULT;
+ goto free_buf;
+ }
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, 1);
+ } else {
+ memset(buf, 0, len);
+
+ /*
+ * We need to preserve the original bvec and the number of vecs
+ * in it for completion handling
+ */
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1);
+ }
+
+ if (IS_ERR(bip)) {
+ ret = PTR_ERR(bip);
+ goto free_buf;
+ }
+
+ if (write)
+ bio_integrity_unpin_bvec(bvec, nr_vecs, false);
+ else
+ memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec));
+
+ ret = bio_integrity_add_page(bio, virt_to_page(buf), len,
+ offset_in_page(buf));
+ if (ret != len) {
+ ret = -ENOMEM;
+ goto free_bip;
+ }
+
+ bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER;
+ bip->bip_iter.bi_sector = seed;
+ return 0;
+free_bip:
+ bio_integrity_free(bio);
+free_buf:
+ kfree(buf);
+ return ret;
+}
+
+static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec,
+ int nr_vecs, unsigned int len, u32 seed)
+{
+ struct bio_integrity_payload *bip;
+
+ bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs);
+ if (IS_ERR(bip))
+ return PTR_ERR(bip);
+
+ memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec));
+ bip->bip_flags |= BIP_INTEGRITY_USER;
+ bip->bip_iter.bi_sector = seed;
+ bip->bip_iter.bi_size = len;
+ return 0;
+}
+
+static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages,
+ int nr_vecs, ssize_t bytes, ssize_t offset)
+{
+ unsigned int nr_bvecs = 0;
+ int i, j;
+
+ for (i = 0; i < nr_vecs; i = j) {
+ size_t size = min_t(size_t, bytes, PAGE_SIZE - offset);
+ struct folio *folio = page_folio(pages[i]);
+
+ bytes -= size;
+ for (j = i + 1; j < nr_vecs; j++) {
+ size_t next = min_t(size_t, PAGE_SIZE, bytes);
+
+ if (page_folio(pages[j]) != folio ||
+ pages[j] != pages[j - 1] + 1)
+ break;
+ unpin_user_page(pages[j]);
+ size += next;
+ bytes -= next;
+ }
+
+ bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset);
+ offset = 0;
+ nr_bvecs++;
+ }
+
+ return nr_bvecs;
+}
+
+int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes,
+ u32 seed)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ unsigned int align = q->dma_pad_mask | queue_dma_alignment(q);
+ struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages;
+ struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec;
+ unsigned int direction, nr_bvecs;
+ struct iov_iter iter;
+ int ret, nr_vecs;
+ size_t offset;
+ bool copy;
+
+ if (bio_integrity(bio))
+ return -EINVAL;
+ if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q))
+ return -E2BIG;
+
+ if (bio_data_dir(bio) == READ)
+ direction = ITER_DEST;
+ else
+ direction = ITER_SOURCE;
+
+ iov_iter_ubuf(&iter, direction, ubuf, bytes);
+ nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1);
+ if (nr_vecs > BIO_MAX_VECS)
+ return -E2BIG;
+ if (nr_vecs > UIO_FASTIOV) {
+ bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL);
+ if (!bvec)
+ return -ENOMEM;
+ pages = NULL;
+ }
+
+ copy = !iov_iter_is_aligned(&iter, align, align);
+ ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset);
+ if (unlikely(ret < 0))
+ goto free_bvec;
+
+ nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset);
+ if (pages != stack_pages)
+ kvfree(pages);
+ if (nr_bvecs > queue_max_integrity_segments(q))
+ copy = true;
+
+ if (copy)
+ ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes,
+ direction, seed);
+ else
+ ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed);
+ if (ret)
+ goto release_pages;
+ if (bvec != stack_vec)
+ kfree(bvec);
+
+ return 0;
+
+release_pages:
+ bio_integrity_unpin_bvec(bvec, nr_bvecs, false);
+free_bvec:
+ if (bvec != stack_vec)
+ kfree(bvec);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bio_integrity_map_user);
+
/**
* bio_integrity_process - Process integrity metadata for a bio
* @bio: bio to generate/verify integrity metadata for
@@ -162,33 +383,30 @@ EXPORT_SYMBOL(bio_integrity_add_page);
static blk_status_t bio_integrity_process(struct bio *bio,
struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct blk_integrity_iter iter;
struct bvec_iter bviter;
struct bio_vec bv;
struct bio_integrity_payload *bip = bio_integrity(bio);
blk_status_t ret = BLK_STS_OK;
- void *prot_buf = page_address(bip->bip_vec->bv_page) +
- bip->bip_vec->bv_offset;
- iter.disk_name = bio->bi_disk->disk_name;
+ iter.disk_name = bio->bi_bdev->bd_disk->disk_name;
iter.interval = 1 << bi->interval_exp;
+ iter.tuple_size = bi->tuple_size;
iter.seed = proc_iter->bi_sector;
- iter.prot_buf = prot_buf;
+ iter.prot_buf = bvec_virt(bip->bip_vec);
__bio_for_each_segment(bv, bio, bviter, *proc_iter) {
- void *kaddr = kmap_atomic(bv.bv_page);
+ void *kaddr = bvec_kmap_local(&bv);
- iter.data_buf = kaddr + bv.bv_offset;
+ iter.data_buf = kaddr;
iter.data_size = bv.bv_len;
-
ret = proc_fn(&iter);
- if (ret) {
- kunmap_atomic(kaddr);
- return ret;
- }
+ kunmap_local(kaddr);
+
+ if (ret)
+ break;
- kunmap_atomic(kaddr);
}
return ret;
}
@@ -208,14 +426,11 @@ static blk_status_t bio_integrity_process(struct bio *bio,
bool bio_integrity_prep(struct bio *bio)
{
struct bio_integrity_payload *bip;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
- struct request_queue *q = bio->bi_disk->queue;
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
void *buf;
unsigned long start, end;
unsigned int len, nr_pages;
unsigned int bytes, offset, i;
- unsigned int intervals;
- blk_status_t status;
if (!bi)
return true;
@@ -239,12 +454,10 @@ bool bio_integrity_prep(struct bio *bio)
!(bi->flags & BLK_INTEGRITY_GENERATE))
return true;
}
- intervals = bio_integrity_intervals(bi, bio_sectors(bio));
/* Allocate kernel buffer for protection data */
- len = intervals * bi->tuple_size;
- buf = kmalloc(len, GFP_NOIO | q->bounce_gfp);
- status = BLK_STS_RESOURCE;
+ len = bio_integrity_bytes(bi, bio_sectors(bio));
+ buf = kmalloc(len, GFP_NOIO);
if (unlikely(buf == NULL)) {
printk(KERN_ERR "could not allocate integrity buffer\n");
goto err_end_io;
@@ -259,12 +472,10 @@ bool bio_integrity_prep(struct bio *bio)
if (IS_ERR(bip)) {
printk(KERN_ERR "could not allocate data integrity bioset\n");
kfree(buf);
- status = BLK_STS_RESOURCE;
goto err_end_io;
}
bip->bip_flags |= BIP_BLOCK_INTEGRITY;
- bip->bip_iter.bi_size = len;
bip_set_seed(bip, bio->bi_iter.bi_sector);
if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM)
@@ -272,28 +483,18 @@ bool bio_integrity_prep(struct bio *bio)
/* Map it */
offset = offset_in_page(buf);
- for (i = 0 ; i < nr_pages ; i++) {
- int ret;
+ for (i = 0; i < nr_pages && len > 0; i++) {
bytes = PAGE_SIZE - offset;
- if (len <= 0)
- break;
-
if (bytes > len)
bytes = len;
- ret = bio_integrity_add_page(bio, virt_to_page(buf),
- bytes, offset);
-
- if (ret == 0) {
+ if (bio_integrity_add_page(bio, virt_to_page(buf),
+ bytes, offset) < bytes) {
printk(KERN_ERR "could not attach integrity payload\n");
- status = BLK_STS_RESOURCE;
goto err_end_io;
}
- if (ret < bytes)
- break;
-
buf += bytes;
len -= bytes;
offset = 0;
@@ -309,10 +510,9 @@ bool bio_integrity_prep(struct bio *bio)
return true;
err_end_io:
- bio->bi_status = status;
+ bio->bi_status = BLK_STS_RESOURCE;
bio_endio(bio);
return false;
-
}
EXPORT_SYMBOL(bio_integrity_prep);
@@ -329,7 +529,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
struct bio_integrity_payload *bip =
container_of(work, struct bio_integrity_payload, bip_work);
struct bio *bio = bip->bip_bio;
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
/*
* At the moment verify is called bio's iterator was advanced
@@ -355,7 +555,7 @@ static void bio_integrity_verify_fn(struct work_struct *work)
*/
bool __bio_integrity_endio(struct bio *bio)
{
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
struct bio_integrity_payload *bip = bio_integrity(bio);
if (bio_op(bio) == REQ_OP_READ && !bio->bi_status &&
@@ -381,10 +581,10 @@ bool __bio_integrity_endio(struct bio *bio)
void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9);
- bip->bip_iter.bi_sector += bytes_done >> 9;
+ bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9);
bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes);
}
@@ -397,7 +597,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done)
void bio_integrity_trim(struct bio *bio)
{
struct bio_integrity_payload *bip = bio_integrity(bio);
- struct blk_integrity *bi = blk_get_integrity(bio->bi_disk);
+ struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk);
bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio));
}
@@ -428,10 +628,10 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src,
bip->bip_vcnt = bip_src->bip_vcnt;
bip->bip_iter = bip_src->bip_iter;
+ bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY;
return 0;
}
-EXPORT_SYMBOL(bio_integrity_clone);
int bioset_integrity_create(struct bio_set *bs, int pool_size)
{
@@ -470,6 +670,6 @@ void __init bio_integrity_init(void)
bip_slab = kmem_cache_create("bio_integrity_payload",
sizeof(struct bio_integrity_payload) +
- sizeof(struct bio_vec) * BIP_INLINE_VECS,
+ sizeof(struct bio_vec) * BIO_INLINE_VECS,
0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
}
diff --git a/block/bio.c b/block/bio.c
index 496aff366767..b9642a41f286 100644
--- a/block/bio.c
+++ b/block/bio.c
@@ -15,31 +15,54 @@
#include <linux/mempool.h>
#include <linux/workqueue.h>
#include <linux/cgroup.h>
-#include <linux/blk-cgroup.h>
#include <linux/highmem.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
+#include <linux/xarray.h>
#include <trace/events/block.h>
#include "blk.h"
#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
-/*
- * Test patch to inline a certain number of bi_io_vec's inside the bio
- * itself, to shrink a bio data allocation from two mempool calls to one
- */
-#define BIO_INLINE_VECS 4
+#define ALLOC_CACHE_THRESHOLD 16
+#define ALLOC_CACHE_MAX 256
-/*
- * if you change this list, also change bvec_alloc or things will
- * break badly! cannot be bigger than what you can fit into an
- * unsigned short
- */
-#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n }
-static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = {
- BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max),
+struct bio_alloc_cache {
+ struct bio *free_list;
+ struct bio *free_list_irq;
+ unsigned int nr;
+ unsigned int nr_irq;
};
-#undef BV
+
+static struct biovec_slab {
+ int nr_vecs;
+ char *name;
+ struct kmem_cache *slab;
+} bvec_slabs[] __read_mostly = {
+ { .nr_vecs = 16, .name = "biovec-16" },
+ { .nr_vecs = 64, .name = "biovec-64" },
+ { .nr_vecs = 128, .name = "biovec-128" },
+ { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" },
+};
+
+static struct biovec_slab *biovec_slab(unsigned short nr_vecs)
+{
+ switch (nr_vecs) {
+ /* smaller bios use inline vecs */
+ case 5 ... 16:
+ return &bvec_slabs[0];
+ case 17 ... 64:
+ return &bvec_slabs[1];
+ case 65 ... 128:
+ return &bvec_slabs[2];
+ case 129 ... BIO_MAX_VECS:
+ return &bvec_slabs[3];
+ default:
+ BUG();
+ return NULL;
+ }
+}
/*
* fs_bio_set is the bio_set containing bio and iovec memory pools used by
@@ -58,184 +81,144 @@ struct bio_slab {
char name[8];
};
static DEFINE_MUTEX(bio_slab_lock);
-static struct bio_slab *bio_slabs;
-static unsigned int bio_slab_nr, bio_slab_max;
+static DEFINE_XARRAY(bio_slabs);
-static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size)
+static struct bio_slab *create_bio_slab(unsigned int size)
{
- unsigned int sz = sizeof(struct bio) + extra_size;
- struct kmem_cache *slab = NULL;
- struct bio_slab *bslab, *new_bio_slabs;
- unsigned int new_bio_slab_max;
- unsigned int i, entry = -1;
+ struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL);
- mutex_lock(&bio_slab_lock);
+ if (!bslab)
+ return NULL;
- i = 0;
- while (i < bio_slab_nr) {
- bslab = &bio_slabs[i];
+ snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size);
+ bslab->slab = kmem_cache_create(bslab->name, size,
+ ARCH_KMALLOC_MINALIGN,
+ SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL);
+ if (!bslab->slab)
+ goto fail_alloc_slab;
- if (!bslab->slab && entry == -1)
- entry = i;
- else if (bslab->slab_size == sz) {
- slab = bslab->slab;
- bslab->slab_ref++;
- break;
- }
- i++;
- }
+ bslab->slab_ref = 1;
+ bslab->slab_size = size;
- if (slab)
- goto out_unlock;
-
- if (bio_slab_nr == bio_slab_max && entry == -1) {
- new_bio_slab_max = bio_slab_max << 1;
- new_bio_slabs = krealloc(bio_slabs,
- new_bio_slab_max * sizeof(struct bio_slab),
- GFP_KERNEL);
- if (!new_bio_slabs)
- goto out_unlock;
- bio_slab_max = new_bio_slab_max;
- bio_slabs = new_bio_slabs;
- }
- if (entry == -1)
- entry = bio_slab_nr++;
+ if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL)))
+ return bslab;
- bslab = &bio_slabs[entry];
+ kmem_cache_destroy(bslab->slab);
- snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry);
- slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN,
- SLAB_HWCACHE_ALIGN, NULL);
- if (!slab)
- goto out_unlock;
+fail_alloc_slab:
+ kfree(bslab);
+ return NULL;
+}
- bslab->slab = slab;
- bslab->slab_ref = 1;
- bslab->slab_size = sz;
-out_unlock:
+static inline unsigned int bs_bio_slab_size(struct bio_set *bs)
+{
+ return bs->front_pad + sizeof(struct bio) + bs->back_pad;
+}
+
+static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs)
+{
+ unsigned int size = bs_bio_slab_size(bs);
+ struct bio_slab *bslab;
+
+ mutex_lock(&bio_slab_lock);
+ bslab = xa_load(&bio_slabs, size);
+ if (bslab)
+ bslab->slab_ref++;
+ else
+ bslab = create_bio_slab(size);
mutex_unlock(&bio_slab_lock);
- return slab;
+
+ if (bslab)
+ return bslab->slab;
+ return NULL;
}
static void bio_put_slab(struct bio_set *bs)
{
struct bio_slab *bslab = NULL;
- unsigned int i;
+ unsigned int slab_size = bs_bio_slab_size(bs);
mutex_lock(&bio_slab_lock);
- for (i = 0; i < bio_slab_nr; i++) {
- if (bs->bio_slab == bio_slabs[i].slab) {
- bslab = &bio_slabs[i];
- break;
- }
- }
-
+ bslab = xa_load(&bio_slabs, slab_size);
if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n"))
goto out;
+ WARN_ON_ONCE(bslab->slab != bs->bio_slab);
+
WARN_ON(!bslab->slab_ref);
if (--bslab->slab_ref)
goto out;
+ xa_erase(&bio_slabs, slab_size);
+
kmem_cache_destroy(bslab->slab);
- bslab->slab = NULL;
+ kfree(bslab);
out:
mutex_unlock(&bio_slab_lock);
}
-unsigned int bvec_nr_vecs(unsigned short idx)
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs)
{
- return bvec_slabs[--idx].nr_vecs;
-}
+ BUG_ON(nr_vecs > BIO_MAX_VECS);
-void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx)
-{
- if (!idx)
- return;
- idx--;
-
- BIO_BUG_ON(idx >= BVEC_POOL_NR);
-
- if (idx == BVEC_POOL_MAX) {
+ if (nr_vecs == BIO_MAX_VECS)
mempool_free(bv, pool);
- } else {
- struct biovec_slab *bvs = bvec_slabs + idx;
+ else if (nr_vecs > BIO_INLINE_VECS)
+ kmem_cache_free(biovec_slab(nr_vecs)->slab, bv);
+}
- kmem_cache_free(bvs->slab, bv);
- }
+/*
+ * Make the first allocation restricted and don't dump info on allocation
+ * failures, since we'll fall back to the mempool in case of failure.
+ */
+static inline gfp_t bvec_alloc_gfp(gfp_t gfp)
+{
+ return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) |
+ __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
}
-struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx,
- mempool_t *pool)
+struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
+ gfp_t gfp_mask)
{
- struct bio_vec *bvl;
+ struct biovec_slab *bvs = biovec_slab(*nr_vecs);
- /*
- * see comment near bvec_array define!
- */
- switch (nr) {
- case 1:
- *idx = 0;
- break;
- case 2 ... 4:
- *idx = 1;
- break;
- case 5 ... 16:
- *idx = 2;
- break;
- case 17 ... 64:
- *idx = 3;
- break;
- case 65 ... 128:
- *idx = 4;
- break;
- case 129 ... BIO_MAX_PAGES:
- *idx = 5;
- break;
- default:
+ if (WARN_ON_ONCE(!bvs))
return NULL;
- }
/*
- * idx now points to the pool we want to allocate from. only the
- * 1-vec entry pool is mempool backed.
+ * Upgrade the nr_vecs request to take full advantage of the allocation.
+ * We also rely on this in the bvec_free path.
*/
- if (*idx == BVEC_POOL_MAX) {
-fallback:
- bvl = mempool_alloc(pool, gfp_mask);
- } else {
- struct biovec_slab *bvs = bvec_slabs + *idx;
- gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO);
+ *nr_vecs = bvs->nr_vecs;
- /*
- * Make this allocation restricted and don't dump info on
- * allocation failures, since we'll fallback to the mempool
- * in case of failure.
- */
- __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN;
+ /*
+ * Try a slab allocation first for all smaller allocations. If that
+ * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool.
+ * The mempool is sized to handle up to BIO_MAX_VECS entries.
+ */
+ if (*nr_vecs < BIO_MAX_VECS) {
+ struct bio_vec *bvl;
- /*
- * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM
- * is set, retry with the 1-entry mempool
- */
- bvl = kmem_cache_alloc(bvs->slab, __gfp_mask);
- if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) {
- *idx = BVEC_POOL_MAX;
- goto fallback;
- }
+ bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask));
+ if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM))
+ return bvl;
+ *nr_vecs = BIO_MAX_VECS;
}
- (*idx)++;
- return bvl;
+ return mempool_alloc(pool, gfp_mask);
}
void bio_uninit(struct bio *bio)
{
- bio_disassociate_blkg(bio);
-
+#ifdef CONFIG_BLK_CGROUP
+ if (bio->bi_blkg) {
+ blkg_put(bio->bi_blkg);
+ bio->bi_blkg = NULL;
+ }
+#endif
if (bio_integrity(bio))
bio_integrity_free(bio);
@@ -246,24 +229,13 @@ EXPORT_SYMBOL(bio_uninit);
static void bio_free(struct bio *bio)
{
struct bio_set *bs = bio->bi_pool;
- void *p;
+ void *p = bio;
- bio_uninit(bio);
+ WARN_ON_ONCE(!bs);
- if (bs) {
- bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio));
-
- /*
- * If we have front padding, adjust the bio pointer before freeing
- */
- p = bio;
- p -= bs->front_pad;
-
- mempool_free(p, &bs->bio_pool);
- } else {
- /* Bio was allocated by bio_kmalloc() */
- kfree(bio);
- }
+ bio_uninit(bio);
+ bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs);
+ mempool_free(p - bs->front_pad, &bs->bio_pool);
}
/*
@@ -271,21 +243,53 @@ static void bio_free(struct bio *bio)
* they must remember to pair any call to bio_init() with bio_uninit()
* when IO has completed, or when the bio is released.
*/
-void bio_init(struct bio *bio, struct bio_vec *table,
- unsigned short max_vecs)
+void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table,
+ unsigned short max_vecs, blk_opf_t opf)
{
- memset(bio, 0, sizeof(*bio));
+ bio->bi_next = NULL;
+ bio->bi_bdev = bdev;
+ bio->bi_opf = opf;
+ bio->bi_flags = 0;
+ bio->bi_ioprio = 0;
+ bio->bi_status = 0;
+ bio->bi_iter.bi_sector = 0;
+ bio->bi_iter.bi_size = 0;
+ bio->bi_iter.bi_idx = 0;
+ bio->bi_iter.bi_bvec_done = 0;
+ bio->bi_end_io = NULL;
+ bio->bi_private = NULL;
+#ifdef CONFIG_BLK_CGROUP
+ bio->bi_blkg = NULL;
+ bio->bi_issue.value = 0;
+ if (bdev)
+ bio_associate_blkg(bio);
+#ifdef CONFIG_BLK_CGROUP_IOCOST
+ bio->bi_iocost_cost = 0;
+#endif
+#endif
+#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+ bio->bi_crypt_context = NULL;
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ bio->bi_integrity = NULL;
+#endif
+ bio->bi_vcnt = 0;
+
atomic_set(&bio->__bi_remaining, 1);
atomic_set(&bio->__bi_cnt, 1);
+ bio->bi_cookie = BLK_QC_T_NONE;
- bio->bi_io_vec = table;
bio->bi_max_vecs = max_vecs;
+ bio->bi_io_vec = table;
+ bio->bi_pool = NULL;
}
EXPORT_SYMBOL(bio_init);
/**
* bio_reset - reinitialize a bio
* @bio: bio to reset
+ * @bdev: block device to use the bio for
+ * @opf: operation and flags for bio
*
* Description:
* After calling bio_reset(), @bio will be in the same state as a freshly
@@ -293,15 +297,15 @@ EXPORT_SYMBOL(bio_init);
* preserved are the ones that are initialized by bio_alloc_bioset(). See
* comment in struct bio.
*/
-void bio_reset(struct bio *bio)
+void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf)
{
- unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS);
-
bio_uninit(bio);
-
memset(bio, 0, BIO_RESET_BYTES);
- bio->bi_flags = flags;
atomic_set(&bio->__bi_remaining, 1);
+ bio->bi_bdev = bdev;
+ if (bio->bi_bdev)
+ bio_associate_blkg(bio);
+ bio->bi_opf = opf;
}
EXPORT_SYMBOL(bio_reset);
@@ -309,7 +313,7 @@ static struct bio *__bio_chain_endio(struct bio *bio)
{
struct bio *parent = bio->bi_private;
- if (!parent->bi_status)
+ if (bio->bi_status && !parent->bi_status)
parent->bi_status = bio->bi_status;
bio_put(bio);
return parent;
@@ -323,7 +327,7 @@ static void bio_chain_endio(struct bio *bio)
/**
* bio_chain - chain bio completions
* @bio: the target bio
- * @parent: the @bio's parent bio
+ * @parent: the parent bio of @bio
*
* The caller won't have a bi_end_io called when @bio completes - instead,
* @parent's bi_end_io won't be called until both @parent and @bio have
@@ -341,6 +345,20 @@ void bio_chain(struct bio *bio, struct bio *parent)
}
EXPORT_SYMBOL(bio_chain);
+struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev,
+ unsigned int nr_pages, blk_opf_t opf, gfp_t gfp)
+{
+ struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp);
+
+ if (bio) {
+ bio_chain(bio, new);
+ submit_bio(bio);
+ }
+
+ return new;
+}
+EXPORT_SYMBOL_GPL(blk_next_bio);
+
static void bio_alloc_rescue(struct work_struct *work)
{
struct bio_set *bs = container_of(work, struct bio_set, rescue_work);
@@ -354,7 +372,7 @@ static void bio_alloc_rescue(struct work_struct *work)
if (!bio)
break;
- generic_make_request(bio);
+ submit_bio_noacct(bio);
}
}
@@ -395,130 +413,165 @@ static void punt_bios_to_rescuer(struct bio_set *bs)
queue_work(bs->rescue_workqueue, &bs->rescue_work);
}
+static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache)
+{
+ unsigned long flags;
+
+ /* cache->free_list must be empty */
+ if (WARN_ON_ONCE(cache->free_list))
+ return;
+
+ local_irq_save(flags);
+ cache->free_list = cache->free_list_irq;
+ cache->free_list_irq = NULL;
+ cache->nr += cache->nr_irq;
+ cache->nr_irq = 0;
+ local_irq_restore(flags);
+}
+
+static struct bio *bio_alloc_percpu_cache(struct block_device *bdev,
+ unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp,
+ struct bio_set *bs)
+{
+ struct bio_alloc_cache *cache;
+ struct bio *bio;
+
+ cache = per_cpu_ptr(bs->cache, get_cpu());
+ if (!cache->free_list) {
+ if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD)
+ bio_alloc_irq_cache_splice(cache);
+ if (!cache->free_list) {
+ put_cpu();
+ return NULL;
+ }
+ }
+ bio = cache->free_list;
+ cache->free_list = bio->bi_next;
+ cache->nr--;
+ put_cpu();
+
+ bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf);
+ bio->bi_pool = bs;
+ return bio;
+}
+
/**
* bio_alloc_bioset - allocate a bio for I/O
+ * @bdev: block device to allocate the bio for (can be %NULL)
+ * @nr_vecs: number of bvecs to pre-allocate
+ * @opf: operation and flags for bio
* @gfp_mask: the GFP_* mask given to the slab allocator
- * @nr_iovecs: number of iovecs to pre-allocate
* @bs: the bio_set to allocate from.
*
- * Description:
- * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is
- * backed by the @bs's mempool.
+ * Allocate a bio from the mempools in @bs.
*
- * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will
- * always be able to allocate a bio. This is due to the mempool guarantees.
- * To make this work, callers must never allocate more than 1 bio at a time
- * from this pool. Callers that need to allocate more than 1 bio must always
- * submit the previously allocated bio for IO before attempting to allocate
- * a new one. Failure to do so can cause deadlocks under memory pressure.
+ * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to
+ * allocate a bio. This is due to the mempool guarantees. To make this work,
+ * callers must never allocate more than 1 bio at a time from the general pool.
+ * Callers that need to allocate more than 1 bio must always submit the
+ * previously allocated bio for IO before attempting to allocate a new one.
+ * Failure to do so can cause deadlocks under memory pressure.
*
- * Note that when running under generic_make_request() (i.e. any block
- * driver), bios are not submitted until after you return - see the code in
- * generic_make_request() that converts recursion into iteration, to prevent
- * stack overflows.
+ * Note that when running under submit_bio_noacct() (i.e. any block driver),
+ * bios are not submitted until after you return - see the code in
+ * submit_bio_noacct() that converts recursion into iteration, to prevent
+ * stack overflows.
*
- * This would normally mean allocating multiple bios under
- * generic_make_request() would be susceptible to deadlocks, but we have
- * deadlock avoidance code that resubmits any blocked bios from a rescuer
- * thread.
+ * This would normally mean allocating multiple bios under submit_bio_noacct()
+ * would be susceptible to deadlocks, but we have
+ * deadlock avoidance code that resubmits any blocked bios from a rescuer
+ * thread.
*
- * However, we do not guarantee forward progress for allocations from other
- * mempools. Doing multiple allocations from the same mempool under
- * generic_make_request() should be avoided - instead, use bio_set's front_pad
- * for per bio allocations.
+ * However, we do not guarantee forward progress for allocations from other
+ * mempools. Doing multiple allocations from the same mempool under
+ * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad
+ * for per bio allocations.
*
- * RETURNS:
- * Pointer to new bio on success, NULL on failure.
+ * Returns: Pointer to new bio on success, NULL on failure.
*/
-struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs,
+struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs,
+ blk_opf_t opf, gfp_t gfp_mask,
struct bio_set *bs)
{
gfp_t saved_gfp = gfp_mask;
- unsigned front_pad;
- unsigned inline_vecs;
- struct bio_vec *bvl = NULL;
struct bio *bio;
void *p;
- if (!bs) {
- if (nr_iovecs > UIO_MAXIOV)
- return NULL;
-
- p = kmalloc(sizeof(struct bio) +
- nr_iovecs * sizeof(struct bio_vec),
- gfp_mask);
- front_pad = 0;
- inline_vecs = nr_iovecs;
- } else {
- /* should not use nobvec bioset for nr_iovecs > 0 */
- if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) &&
- nr_iovecs > 0))
- return NULL;
- /*
- * generic_make_request() converts recursion to iteration; this
- * means if we're running beneath it, any bios we allocate and
- * submit will not be submitted (and thus freed) until after we
- * return.
- *
- * This exposes us to a potential deadlock if we allocate
- * multiple bios from the same bio_set() while running
- * underneath generic_make_request(). If we were to allocate
- * multiple bios (say a stacking block driver that was splitting
- * bios), we would deadlock if we exhausted the mempool's
- * reserve.
- *
- * We solve this, and guarantee forward progress, with a rescuer
- * workqueue per bio_set. If we go to allocate and there are
- * bios on current->bio_list, we first try the allocation
- * without __GFP_DIRECT_RECLAIM; if that fails, we punt those
- * bios we would be blocking to the rescuer workqueue before
- * we retry with the original gfp_flags.
- */
-
- if (current->bio_list &&
- (!bio_list_empty(&current->bio_list[0]) ||
- !bio_list_empty(&current->bio_list[1])) &&
- bs->rescue_workqueue)
- gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+ /* should not use nobvec bioset for nr_vecs > 0 */
+ if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0))
+ return NULL;
- p = mempool_alloc(&bs->bio_pool, gfp_mask);
- if (!p && gfp_mask != saved_gfp) {
- punt_bios_to_rescuer(bs);
- gfp_mask = saved_gfp;
- p = mempool_alloc(&bs->bio_pool, gfp_mask);
+ if (opf & REQ_ALLOC_CACHE) {
+ if (bs->cache && nr_vecs <= BIO_INLINE_VECS) {
+ bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf,
+ gfp_mask, bs);
+ if (bio)
+ return bio;
+ /*
+ * No cached bio available, bio returned below marked with
+ * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache.
+ */
+ } else {
+ opf &= ~REQ_ALLOC_CACHE;
}
-
- front_pad = bs->front_pad;
- inline_vecs = BIO_INLINE_VECS;
}
+ /*
+ * submit_bio_noacct() converts recursion to iteration; this means if
+ * we're running beneath it, any bios we allocate and submit will not be
+ * submitted (and thus freed) until after we return.
+ *
+ * This exposes us to a potential deadlock if we allocate multiple bios
+ * from the same bio_set() while running underneath submit_bio_noacct().
+ * If we were to allocate multiple bios (say a stacking block driver
+ * that was splitting bios), we would deadlock if we exhausted the
+ * mempool's reserve.
+ *
+ * We solve this, and guarantee forward progress, with a rescuer
+ * workqueue per bio_set. If we go to allocate and there are bios on
+ * current->bio_list, we first try the allocation without
+ * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be
+ * blocking to the rescuer workqueue before we retry with the original
+ * gfp_flags.
+ */
+ if (current->bio_list &&
+ (!bio_list_empty(&current->bio_list[0]) ||
+ !bio_list_empty(&current->bio_list[1])) &&
+ bs->rescue_workqueue)
+ gfp_mask &= ~__GFP_DIRECT_RECLAIM;
+
+ p = mempool_alloc(&bs->bio_pool, gfp_mask);
+ if (!p && gfp_mask != saved_gfp) {
+ punt_bios_to_rescuer(bs);
+ gfp_mask = saved_gfp;
+ p = mempool_alloc(&bs->bio_pool, gfp_mask);
+ }
if (unlikely(!p))
return NULL;
+ if (!mempool_is_saturated(&bs->bio_pool))
+ opf &= ~REQ_ALLOC_CACHE;
- bio = p + front_pad;
- bio_init(bio, NULL, 0);
-
- if (nr_iovecs > inline_vecs) {
- unsigned long idx = 0;
+ bio = p + bs->front_pad;
+ if (nr_vecs > BIO_INLINE_VECS) {
+ struct bio_vec *bvl = NULL;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
if (!bvl && gfp_mask != saved_gfp) {
punt_bios_to_rescuer(bs);
gfp_mask = saved_gfp;
- bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool);
+ bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask);
}
-
if (unlikely(!bvl))
goto err_free;
- bio->bi_flags |= idx << BVEC_POOL_OFFSET;
- } else if (nr_iovecs) {
- bvl = bio->bi_inline_vecs;
+ bio_init(bio, bdev, bvl, nr_vecs, opf);
+ } else if (nr_vecs) {
+ bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf);
+ } else {
+ bio_init(bio, bdev, NULL, 0, opf);
}
bio->bi_pool = bs;
- bio->bi_max_vecs = nr_iovecs;
- bio->bi_io_vec = bvl;
return bio;
err_free:
@@ -527,59 +580,41 @@ err_free:
}
EXPORT_SYMBOL(bio_alloc_bioset);
-void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
+/**
+ * bio_kmalloc - kmalloc a bio
+ * @nr_vecs: number of bio_vecs to allocate
+ * @gfp_mask: the GFP_* mask given to the slab allocator
+ *
+ * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized
+ * using bio_init() before use. To free a bio returned from this function use
+ * kfree() after calling bio_uninit(). A bio returned from this function can
+ * be reused by calling bio_uninit() before calling bio_init() again.
+ *
+ * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this
+ * function are not backed by a mempool can fail. Do not use this function
+ * for allocations in the file system I/O path.
+ *
+ * Returns: Pointer to new bio on success, NULL on failure.
+ */
+struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask)
{
- unsigned long flags;
- struct bio_vec bv;
- struct bvec_iter iter;
+ struct bio *bio;
- __bio_for_each_segment(bv, bio, iter, start) {
- char *data = bvec_kmap_irq(&bv, &flags);
- memset(data, 0, bv.bv_len);
- flush_dcache_page(bv.bv_page);
- bvec_kunmap_irq(data, &flags);
- }
+ if (nr_vecs > UIO_MAXIOV)
+ return NULL;
+ return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask);
}
-EXPORT_SYMBOL(zero_fill_bio_iter);
+EXPORT_SYMBOL(bio_kmalloc);
-void bio_truncate(struct bio *bio, unsigned new_size)
+void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start)
{
struct bio_vec bv;
struct bvec_iter iter;
- unsigned int done = 0;
- bool truncated = false;
-
- if (new_size >= bio->bi_iter.bi_size)
- return;
- if (bio_data_dir(bio) != READ)
- goto exit;
-
- bio_for_each_segment(bv, bio, iter) {
- if (done + bv.bv_len > new_size) {
- unsigned offset;
-
- if (!truncated)
- offset = new_size - done;
- else
- offset = 0;
- zero_user(bv.bv_page, offset, bv.bv_len - offset);
- truncated = true;
- }
- done += bv.bv_len;
- }
-
- exit:
- /*
- * Don't touch bvec table here and make it really immutable, since
- * fs bio user has to retrieve all pages via bio_for_each_segment_all
- * in its .end_bio() callback.
- *
- * It is enough to truncate bio by updating .bi_size since we can make
- * correct bvec with the updated .bi_size for drivers.
- */
- bio->bi_iter.bi_size = new_size;
+ __bio_for_each_segment(bv, bio, iter, start)
+ memzero_bvec(&bv);
}
+EXPORT_SYMBOL(zero_fill_bio_iter);
/**
* bio_truncate - truncate the bio to small size of @new_size
@@ -591,7 +626,7 @@ void bio_truncate(struct bio *bio, unsigned new_size)
* REQ_OP_READ, zero the truncated part. This function should only
* be used for handling corner cases, such as bio eod.
*/
-void bio_truncate(struct bio *bio, unsigned new_size)
+static void bio_truncate(struct bio *bio, unsigned new_size)
{
struct bio_vec bv;
struct bvec_iter iter;
@@ -612,7 +647,8 @@ void bio_truncate(struct bio *bio, unsigned new_size)
offset = new_size - done;
else
offset = 0;
- zero_user(bv.bv_page, offset, bv.bv_len - offset);
+ zero_user(bv.bv_page, bv.bv_offset + offset,
+ bv.bv_len - offset);
truncated = true;
}
done += bv.bv_len;
@@ -644,16 +680,7 @@ void bio_truncate(struct bio *bio, unsigned new_size)
*/
void guard_bio_eod(struct bio *bio)
{
- sector_t maxsector;
- struct hd_struct *part;
-
- rcu_read_lock();
- part = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (part)
- maxsector = part_nr_sects_read(part);
- else
- maxsector = get_capacity(bio->bi_disk);
- rcu_read_unlock();
+ sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
if (!maxsector)
return;
@@ -673,6 +700,93 @@ void guard_bio_eod(struct bio *bio)
bio_truncate(bio, maxsector << 9);
}
+static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ unsigned int i = 0;
+ struct bio *bio;
+
+ while ((bio = cache->free_list) != NULL) {
+ cache->free_list = bio->bi_next;
+ cache->nr--;
+ bio_free(bio);
+ if (++i == nr)
+ break;
+ }
+ return i;
+}
+
+static void bio_alloc_cache_prune(struct bio_alloc_cache *cache,
+ unsigned int nr)
+{
+ nr -= __bio_alloc_cache_prune(cache, nr);
+ if (!READ_ONCE(cache->free_list)) {
+ bio_alloc_irq_cache_splice(cache);
+ __bio_alloc_cache_prune(cache, nr);
+ }
+}
+
+static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node)
+{
+ struct bio_set *bs;
+
+ bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead);
+ if (bs->cache) {
+ struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu);
+
+ bio_alloc_cache_prune(cache, -1U);
+ }
+ return 0;
+}
+
+static void bio_alloc_cache_destroy(struct bio_set *bs)
+{
+ int cpu;
+
+ if (!bs->cache)
+ return;
+
+ cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ for_each_possible_cpu(cpu) {
+ struct bio_alloc_cache *cache;
+
+ cache = per_cpu_ptr(bs->cache, cpu);
+ bio_alloc_cache_prune(cache, -1U);
+ }
+ free_percpu(bs->cache);
+ bs->cache = NULL;
+}
+
+static inline void bio_put_percpu_cache(struct bio *bio)
+{
+ struct bio_alloc_cache *cache;
+
+ cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu());
+ if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) {
+ put_cpu();
+ bio_free(bio);
+ return;
+ }
+
+ bio_uninit(bio);
+
+ if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) {
+ bio->bi_next = cache->free_list;
+ bio->bi_bdev = NULL;
+ cache->free_list = bio;
+ cache->nr++;
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ bio->bi_next = cache->free_list_irq;
+ cache->free_list_irq = bio;
+ cache->nr_irq++;
+ local_irq_restore(flags);
+ }
+ put_cpu();
+}
+
/**
* bio_put - release a reference to a bio
* @bio: bio to release reference to
@@ -683,112 +797,135 @@ void guard_bio_eod(struct bio *bio)
**/
void bio_put(struct bio *bio)
{
- if (!bio_flagged(bio, BIO_REFFED))
- bio_free(bio);
- else {
- BIO_BUG_ON(!atomic_read(&bio->__bi_cnt));
-
- /*
- * last put frees it
- */
- if (atomic_dec_and_test(&bio->__bi_cnt))
- bio_free(bio);
+ if (unlikely(bio_flagged(bio, BIO_REFFED))) {
+ BUG_ON(!atomic_read(&bio->__bi_cnt));
+ if (!atomic_dec_and_test(&bio->__bi_cnt))
+ return;
}
+ if (bio->bi_opf & REQ_ALLOC_CACHE)
+ bio_put_percpu_cache(bio);
+ else
+ bio_free(bio);
}
EXPORT_SYMBOL(bio_put);
-/**
- * __bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: destination bio
- * @bio_src: bio to clone
- *
- * Clone a &bio. Caller will own the returned bio, but not
- * the actual data it points to. Reference count of returned
- * bio will be one.
- *
- * Caller must ensure that @bio_src is not freed before @bio.
- */
-void __bio_clone_fast(struct bio *bio, struct bio *bio_src)
+static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp)
{
- BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio));
-
- /*
- * most users will be overriding ->bi_disk with a new target,
- * so we don't set nor calculate new physical/hw segment counts here
- */
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_partno = bio_src->bi_partno;
bio_set_flag(bio, BIO_CLONED);
- if (bio_flagged(bio_src, BIO_THROTTLED))
- bio_set_flag(bio, BIO_THROTTLED);
- bio->bi_opf = bio_src->bi_opf;
bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter = bio_src->bi_iter;
- bio->bi_io_vec = bio_src->bi_io_vec;
- bio_clone_blkg_association(bio, bio_src);
- blkcg_bio_issue_init(bio);
+ if (bio->bi_bdev) {
+ if (bio->bi_bdev == bio_src->bi_bdev &&
+ bio_flagged(bio_src, BIO_REMAPPED))
+ bio_set_flag(bio, BIO_REMAPPED);
+ bio_clone_blkg_association(bio, bio_src);
+ }
+
+ if (bio_crypt_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ if (bio_integrity(bio_src) &&
+ bio_integrity_clone(bio, bio_src, gfp) < 0)
+ return -ENOMEM;
+ return 0;
}
-EXPORT_SYMBOL(__bio_clone_fast);
/**
- * bio_clone_fast - clone a bio that shares the original bio's biovec
- * @bio: bio to clone
- * @gfp_mask: allocation priority
- * @bs: bio_set to allocate from
+ * bio_alloc_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ * @bs: bio_set to allocate from
+ *
+ * Allocate a new bio that is a clone of @bio_src. The caller owns the returned
+ * bio, but not the actual data it points to.
*
- * Like __bio_clone_fast, only also allocates the returned bio
+ * The caller must ensure that the return bio is not freed before @bio_src.
*/
-struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs)
+struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src,
+ gfp_t gfp, struct bio_set *bs)
{
- struct bio *b;
+ struct bio *bio;
- b = bio_alloc_bioset(gfp_mask, 0, bs);
- if (!b)
+ bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs);
+ if (!bio)
return NULL;
- __bio_clone_fast(b, bio);
-
- bio_crypt_clone(b, bio, gfp_mask);
-
- if (bio_integrity(bio)) {
- int ret;
+ if (__bio_clone(bio, bio_src, gfp) < 0) {
+ bio_put(bio);
+ return NULL;
+ }
+ bio->bi_io_vec = bio_src->bi_io_vec;
- ret = bio_integrity_clone(b, bio, gfp_mask);
+ return bio;
+}
+EXPORT_SYMBOL(bio_alloc_clone);
- if (ret < 0) {
- bio_put(b);
- return NULL;
- }
- }
+/**
+ * bio_init_clone - clone a bio that shares the original bio's biovec
+ * @bdev: block_device to clone onto
+ * @bio: bio to clone into
+ * @bio_src: bio to clone from
+ * @gfp: allocation priority
+ *
+ * Initialize a new bio in caller provided memory that is a clone of @bio_src.
+ * The caller owns the returned bio, but not the actual data it points to.
+ *
+ * The caller must ensure that @bio_src is not freed before @bio.
+ */
+int bio_init_clone(struct block_device *bdev, struct bio *bio,
+ struct bio *bio_src, gfp_t gfp)
+{
+ int ret;
- return b;
+ bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf);
+ ret = __bio_clone(bio, bio_src, gfp);
+ if (ret)
+ bio_uninit(bio);
+ return ret;
}
-EXPORT_SYMBOL(bio_clone_fast);
+EXPORT_SYMBOL(bio_init_clone);
-const char *bio_devname(struct bio *bio, char *buf)
+/**
+ * bio_full - check if the bio is full
+ * @bio: bio to check
+ * @len: length of one segment to be added
+ *
+ * Return true if @bio is full and one segment with @len bytes can't be
+ * added to the bio, otherwise return false
+ */
+static inline bool bio_full(struct bio *bio, unsigned len)
{
- return disk_name(bio->bi_disk, bio->bi_partno, buf);
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return true;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return true;
+ return false;
}
-EXPORT_SYMBOL(bio_devname);
-static inline bool page_is_mergeable(const struct bio_vec *bv,
- struct page *page, unsigned int len, unsigned int off,
- bool *same_page)
+static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page,
+ unsigned int len, unsigned int off, bool *same_page)
{
- phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) +
- bv->bv_offset + bv->bv_len - 1;
+ size_t bv_end = bv->bv_offset + bv->bv_len;
+ phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1;
phys_addr_t page_addr = page_to_phys(page);
if (vec_end_addr + 1 != page_addr + off)
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(bv, page))
return false;
+ if (!zone_device_pages_have_same_pgmap(bv->bv_page, page))
+ return false;
*same_page = ((vec_end_addr & PAGE_MASK) == page_addr);
- if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page)
- return false;
+ if (!*same_page) {
+ if (IS_ENABLED(CONFIG_KMSAN))
+ return false;
+ if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE)
+ return false;
+ }
+
+ bv->bv_len += len;
return true;
}
@@ -797,20 +934,19 @@ static inline bool page_is_mergeable(const struct bio_vec *bv,
* size limit. This is not for normal read/write bios, but for passthrough
* or Zone Append operations that we can't split.
*/
-static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned len,
- unsigned offset, bool *same_page)
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+ struct page *page, unsigned len, unsigned offset,
+ bool *same_page)
{
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
unsigned long mask = queue_segment_boundary(q);
phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset;
phys_addr_t addr2 = page_to_phys(page) + offset + len - 1;
if ((addr1 | mask) != (addr2 | mask))
return false;
- if (bv->bv_len + len > queue_max_segment_size(q))
+ if (len > queue_max_segment_size(q) - bv->bv_len)
return false;
- return __bio_try_merge_page(bio, page, len, offset, same_page);
+ return bvec_try_merge_page(bv, page, len, offset, same_page);
}
/**
@@ -830,37 +966,37 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio,
struct page *page, unsigned int len, unsigned int offset,
unsigned int max_sectors, bool *same_page)
{
- struct bio_vec *bvec;
+ unsigned int max_size = max_sectors << SECTOR_SHIFT;
if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
return 0;
- if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors)
+ len = min3(len, max_size, queue_max_segment_size(q));
+ if (len > max_size - bio->bi_iter.bi_size)
return 0;
if (bio->bi_vcnt > 0) {
- if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page))
+ struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+
+ if (bvec_try_merge_hw_page(q, bv, page, len, offset,
+ same_page)) {
+ bio->bi_iter.bi_size += len;
return len;
+ }
+
+ if (bio->bi_vcnt >=
+ min(bio->bi_max_vecs, queue_max_segments(q)))
+ return 0;
/*
* If the queue doesn't support SG gaps and adding this segment
* would create a gap, disallow it.
*/
- bvec = &bio->bi_io_vec[bio->bi_vcnt - 1];
- if (bvec_gap_to_prev(q, bvec, offset))
+ if (bvec_gap_to_prev(&q->limits, bv, offset))
return 0;
}
- if (bio_full(bio, len))
- return 0;
-
- if (bio->bi_vcnt >= queue_max_segments(q))
- return 0;
-
- bvec = &bio->bi_io_vec[bio->bi_vcnt];
- bvec->bv_page = page;
- bvec->bv_len = len;
- bvec->bv_offset = offset;
+ bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset);
bio->bi_vcnt++;
bio->bi_iter.bi_size += len;
return len;
@@ -891,41 +1027,37 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio,
EXPORT_SYMBOL(bio_add_pc_page);
/**
- * __bio_try_merge_page - try appending data to an existing bvec.
+ * bio_add_zone_append_page - attempt to add page to zone-append bio
* @bio: destination bio
- * @page: start page to add
- * @len: length of the data to add
- * @off: offset of the data relative to @page
- * @same_page: return if the segment has been merged inside the same page
- *
- * Try to add the data at @page + @off to the last bvec of @bio. This is a
- * a useful optimisation for file systems with a block size smaller than the
- * page size.
+ * @page: page to add
+ * @len: vec entry length
+ * @offset: vec entry offset
*
- * Warn if (@len, @off) crosses pages in case that @same_page is true.
+ * Attempt to add a page to the bio_vec maplist of a bio that will be submitted
+ * for a zone-append request. This can fail for a number of reasons, such as the
+ * bio being full or the target block device is not a zoned block device or
+ * other limitations of the target block device. The target block device must
+ * allow bio's up to PAGE_SIZE, so it is always possible to add a single page
+ * to an empty bio.
*
- * Return %true on success or %false on failure.
+ * Returns: number of bytes added to the bio, or 0 in case of a failure.
*/
-bool __bio_try_merge_page(struct bio *bio, struct page *page,
- unsigned int len, unsigned int off, bool *same_page)
+int bio_add_zone_append_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
{
- if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
- return false;
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ bool same_page = false;
- if (bio->bi_vcnt > 0) {
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1];
+ if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND))
+ return 0;
- if (page_is_mergeable(bv, page, len, off, same_page)) {
- if (bio->bi_iter.bi_size > UINT_MAX - len)
- return false;
- bv->bv_len += len;
- bio->bi_iter.bi_size += len;
- return true;
- }
- }
- return false;
+ if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev)))
+ return 0;
+
+ return bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_zone_append_sectors(q), &same_page);
}
-EXPORT_SYMBOL_GPL(__bio_try_merge_page);
+EXPORT_SYMBOL_GPL(bio_add_zone_append_page);
/**
* __bio_add_page - add page(s) to a bio in a new segment
@@ -940,20 +1072,12 @@ EXPORT_SYMBOL_GPL(__bio_try_merge_page);
void __bio_add_page(struct bio *bio, struct page *page,
unsigned int len, unsigned int off)
{
- struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt];
-
WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED));
WARN_ON_ONCE(bio_full(bio, len));
- bv->bv_page = page;
- bv->bv_offset = off;
- bv->bv_len = len;
-
+ bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off);
bio->bi_iter.bi_size += len;
bio->bi_vcnt++;
-
- if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page)))
- bio_set_flag(bio, BIO_WORKINGSET);
}
EXPORT_SYMBOL_GPL(__bio_add_page);
@@ -972,46 +1096,129 @@ int bio_add_page(struct bio *bio, struct page *page,
{
bool same_page = false;
- if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (bio_full(bio, len))
- return 0;
- __bio_add_page(bio, page, len, offset);
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return 0;
+ if (bio->bi_iter.bi_size > UINT_MAX - len)
+ return 0;
+
+ if (bio->bi_vcnt > 0 &&
+ bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+ page, len, offset, &same_page)) {
+ bio->bi_iter.bi_size += len;
+ return len;
}
+
+ if (bio->bi_vcnt >= bio->bi_max_vecs)
+ return 0;
+ __bio_add_page(bio, page, len, offset);
return len;
}
EXPORT_SYMBOL(bio_add_page);
-void bio_release_pages(struct bio *bio, bool mark_dirty)
+void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len,
+ size_t off)
{
- struct bvec_iter_all iter_all;
- struct bio_vec *bvec;
+ WARN_ON_ONCE(len > UINT_MAX);
+ WARN_ON_ONCE(off > UINT_MAX);
+ __bio_add_page(bio, &folio->page, len, off);
+}
- if (bio_flagged(bio, BIO_NO_PAGE_REF))
- return;
+/**
+ * bio_add_folio - Attempt to add part of a folio to a bio.
+ * @bio: BIO to add to.
+ * @folio: Folio to add.
+ * @len: How many bytes from the folio to add.
+ * @off: First byte in this folio to add.
+ *
+ * Filesystems that use folios can call this function instead of calling
+ * bio_add_page() for each page in the folio. If @off is bigger than
+ * PAGE_SIZE, this function can create a bio_vec that starts in a page
+ * after the bv_page. BIOs do not support folios that are 4GiB or larger.
+ *
+ * Return: Whether the addition was successful.
+ */
+bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len,
+ size_t off)
+{
+ if (len > UINT_MAX || off > UINT_MAX)
+ return false;
+ return bio_add_page(bio, &folio->page, len, off) > 0;
+}
+EXPORT_SYMBOL(bio_add_folio);
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (mark_dirty && !PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
- put_page(bvec->bv_page);
+void __bio_release_pages(struct bio *bio, bool mark_dirty)
+{
+ struct folio_iter fi;
+
+ bio_for_each_folio_all(fi, bio) {
+ struct page *page;
+ size_t done = 0;
+
+ if (mark_dirty) {
+ folio_lock(fi.folio);
+ folio_mark_dirty(fi.folio);
+ folio_unlock(fi.folio);
+ }
+ page = folio_page(fi.folio, fi.offset / PAGE_SIZE);
+ do {
+ bio_release_page(bio, page++);
+ done += PAGE_SIZE;
+ } while (done < fi.length);
}
}
-EXPORT_SYMBOL_GPL(bio_release_pages);
+EXPORT_SYMBOL_GPL(__bio_release_pages);
-static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
+void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter)
{
- const struct bio_vec *bv = iter->bvec;
- unsigned int len;
- size_t size;
+ size_t size = iov_iter_count(iter);
- if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len))
- return -EINVAL;
+ WARN_ON_ONCE(bio->bi_max_vecs);
+
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ size_t max_sectors = queue_max_zone_append_sectors(q);
+
+ size = min(size, max_sectors << SECTOR_SHIFT);
+ }
- len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count);
- size = bio_add_page(bio, bv->bv_page, len,
- bv->bv_offset + iter->iov_offset);
- if (unlikely(size != len))
+ bio->bi_vcnt = iter->nr_segs;
+ bio->bi_io_vec = (struct bio_vec *)iter->bvec;
+ bio->bi_iter.bi_bvec_done = iter->iov_offset;
+ bio->bi_iter.bi_size = size;
+ bio_set_flag(bio, BIO_CLONED);
+}
+
+static int bio_iov_add_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ bool same_page = false;
+
+ if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len))
+ return -EIO;
+
+ if (bio->bi_vcnt > 0 &&
+ bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1],
+ page, len, offset, &same_page)) {
+ bio->bi_iter.bi_size += len;
+ if (same_page)
+ bio_release_page(bio, page);
+ return 0;
+ }
+ __bio_add_page(bio, page, len, offset);
+ return 0;
+}
+
+static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page,
+ unsigned int len, unsigned int offset)
+{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ bool same_page = false;
+
+ if (bio_add_hw_page(q, bio, page, len, offset,
+ queue_max_zone_append_sectors(q), &same_page) != len)
return -EINVAL;
- iov_iter_advance(iter, size);
+ if (same_page)
+ bio_release_page(bio, page);
return 0;
}
@@ -1022,96 +1229,81 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter)
* @bio: bio to add pages to
* @iter: iov iterator describing the region to be mapped
*
- * Pins pages from *iter and appends them to @bio's bvec array. The
- * pages will have to be released using put_page() when done.
- * For multi-segment *iter, this function only adds pages from the
- * the next non-empty segment of the iov iterator.
+ * Extracts pages from *iter and appends them to @bio's bvec array. The pages
+ * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag.
+ * For a multi-segment *iter, this function only adds pages from the next
+ * non-empty segment of the iov iterator.
*/
static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
+ iov_iter_extraction_t extraction_flags = 0;
unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
struct page **pages = (struct page **)bv;
- bool same_page = false;
ssize_t size, left;
- unsigned len, i;
+ unsigned len, i = 0;
size_t offset;
+ int ret = 0;
/*
* Move page array up in the allocated memory for the bio vecs as far as
* possible so that we can start filling biovecs from the beginning
* without overwriting the temporary page array.
- */
+ */
BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
+ if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue))
+ extraction_flags |= ITER_ALLOW_P2PDMA;
+
+ /*
+ * Each segment in the iov is required to be a block size multiple.
+ * However, we may not be able to get the entire segment if it spans
+ * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the
+ * result to ensure the bio's total size is correct. The remainder of
+ * the iov data will be picked up in the next bio iteration.
+ */
+ size = iov_iter_extract_pages(iter, &pages,
+ UINT_MAX - bio->bi_iter.bi_size,
+ nr_pages, extraction_flags, &offset);
if (unlikely(size <= 0))
return size ? size : -EFAULT;
- for (left = size, i = 0; left > 0; left -= len, i++) {
- struct page *page = pages[i];
-
- len = min_t(size_t, PAGE_SIZE - offset, left);
+ nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE);
- if (__bio_try_merge_page(bio, page, len, offset, &same_page)) {
- if (same_page)
- put_page(page);
- } else {
- if (WARN_ON_ONCE(bio_full(bio, len)))
- return -EINVAL;
- __bio_add_page(bio, page, len, offset);
- }
- offset = 0;
+ if (bio->bi_bdev) {
+ size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1);
+ iov_iter_revert(iter, trim);
+ size -= trim;
}
- iov_iter_advance(iter, size);
- return 0;
-}
-
-static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
-{
- unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt;
- unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt;
- struct request_queue *q = bio->bi_disk->queue;
- unsigned int max_append_sectors = queue_max_zone_append_sectors(q);
- struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt;
- struct page **pages = (struct page **)bv;
- ssize_t size, left;
- unsigned len, i;
- size_t offset;
-
- if (WARN_ON_ONCE(!max_append_sectors))
- return 0;
-
- /*
- * Move page array up in the allocated memory for the bio vecs as far as
- * possible so that we can start filling biovecs from the beginning
- * without overwriting the temporary page array.
- */
- BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2);
- pages += entries_left * (PAGE_PTRS_PER_BVEC - 1);
-
- size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset);
- if (unlikely(size <= 0))
- return size ? size : -EFAULT;
+ if (unlikely(!size)) {
+ ret = -EFAULT;
+ goto out;
+ }
for (left = size, i = 0; left > 0; left -= len, i++) {
struct page *page = pages[i];
- bool same_page = false;
len = min_t(size_t, PAGE_SIZE - offset, left);
- if (bio_add_hw_page(q, bio, page, len, offset,
- max_append_sectors, &same_page) != len)
- return -EINVAL;
- if (same_page)
- put_page(page);
+ if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
+ ret = bio_iov_add_zone_append_page(bio, page, len,
+ offset);
+ if (ret)
+ break;
+ } else
+ bio_iov_add_page(bio, page, len, offset);
+
offset = 0;
}
- iov_iter_advance(iter, size);
- return 0;
+ iov_iter_revert(iter, left);
+out:
+ while (i < nr_pages)
+ bio_release_page(bio, pages[i++]);
+
+ return ret;
}
/**
@@ -1122,41 +1314,37 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter)
* This takes either an iterator pointing to user memory, or one pointing to
* kernel pages (BVEC iterator). If we're adding user pages, we pin them and
* map them into the kernel. On IO completion, the caller should put those
- * pages. If we're adding kernel pages, and the caller told us it's safe to
- * do so, we just have to add the pages to the bio directly. We don't grab an
- * extra reference to those pages (the user should already have that), and we
- * don't put the page on IO completion. The caller needs to check if the bio is
- * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be
- * released.
+ * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided
+ * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs
+ * to ensure the bvecs and pages stay referenced until the submitted I/O is
+ * completed by a call to ->ki_complete() or returns with an error other than
+ * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF
+ * on IO completion. If it isn't, then pages should be released.
*
* The function tries, but does not guarantee, to pin as many pages as
- * fit into the bio, or are requested in *iter, whatever is smaller. If
+ * fit into the bio, or are requested in @iter, whatever is smaller. If
* MM encounters an error pinning the requested pages, it stops. Error
* is returned only if 0 pages could be pinned.
*/
int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter)
{
- const bool is_bvec = iov_iter_is_bvec(iter);
- int ret;
+ int ret = 0;
- if (WARN_ON_ONCE(bio->bi_vcnt))
- return -EINVAL;
+ if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)))
+ return -EIO;
+
+ if (iov_iter_is_bvec(iter)) {
+ bio_iov_bvec_set(bio, iter);
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+ return 0;
+ }
+ if (iov_iter_extract_will_pin(iter))
+ bio_set_flag(bio, BIO_PAGE_PINNED);
do {
- if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
- if (WARN_ON_ONCE(is_bvec))
- return -EINVAL;
- ret = __bio_iov_append_get_pages(bio, iter);
- } else {
- if (is_bvec)
- ret = __bio_iov_bvec_add_pages(bio, iter);
- else
- ret = __bio_iov_iter_get_pages(bio, iter);
- }
+ ret = __bio_iov_iter_get_pages(bio, iter);
} while (!ret && iov_iter_count(iter) && !bio_full(bio, 0));
- if (is_bvec)
- bio_set_flag(bio, BIO_NO_PAGE_REF);
return bio->bi_vcnt ? 0 : ret;
}
EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages);
@@ -1179,7 +1367,8 @@ static void submit_bio_wait_endio(struct bio *bio)
*/
int submit_bio_wait(struct bio *bio)
{
- DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map);
+ DECLARE_COMPLETION_ONSTACK_MAP(done,
+ bio->bi_bdev->bd_disk->lockdep_map);
unsigned long hang_check;
bio->bi_private = &done;
@@ -1200,18 +1389,7 @@ int submit_bio_wait(struct bio *bio)
}
EXPORT_SYMBOL(submit_bio_wait);
-/**
- * bio_advance - increment/complete a bio by some number of bytes
- * @bio: bio to advance
- * @bytes: number of bytes to complete
- *
- * This updates bi_sector, bi_size and bi_idx; if the number of bytes to
- * complete doesn't align with a bvec boundary, then bv_len and bv_offset will
- * be updated on the last bvec as well.
- *
- * @bio will then represent the remaining, uncompleted portion of the io.
- */
-void bio_advance(struct bio *bio, unsigned bytes)
+void __bio_advance(struct bio *bio, unsigned bytes)
{
if (bio_integrity(bio))
bio_integrity_advance(bio, bytes);
@@ -1219,35 +1397,25 @@ void bio_advance(struct bio *bio, unsigned bytes)
bio_crypt_advance(bio, bytes);
bio_advance_iter(bio, &bio->bi_iter, bytes);
}
-EXPORT_SYMBOL(bio_advance);
+EXPORT_SYMBOL(__bio_advance);
void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter,
struct bio *src, struct bvec_iter *src_iter)
{
- struct bio_vec src_bv, dst_bv;
- void *src_p, *dst_p;
- unsigned bytes;
-
while (src_iter->bi_size && dst_iter->bi_size) {
- src_bv = bio_iter_iovec(src, *src_iter);
- dst_bv = bio_iter_iovec(dst, *dst_iter);
+ struct bio_vec src_bv = bio_iter_iovec(src, *src_iter);
+ struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter);
+ unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ void *src_buf = bvec_kmap_local(&src_bv);
+ void *dst_buf = bvec_kmap_local(&dst_bv);
- bytes = min(src_bv.bv_len, dst_bv.bv_len);
+ memcpy(dst_buf, src_buf, bytes);
- src_p = kmap_atomic(src_bv.bv_page);
- dst_p = kmap_atomic(dst_bv.bv_page);
+ kunmap_local(dst_buf);
+ kunmap_local(src_buf);
- memcpy(dst_p + dst_bv.bv_offset,
- src_p + src_bv.bv_offset,
- bytes);
-
- kunmap_atomic(dst_p);
- kunmap_atomic(src_p);
-
- flush_dcache_page(dst_bv.bv_page);
-
- bio_advance_iter(src, src_iter, bytes);
- bio_advance_iter(dst, dst_iter, bytes);
+ bio_advance_iter_single(src, src_iter, bytes);
+ bio_advance_iter_single(dst, dst_iter, bytes);
}
}
EXPORT_SYMBOL(bio_copy_data_iter);
@@ -1269,43 +1437,6 @@ void bio_copy_data(struct bio *dst, struct bio *src)
}
EXPORT_SYMBOL(bio_copy_data);
-/**
- * bio_list_copy_data - copy contents of data buffers from one chain of bios to
- * another
- * @src: source bio list
- * @dst: destination bio list
- *
- * Stops when it reaches the end of either the @src list or @dst list - that is,
- * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of
- * bios).
- */
-void bio_list_copy_data(struct bio *dst, struct bio *src)
-{
- struct bvec_iter src_iter = src->bi_iter;
- struct bvec_iter dst_iter = dst->bi_iter;
-
- while (1) {
- if (!src_iter.bi_size) {
- src = src->bi_next;
- if (!src)
- break;
-
- src_iter = src->bi_iter;
- }
-
- if (!dst_iter.bi_size) {
- dst = dst->bi_next;
- if (!dst)
- break;
-
- dst_iter = dst->bi_iter;
- }
-
- bio_copy_data_iter(dst, &dst_iter, src, &src_iter);
- }
-}
-EXPORT_SYMBOL(bio_list_copy_data);
-
void bio_free_pages(struct bio *bio)
{
struct bio_vec *bvec;
@@ -1320,18 +1451,12 @@ EXPORT_SYMBOL(bio_free_pages);
* bio_set_pages_dirty() and bio_check_pages_dirty() are support functions
* for performing direct-IO in BIOs.
*
- * The problem is that we cannot run set_page_dirty() from interrupt context
+ * The problem is that we cannot run folio_mark_dirty() from interrupt context
* because the required locks are not interrupt-safe. So what we can do is to
* mark the pages dirty _before_ performing IO. And in interrupt context,
* check that the pages are still dirty. If so, fine. If not, redirty them
* in process context.
*
- * We special-case compound pages here: normally this means reads into hugetlb
- * pages. The logic in here doesn't really work right for compound pages
- * because the VM does not uniformly chase down the head page in all cases.
- * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't
- * handle them at all. So we skip compound pages here at an early stage.
- *
* Note that this code is very hard to test under normal circumstances because
* direct-io pins the pages with get_user_pages(). This makes
* is_page_cache_freeable return false, and the VM will not clean the pages.
@@ -1347,14 +1472,15 @@ EXPORT_SYMBOL(bio_free_pages);
*/
void bio_set_pages_dirty(struct bio *bio)
{
- struct bio_vec *bvec;
- struct bvec_iter_all iter_all;
+ struct folio_iter fi;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (!PageCompound(bvec->bv_page))
- set_page_dirty_lock(bvec->bv_page);
+ bio_for_each_folio_all(fi, bio) {
+ folio_lock(fi.folio);
+ folio_mark_dirty(fi.folio);
+ folio_unlock(fi.folio);
}
}
+EXPORT_SYMBOL_GPL(bio_set_pages_dirty);
/*
* bio_check_pages_dirty() will check that all the BIO's pages are still dirty.
@@ -1363,8 +1489,8 @@ void bio_set_pages_dirty(struct bio *bio)
* the BIO and re-dirty the pages in process context.
*
* It is expected that bio_check_pages_dirty() will wholly own the BIO from
- * here on. It will run one put_page() against each page and will run one
- * bio_put() against the BIO.
+ * here on. It will unpin each page and will run one bio_put() against the
+ * BIO.
*/
static void bio_dirty_fn(struct work_struct *work);
@@ -1395,12 +1521,11 @@ static void bio_dirty_fn(struct work_struct *work)
void bio_check_pages_dirty(struct bio *bio)
{
- struct bio_vec *bvec;
+ struct folio_iter fi;
unsigned long flags;
- struct bvec_iter_all iter_all;
- bio_for_each_segment_all(bvec, bio, iter_all) {
- if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page))
+ bio_for_each_folio_all(fi, bio) {
+ if (!folio_test_dirty(fi.folio))
goto defer;
}
@@ -1414,6 +1539,7 @@ defer:
spin_unlock_irqrestore(&bio_dirty_lock, flags);
schedule_work(&bio_dirty_work);
}
+EXPORT_SYMBOL_GPL(bio_check_pages_dirty);
static inline bool bio_remaining_done(struct bio *bio)
{
@@ -1445,8 +1571,7 @@ static inline bool bio_remaining_done(struct bio *bio)
*
* bio_endio() can be called several times on a bio that has been chained
* using bio_chain(). The ->bi_end_io() function will only be called the
- * last time. At this point the BLK_TA_COMPLETE tracing event will be
- * generated if BIO_TRACE_COMPLETION is set.
+ * last time.
**/
void bio_endio(struct bio *bio)
{
@@ -1456,8 +1581,12 @@ again:
if (!bio_integrity_endio(bio))
return;
- if (bio->bi_disk)
- rq_qos_done_bio(bio->bi_disk->queue, bio);
+ rq_qos_done_bio(bio);
+
+ if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio);
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ }
/*
* Need to have a real endio function for chained bios, otherwise
@@ -1472,11 +1601,6 @@ again:
goto again;
}
- if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_complete(bio->bi_disk->queue, bio);
- bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- }
-
blk_throtl_bio_endio(bio);
/* release cgroup info */
bio_uninit(bio);
@@ -1511,7 +1635,7 @@ struct bio *bio_split(struct bio *bio, int sectors,
if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND))
return NULL;
- split = bio_clone_fast(bio, gfp, bs);
+ split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs);
if (!split)
return NULL;
@@ -1534,12 +1658,15 @@ EXPORT_SYMBOL(bio_split);
* @bio: bio to trim
* @offset: number of sectors to trim from the front of @bio
* @size: size we want to trim @bio to, in sectors
+ *
+ * This function is typically used for bios that are cloned and submitted
+ * to the underlying device in parts.
*/
-void bio_trim(struct bio *bio, int offset, int size)
+void bio_trim(struct bio *bio, sector_t offset, sector_t size)
{
- /* 'bio' is a cloned bio which we need to trim to match
- * the given offset and size.
- */
+ if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS ||
+ offset + size > bio_sectors(bio)))
+ return;
size <<= 9;
if (offset == 0 && size == bio->bi_iter.bi_size)
@@ -1550,7 +1677,6 @@ void bio_trim(struct bio *bio, int offset, int size)
if (bio_integrity(bio))
bio_integrity_trim(bio);
-
}
EXPORT_SYMBOL_GPL(bio_trim);
@@ -1560,7 +1686,7 @@ EXPORT_SYMBOL_GPL(bio_trim);
*/
int biovec_init_pool(mempool_t *pool, int pool_entries)
{
- struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX;
+ struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1;
return mempool_init_slab_pool(pool, pool_entries, bp->slab);
}
@@ -1573,6 +1699,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries)
*/
void bioset_exit(struct bio_set *bs)
{
+ bio_alloc_cache_destroy(bs);
if (bs->rescue_workqueue)
destroy_workqueue(bs->rescue_workqueue);
bs->rescue_workqueue = NULL;
@@ -1603,9 +1730,9 @@ EXPORT_SYMBOL(bioset_exit);
* Note that the bio must be embedded at the END of that structure always,
* or things will break badly.
* If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated
- * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast().
- * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to
- * dispatch queued requests when the mempool runs out of space.
+ * for allocating iovecs. This pool is not needed e.g. for bio_init_clone().
+ * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used
+ * to dispatch queued requests when the mempool runs out of space.
*
*/
int bioset_init(struct bio_set *bs,
@@ -1613,15 +1740,17 @@ int bioset_init(struct bio_set *bs,
unsigned int front_pad,
int flags)
{
- unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
-
bs->front_pad = front_pad;
+ if (flags & BIOSET_NEED_BVECS)
+ bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec);
+ else
+ bs->back_pad = 0;
spin_lock_init(&bs->rescue_lock);
bio_list_init(&bs->rescue_list);
INIT_WORK(&bs->rescue_work, bio_alloc_rescue);
- bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad);
+ bs->bio_slab = bio_find_or_create_slab(bs);
if (!bs->bio_slab)
return -ENOMEM;
@@ -1632,12 +1761,18 @@ int bioset_init(struct bio_set *bs,
biovec_init_pool(&bs->bvec_pool, pool_size))
goto bad;
- if (!(flags & BIOSET_NEED_RESCUER))
- return 0;
-
- bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0);
- if (!bs->rescue_workqueue)
- goto bad;
+ if (flags & BIOSET_NEED_RESCUER) {
+ bs->rescue_workqueue = alloc_workqueue("bioset",
+ WQ_MEM_RECLAIM, 0);
+ if (!bs->rescue_workqueue)
+ goto bad;
+ }
+ if (flags & BIOSET_PERCPU_CACHE) {
+ bs->cache = alloc_percpu(struct bio_alloc_cache);
+ if (!bs->cache)
+ goto bad;
+ cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead);
+ }
return 0;
bad:
@@ -1646,194 +1781,27 @@ bad:
}
EXPORT_SYMBOL(bioset_init);
-/*
- * Initialize and setup a new bio_set, based on the settings from
- * another bio_set.
- */
-int bioset_init_from_src(struct bio_set *bs, struct bio_set *src)
-{
- int flags;
-
- flags = 0;
- if (src->bvec_pool.min_nr)
- flags |= BIOSET_NEED_BVECS;
- if (src->rescue_workqueue)
- flags |= BIOSET_NEED_RESCUER;
-
- return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags);
-}
-EXPORT_SYMBOL(bioset_init_from_src);
-
-#ifdef CONFIG_BLK_CGROUP
-
-/**
- * bio_disassociate_blkg - puts back the blkg reference if associated
- * @bio: target bio
- *
- * Helper to disassociate the blkg from @bio if a blkg is associated.
- */
-void bio_disassociate_blkg(struct bio *bio)
-{
- if (bio->bi_blkg) {
- blkg_put(bio->bi_blkg);
- bio->bi_blkg = NULL;
- }
-}
-EXPORT_SYMBOL_GPL(bio_disassociate_blkg);
-
-/**
- * __bio_associate_blkg - associate a bio with the a blkg
- * @bio: target bio
- * @blkg: the blkg to associate
- *
- * This tries to associate @bio with the specified @blkg. Association failure
- * is handled by walking up the blkg tree. Therefore, the blkg associated can
- * be anything between @blkg and the root_blkg. This situation only happens
- * when a cgroup is dying and then the remaining bios will spill to the closest
- * alive blkg.
- *
- * A reference will be taken on the @blkg and will be released when @bio is
- * freed.
- */
-static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg)
-{
- bio_disassociate_blkg(bio);
-
- bio->bi_blkg = blkg_tryget_closest(blkg);
-}
-
-/**
- * bio_associate_blkg_from_css - associate a bio with a specified css
- * @bio: target bio
- * @css: target css
- *
- * Associate @bio with the blkg found by combining the css's blkg and the
- * request_queue of the @bio. This falls back to the queue's root_blkg if
- * the association fails with the css.
- */
-void bio_associate_blkg_from_css(struct bio *bio,
- struct cgroup_subsys_state *css)
-{
- struct request_queue *q = bio->bi_disk->queue;
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- if (!css || !css->parent)
- blkg = q->root_blkg;
- else
- blkg = blkg_lookup_create(css_to_blkcg(css), q);
-
- __bio_associate_blkg(bio, blkg);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
-
-#ifdef CONFIG_MEMCG
-/**
- * bio_associate_blkg_from_page - associate a bio with the page's blkg
- * @bio: target bio
- * @page: the page to lookup the blkcg from
- *
- * Associate @bio with the blkg from @page's owning memcg and the respective
- * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's
- * root_blkg.
- */
-void bio_associate_blkg_from_page(struct bio *bio, struct page *page)
-{
- struct cgroup_subsys_state *css;
-
- if (!page->mem_cgroup)
- return;
-
- rcu_read_lock();
-
- css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys);
- bio_associate_blkg_from_css(bio, css);
-
- rcu_read_unlock();
-}
-#endif /* CONFIG_MEMCG */
-
-/**
- * bio_associate_blkg - associate a bio with a blkg
- * @bio: target bio
- *
- * Associate @bio with the blkg found from the bio's css and request_queue.
- * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
- * already associated, the css is reused and association redone as the
- * request_queue may have changed.
- */
-void bio_associate_blkg(struct bio *bio)
-{
- struct cgroup_subsys_state *css;
-
- rcu_read_lock();
-
- if (bio->bi_blkg)
- css = &bio_blkcg(bio)->css;
- else
- css = blkcg_css();
-
- bio_associate_blkg_from_css(bio, css);
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_associate_blkg);
-
-/**
- * bio_clone_blkg_association - clone blkg association from src to dst bio
- * @dst: destination bio
- * @src: source bio
- */
-void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+static int __init init_bio(void)
{
- rcu_read_lock();
-
- if (src->bi_blkg)
- __bio_associate_blkg(dst, src->bi_blkg);
+ int i;
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
-#endif /* CONFIG_BLK_CGROUP */
+ BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags));
-static void __init biovec_init_slabs(void)
-{
- int i;
+ bio_integrity_init();
- for (i = 0; i < BVEC_POOL_NR; i++) {
- int size;
+ for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) {
struct biovec_slab *bvs = bvec_slabs + i;
- if (bvs->nr_vecs <= BIO_INLINE_VECS) {
- bvs->slab = NULL;
- continue;
- }
-
- size = bvs->nr_vecs * sizeof(struct bio_vec);
- bvs->slab = kmem_cache_create(bvs->name, size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
+ bvs->slab = kmem_cache_create(bvs->name,
+ bvs->nr_vecs * sizeof(struct bio_vec), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
}
-}
-
-static int __init init_bio(void)
-{
- bio_slab_max = 2;
- bio_slab_nr = 0;
- bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab),
- GFP_KERNEL);
- BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET);
-
- if (!bio_slabs)
- panic("bio: can't allocate bios\n");
-
- bio_integrity_init();
- biovec_init_slabs();
+ cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL,
+ bio_cpu_dead);
- if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS))
+ if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0,
+ BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE))
panic("bio: can't allocate bios\n");
if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE))
diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c
new file mode 100644
index 000000000000..3ec21333f393
--- /dev/null
+++ b/block/blk-cgroup-fc-appid.c
@@ -0,0 +1,57 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "blk-cgroup.h"
+
+/**
+ * blkcg_set_fc_appid - set the fc_app_id field associted to blkcg
+ * @app_id: application identifier
+ * @cgrp_id: cgroup id
+ * @app_id_len: size of application identifier
+ */
+int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len)
+{
+ struct cgroup *cgrp;
+ struct cgroup_subsys_state *css;
+ struct blkcg *blkcg;
+ int ret = 0;
+
+ if (app_id_len > FC_APPID_LEN)
+ return -EINVAL;
+
+ cgrp = cgroup_get_from_id(cgrp_id);
+ if (IS_ERR(cgrp))
+ return PTR_ERR(cgrp);
+ css = cgroup_get_e_css(cgrp, &io_cgrp_subsys);
+ if (!css) {
+ ret = -ENOENT;
+ goto out_cgrp_put;
+ }
+ blkcg = css_to_blkcg(css);
+ /*
+ * There is a slight race condition on setting the appid.
+ * Worst case an I/O may not find the right id.
+ * This is no different from the I/O we let pass while obtaining
+ * the vmid from the fabric.
+ * Adding the overhead of a lock is not necessary.
+ */
+ strscpy(blkcg->fc_app_id, app_id, app_id_len);
+ css_put(css);
+out_cgrp_put:
+ cgroup_put(cgrp);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blkcg_set_fc_appid);
+
+/**
+ * blkcg_get_fc_appid - get the fc app identifier associated with a bio
+ * @bio: target bio
+ *
+ * On success return the fc_app_id, on failure return NULL
+ */
+char *blkcg_get_fc_appid(struct bio *bio)
+{
+ if (!bio->bi_blkg || bio->bi_blkg->blkcg->fc_app_id[0] == '\0')
+ return NULL;
+ return bio->bi_blkg->blkcg->fc_app_id;
+}
+EXPORT_SYMBOL_GPL(blkcg_get_fc_appid);
diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c
index 85d5790ac49b..3304e841df7c 100644
--- a/block/blk-cgroup-rwstat.c
+++ b/block/blk-cgroup-rwstat.c
@@ -109,6 +109,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
lockdep_assert_held(&blkg->q->queue_lock);
+ memset(sum, 0, sizeof(*sum));
rcu_read_lock();
blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) {
struct blkg_rwstat *rwstat;
@@ -122,7 +123,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
rwstat = (void *)pos_blkg + off;
for (i = 0; i < BLKG_RWSTAT_NR; i++)
- sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i);
+ sum->cnt[i] += blkg_rwstat_read_counter(rwstat, i);
}
rcu_read_unlock();
}
diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h
index ee746919c41f..022527b0b043 100644
--- a/block/blk-cgroup-rwstat.h
+++ b/block/blk-cgroup-rwstat.h
@@ -6,7 +6,7 @@
#ifndef _BLK_CGROUP_RWSTAT_H
#define _BLK_CGROUP_RWSTAT_H
-#include <linux/blk-cgroup.h>
+#include "blk-cgroup.h"
enum blkg_rwstat_type {
BLKG_RWSTAT_READ,
@@ -59,20 +59,20 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
* caller is responsible for synchronizing calls to this function.
*/
static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
- unsigned int op, uint64_t val)
+ blk_opf_t opf, uint64_t val)
{
struct percpu_counter *cnt;
- if (op_is_discard(op))
+ if (op_is_discard(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
- else if (op_is_write(op))
+ else if (op_is_write(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
- if (op_is_sync(op))
+ if (op_is_sync(opf))
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
else
cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 0ecc897b225c..ff93c385ba5a 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -23,16 +23,18 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/slab.h>
-#include <linux/genhd.h>
#include <linux/delay.h>
#include <linux/atomic.h>
#include <linux/ctype.h>
-#include <linux/blk-cgroup.h>
-#include <linux/tracehook.h>
+#include <linux/resume_user_mode.h>
#include <linux/psi.h>
+#include <linux/part_stat.h>
#include "blk.h"
+#include "blk-cgroup.h"
+#include "blk-ioprio.h"
+#include "blk-throttle.h"
-#define MAX_KEY_LEN 100
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu);
/*
* blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation.
@@ -55,7 +57,58 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS];
static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */
bool blkcg_debug_stats = false;
-static struct workqueue_struct *blkcg_punt_bio_wq;
+
+static DEFINE_RAW_SPINLOCK(blkg_stat_lock);
+
+#define BLKG_DESTROY_BATCH_SIZE 64
+
+/*
+ * Lockless lists for tracking IO stats update
+ *
+ * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg).
+ * There are multiple blkg's (one for each block device) attached to each
+ * blkcg. The rstat code keeps track of which cpu has IO stats updated,
+ * but it doesn't know which blkg has the updated stats. If there are many
+ * block devices in a system, the cost of iterating all the blkg's to flush
+ * out the IO stats can be high. To reduce such overhead, a set of percpu
+ * lockless lists (lhead) per blkcg are used to track the set of recently
+ * updated iostat_cpu's since the last flush. An iostat_cpu will be put
+ * onto the lockless list on the update side [blk_cgroup_bio_start()] if
+ * not there yet and then removed when being flushed [blkcg_rstat_flush()].
+ * References to blkg are gotten and then put back in the process to
+ * protect against blkg removal.
+ *
+ * Return: 0 if successful or -ENOMEM if allocation fails.
+ */
+static int init_blkcg_llists(struct blkcg *blkcg)
+{
+ int cpu;
+
+ blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL);
+ if (!blkcg->lhead)
+ return -ENOMEM;
+
+ for_each_possible_cpu(cpu)
+ init_llist_head(per_cpu_ptr(blkcg->lhead, cpu));
+ return 0;
+}
+
+/**
+ * blkcg_css - find the current css
+ *
+ * Find the css associated with either the kthread or the current task.
+ * This may return a dying css, so it is up to the caller to use tryget logic
+ * to confirm it is alive and well.
+ */
+static struct cgroup_subsys_state *blkcg_css(void)
+{
+ struct cgroup_subsys_state *css;
+
+ css = kthread_blkcg();
+ if (css)
+ return css;
+ return task_css(current, io_cgrp_id);
+}
static bool blkcg_policy_enabled(struct request_queue *q,
const struct blkcg_policy *pol)
@@ -63,6 +116,37 @@ static bool blkcg_policy_enabled(struct request_queue *q,
return pol && test_bit(pol->plid, q->blkcg_pols);
}
+static void blkg_free_workfn(struct work_struct *work)
+{
+ struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
+ free_work);
+ struct request_queue *q = blkg->q;
+ int i;
+
+ /*
+ * pd_free_fn() can also be called from blkcg_deactivate_policy(),
+ * in order to make sure pd_free_fn() is called in order, the deletion
+ * of the list blkg->q_node is delayed to here from blkg_destroy(), and
+ * blkcg_mutex is used to synchronize blkg_free_workfn() and
+ * blkcg_deactivate_policy().
+ */
+ mutex_lock(&q->blkcg_mutex);
+ for (i = 0; i < BLKCG_MAX_POLS; i++)
+ if (blkg->pd[i])
+ blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+ if (blkg->parent)
+ blkg_put(blkg->parent);
+ spin_lock_irq(&q->queue_lock);
+ list_del_init(&blkg->q_node);
+ spin_unlock_irq(&q->queue_lock);
+ mutex_unlock(&q->blkcg_mutex);
+
+ blk_put_queue(q);
+ free_percpu(blkg->iostat_cpu);
+ percpu_ref_exit(&blkg->refcnt);
+ kfree(blkg);
+}
+
/**
* blkg_free - free a blkg
* @blkg: blkg to free
@@ -71,33 +155,37 @@ static bool blkcg_policy_enabled(struct request_queue *q,
*/
static void blkg_free(struct blkcg_gq *blkg)
{
- int i;
-
if (!blkg)
return;
- for (i = 0; i < BLKCG_MAX_POLS; i++)
- if (blkg->pd[i])
- blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
-
- free_percpu(blkg->iostat_cpu);
- percpu_ref_exit(&blkg->refcnt);
- kfree(blkg);
+ /*
+ * Both ->pd_free_fn() and request queue's release handler may
+ * sleep, so free us by scheduling one work func
+ */
+ INIT_WORK(&blkg->free_work, blkg_free_workfn);
+ schedule_work(&blkg->free_work);
}
static void __blkg_release(struct rcu_head *rcu)
{
struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head);
+ struct blkcg *blkcg = blkg->blkcg;
+ int cpu;
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
WARN_ON(!bio_list_empty(&blkg->async_bios));
+#endif
+ /*
+ * Flush all the non-empty percpu lockless lists before releasing
+ * us, given these stat belongs to us.
+ *
+ * blkg_stat_lock is for serializing blkg stat update
+ */
+ for_each_possible_cpu(cpu)
+ __blkcg_rstat_flush(blkcg, cpu);
/* release the blkcg and parent blkg refs this blkg has been holding */
css_put(&blkg->blkcg->css);
- if (blkg->parent)
- blkg_put(blkg->parent);
-
- wb_congested_put(blkg->wb_congested);
-
blkg_free(blkg);
}
@@ -116,125 +204,184 @@ static void blkg_release(struct percpu_ref *ref)
call_rcu(&blkg->rcu_head, __blkg_release);
}
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
+static struct workqueue_struct *blkcg_punt_bio_wq;
+
static void blkg_async_bio_workfn(struct work_struct *work)
{
struct blkcg_gq *blkg = container_of(work, struct blkcg_gq,
async_bio_work);
struct bio_list bios = BIO_EMPTY_LIST;
struct bio *bio;
+ struct blk_plug plug;
+ bool need_plug = false;
/* as long as there are pending bios, @blkg can't go away */
- spin_lock_bh(&blkg->async_bio_lock);
+ spin_lock(&blkg->async_bio_lock);
bio_list_merge(&bios, &blkg->async_bios);
bio_list_init(&blkg->async_bios);
- spin_unlock_bh(&blkg->async_bio_lock);
+ spin_unlock(&blkg->async_bio_lock);
+ /* start plug only when bio_list contains at least 2 bios */
+ if (bios.head && bios.head->bi_next) {
+ need_plug = true;
+ blk_start_plug(&plug);
+ }
while ((bio = bio_list_pop(&bios)))
submit_bio(bio);
+ if (need_plug)
+ blk_finish_plug(&plug);
+}
+
+/*
+ * When a shared kthread issues a bio for a cgroup, doing so synchronously can
+ * lead to priority inversions as the kthread can be trapped waiting for that
+ * cgroup. Use this helper instead of submit_bio to punt the actual issuing to
+ * a dedicated per-blkcg work item to avoid such priority inversions.
+ */
+void blkcg_punt_bio_submit(struct bio *bio)
+{
+ struct blkcg_gq *blkg = bio->bi_blkg;
+
+ if (blkg->parent) {
+ spin_lock(&blkg->async_bio_lock);
+ bio_list_add(&blkg->async_bios, bio);
+ spin_unlock(&blkg->async_bio_lock);
+ queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
+ } else {
+ /* never bounce for the root cgroup */
+ submit_bio(bio);
+ }
+}
+EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit);
+
+static int __init blkcg_punt_bio_init(void)
+{
+ blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
+ WQ_MEM_RECLAIM | WQ_FREEZABLE |
+ WQ_UNBOUND | WQ_SYSFS, 0);
+ if (!blkcg_punt_bio_wq)
+ return -ENOMEM;
+ return 0;
+}
+subsys_initcall(blkcg_punt_bio_init);
+#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */
+
+/**
+ * bio_blkcg_css - return the blkcg CSS associated with a bio
+ * @bio: target bio
+ *
+ * This returns the CSS for the blkcg associated with a bio, or %NULL if not
+ * associated. Callers are expected to either handle %NULL or know association
+ * has been done prior to calling this.
+ */
+struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio)
+{
+ if (!bio || !bio->bi_blkg)
+ return NULL;
+ return &bio->bi_blkg->blkcg->css;
+}
+EXPORT_SYMBOL_GPL(bio_blkcg_css);
+
+/**
+ * blkcg_parent - get the parent of a blkcg
+ * @blkcg: blkcg of interest
+ *
+ * Return the parent blkcg of @blkcg. Can be called anytime.
+ */
+static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
+{
+ return css_to_blkcg(blkcg->css.parent);
}
/**
* blkg_alloc - allocate a blkg
* @blkcg: block cgroup the new blkg is associated with
- * @q: request_queue the new blkg is associated with
+ * @disk: gendisk the new blkg is associated with
* @gfp_mask: allocation mask to use
*
- * Allocate a new blkg assocating @blkcg and @q.
+ * Allocate a new blkg associating @blkcg and @disk.
*/
-static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q,
+static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk,
gfp_t gfp_mask)
{
struct blkcg_gq *blkg;
int i, cpu;
/* alloc and init base part */
- blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node);
+ blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node);
if (!blkg)
return NULL;
-
if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask))
- goto err_free;
-
+ goto out_free_blkg;
blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask);
if (!blkg->iostat_cpu)
- goto err_free;
+ goto out_exit_refcnt;
+ if (!blk_get_queue(disk->queue))
+ goto out_free_iostat;
- blkg->q = q;
+ blkg->q = disk->queue;
INIT_LIST_HEAD(&blkg->q_node);
+ blkg->blkcg = blkcg;
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
spin_lock_init(&blkg->async_bio_lock);
bio_list_init(&blkg->async_bios);
INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn);
- blkg->blkcg = blkcg;
+#endif
u64_stats_init(&blkg->iostat.sync);
- for_each_possible_cpu(cpu)
+ for_each_possible_cpu(cpu) {
u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync);
+ per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg;
+ }
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkg_policy_data *pd;
- if (!blkcg_policy_enabled(q, pol))
+ if (!blkcg_policy_enabled(disk->queue, pol))
continue;
/* alloc per-policy data and attach it to blkg */
- pd = pol->pd_alloc_fn(gfp_mask, q, blkcg);
+ pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask);
if (!pd)
- goto err_free;
-
+ goto out_free_pds;
blkg->pd[i] = pd;
pd->blkg = blkg;
pd->plid = i;
+ pd->online = false;
}
return blkg;
-err_free:
- blkg_free(blkg);
- return NULL;
-}
-
-struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
- struct request_queue *q, bool update_hint)
-{
- struct blkcg_gq *blkg;
-
- /*
- * Hint didn't match. Look up from the radix tree. Note that the
- * hint can only be updated under queue_lock as otherwise @blkg
- * could have already been removed from blkg_tree. The caller is
- * responsible for grabbing queue_lock if @update_hint.
- */
- blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
- if (blkg && blkg->q == q) {
- if (update_hint) {
- lockdep_assert_held(&q->queue_lock);
- rcu_assign_pointer(blkcg->blkg_hint, blkg);
- }
- return blkg;
- }
-
+out_free_pds:
+ while (--i >= 0)
+ if (blkg->pd[i])
+ blkcg_policy[i]->pd_free_fn(blkg->pd[i]);
+ blk_put_queue(disk->queue);
+out_free_iostat:
+ free_percpu(blkg->iostat_cpu);
+out_exit_refcnt:
+ percpu_ref_exit(&blkg->refcnt);
+out_free_blkg:
+ kfree(blkg);
return NULL;
}
-EXPORT_SYMBOL_GPL(blkg_lookup_slowpath);
/*
* If @new_blkg is %NULL, this function tries to allocate a new one as
* necessary using %GFP_NOWAIT. @new_blkg is always consumed on return.
*/
-static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
- struct request_queue *q,
+static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
struct blkcg_gq *new_blkg)
{
struct blkcg_gq *blkg;
- struct bdi_writeback_congested *wb_congested;
int i, ret;
- WARN_ON_ONCE(!rcu_read_lock_held());
- lockdep_assert_held(&q->queue_lock);
+ lockdep_assert_held(&disk->queue->queue_lock);
/* request_queue is dying, do not create/recreate a blkg */
- if (blk_queue_dying(q)) {
+ if (blk_queue_dying(disk->queue)) {
ret = -ENODEV;
goto err_free_blkg;
}
@@ -245,31 +392,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
goto err_free_blkg;
}
- wb_congested = wb_congested_get_create(q->backing_dev_info,
- blkcg->css.id,
- GFP_NOWAIT | __GFP_NOWARN);
- if (!wb_congested) {
- ret = -ENOMEM;
- goto err_put_css;
- }
-
/* allocate */
if (!new_blkg) {
- new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN);
+ new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto err_put_congested;
+ goto err_put_css;
}
}
blkg = new_blkg;
- blkg->wb_congested = wb_congested;
/* link parent */
if (blkcg_parent(blkcg)) {
- blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false);
+ blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue);
if (WARN_ON_ONCE(!blkg->parent)) {
ret = -ENODEV;
- goto err_put_congested;
+ goto err_put_css;
}
blkg_get(blkg->parent);
}
@@ -284,16 +422,19 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
/* insert */
spin_lock(&blkcg->lock);
- ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg);
+ ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg);
if (likely(!ret)) {
hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list);
- list_add(&blkg->q_node, &q->blkg_list);
+ list_add(&blkg->q_node, &disk->queue->blkg_list);
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
- if (blkg->pd[i] && pol->pd_online_fn)
- pol->pd_online_fn(blkg->pd[i]);
+ if (blkg->pd[i]) {
+ if (pol->pd_online_fn)
+ pol->pd_online_fn(blkg->pd[i]);
+ blkg->pd[i]->online = true;
+ }
}
}
blkg->online = true;
@@ -306,40 +447,49 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg,
blkg_put(blkg);
return ERR_PTR(ret);
-err_put_congested:
- wb_congested_put(wb_congested);
err_put_css:
css_put(&blkcg->css);
err_free_blkg:
- blkg_free(new_blkg);
+ if (new_blkg)
+ blkg_free(new_blkg);
return ERR_PTR(ret);
}
/**
- * __blkg_lookup_create - lookup blkg, try to create one if not there
+ * blkg_lookup_create - lookup blkg, try to create one if not there
* @blkcg: blkcg of interest
- * @q: request_queue of interest
+ * @disk: gendisk of interest
*
- * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to
+ * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to
* create one. blkg creation is performed recursively from blkcg_root such
* that all non-root blkg's have access to the parent blkg. This function
- * should be called under RCU read lock and @q->queue_lock.
+ * should be called under RCU read lock and takes @disk->queue->queue_lock.
*
* Returns the blkg or the closest blkg if blkg_create() fails as it walks
* down from root.
*/
-struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
+static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
+ struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blkcg_gq *blkg;
+ unsigned long flags;
WARN_ON_ONCE(!rcu_read_lock_held());
- lockdep_assert_held(&q->queue_lock);
- blkg = __blkg_lookup(blkcg, q, true);
+ blkg = blkg_lookup(blkcg, q);
if (blkg)
return blkg;
+ spin_lock_irqsave(&q->queue_lock, flags);
+ blkg = blkg_lookup(blkcg, q);
+ if (blkg) {
+ if (blkcg != &blkcg_root &&
+ blkg != rcu_dereference(blkcg->blkg_hint))
+ rcu_assign_pointer(blkcg->blkg_hint, blkg);
+ goto found;
+ }
+
/*
* Create blkgs walking down from blkcg_root to @blkcg, so that all
* non-root blkgs have access to their parents. Returns the closest
@@ -351,7 +501,7 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
struct blkcg_gq *ret_blkg = q->root_blkg;
while (parent) {
- blkg = __blkg_lookup(parent, q, false);
+ blkg = blkg_lookup(parent, q);
if (blkg) {
/* remember closest blkg */
ret_blkg = blkg;
@@ -361,35 +511,17 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
parent = blkcg_parent(parent);
}
- blkg = blkg_create(pos, q, NULL);
- if (IS_ERR(blkg))
- return ret_blkg;
+ blkg = blkg_create(pos, disk, NULL);
+ if (IS_ERR(blkg)) {
+ blkg = ret_blkg;
+ break;
+ }
if (pos == blkcg)
- return blkg;
- }
-}
-
-/**
- * blkg_lookup_create - find or create a blkg
- * @blkcg: target block cgroup
- * @q: target request_queue
- *
- * This looks up or creates the blkg representing the unique pair
- * of the blkcg and the request_queue.
- */
-struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
- struct request_queue *q)
-{
- struct blkcg_gq *blkg = blkg_lookup(blkcg, q);
-
- if (unlikely(!blkg)) {
- unsigned long flags;
-
- spin_lock_irqsave(&q->queue_lock, flags);
- blkg = __blkg_lookup_create(blkcg, q);
- spin_unlock_irqrestore(&q->queue_lock, flags);
+ break;
}
+found:
+ spin_unlock_irqrestore(&q->queue_lock, flags);
return blkg;
}
@@ -401,21 +533,28 @@ static void blkg_destroy(struct blkcg_gq *blkg)
lockdep_assert_held(&blkg->q->queue_lock);
lockdep_assert_held(&blkcg->lock);
- /* Something wrong if we are trying to remove same group twice */
- WARN_ON_ONCE(list_empty(&blkg->q_node));
- WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node));
+ /*
+ * blkg stays on the queue list until blkg_free_workfn(), see details in
+ * blkg_free_workfn(), hence this function can be called from
+ * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before
+ * blkg_free_workfn().
+ */
+ if (hlist_unhashed(&blkg->blkcg_node))
+ return;
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
- if (blkg->pd[i] && pol->pd_offline_fn)
- pol->pd_offline_fn(blkg->pd[i]);
+ if (blkg->pd[i] && blkg->pd[i]->online) {
+ blkg->pd[i]->online = false;
+ if (pol->pd_offline_fn)
+ pol->pd_offline_fn(blkg->pd[i]);
+ }
}
blkg->online = false;
radix_tree_delete(&blkcg->blkg_tree, blkg->q->id);
- list_del_init(&blkg->q_node);
hlist_del_init_rcu(&blkg->blkcg_node);
/*
@@ -433,23 +572,47 @@ static void blkg_destroy(struct blkcg_gq *blkg)
percpu_ref_kill(&blkg->refcnt);
}
-/**
- * blkg_destroy_all - destroy all blkgs associated with a request_queue
- * @q: request_queue of interest
- *
- * Destroy all blkgs associated with @q.
- */
-static void blkg_destroy_all(struct request_queue *q)
+static void blkg_destroy_all(struct gendisk *disk)
{
- struct blkcg_gq *blkg, *n;
+ struct request_queue *q = disk->queue;
+ struct blkcg_gq *blkg;
+ int count = BLKG_DESTROY_BATCH_SIZE;
+ int i;
+restart:
spin_lock_irq(&q->queue_lock);
- list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) {
+ list_for_each_entry(blkg, &q->blkg_list, q_node) {
struct blkcg *blkcg = blkg->blkcg;
+ if (hlist_unhashed(&blkg->blkcg_node))
+ continue;
+
spin_lock(&blkcg->lock);
blkg_destroy(blkg);
spin_unlock(&blkcg->lock);
+
+ /*
+ * in order to avoid holding the spin lock for too long, release
+ * it when a batch of blkgs are destroyed.
+ */
+ if (!(--count)) {
+ count = BLKG_DESTROY_BATCH_SIZE;
+ spin_unlock_irq(&q->queue_lock);
+ cond_resched();
+ goto restart;
+ }
+ }
+
+ /*
+ * Mark policy deactivated since policy offline has been done, and
+ * the free is scheduled, so future blkcg_deactivate_policy() can
+ * be bypassed
+ */
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (pol)
+ __clear_bit(pol->plid, q->blkcg_pols);
}
q->root_blkg = NULL;
@@ -476,8 +639,13 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
struct blkg_iostat_set *bis =
per_cpu_ptr(blkg->iostat_cpu, cpu);
memset(bis, 0, sizeof(*bis));
+
+ /* Re-initialize the cleared blkg_iostat_set */
+ u64_stats_init(&bis->sync);
+ bis->blkg = blkg;
}
memset(&blkg->iostat, 0, sizeof(blkg->iostat));
+ u64_stats_init(&blkg->iostat.sync);
for (i = 0; i < BLKCG_MAX_POLS; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
@@ -494,10 +662,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
const char *blkg_dev_name(struct blkcg_gq *blkg)
{
- /* some drivers (floppy) instantiate a queue w/o disk registered */
- if (blkg->q->backing_dev_info->dev)
- return bdi_dev_name(blkg->q->backing_dev_info);
- return NULL;
+ if (!blkg->q->disk)
+ return NULL;
+ return bdi_dev_name(blkg->q->disk->bdi);
}
/**
@@ -547,7 +714,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs);
* @pd: policy private data of interest
* @v: value to print
*
- * Print @v to @sf for the device assocaited with @pd.
+ * Print @v to @sf for the device associated with @pd.
*/
u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
{
@@ -561,93 +728,119 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v)
}
EXPORT_SYMBOL_GPL(__blkg_prfill_u64);
-/* Performs queue bypass and policy enabled checks then looks up blkg. */
-static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg,
- const struct blkcg_policy *pol,
- struct request_queue *q)
+/**
+ * blkg_conf_init - initialize a blkg_conf_ctx
+ * @ctx: blkg_conf_ctx to initialize
+ * @input: input string
+ *
+ * Initialize @ctx which can be used to parse blkg config input string @input.
+ * Once initialized, @ctx can be used with blkg_conf_open_bdev() and
+ * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit().
+ */
+void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input)
{
- WARN_ON_ONCE(!rcu_read_lock_held());
- lockdep_assert_held(&q->queue_lock);
-
- if (!blkcg_policy_enabled(q, pol))
- return ERR_PTR(-EOPNOTSUPP);
- return __blkg_lookup(blkcg, q, true /* update_hint */);
+ *ctx = (struct blkg_conf_ctx){ .input = input };
}
+EXPORT_SYMBOL_GPL(blkg_conf_init);
/**
- * blkg_conf_prep - parse and prepare for per-blkg config update
- * @inputp: input string pointer
+ * blkg_conf_open_bdev - parse and open bdev for per-blkg config update
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
*
- * Parse the device node prefix part, MAJ:MIN, of per-blkg config update
- * from @input and get and return the matching gendisk. *@inputp is
- * updated to point past the device node prefix. Returns an ERR_PTR()
- * value on error.
+ * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from
+ * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is
+ * set to point past the device node prefix.
*
- * Use this function iff blkg_conf_prep() can't be used for some reason.
+ * This function may be called multiple times on @ctx and the extra calls become
+ * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function
+ * explicitly if bdev access is needed without resolving the blkcg / policy part
+ * of @ctx->input. Returns -errno on error.
*/
-struct gendisk *blkcg_conf_get_disk(char **inputp)
+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx)
{
- char *input = *inputp;
+ char *input = ctx->input;
unsigned int major, minor;
- struct gendisk *disk;
- int key_len, part;
+ struct block_device *bdev;
+ int key_len;
+
+ if (ctx->bdev)
+ return 0;
if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2)
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
input += key_len;
if (!isspace(*input))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
input = skip_spaces(input);
- disk = get_gendisk(MKDEV(major, minor), &part);
- if (!disk)
- return ERR_PTR(-ENODEV);
- if (part) {
- put_disk_and_module(disk);
- return ERR_PTR(-ENODEV);
+ bdev = blkdev_get_no_open(MKDEV(major, minor));
+ if (!bdev)
+ return -ENODEV;
+ if (bdev_is_partition(bdev)) {
+ blkdev_put_no_open(bdev);
+ return -ENODEV;
+ }
+
+ mutex_lock(&bdev->bd_queue->rq_qos_mutex);
+ if (!disk_live(bdev->bd_disk)) {
+ blkdev_put_no_open(bdev);
+ mutex_unlock(&bdev->bd_queue->rq_qos_mutex);
+ return -ENODEV;
}
- *inputp = input;
- return disk;
+ ctx->body = input;
+ ctx->bdev = bdev;
+ return 0;
}
/**
* blkg_conf_prep - parse and prepare for per-blkg config update
* @blkcg: target block cgroup
* @pol: target policy
- * @input: input string
- * @ctx: blkg_conf_ctx to be filled
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
+ *
+ * Parse per-blkg config update from @ctx->input and initialize @ctx
+ * accordingly. On success, @ctx->body points to the part of @ctx->input
+ * following MAJ:MIN, @ctx->bdev points to the target block device and
+ * @ctx->blkg to the blkg being configured.
*
- * Parse per-blkg config update from @input and initialize @ctx with the
- * result. @ctx->blkg points to the blkg to be updated and @ctx->body the
- * part of @input following MAJ:MIN. This function returns with RCU read
- * lock and queue lock held and must be paired with blkg_conf_finish().
+ * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this
+ * function returns with queue lock held and must be followed by
+ * blkg_conf_exit().
*/
int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
- char *input, struct blkg_conf_ctx *ctx)
- __acquires(rcu) __acquires(&disk->queue->queue_lock)
+ struct blkg_conf_ctx *ctx)
+ __acquires(&bdev->bd_queue->queue_lock)
{
struct gendisk *disk;
struct request_queue *q;
struct blkcg_gq *blkg;
int ret;
- disk = blkcg_conf_get_disk(&input);
- if (IS_ERR(disk))
- return PTR_ERR(disk);
+ ret = blkg_conf_open_bdev(ctx);
+ if (ret)
+ return ret;
+ disk = ctx->bdev->bd_disk;
q = disk->queue;
- rcu_read_lock();
+ /*
+ * blkcg_deactivate_policy() requires queue to be frozen, we can grab
+ * q_usage_counter to prevent concurrent with blkcg_deactivate_policy().
+ */
+ ret = blk_queue_enter(q, 0);
+ if (ret)
+ goto fail;
+
spin_lock_irq(&q->queue_lock);
- blkg = blkg_lookup_check(blkcg, pol, q);
- if (IS_ERR(blkg)) {
- ret = PTR_ERR(blkg);
+ if (!blkcg_policy_enabled(q, pol)) {
+ ret = -EOPNOTSUPP;
goto fail_unlock;
}
+ blkg = blkg_lookup(blkcg, q);
if (blkg)
goto success;
@@ -661,54 +854,62 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
struct blkcg_gq *new_blkg;
parent = blkcg_parent(blkcg);
- while (parent && !__blkg_lookup(parent, q, false)) {
+ while (parent && !blkg_lookup(parent, q)) {
pos = parent;
parent = blkcg_parent(parent);
}
/* Drop locks to do new blkg allocation with GFP_KERNEL. */
spin_unlock_irq(&q->queue_lock);
- rcu_read_unlock();
- new_blkg = blkg_alloc(pos, q, GFP_KERNEL);
+ new_blkg = blkg_alloc(pos, disk, GFP_KERNEL);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
- goto fail;
+ goto fail_exit_queue;
+ }
+
+ if (radix_tree_preload(GFP_KERNEL)) {
+ blkg_free(new_blkg);
+ ret = -ENOMEM;
+ goto fail_exit_queue;
}
- rcu_read_lock();
spin_lock_irq(&q->queue_lock);
- blkg = blkg_lookup_check(pos, pol, q);
- if (IS_ERR(blkg)) {
- ret = PTR_ERR(blkg);
- goto fail_unlock;
+ if (!blkcg_policy_enabled(q, pol)) {
+ blkg_free(new_blkg);
+ ret = -EOPNOTSUPP;
+ goto fail_preloaded;
}
+ blkg = blkg_lookup(pos, q);
if (blkg) {
blkg_free(new_blkg);
} else {
- blkg = blkg_create(pos, q, new_blkg);
+ blkg = blkg_create(pos, disk, new_blkg);
if (IS_ERR(blkg)) {
ret = PTR_ERR(blkg);
- goto fail_unlock;
+ goto fail_preloaded;
}
}
+ radix_tree_preload_end();
+
if (pos == blkcg)
goto success;
}
success:
- ctx->disk = disk;
+ blk_queue_exit(q);
ctx->blkg = blkg;
- ctx->body = input;
return 0;
+fail_preloaded:
+ radix_tree_preload_end();
fail_unlock:
spin_unlock_irq(&q->queue_lock);
- rcu_read_unlock();
+fail_exit_queue:
+ blk_queue_exit(q);
fail:
- put_disk_and_module(disk);
/*
* If queue was bypassing, we should retry. Do so after a
* short msleep(). It isn't strictly necessary but queue
@@ -724,108 +925,251 @@ fail:
EXPORT_SYMBOL_GPL(blkg_conf_prep);
/**
- * blkg_conf_finish - finish up per-blkg config update
- * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep()
+ * blkg_conf_exit - clean up per-blkg config update
+ * @ctx: blkg_conf_ctx initialized with blkg_conf_init()
*
- * Finish up after per-blkg config update. This function must be paired
- * with blkg_conf_prep().
+ * Clean up after per-blkg config update. This function must be called on all
+ * blkg_conf_ctx's initialized with blkg_conf_init().
*/
-void blkg_conf_finish(struct blkg_conf_ctx *ctx)
- __releases(&ctx->disk->queue->queue_lock) __releases(rcu)
+void blkg_conf_exit(struct blkg_conf_ctx *ctx)
+ __releases(&ctx->bdev->bd_queue->queue_lock)
+ __releases(&ctx->bdev->bd_queue->rq_qos_mutex)
{
- spin_unlock_irq(&ctx->disk->queue->queue_lock);
- rcu_read_unlock();
- put_disk_and_module(ctx->disk);
+ if (ctx->blkg) {
+ spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock);
+ ctx->blkg = NULL;
+ }
+
+ if (ctx->bdev) {
+ mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex);
+ blkdev_put_no_open(ctx->bdev);
+ ctx->body = NULL;
+ ctx->bdev = NULL;
+ }
}
-EXPORT_SYMBOL_GPL(blkg_conf_finish);
+EXPORT_SYMBOL_GPL(blkg_conf_exit);
-static int blkcg_print_stat(struct seq_file *sf, void *v)
+static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
{
- struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
- struct blkcg_gq *blkg;
+ int i;
- cgroup_rstat_flush(blkcg->css.cgroup);
- rcu_read_lock();
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] = src->bytes[i];
+ dst->ios[i] = src->ios[i];
+ }
+}
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkg_iostat_set *bis = &blkg->iostat;
- const char *dname;
- char *buf;
- u64 rbytes, wbytes, rios, wios, dbytes, dios;
- size_t size = seq_get_buf(sf, &buf), off = 0;
- int i;
- bool has_stats = false;
- unsigned seq;
+static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
- spin_lock_irq(&blkg->q->queue_lock);
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] += src->bytes[i];
+ dst->ios[i] += src->ios[i];
+ }
+}
- if (!blkg->online)
- goto skip;
+static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+{
+ int i;
- dname = blkg_dev_name(blkg);
- if (!dname)
- goto skip;
+ for (i = 0; i < BLKG_IOSTAT_NR; i++) {
+ dst->bytes[i] -= src->bytes[i];
+ dst->ios[i] -= src->ios[i];
+ }
+}
- /*
- * Hooray string manipulation, count is the size written NOT
- * INCLUDING THE \0, so size is now count+1 less than what we
- * had before, but we want to start writing the next bit from
- * the \0 so we only add count to buf.
- */
- off += scnprintf(buf+off, size-off, "%s ", dname);
+static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur,
+ struct blkg_iostat *last)
+{
+ struct blkg_iostat delta;
+ unsigned long flags;
+
+ /* propagate percpu delta to global */
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&delta, cur);
+ blkg_iostat_sub(&delta, last);
+ blkg_iostat_add(&blkg->iostat.cur, &delta);
+ blkg_iostat_add(last, &delta);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
+}
+static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu)
+{
+ struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu);
+ struct llist_node *lnode;
+ struct blkg_iostat_set *bisc, *next_bisc;
+ unsigned long flags;
+
+ rcu_read_lock();
+
+ lnode = llist_del_all(lhead);
+ if (!lnode)
+ goto out;
+
+ /*
+ * For covering concurrent parent blkg update from blkg_release().
+ *
+ * When flushing from cgroup, cgroup_rstat_lock is always held, so
+ * this lock won't cause contention most of time.
+ */
+ raw_spin_lock_irqsave(&blkg_stat_lock, flags);
+
+ /*
+ * Iterate only the iostat_cpu's queued in the lockless list.
+ */
+ llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) {
+ struct blkcg_gq *blkg = bisc->blkg;
+ struct blkcg_gq *parent = blkg->parent;
+ struct blkg_iostat cur;
+ unsigned int seq;
+
+ WRITE_ONCE(bisc->lqueued, false);
+
+ /* fetch the current per-cpu values */
do {
- seq = u64_stats_fetch_begin(&bis->sync);
-
- rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
- wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
- dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
- rios = bis->cur.ios[BLKG_IOSTAT_READ];
- wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
- dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
- } while (u64_stats_fetch_retry(&bis->sync, seq));
-
- if (rbytes || wbytes || rios || wios) {
- has_stats = true;
- off += scnprintf(buf+off, size-off,
- "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
- rbytes, wbytes, rios, wios,
- dbytes, dios);
- }
+ seq = u64_stats_fetch_begin(&bisc->sync);
+ blkg_iostat_set(&cur, &bisc->cur);
+ } while (u64_stats_fetch_retry(&bisc->sync, seq));
- if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
- has_stats = true;
- off += scnprintf(buf+off, size-off,
- " use_delay=%d delay_nsec=%llu",
- atomic_read(&blkg->use_delay),
- (unsigned long long)atomic64_read(&blkg->delay_nsec));
- }
+ blkcg_iostat_update(blkg, &cur, &bisc->last);
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
- size_t written;
+ /* propagate global delta to parent (unless that's root) */
+ if (parent && parent->parent)
+ blkcg_iostat_update(parent, &blkg->iostat.cur,
+ &blkg->iostat.last);
+ }
+ raw_spin_unlock_irqrestore(&blkg_stat_lock, flags);
+out:
+ rcu_read_unlock();
+}
+
+static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+{
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (cgroup_parent(css->cgroup))
+ __blkcg_rstat_flush(css_to_blkcg(css), cpu);
+}
- if (!blkg->pd[i] || !pol->pd_stat_fn)
- continue;
+/*
+ * We source root cgroup stats from the system-wide stats to avoid
+ * tracking the same information twice and incurring overhead when no
+ * cgroups are defined. For that reason, cgroup_rstat_flush in
+ * blkcg_print_stat does not actually fill out the iostat in the root
+ * cgroup's blkcg_gq.
+ *
+ * However, we would like to re-use the printing code between the root and
+ * non-root cgroups to the extent possible. For that reason, we simulate
+ * flushing the root cgroup's stats by explicitly filling in the iostat
+ * with disk level statistics.
+ */
+static void blkcg_fill_root_iostats(void)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg;
+ struct blkg_iostat tmp;
+ int cpu;
+ unsigned long flags;
- written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off);
- if (written)
- has_stats = true;
- off += written;
+ memset(&tmp, 0, sizeof(tmp));
+ for_each_possible_cpu(cpu) {
+ struct disk_stats *cpu_dkstats;
+
+ cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu);
+ tmp.ios[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->ios[STAT_READ];
+ tmp.ios[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->ios[STAT_WRITE];
+ tmp.ios[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->ios[STAT_DISCARD];
+ // convert sectors to bytes
+ tmp.bytes[BLKG_IOSTAT_READ] +=
+ cpu_dkstats->sectors[STAT_READ] << 9;
+ tmp.bytes[BLKG_IOSTAT_WRITE] +=
+ cpu_dkstats->sectors[STAT_WRITE] << 9;
+ tmp.bytes[BLKG_IOSTAT_DISCARD] +=
+ cpu_dkstats->sectors[STAT_DISCARD] << 9;
}
- if (has_stats) {
- if (off < size - 1) {
- off += scnprintf(buf+off, size-off, "\n");
- seq_commit(sf, off);
- } else {
- seq_commit(sf, -1);
- }
- }
- skip:
- spin_unlock_irq(&blkg->q->queue_lock);
+ flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync);
+ blkg_iostat_set(&blkg->iostat.cur, &tmp);
+ u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags);
}
+}
+
+static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s)
+{
+ struct blkg_iostat_set *bis = &blkg->iostat;
+ u64 rbytes, wbytes, rios, wios, dbytes, dios;
+ const char *dname;
+ unsigned seq;
+ int i;
+
+ if (!blkg->online)
+ return;
+
+ dname = blkg_dev_name(blkg);
+ if (!dname)
+ return;
+
+ seq_printf(s, "%s ", dname);
+ do {
+ seq = u64_stats_fetch_begin(&bis->sync);
+
+ rbytes = bis->cur.bytes[BLKG_IOSTAT_READ];
+ wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE];
+ dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD];
+ rios = bis->cur.ios[BLKG_IOSTAT_READ];
+ wios = bis->cur.ios[BLKG_IOSTAT_WRITE];
+ dios = bis->cur.ios[BLKG_IOSTAT_DISCARD];
+ } while (u64_stats_fetch_retry(&bis->sync, seq));
+
+ if (rbytes || wbytes || rios || wios) {
+ seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu",
+ rbytes, wbytes, rios, wios,
+ dbytes, dios);
+ }
+
+ if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) {
+ seq_printf(s, " use_delay=%d delay_nsec=%llu",
+ atomic_read(&blkg->use_delay),
+ atomic64_read(&blkg->delay_nsec));
+ }
+
+ for (i = 0; i < BLKCG_MAX_POLS; i++) {
+ struct blkcg_policy *pol = blkcg_policy[i];
+
+ if (!blkg->pd[i] || !pol->pd_stat_fn)
+ continue;
+
+ pol->pd_stat_fn(blkg->pd[i], s);
+ }
+
+ seq_puts(s, "\n");
+}
+
+static int blkcg_print_stat(struct seq_file *sf, void *v)
+{
+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
+ struct blkcg_gq *blkg;
+
+ if (!seq_css(sf)->parent)
+ blkcg_fill_root_iostats();
+ else
+ cgroup_rstat_flush(blkcg->css.cgroup);
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
+ spin_lock_irq(&blkg->q->queue_lock);
+ blkcg_print_one_stat(blkg, sf);
+ spin_unlock_irq(&blkg->q->queue_lock);
+ }
rcu_read_unlock();
return 0;
}
@@ -833,7 +1177,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v)
static struct cftype blkcg_files[] = {
{
.name = "stat",
- .flags = CFTYPE_NOT_ON_ROOT,
.seq_show = blkcg_print_stat,
},
{ } /* terminate */
@@ -847,6 +1190,13 @@ static struct cftype blkcg_legacy_files[] = {
{ } /* terminate */
};
+#ifdef CONFIG_CGROUP_WRITEBACK
+struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css)
+{
+ return &css_to_blkcg(css)->cgwb_list;
+}
+#endif
+
/*
* blkcg destruction is a three-stage process.
*
@@ -869,25 +1219,6 @@ static struct cftype blkcg_legacy_files[] = {
*/
/**
- * blkcg_css_offline - cgroup css_offline callback
- * @css: css of interest
- *
- * This function is called when @css is about to go away. Here the cgwbs are
- * offlined first and only once writeback associated with the blkcg has
- * finished do we start step 2 (see above).
- */
-static void blkcg_css_offline(struct cgroup_subsys_state *css)
-{
- struct blkcg *blkcg = css_to_blkcg(css);
-
- /* this prevents anyone from attaching or migrating to this blkcg */
- wb_blkcg_offline(blkcg);
-
- /* put the base online pin allowing step 2 to be triggered */
- blkcg_unpin_online(blkcg);
-}
-
-/**
* blkcg_destroy_blkgs - responsible for shooting down blkgs
* @blkcg: blkcg of interest
*
@@ -898,8 +1229,10 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css)
*
* This is the blkcg counterpart of ioc_release_fn().
*/
-void blkcg_destroy_blkgs(struct blkcg *blkcg)
+static void blkcg_destroy_blkgs(struct blkcg *blkcg)
{
+ might_sleep();
+
spin_lock_irq(&blkcg->lock);
while (!hlist_empty(&blkcg->blkg_list)) {
@@ -907,19 +1240,76 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg)
struct blkcg_gq, blkcg_node);
struct request_queue *q = blkg->q;
- if (spin_trylock(&q->queue_lock)) {
- blkg_destroy(blkg);
- spin_unlock(&q->queue_lock);
- } else {
+ if (need_resched() || !spin_trylock(&q->queue_lock)) {
+ /*
+ * Given that the system can accumulate a huge number
+ * of blkgs in pathological cases, check to see if we
+ * need to rescheduling to avoid softlockup.
+ */
spin_unlock_irq(&blkcg->lock);
- cpu_relax();
+ cond_resched();
spin_lock_irq(&blkcg->lock);
+ continue;
}
+
+ blkg_destroy(blkg);
+ spin_unlock(&q->queue_lock);
}
spin_unlock_irq(&blkcg->lock);
}
+/**
+ * blkcg_pin_online - pin online state
+ * @blkcg_css: blkcg of interest
+ *
+ * While pinned, a blkcg is kept online. This is primarily used to
+ * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline
+ * while an associated cgwb is still active.
+ */
+void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css)
+{
+ refcount_inc(&css_to_blkcg(blkcg_css)->online_pin);
+}
+
+/**
+ * blkcg_unpin_online - unpin online state
+ * @blkcg_css: blkcg of interest
+ *
+ * This is primarily used to impedance-match blkg and cgwb lifetimes so
+ * that blkg doesn't go offline while an associated cgwb is still active.
+ * When this count goes to zero, all active cgwbs have finished so the
+ * blkcg can continue destruction by calling blkcg_destroy_blkgs().
+ */
+void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css)
+{
+ struct blkcg *blkcg = css_to_blkcg(blkcg_css);
+
+ do {
+ if (!refcount_dec_and_test(&blkcg->online_pin))
+ break;
+ blkcg_destroy_blkgs(blkcg);
+ blkcg = blkcg_parent(blkcg);
+ } while (blkcg);
+}
+
+/**
+ * blkcg_css_offline - cgroup css_offline callback
+ * @css: css of interest
+ *
+ * This function is called when @css is about to go away. Here the cgwbs are
+ * offlined first and only once writeback associated with the blkcg has
+ * finished do we start step 2 (see above).
+ */
+static void blkcg_css_offline(struct cgroup_subsys_state *css)
+{
+ /* this prevents anyone from attaching or migrating to this blkcg */
+ wb_blkcg_offline(css);
+
+ /* put the base online pin allowing step 2 to be triggered */
+ blkcg_unpin_online(css);
+}
+
static void blkcg_css_free(struct cgroup_subsys_state *css)
{
struct blkcg *blkcg = css_to_blkcg(css);
@@ -935,6 +1325,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css)
mutex_unlock(&blkcg_pol_mutex);
+ free_percpu(blkcg->lhead);
kfree(blkcg);
}
@@ -942,7 +1333,6 @@ static struct cgroup_subsys_state *
blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
{
struct blkcg *blkcg;
- struct cgroup_subsys_state *ret;
int i;
mutex_lock(&blkcg_pol_mutex);
@@ -951,12 +1341,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
blkcg = &blkcg_root;
} else {
blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL);
- if (!blkcg) {
- ret = ERR_PTR(-ENOMEM);
+ if (!blkcg)
goto unlock;
- }
}
+ if (init_blkcg_llists(blkcg))
+ goto free_blkcg;
+
for (i = 0; i < BLKCG_MAX_POLS ; i++) {
struct blkcg_policy *pol = blkcg_policy[i];
struct blkcg_policy_data *cpd;
@@ -971,15 +1362,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
continue;
cpd = pol->cpd_alloc_fn(GFP_KERNEL);
- if (!cpd) {
- ret = ERR_PTR(-ENOMEM);
+ if (!cpd)
goto free_pd_blkcg;
- }
+
blkcg->cpd[i] = cpd;
cpd->blkcg = blkcg;
cpd->plid = i;
- if (pol->cpd_init_fn)
- pol->cpd_init_fn(cpd);
}
spin_lock_init(&blkcg->lock);
@@ -998,18 +1386,18 @@ free_pd_blkcg:
for (i--; i >= 0; i--)
if (blkcg->cpd[i])
blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]);
-
+ free_percpu(blkcg->lhead);
+free_blkcg:
if (blkcg != &blkcg_root)
kfree(blkcg);
unlock:
mutex_unlock(&blkcg_pol_mutex);
- return ret;
+ return ERR_PTR(-ENOMEM);
}
static int blkcg_css_online(struct cgroup_subsys_state *css)
{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg *parent = blkcg_parent(blkcg);
+ struct blkcg *parent = blkcg_parent(css_to_blkcg(css));
/*
* blkcg_pin_online() is used to delay blkcg offline so that blkgs
@@ -1017,199 +1405,71 @@ static int blkcg_css_online(struct cgroup_subsys_state *css)
* parent so that offline always happens towards the root.
*/
if (parent)
- blkcg_pin_online(parent);
+ blkcg_pin_online(&parent->css);
return 0;
}
-/**
- * blkcg_init_queue - initialize blkcg part of request queue
- * @q: request_queue to initialize
- *
- * Called from __blk_alloc_queue(). Responsible for initializing blkcg
- * part of new request_queue @q.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int blkcg_init_queue(struct request_queue *q)
+int blkcg_init_disk(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blkcg_gq *new_blkg, *blkg;
bool preloaded;
int ret;
- new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL);
+ INIT_LIST_HEAD(&q->blkg_list);
+ mutex_init(&q->blkcg_mutex);
+
+ new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL);
if (!new_blkg)
return -ENOMEM;
preloaded = !radix_tree_preload(GFP_KERNEL);
/* Make sure the root blkg exists. */
- rcu_read_lock();
+ /* spin_lock_irq can serve as RCU read-side critical section. */
spin_lock_irq(&q->queue_lock);
- blkg = blkg_create(&blkcg_root, q, new_blkg);
+ blkg = blkg_create(&blkcg_root, disk, new_blkg);
if (IS_ERR(blkg))
goto err_unlock;
q->root_blkg = blkg;
spin_unlock_irq(&q->queue_lock);
- rcu_read_unlock();
if (preloaded)
radix_tree_preload_end();
- ret = blk_iolatency_init(q);
+ ret = blk_ioprio_init(disk);
if (ret)
goto err_destroy_all;
- ret = blk_throtl_init(q);
+ ret = blk_throtl_init(disk);
if (ret)
- goto err_destroy_all;
+ goto err_ioprio_exit;
+
return 0;
+err_ioprio_exit:
+ blk_ioprio_exit(disk);
err_destroy_all:
- blkg_destroy_all(q);
+ blkg_destroy_all(disk);
return ret;
err_unlock:
spin_unlock_irq(&q->queue_lock);
- rcu_read_unlock();
if (preloaded)
radix_tree_preload_end();
return PTR_ERR(blkg);
}
-/**
- * blkcg_exit_queue - exit and release blkcg part of request_queue
- * @q: request_queue being released
- *
- * Called from blk_exit_queue(). Responsible for exiting blkcg part.
- */
-void blkcg_exit_queue(struct request_queue *q)
-{
- blkg_destroy_all(q);
- blk_throtl_exit(q);
-}
-
-/*
- * We cannot support shared io contexts, as we have no mean to support
- * two tasks with the same ioc in two different groups without major rework
- * of the main cic data structures. For now we allow a task to change
- * its cgroup only if it's the only owner of its ioc.
- */
-static int blkcg_can_attach(struct cgroup_taskset *tset)
-{
- struct task_struct *task;
- struct cgroup_subsys_state *dst_css;
- struct io_context *ioc;
- int ret = 0;
-
- /* task_lock() is needed to avoid races with exit_io_context() */
- cgroup_taskset_for_each(task, dst_css, tset) {
- task_lock(task);
- ioc = task->io_context;
- if (ioc && atomic_read(&ioc->nr_tasks) > 1)
- ret = -EINVAL;
- task_unlock(task);
- if (ret)
- break;
- }
- return ret;
-}
-
-static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] = src->bytes[i];
- dst->ios[i] = src->ios[i];
- }
-}
-
-static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src)
-{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] += src->bytes[i];
- dst->ios[i] += src->ios[i];
- }
-}
-
-static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src)
+void blkcg_exit_disk(struct gendisk *disk)
{
- int i;
-
- for (i = 0; i < BLKG_IOSTAT_NR; i++) {
- dst->bytes[i] -= src->bytes[i];
- dst->ios[i] -= src->ios[i];
- }
-}
-
-static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu)
-{
- struct blkcg *blkcg = css_to_blkcg(css);
- struct blkcg_gq *blkg;
-
- rcu_read_lock();
-
- hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) {
- struct blkcg_gq *parent = blkg->parent;
- struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu);
- struct blkg_iostat cur, delta;
- unsigned seq;
-
- /* fetch the current per-cpu values */
- do {
- seq = u64_stats_fetch_begin(&bisc->sync);
- blkg_iostat_set(&cur, &bisc->cur);
- } while (u64_stats_fetch_retry(&bisc->sync, seq));
-
- /* propagate percpu delta to global */
- u64_stats_update_begin(&blkg->iostat.sync);
- blkg_iostat_set(&delta, &cur);
- blkg_iostat_sub(&delta, &bisc->last);
- blkg_iostat_add(&blkg->iostat.cur, &delta);
- blkg_iostat_add(&bisc->last, &delta);
- u64_stats_update_end(&blkg->iostat.sync);
-
- /* propagate global delta to parent */
- if (parent) {
- u64_stats_update_begin(&parent->iostat.sync);
- blkg_iostat_set(&delta, &blkg->iostat.cur);
- blkg_iostat_sub(&delta, &blkg->iostat.last);
- blkg_iostat_add(&parent->iostat.cur, &delta);
- blkg_iostat_add(&blkg->iostat.last, &delta);
- u64_stats_update_end(&parent->iostat.sync);
- }
- }
-
- rcu_read_unlock();
-}
-
-static void blkcg_bind(struct cgroup_subsys_state *root_css)
-{
- int i;
-
- mutex_lock(&blkcg_pol_mutex);
-
- for (i = 0; i < BLKCG_MAX_POLS; i++) {
- struct blkcg_policy *pol = blkcg_policy[i];
- struct blkcg *blkcg;
-
- if (!pol || !pol->cpd_bind_fn)
- continue;
-
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node)
- if (blkcg->cpd[pol->plid])
- pol->cpd_bind_fn(blkcg->cpd[pol->plid]);
- }
- mutex_unlock(&blkcg_pol_mutex);
+ blkg_destroy_all(disk);
+ blk_throtl_exit(disk);
}
static void blkcg_exit(struct task_struct *tsk)
{
- if (tsk->throttle_queue)
- blk_put_queue(tsk->throttle_queue);
- tsk->throttle_queue = NULL;
+ if (tsk->throttle_disk)
+ put_disk(tsk->throttle_disk);
+ tsk->throttle_disk = NULL;
}
struct cgroup_subsys io_cgrp_subsys = {
@@ -1217,9 +1477,7 @@ struct cgroup_subsys io_cgrp_subsys = {
.css_online = blkcg_css_online,
.css_offline = blkcg_css_offline,
.css_free = blkcg_css_free,
- .can_attach = blkcg_can_attach,
.css_rstat_flush = blkcg_rstat_flush,
- .bind = blkcg_bind,
.dfl_cftypes = blkcg_files,
.legacy_cftypes = blkcg_legacy_files,
.legacy_name = "blkio",
@@ -1236,14 +1494,14 @@ struct cgroup_subsys io_cgrp_subsys = {
EXPORT_SYMBOL_GPL(io_cgrp_subsys);
/**
- * blkcg_activate_policy - activate a blkcg policy on a request_queue
- * @q: request_queue of interest
+ * blkcg_activate_policy - activate a blkcg policy on a gendisk
+ * @disk: gendisk of interest
* @pol: blkcg policy to activate
*
- * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through
+ * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through
* bypass mode to populate its blkgs with policy_data for @pol.
*
- * Activation happens with @q bypassed, so nobody would be accessing blkgs
+ * Activation happens with @disk bypassed, so nobody would be accessing blkgs
* from IO path. Update of each blkg is protected by both queue and blkcg
* locks so that holding either lock and testing blkcg_policy_enabled() is
* always enough for dereferencing policy data.
@@ -1251,9 +1509,9 @@ EXPORT_SYMBOL_GPL(io_cgrp_subsys);
* The caller is responsible for synchronizing [de]activations and policy
* [un]registerations. Returns 0 on success, -errno on failure.
*/
-int blkcg_activate_policy(struct request_queue *q,
- const struct blkcg_policy *pol)
+int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
{
+ struct request_queue *q = disk->queue;
struct blkg_policy_data *pd_prealloc = NULL;
struct blkcg_gq *blkg, *pinned_blkg = NULL;
int ret;
@@ -1266,7 +1524,7 @@ int blkcg_activate_policy(struct request_queue *q,
retry:
spin_lock_irq(&q->queue_lock);
- /* blkg_list is pushed at the head, reverse walk to allocate parents first */
+ /* blkg_list is pushed at the head, reverse walk to initialize parents first */
list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) {
struct blkg_policy_data *pd;
@@ -1278,8 +1536,8 @@ retry:
pd = pd_prealloc;
pd_prealloc = NULL;
} else {
- pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q,
- blkg->blkcg);
+ pd = pol->pd_alloc_fn(disk, blkg->blkcg,
+ GFP_NOWAIT | __GFP_NOWARN);
}
if (!pd) {
@@ -1296,23 +1554,29 @@ retry:
if (pd_prealloc)
pol->pd_free_fn(pd_prealloc);
- pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q,
- blkg->blkcg);
+ pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg,
+ GFP_KERNEL);
if (pd_prealloc)
goto retry;
else
goto enomem;
}
- blkg->pd[pol->plid] = pd;
+ spin_lock(&blkg->blkcg->lock);
+
pd->blkg = blkg;
pd->plid = pol->plid;
- }
+ blkg->pd[pol->plid] = pd;
- /* all allocated, init in the same order */
- if (pol->pd_init_fn)
- list_for_each_entry_reverse(blkg, &q->blkg_list, q_node)
- pol->pd_init_fn(blkg->pd[pol->plid]);
+ if (pol->pd_init_fn)
+ pol->pd_init_fn(pd);
+
+ if (pol->pd_online_fn)
+ pol->pd_online_fn(pd);
+ pd->online = true;
+
+ spin_unlock(&blkg->blkcg->lock);
+ }
__set_bit(pol->plid, q->blkcg_pols);
ret = 0;
@@ -1328,13 +1592,22 @@ out:
return ret;
enomem:
- /* alloc failed, nothing's initialized yet, free everything */
+ /* alloc failed, take down everything */
spin_lock_irq(&q->queue_lock);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
- if (blkg->pd[pol->plid]) {
- pol->pd_free_fn(blkg->pd[pol->plid]);
+ struct blkcg *blkcg = blkg->blkcg;
+ struct blkg_policy_data *pd;
+
+ spin_lock(&blkcg->lock);
+ pd = blkg->pd[pol->plid];
+ if (pd) {
+ if (pd->online && pol->pd_offline_fn)
+ pol->pd_offline_fn(pd);
+ pd->online = false;
+ pol->pd_free_fn(pd);
blkg->pd[pol->plid] = NULL;
}
+ spin_unlock(&blkcg->lock);
}
spin_unlock_irq(&q->queue_lock);
ret = -ENOMEM;
@@ -1343,16 +1616,17 @@ enomem:
EXPORT_SYMBOL_GPL(blkcg_activate_policy);
/**
- * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue
- * @q: request_queue of interest
+ * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk
+ * @disk: gendisk of interest
* @pol: blkcg policy to deactivate
*
- * Deactivate @pol on @q. Follows the same synchronization rules as
+ * Deactivate @pol on @disk. Follows the same synchronization rules as
* blkcg_activate_policy().
*/
-void blkcg_deactivate_policy(struct request_queue *q,
+void blkcg_deactivate_policy(struct gendisk *disk,
const struct blkcg_policy *pol)
{
+ struct request_queue *q = disk->queue;
struct blkcg_gq *blkg;
if (!blkcg_policy_enabled(q, pol))
@@ -1361,26 +1635,44 @@ void blkcg_deactivate_policy(struct request_queue *q,
if (queue_is_mq(q))
blk_mq_freeze_queue(q);
+ mutex_lock(&q->blkcg_mutex);
spin_lock_irq(&q->queue_lock);
__clear_bit(pol->plid, q->blkcg_pols);
list_for_each_entry(blkg, &q->blkg_list, q_node) {
+ struct blkcg *blkcg = blkg->blkcg;
+
+ spin_lock(&blkcg->lock);
if (blkg->pd[pol->plid]) {
- if (pol->pd_offline_fn)
+ if (blkg->pd[pol->plid]->online && pol->pd_offline_fn)
pol->pd_offline_fn(blkg->pd[pol->plid]);
pol->pd_free_fn(blkg->pd[pol->plid]);
blkg->pd[pol->plid] = NULL;
}
+ spin_unlock(&blkcg->lock);
}
spin_unlock_irq(&q->queue_lock);
+ mutex_unlock(&q->blkcg_mutex);
if (queue_is_mq(q))
blk_mq_unfreeze_queue(q);
}
EXPORT_SYMBOL_GPL(blkcg_deactivate_policy);
+static void blkcg_free_all_cpd(struct blkcg_policy *pol)
+{
+ struct blkcg *blkcg;
+
+ list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
+ if (blkcg->cpd[pol->plid]) {
+ pol->cpd_free_fn(blkcg->cpd[pol->plid]);
+ blkcg->cpd[pol->plid] = NULL;
+ }
+ }
+}
+
/**
* blkcg_policy_register - register a blkcg policy
* @pol: blkcg policy to register
@@ -1427,8 +1719,6 @@ int blkcg_policy_register(struct blkcg_policy *pol)
blkcg->cpd[pol->plid] = cpd;
cpd->blkcg = blkcg;
cpd->plid = pol->plid;
- if (pol->cpd_init_fn)
- pol->cpd_init_fn(cpd);
}
}
@@ -1445,14 +1735,9 @@ int blkcg_policy_register(struct blkcg_policy *pol)
return 0;
err_free_cpds:
- if (pol->cpd_free_fn) {
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- if (blkcg->cpd[pol->plid]) {
- pol->cpd_free_fn(blkcg->cpd[pol->plid]);
- blkcg->cpd[pol->plid] = NULL;
- }
- }
- }
+ if (pol->cpd_free_fn)
+ blkcg_free_all_cpd(pol);
+
blkcg_policy[pol->plid] = NULL;
err_unlock:
mutex_unlock(&blkcg_pol_mutex);
@@ -1469,8 +1754,6 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register);
*/
void blkcg_policy_unregister(struct blkcg_policy *pol)
{
- struct blkcg *blkcg;
-
mutex_lock(&blkcg_pol_register_mutex);
if (WARN_ON(blkcg_policy[pol->plid] != pol))
@@ -1485,14 +1768,9 @@ void blkcg_policy_unregister(struct blkcg_policy *pol)
/* remove cpds and unregister */
mutex_lock(&blkcg_pol_mutex);
- if (pol->cpd_free_fn) {
- list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) {
- if (blkcg->cpd[pol->plid]) {
- pol->cpd_free_fn(blkcg->cpd[pol->plid]);
- blkcg->cpd[pol->plid] = NULL;
- }
- }
- }
+ if (pol->cpd_free_fn)
+ blkcg_free_all_cpd(pol);
+
blkcg_policy[pol->plid] = NULL;
mutex_unlock(&blkcg_pol_mutex);
@@ -1501,25 +1779,6 @@ out_unlock:
}
EXPORT_SYMBOL_GPL(blkcg_policy_unregister);
-bool __blkcg_punt_bio_submit(struct bio *bio)
-{
- struct blkcg_gq *blkg = bio->bi_blkg;
-
- /* consume the flag first */
- bio->bi_opf &= ~REQ_CGROUP_PUNT;
-
- /* never bounce for the root cgroup */
- if (!blkg->parent)
- return false;
-
- spin_lock_bh(&blkg->async_bio_lock);
- bio_list_add(&blkg->async_bios, bio);
- spin_unlock_bh(&blkg->async_bio_lock);
-
- queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work);
- return true;
-}
-
/*
* Scale the accumulated delay based on how long it has been since we updated
* the delay. We only call this when we are adding delay, in case it's been a
@@ -1548,7 +1807,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
* everybody is happy with their IO latencies.
*/
if (time_before64(old + NSEC_PER_SEC, now) &&
- atomic64_cmpxchg(&blkg->delay_start, old, now) == old) {
+ atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) {
u64 cur = atomic64_read(&blkg->delay_nsec);
u64 sub = min_t(u64, blkg->last_delay, now - old);
int cur_use = atomic_read(&blkg->use_delay);
@@ -1586,16 +1845,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now)
static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
{
unsigned long pflags;
+ bool clamp;
u64 now = ktime_to_ns(ktime_get());
u64 exp;
u64 delay_nsec = 0;
int tok;
while (blkg->parent) {
- if (atomic_read(&blkg->use_delay)) {
+ int use_delay = atomic_read(&blkg->use_delay);
+
+ if (use_delay) {
+ u64 this_delay;
+
blkcg_scale_delay(blkg, now);
- delay_nsec = max_t(u64, delay_nsec,
- atomic64_read(&blkg->delay_nsec));
+ this_delay = atomic64_read(&blkg->delay_nsec);
+ if (this_delay > delay_nsec) {
+ delay_nsec = this_delay;
+ clamp = use_delay > 0;
+ }
}
blkg = blkg->parent;
}
@@ -1607,10 +1874,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
* Let's not sleep for all eternity if we've amassed a huge delay.
* Swapping or metadata IO can accumulate 10's of seconds worth of
* delay, and we want userspace to be able to do _something_ so cap the
- * delays at 1 second. If there's 10's of seconds worth of delay then
- * the tasks will be delayed for 1 second for every syscall.
+ * delays at 0.25s. If there's 10's of seconds worth of delay then the
+ * tasks will be delayed for 0.25 second for every syscall. If
+ * blkcg_set_delay() was used as indicated by negative use_delay, the
+ * caller is responsible for regulating the range.
*/
- delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
+ if (clamp)
+ delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC);
if (use_memdelay)
psi_memstall_enter(&pflags);
@@ -1633,35 +1903,29 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay)
*
* This is only called if we've been marked with set_notify_resume(). Obviously
* we can be set_notify_resume() for reasons other than blkcg throttling, so we
- * check to see if current->throttle_queue is set and if not this doesn't do
+ * check to see if current->throttle_disk is set and if not this doesn't do
* anything. This should only ever be called by the resume code, it's not meant
* to be called by people willy-nilly as it will actually do the work to
* throttle the task if it is setup for throttling.
*/
void blkcg_maybe_throttle_current(void)
{
- struct request_queue *q = current->throttle_queue;
- struct cgroup_subsys_state *css;
+ struct gendisk *disk = current->throttle_disk;
struct blkcg *blkcg;
struct blkcg_gq *blkg;
bool use_memdelay = current->use_memdelay;
- if (!q)
+ if (!disk)
return;
- current->throttle_queue = NULL;
+ current->throttle_disk = NULL;
current->use_memdelay = false;
rcu_read_lock();
- css = kthread_blkcg();
- if (css)
- blkcg = css_to_blkcg(css);
- else
- blkcg = css_to_blkcg(task_css(current, io_cgrp_id));
-
+ blkcg = css_to_blkcg(blkcg_css());
if (!blkcg)
goto out;
- blkg = blkg_lookup(blkcg, q);
+ blkg = blkg_lookup(blkcg, disk->queue);
if (!blkg)
goto out;
if (!blkg_tryget(blkg))
@@ -1670,22 +1934,21 @@ void blkcg_maybe_throttle_current(void)
blkcg_maybe_throttle_blkg(blkg, use_memdelay);
blkg_put(blkg);
- blk_put_queue(q);
+ put_disk(disk);
return;
out:
rcu_read_unlock();
- blk_put_queue(q);
}
/**
* blkcg_schedule_throttle - this task needs to check for throttling
- * @q: the request queue IO was submitted on
+ * @disk: disk to throttle
* @use_memdelay: do we charge this to memory delay for PSI
*
* This is called by the IO controller when we know there's delay accumulated
* for the blkg for this task. We do not pass the blkg because there are places
* we call this that may not have that information, the swapping code for
- * instance will only have a request_queue at that point. This set's the
+ * instance will only have a block_device at that point. This set's the
* notify_resume for the task to check and see if it requires throttling before
* returning to user space.
*
@@ -1694,17 +1957,21 @@ out:
* throttle once. If the task needs to be throttled again it'll need to be
* re-set at the next time we see the task.
*/
-void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay)
+void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay)
{
if (unlikely(current->flags & PF_KTHREAD))
return;
- if (!blk_get_queue(q))
- return;
+ if (current->throttle_disk != disk) {
+ if (test_bit(GD_DEAD, &disk->state))
+ return;
+ get_device(disk_to_dev(disk));
+
+ if (current->throttle_disk)
+ put_disk(current->throttle_disk);
+ current->throttle_disk = disk;
+ }
- if (current->throttle_queue)
- blk_put_queue(current->throttle_queue);
- current->throttle_queue = q;
if (use_memdelay)
current->use_memdelay = use_memdelay;
set_notify_resume(current);
@@ -1727,16 +1994,172 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta)
atomic64_add(delta, &blkg->delay_nsec);
}
-static int __init blkcg_init(void)
+/**
+ * blkg_tryget_closest - try and get a blkg ref on the closet blkg
+ * @bio: target bio
+ * @css: target css
+ *
+ * As the failure mode here is to walk up the blkg tree, this ensure that the
+ * blkg->parent pointers are always valid. This returns the blkg that it ended
+ * up taking a reference on or %NULL if no reference was taken.
+ */
+static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio,
+ struct cgroup_subsys_state *css)
{
- blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio",
- WQ_MEM_RECLAIM | WQ_FREEZABLE |
- WQ_UNBOUND | WQ_SYSFS, 0);
- if (!blkcg_punt_bio_wq)
- return -ENOMEM;
- return 0;
+ struct blkcg_gq *blkg, *ret_blkg = NULL;
+
+ rcu_read_lock();
+ blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk);
+ while (blkg) {
+ if (blkg_tryget(blkg)) {
+ ret_blkg = blkg;
+ break;
+ }
+ blkg = blkg->parent;
+ }
+ rcu_read_unlock();
+
+ return ret_blkg;
+}
+
+/**
+ * bio_associate_blkg_from_css - associate a bio with a specified css
+ * @bio: target bio
+ * @css: target css
+ *
+ * Associate @bio with the blkg found by combining the css's blkg and the
+ * request_queue of the @bio. An association failure is handled by walking up
+ * the blkg tree. Therefore, the blkg associated can be anything between @blkg
+ * and q->root_blkg. This situation only happens when a cgroup is dying and
+ * then the remaining bios will spill to the closest alive blkg.
+ *
+ * A reference will be taken on the blkg and will be released when @bio is
+ * freed.
+ */
+void bio_associate_blkg_from_css(struct bio *bio,
+ struct cgroup_subsys_state *css)
+{
+ if (bio->bi_blkg)
+ blkg_put(bio->bi_blkg);
+
+ if (css && css->parent) {
+ bio->bi_blkg = blkg_tryget_closest(bio, css);
+ } else {
+ blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg);
+ bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg;
+ }
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css);
+
+/**
+ * bio_associate_blkg - associate a bio with a blkg
+ * @bio: target bio
+ *
+ * Associate @bio with the blkg found from the bio's css and request_queue.
+ * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is
+ * already associated, the css is reused and association redone as the
+ * request_queue may have changed.
+ */
+void bio_associate_blkg(struct bio *bio)
+{
+ struct cgroup_subsys_state *css;
+
+ if (blk_op_is_passthrough(bio->bi_opf))
+ return;
+
+ rcu_read_lock();
+
+ if (bio->bi_blkg)
+ css = bio_blkcg_css(bio);
+ else
+ css = blkcg_css();
+
+ bio_associate_blkg_from_css(bio, css);
+
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(bio_associate_blkg);
+
+/**
+ * bio_clone_blkg_association - clone blkg association from src to dst bio
+ * @dst: destination bio
+ * @src: source bio
+ */
+void bio_clone_blkg_association(struct bio *dst, struct bio *src)
+{
+ if (src->bi_blkg)
+ bio_associate_blkg_from_css(dst, bio_blkcg_css(src));
+}
+EXPORT_SYMBOL_GPL(bio_clone_blkg_association);
+
+static int blk_cgroup_io_type(struct bio *bio)
+{
+ if (op_is_discard(bio->bi_opf))
+ return BLKG_IOSTAT_DISCARD;
+ if (op_is_write(bio->bi_opf))
+ return BLKG_IOSTAT_WRITE;
+ return BLKG_IOSTAT_READ;
+}
+
+void blk_cgroup_bio_start(struct bio *bio)
+{
+ struct blkcg *blkcg = bio->bi_blkg->blkcg;
+ int rwd = blk_cgroup_io_type(bio), cpu;
+ struct blkg_iostat_set *bis;
+ unsigned long flags;
+
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys))
+ return;
+
+ /* Root-level stats are sourced from system-wide IO stats */
+ if (!cgroup_parent(blkcg->css.cgroup))
+ return;
+
+ cpu = get_cpu();
+ bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu);
+ flags = u64_stats_update_begin_irqsave(&bis->sync);
+
+ /*
+ * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split
+ * bio and we would have already accounted for the size of the bio.
+ */
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
+ bis->cur.bytes[rwd] += bio->bi_iter.bi_size;
+ }
+ bis->cur.ios[rwd]++;
+
+ /*
+ * If the iostat_cpu isn't in a lockless list, put it into the
+ * list to indicate that a stat update is pending.
+ */
+ if (!READ_ONCE(bis->lqueued)) {
+ struct llist_head *lhead = this_cpu_ptr(blkcg->lhead);
+
+ llist_add(&bis->lnode, lhead);
+ WRITE_ONCE(bis->lqueued, true);
+ }
+
+ u64_stats_update_end_irqrestore(&bis->sync, flags);
+ cgroup_rstat_updated(blkcg->css.cgroup, cpu);
+ put_cpu();
+}
+
+bool blk_cgroup_congested(void)
+{
+ struct cgroup_subsys_state *css;
+ bool ret = false;
+
+ rcu_read_lock();
+ for (css = blkcg_css(); css; css = css->parent) {
+ if (atomic_read(&css->cgroup->congestion_count)) {
+ ret = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+ return ret;
}
-subsys_initcall(blkcg_init);
module_param(blkcg_debug_stats, bool, 0644);
MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not");
diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h
new file mode 100644
index 000000000000..b927a4a0ad03
--- /dev/null
+++ b/block/blk-cgroup.h
@@ -0,0 +1,508 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _BLK_CGROUP_PRIVATE_H
+#define _BLK_CGROUP_PRIVATE_H
+/*
+ * block cgroup private header
+ *
+ * Based on ideas and code from CFQ, CFS and BFQ:
+ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
+ *
+ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
+ * Paolo Valente <paolo.valente@unimore.it>
+ *
+ * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
+ * Nauman Rafique <nauman@google.com>
+ */
+
+#include <linux/blk-cgroup.h>
+#include <linux/cgroup.h>
+#include <linux/kthread.h>
+#include <linux/blk-mq.h>
+#include <linux/llist.h>
+
+struct blkcg_gq;
+struct blkg_policy_data;
+
+
+/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
+#define BLKG_STAT_CPU_BATCH (INT_MAX / 2)
+
+#ifdef CONFIG_BLK_CGROUP
+
+enum blkg_iostat_type {
+ BLKG_IOSTAT_READ,
+ BLKG_IOSTAT_WRITE,
+ BLKG_IOSTAT_DISCARD,
+
+ BLKG_IOSTAT_NR,
+};
+
+struct blkg_iostat {
+ u64 bytes[BLKG_IOSTAT_NR];
+ u64 ios[BLKG_IOSTAT_NR];
+};
+
+struct blkg_iostat_set {
+ struct u64_stats_sync sync;
+ struct blkcg_gq *blkg;
+ struct llist_node lnode;
+ int lqueued; /* queued in llist */
+ struct blkg_iostat cur;
+ struct blkg_iostat last;
+};
+
+/* association between a blk cgroup and a request queue */
+struct blkcg_gq {
+ /* Pointer to the associated request_queue */
+ struct request_queue *q;
+ struct list_head q_node;
+ struct hlist_node blkcg_node;
+ struct blkcg *blkcg;
+
+ /* all non-root blkcg_gq's are guaranteed to have access to parent */
+ struct blkcg_gq *parent;
+
+ /* reference count */
+ struct percpu_ref refcnt;
+
+ /* is this blkg online? protected by both blkcg and q locks */
+ bool online;
+
+ struct blkg_iostat_set __percpu *iostat_cpu;
+ struct blkg_iostat_set iostat;
+
+ struct blkg_policy_data *pd[BLKCG_MAX_POLS];
+#ifdef CONFIG_BLK_CGROUP_PUNT_BIO
+ spinlock_t async_bio_lock;
+ struct bio_list async_bios;
+#endif
+ union {
+ struct work_struct async_bio_work;
+ struct work_struct free_work;
+ };
+
+ atomic_t use_delay;
+ atomic64_t delay_nsec;
+ atomic64_t delay_start;
+ u64 last_delay;
+ int last_use;
+
+ struct rcu_head rcu_head;
+};
+
+struct blkcg {
+ struct cgroup_subsys_state css;
+ spinlock_t lock;
+ refcount_t online_pin;
+
+ struct radix_tree_root blkg_tree;
+ struct blkcg_gq __rcu *blkg_hint;
+ struct hlist_head blkg_list;
+
+ struct blkcg_policy_data *cpd[BLKCG_MAX_POLS];
+
+ struct list_head all_blkcgs_node;
+
+ /*
+ * List of updated percpu blkg_iostat_set's since the last flush.
+ */
+ struct llist_head __percpu *lhead;
+
+#ifdef CONFIG_BLK_CGROUP_FC_APPID
+ char fc_app_id[FC_APPID_LEN];
+#endif
+#ifdef CONFIG_CGROUP_WRITEBACK
+ struct list_head cgwb_list;
+#endif
+};
+
+static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
+{
+ return css ? container_of(css, struct blkcg, css) : NULL;
+}
+
+/*
+ * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
+ * request_queue (q). This is used by blkcg policies which need to track
+ * information per blkcg - q pair.
+ *
+ * There can be multiple active blkcg policies and each blkg:policy pair is
+ * represented by a blkg_policy_data which is allocated and freed by each
+ * policy's pd_alloc/free_fn() methods. A policy can allocate private data
+ * area by allocating larger data structure which embeds blkg_policy_data
+ * at the beginning.
+ */
+struct blkg_policy_data {
+ /* the blkg and policy id this per-policy data belongs to */
+ struct blkcg_gq *blkg;
+ int plid;
+ bool online;
+};
+
+/*
+ * Policies that need to keep per-blkcg data which is independent from any
+ * request_queue associated to it should implement cpd_alloc/free_fn()
+ * methods. A policy can allocate private data area by allocating larger
+ * data structure which embeds blkcg_policy_data at the beginning.
+ * cpd_init() is invoked to let each policy handle per-blkcg data.
+ */
+struct blkcg_policy_data {
+ /* the blkcg and policy id this per-policy data belongs to */
+ struct blkcg *blkcg;
+ int plid;
+};
+
+typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
+typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
+typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk,
+ struct blkcg *blkcg, gfp_t gfp);
+typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
+typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd,
+ struct seq_file *s);
+
+struct blkcg_policy {
+ int plid;
+ /* cgroup files for the policy */
+ struct cftype *dfl_cftypes;
+ struct cftype *legacy_cftypes;
+
+ /* operations */
+ blkcg_pol_alloc_cpd_fn *cpd_alloc_fn;
+ blkcg_pol_free_cpd_fn *cpd_free_fn;
+
+ blkcg_pol_alloc_pd_fn *pd_alloc_fn;
+ blkcg_pol_init_pd_fn *pd_init_fn;
+ blkcg_pol_online_pd_fn *pd_online_fn;
+ blkcg_pol_offline_pd_fn *pd_offline_fn;
+ blkcg_pol_free_pd_fn *pd_free_fn;
+ blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn;
+ blkcg_pol_stat_pd_fn *pd_stat_fn;
+};
+
+extern struct blkcg blkcg_root;
+extern bool blkcg_debug_stats;
+
+int blkcg_init_disk(struct gendisk *disk);
+void blkcg_exit_disk(struct gendisk *disk);
+
+/* Blkio controller policy registration */
+int blkcg_policy_register(struct blkcg_policy *pol);
+void blkcg_policy_unregister(struct blkcg_policy *pol);
+int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol);
+void blkcg_deactivate_policy(struct gendisk *disk,
+ const struct blkcg_policy *pol);
+
+const char *blkg_dev_name(struct blkcg_gq *blkg);
+void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
+ u64 (*prfill)(struct seq_file *,
+ struct blkg_policy_data *, int),
+ const struct blkcg_policy *pol, int data,
+ bool show_total);
+u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
+
+struct blkg_conf_ctx {
+ char *input;
+ char *body;
+ struct block_device *bdev;
+ struct blkcg_gq *blkg;
+};
+
+void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input);
+int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx);
+int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
+ struct blkg_conf_ctx *ctx);
+void blkg_conf_exit(struct blkg_conf_ctx *ctx);
+
+/**
+ * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
+ * @return: true if this bio needs to be submitted with the root blkg context.
+ *
+ * In order to avoid priority inversions we sometimes need to issue a bio as if
+ * it were attached to the root blkg, and then backcharge to the actual owning
+ * blkg. The idea is we do bio_blkcg_css() to look up the actual context for
+ * the bio and attach the appropriate blkg to the bio. Then we call this helper
+ * and if it is true run with the root blkg for that queue and then do any
+ * backcharging to the originating cgroup once the io is complete.
+ */
+static inline bool bio_issue_as_root_blkg(struct bio *bio)
+{
+ return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
+}
+
+/**
+ * blkg_lookup - lookup blkg for the specified blkcg - q pair
+ * @blkcg: blkcg of interest
+ * @q: request_queue of interest
+ *
+ * Lookup blkg for the @blkcg - @q pair.
+
+ * Must be called in a RCU critical section.
+ */
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
+ struct request_queue *q)
+{
+ struct blkcg_gq *blkg;
+
+ if (blkcg == &blkcg_root)
+ return q->root_blkg;
+
+ blkg = rcu_dereference_check(blkcg->blkg_hint,
+ lockdep_is_held(&q->queue_lock));
+ if (blkg && blkg->q == q)
+ return blkg;
+
+ blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id);
+ if (blkg && blkg->q != q)
+ blkg = NULL;
+ return blkg;
+}
+
+/**
+ * blkg_to_pdata - get policy private data
+ * @blkg: blkg of interest
+ * @pol: policy of interest
+ *
+ * Return pointer to private data associated with the @blkg-@pol pair.
+ */
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol)
+{
+ return blkg ? blkg->pd[pol->plid] : NULL;
+}
+
+static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
+ struct blkcg_policy *pol)
+{
+ return blkcg ? blkcg->cpd[pol->plid] : NULL;
+}
+
+/**
+ * pdata_to_blkg - get blkg associated with policy private data
+ * @pd: policy private data of interest
+ *
+ * @pd is policy private data. Determine the blkg it's associated with.
+ */
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
+{
+ return pd ? pd->blkg : NULL;
+}
+
+static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
+{
+ return cpd ? cpd->blkcg : NULL;
+}
+
+/**
+ * blkg_path - format cgroup path of blkg
+ * @blkg: blkg of interest
+ * @buf: target buffer
+ * @buflen: target buffer length
+ *
+ * Format the path of the cgroup of @blkg into @buf.
+ */
+static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
+{
+ return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
+}
+
+/**
+ * blkg_get - get a blkg reference
+ * @blkg: blkg to get
+ *
+ * The caller should be holding an existing reference.
+ */
+static inline void blkg_get(struct blkcg_gq *blkg)
+{
+ percpu_ref_get(&blkg->refcnt);
+}
+
+/**
+ * blkg_tryget - try and get a blkg reference
+ * @blkg: blkg to get
+ *
+ * This is for use when doing an RCU lookup of the blkg. We may be in the midst
+ * of freeing this blkg, so we can only use it if the refcnt is not zero.
+ */
+static inline bool blkg_tryget(struct blkcg_gq *blkg)
+{
+ return blkg && percpu_ref_tryget(&blkg->refcnt);
+}
+
+/**
+ * blkg_put - put a blkg reference
+ * @blkg: blkg to put
+ */
+static inline void blkg_put(struct blkcg_gq *blkg)
+{
+ percpu_ref_put(&blkg->refcnt);
+}
+
+/**
+ * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU
+ * read locked. If called under either blkcg or queue lock, the iteration
+ * is guaranteed to include all and only online blkgs. The caller may
+ * update @pos_css by calling css_rightmost_descendant() to skip subtree.
+ * @p_blkg is included in the iteration and the first node to be visited.
+ */
+#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q)))
+
+/**
+ * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
+ * @d_blkg: loop cursor pointing to the current descendant
+ * @pos_css: used for iteration
+ * @p_blkg: target blkg to walk descendants of
+ *
+ * Similar to blkg_for_each_descendant_pre() but performs post-order
+ * traversal instead. Synchronization rules are the same. @p_blkg is
+ * included in the iteration and the last node to be visited.
+ */
+#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \
+ css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \
+ if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \
+ (p_blkg)->q)))
+
+static inline void blkcg_bio_issue_init(struct bio *bio)
+{
+ bio_issue_init(&bio->bi_issue, bio_sectors(bio));
+}
+
+static inline void blkcg_use_delay(struct blkcg_gq *blkg)
+{
+ if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0))
+ return;
+ if (atomic_add_return(1, &blkg->use_delay) == 1)
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ if (WARN_ON_ONCE(old < 0))
+ return 0;
+ if (old == 0)
+ return 0;
+
+ /*
+ * We do this song and dance because we can race with somebody else
+ * adding or removing delay. If we just did an atomic_dec we'd end up
+ * negative and we'd already be in trouble. We need to subtract 1 and
+ * then check to see if we were the last delay so we can drop the
+ * congestion count on the cgroup.
+ */
+ while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1))
+ ;
+
+ if (old == 0)
+ return 0;
+ if (old == 1)
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+ return 1;
+}
+
+/**
+ * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount
+ * @blkg: target blkg
+ * @delay: delay duration in nsecs
+ *
+ * When enabled with this function, the delay is not decayed and must be
+ * explicitly cleared with blkcg_clear_delay(). Must not be mixed with
+ * blkcg_[un]use_delay() and blkcg_add_delay() usages.
+ */
+static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person setting the congestion count for this blkg. */
+ if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1))
+ atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
+
+ atomic64_set(&blkg->delay_nsec, delay);
+}
+
+/**
+ * blkcg_clear_delay - Disable allocator delay mechanism
+ * @blkg: target blkg
+ *
+ * Disable use_delay mechanism. See blkcg_set_delay().
+ */
+static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
+{
+ int old = atomic_read(&blkg->use_delay);
+
+ /* We only want 1 person clearing the congestion count for this blkg. */
+ if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0))
+ atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
+}
+
+/**
+ * blk_cgroup_mergeable - Determine whether to allow or disallow merges
+ * @rq: request to merge into
+ * @bio: bio to merge
+ *
+ * @bio and @rq should belong to the same cgroup and their issue_as_root should
+ * match. The latter is necessary as we don't want to throttle e.g. a metadata
+ * update because it happens to be next to a regular IO.
+ */
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
+{
+ return rq->bio->bi_blkg == bio->bi_blkg &&
+ bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
+}
+
+void blk_cgroup_bio_start(struct bio *bio);
+void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
+#else /* CONFIG_BLK_CGROUP */
+
+struct blkg_policy_data {
+};
+
+struct blkcg_policy_data {
+};
+
+struct blkcg_policy {
+};
+
+struct blkcg {
+};
+
+static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
+static inline int blkcg_init_disk(struct gendisk *disk) { return 0; }
+static inline void blkcg_exit_disk(struct gendisk *disk) { }
+static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
+static inline int blkcg_activate_policy(struct gendisk *disk,
+ const struct blkcg_policy *pol) { return 0; }
+static inline void blkcg_deactivate_policy(struct gendisk *disk,
+ const struct blkcg_policy *pol) { }
+
+static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
+ struct blkcg_policy *pol) { return NULL; }
+static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
+static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
+static inline void blkg_get(struct blkcg_gq *blkg) { }
+static inline void blkg_put(struct blkcg_gq *blkg) { }
+static inline void blkcg_bio_issue_init(struct bio *bio) { }
+static inline void blk_cgroup_bio_start(struct bio *bio) { }
+static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
+
+#define blk_queue_for_each_rl(rl, q) \
+ for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
+
+#endif /* CONFIG_BLK_CGROUP */
+
+#endif /* _BLK_CGROUP_PRIVATE_H */
diff --git a/block/blk-core.c b/block/blk-core.c
index 03252af8c82c..de771093b526 100644
--- a/block/blk-core.c
+++ b/block/blk-core.c
@@ -14,10 +14,10 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
+#include <linux/blk-pm.h>
+#include <linux/blk-integrity.h>
#include <linux/highmem.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
@@ -34,11 +34,10 @@
#include <linux/delay.h>
#include <linux/ratelimit.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <linux/t10-pi.h>
#include <linux/debugfs.h>
#include <linux/bpf.h>
-#include <linux/psi.h>
+#include <linux/part_stat.h>
#include <linux/sched/sysctl.h>
#include <linux/blk-crypto.h>
@@ -46,27 +45,27 @@
#include <trace/events/block.h>
#include "blk.h"
-#include "blk-mq.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
-#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
+#include "blk-throttle.h"
+#include "blk-ioprio.h"
-#ifdef CONFIG_DEBUG_FS
struct dentry *blk_debugfs_root;
-#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_split);
EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug);
+EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert);
-DEFINE_IDA(blk_queue_ida);
+static DEFINE_IDA(blk_queue_ida);
/*
* For queue allocation
*/
-struct kmem_cache *blk_requestq_cachep;
+static struct kmem_cache *blk_requestq_cachep;
/*
* Controlling structure to kblockd
@@ -109,24 +108,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set);
-void blk_rq_init(struct request_queue *q, struct request *rq)
-{
- memset(rq, 0, sizeof(*rq));
-
- INIT_LIST_HEAD(&rq->queuelist);
- rq->q = q;
- rq->__sector = (sector_t) -1;
- INIT_HLIST_NODE(&rq->hash);
- RB_CLEAR_NODE(&rq->rb_node);
- rq->tag = -1;
- rq->internal_tag = -1;
- rq->start_time_ns = ktime_get_ns();
- rq->part = NULL;
- refcount_set(&rq->ref, 1);
- blk_crypto_rq_set_defaults(rq);
-}
-EXPORT_SYMBOL(blk_rq_init);
-
#define REQ_OP_NAME(name) [REQ_OP_##name] = #name
static const char *const blk_op_name[] = {
REQ_OP_NAME(READ),
@@ -140,10 +121,7 @@ static const char *const blk_op_name[] = {
REQ_OP_NAME(ZONE_CLOSE),
REQ_OP_NAME(ZONE_FINISH),
REQ_OP_NAME(ZONE_APPEND),
- REQ_OP_NAME(WRITE_SAME),
REQ_OP_NAME(WRITE_ZEROES),
- REQ_OP_NAME(SCSI_IN),
- REQ_OP_NAME(SCSI_OUT),
REQ_OP_NAME(DRV_IN),
REQ_OP_NAME(DRV_OUT),
};
@@ -157,7 +135,7 @@ static const char *const blk_op_name[] = {
* string format. Useful in the debugging and tracing bio or request. For
* invalid REQ_OP_XXX it returns string "UNKNOWN".
*/
-inline const char *blk_op_str(unsigned int op)
+inline const char *blk_op_str(enum req_op op)
{
const char *op_str = "UNKNOWN";
@@ -178,16 +156,24 @@ static const struct {
[BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" },
[BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" },
[BLK_STS_TARGET] = { -EREMOTEIO, "critical target" },
- [BLK_STS_NEXUS] = { -EBADE, "critical nexus" },
+ [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" },
[BLK_STS_MEDIUM] = { -ENODATA, "critical medium" },
[BLK_STS_PROTECTION] = { -EILSEQ, "protection" },
[BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" },
[BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" },
[BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" },
+ [BLK_STS_OFFLINE] = { -ENODEV, "device offline" },
/* device mapper special case, should not leak out: */
[BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" },
+ /* zone device specific errors */
+ [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" },
+ [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" },
+
+ /* Command duration limit device-side timeout */
+ [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" },
+
/* everything else not covered above: */
[BLK_STS_IOERR] = { -EIO, "I/O" },
};
@@ -215,65 +201,15 @@ int blk_status_to_errno(blk_status_t status)
}
EXPORT_SYMBOL_GPL(blk_status_to_errno);
-static void print_req_error(struct request *req, blk_status_t status,
- const char *caller)
+const char *blk_status_to_str(blk_status_t status)
{
int idx = (__force int)status;
if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors)))
- return;
-
- printk_ratelimited(KERN_ERR
- "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
- "phys_seg %u prio class %u\n",
- caller, blk_errors[idx].name,
- req->rq_disk ? req->rq_disk->disk_name : "?",
- blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)),
- req->cmd_flags & ~REQ_OP_MASK,
- req->nr_phys_segments,
- IOPRIO_PRIO_CLASS(req->ioprio));
-}
-
-static void req_bio_endio(struct request *rq, struct bio *bio,
- unsigned int nbytes, blk_status_t error)
-{
- if (error)
- bio->bi_status = error;
-
- if (unlikely(rq->rq_flags & RQF_QUIET))
- bio_set_flag(bio, BIO_QUIET);
-
- bio_advance(bio, nbytes);
-
- if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) {
- /*
- * Partial zone append completions cannot be supported as the
- * BIO fragments may end up not being written sequentially.
- */
- if (bio->bi_iter.bi_size)
- bio->bi_status = BLK_STS_IOERR;
- else
- bio->bi_iter.bi_sector = rq->__sector;
- }
-
- /* don't actually finish bio if it's part of flush sequence */
- if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
- bio_endio(bio);
+ return "<null>";
+ return blk_errors[idx].name;
}
-
-void blk_dump_rq_flags(struct request *rq, char *msg)
-{
- printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
- rq->rq_disk ? rq->rq_disk->disk_name : "?",
- (unsigned long long) rq->cmd_flags);
-
- printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
- (unsigned long long)blk_rq_pos(rq),
- blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
- printk(KERN_INFO " bio %p, biotail %p, len %u\n",
- rq->bio, rq->biotail, blk_rq_bytes(rq));
-}
-EXPORT_SYMBOL(blk_dump_rq_flags);
+EXPORT_SYMBOL_GPL(blk_status_to_str);
/**
* blk_sync_queue - cancel any pending callbacks on a queue
@@ -285,7 +221,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags);
* A block device may call blk_sync_queue to ensure that any
* such activity is cancelled, thus allowing it to release resources
* that the callbacks might use. The caller must already have made sure
- * that its ->make_request_fn will not re-add plugging prior to calling
+ * that its ->submit_bio will not re-add plugging prior to calling
* this function.
*
* This function does not cancel any asynchronous activity arising
@@ -321,156 +257,117 @@ void blk_clear_pm_only(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_clear_pm_only);
-void blk_put_queue(struct request_queue *q)
+static void blk_free_queue_rcu(struct rcu_head *rcu_head)
{
- kobject_put(&q->kobj);
+ struct request_queue *q = container_of(rcu_head,
+ struct request_queue, rcu_head);
+
+ percpu_ref_exit(&q->q_usage_counter);
+ kmem_cache_free(blk_requestq_cachep, q);
}
-EXPORT_SYMBOL(blk_put_queue);
-void blk_set_queue_dying(struct request_queue *q)
+static void blk_free_queue(struct request_queue *q)
{
- blk_queue_flag_set(QUEUE_FLAG_DYING, q);
-
- /*
- * When queue DYING flag is set, we need to block new req
- * entering queue, so we call blk_freeze_queue_start() to
- * prevent I/O from crossing blk_queue_enter().
- */
- blk_freeze_queue_start(q);
-
+ blk_free_queue_stats(q->stats);
if (queue_is_mq(q))
- blk_mq_wake_waiters(q);
+ blk_mq_release(q);
- /* Make blk_queue_enter() reexamine the DYING flag. */
- wake_up_all(&q->mq_freeze_wq);
+ ida_free(&blk_queue_ida, q->id);
+ call_rcu(&q->rcu_head, blk_free_queue_rcu);
}
-EXPORT_SYMBOL_GPL(blk_set_queue_dying);
/**
- * blk_cleanup_queue - shutdown a request queue
- * @q: request queue to shutdown
+ * blk_put_queue - decrement the request_queue refcount
+ * @q: the request_queue structure to decrement the refcount for
*
- * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and
- * put it. All future requests will be failed immediately with -ENODEV.
+ * Decrements the refcount of the request_queue and free it when the refcount
+ * reaches 0.
*/
-void blk_cleanup_queue(struct request_queue *q)
+void blk_put_queue(struct request_queue *q)
{
- WARN_ON_ONCE(blk_queue_registered(q));
-
- /* mark @q DYING, no new request or merges will be allowed afterwards */
- blk_set_queue_dying(q);
-
- blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q);
- blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q);
+ if (refcount_dec_and_test(&q->refs))
+ blk_free_queue(q);
+}
+EXPORT_SYMBOL(blk_put_queue);
+void blk_queue_start_drain(struct request_queue *q)
+{
/*
- * Drain all requests queued before DYING marking. Set DEAD flag to
- * prevent that blk_mq_run_hw_queues() accesses the hardware queues
- * after draining finished.
+ * When queue DYING flag is set, we need to block new req
+ * entering queue, so we call blk_freeze_queue_start() to
+ * prevent I/O from crossing blk_queue_enter().
*/
- blk_freeze_queue(q);
-
- rq_qos_exit(q);
-
- blk_queue_flag_set(QUEUE_FLAG_DEAD, q);
-
- /* for synchronous bio-based driver finish in-flight integrity i/o */
- blk_flush_integrity();
-
- /* @q won't process any more request, flush async actions */
- del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer);
- blk_sync_queue(q);
-
+ blk_freeze_queue_start(q);
if (queue_is_mq(q))
- blk_mq_exit_queue(q);
-
- /*
- * In theory, request pool of sched_tags belongs to request queue.
- * However, the current implementation requires tag_set for freeing
- * requests, so free the pool now.
- *
- * Queue has become frozen, there can't be any in-queue requests, so
- * it is safe to free requests now.
- */
- mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- blk_mq_sched_free_requests(q);
- mutex_unlock(&q->sysfs_lock);
-
- percpu_ref_exit(&q->q_usage_counter);
-
- /* @q is and will stay empty, shutdown and put */
- blk_put_queue(q);
+ blk_mq_wake_waiters(q);
+ /* Make blk_queue_enter() reexamine the DYING flag. */
+ wake_up_all(&q->mq_freeze_wq);
}
-EXPORT_SYMBOL(blk_cleanup_queue);
/**
* blk_queue_enter() - try to increase q->q_usage_counter
* @q: request queue pointer
- * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT
+ * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM
*/
int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags)
{
- const bool pm = flags & BLK_MQ_REQ_PREEMPT;
-
- while (true) {
- bool success = false;
-
- rcu_read_lock();
- if (percpu_ref_tryget_live(&q->q_usage_counter)) {
- /*
- * The code that increments the pm_only counter is
- * responsible for ensuring that that counter is
- * globally visible before the queue is unfrozen.
- */
- if (pm || !blk_queue_pm_only(q)) {
- success = true;
- } else {
- percpu_ref_put(&q->q_usage_counter);
- }
- }
- rcu_read_unlock();
-
- if (success)
- return 0;
+ const bool pm = flags & BLK_MQ_REQ_PM;
+ while (!blk_try_enter_queue(q, pm)) {
if (flags & BLK_MQ_REQ_NOWAIT)
- return -EBUSY;
+ return -EAGAIN;
/*
- * read pair of barrier in blk_freeze_queue_start(),
- * we need to order reading __PERCPU_REF_DEAD flag of
- * .q_usage_counter and reading .mq_freeze_depth or
- * queue dying flag, otherwise the following wait may
- * never return if the two reads are reordered.
+ * read pair of barrier in blk_freeze_queue_start(), we need to
+ * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
+ * reading .mq_freeze_depth or queue dying flag, otherwise the
+ * following wait may never return if the two reads are
+ * reordered.
*/
smp_rmb();
-
wait_event(q->mq_freeze_wq,
(!q->mq_freeze_depth &&
- (pm || (blk_pm_request_resume(q),
- !blk_queue_pm_only(q)))) ||
+ blk_pm_resume_queue(pm, q)) ||
blk_queue_dying(q));
if (blk_queue_dying(q))
return -ENODEV;
}
+
+ return 0;
}
-static inline int bio_queue_enter(struct bio *bio)
+int __bio_queue_enter(struct request_queue *q, struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
- bool nowait = bio->bi_opf & REQ_NOWAIT;
- int ret;
+ while (!blk_try_enter_queue(q, false)) {
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
- ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0);
- if (unlikely(ret)) {
- if (nowait && !blk_queue_dying(q))
+ if (bio->bi_opf & REQ_NOWAIT) {
+ if (test_bit(GD_DEAD, &disk->state))
+ goto dead;
bio_wouldblock_error(bio);
- else
- bio_io_error(bio);
+ return -EAGAIN;
+ }
+
+ /*
+ * read pair of barrier in blk_freeze_queue_start(), we need to
+ * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and
+ * reading .mq_freeze_depth or queue dying flag, otherwise the
+ * following wait may never return if the two reads are
+ * reordered.
+ */
+ smp_rmb();
+ wait_event(q->mq_freeze_wq,
+ (!q->mq_freeze_depth &&
+ blk_pm_resume_queue(false, q)) ||
+ test_bit(GD_DEAD, &disk->state));
+ if (test_bit(GD_DEAD, &disk->state))
+ goto dead;
}
- return ret;
+ return 0;
+dead:
+ bio_io_error(bio);
+ return -ENODEV;
}
void blk_queue_exit(struct request_queue *q)
@@ -497,54 +394,38 @@ static void blk_timeout_work(struct work_struct *work)
{
}
-struct request_queue *__blk_alloc_queue(int node_id)
+struct request_queue *blk_alloc_queue(int node_id)
{
struct request_queue *q;
- int ret;
- q = kmem_cache_alloc_node(blk_requestq_cachep,
- GFP_KERNEL | __GFP_ZERO, node_id);
+ q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO,
+ node_id);
if (!q)
return NULL;
q->last_merge = NULL;
- q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL);
+ q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL);
if (q->id < 0)
goto fail_q;
- ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS);
- if (ret)
- goto fail_id;
-
- q->backing_dev_info = bdi_alloc(node_id);
- if (!q->backing_dev_info)
- goto fail_split;
-
q->stats = blk_alloc_queue_stats();
if (!q->stats)
- goto fail_stats;
+ goto fail_id;
- q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES;
- q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK;
q->node = node_id;
- timer_setup(&q->backing_dev_info->laptop_mode_wb_timer,
- laptop_mode_timer_fn, 0);
+ atomic_set(&q->nr_active_requests_shared_tags, 0);
+
timer_setup(&q->timeout, blk_rq_timed_out_timer, 0);
INIT_WORK(&q->timeout_work, blk_timeout_work);
INIT_LIST_HEAD(&q->icq_list);
-#ifdef CONFIG_BLK_CGROUP
- INIT_LIST_HEAD(&q->blkg_list);
-#endif
- kobject_init(&q->kobj, &blk_queue_ktype);
-
-#ifdef CONFIG_BLK_DEV_IO_TRACE
- mutex_init(&q->blk_trace_mutex);
-#endif
+ refcount_set(&q->refs, 1);
+ mutex_init(&q->debugfs_mutex);
mutex_init(&q->sysfs_lock);
mutex_init(&q->sysfs_dir_lock);
+ mutex_init(&q->rq_qos_mutex);
spin_lock_init(&q->queue_lock);
init_waitqueue_head(&q->mq_freeze_wq);
@@ -557,252 +438,38 @@ struct request_queue *__blk_alloc_queue(int node_id)
if (percpu_ref_init(&q->q_usage_counter,
blk_queue_usage_counter_release,
PERCPU_REF_INIT_ATOMIC, GFP_KERNEL))
- goto fail_bdi;
-
- if (blkcg_init_queue(q))
- goto fail_ref;
+ goto fail_stats;
- blk_queue_dma_alignment(q, 511);
blk_set_default_limits(&q->limits);
+ q->nr_requests = BLKDEV_DEFAULT_RQ;
return q;
-fail_ref:
- percpu_ref_exit(&q->q_usage_counter);
-fail_bdi:
- blk_free_queue_stats(q->stats);
fail_stats:
- bdi_put(q->backing_dev_info);
-fail_split:
- bioset_exit(&q->bio_split);
+ blk_free_queue_stats(q->stats);
fail_id:
- ida_simple_remove(&blk_queue_ida, q->id);
+ ida_free(&blk_queue_ida, q->id);
fail_q:
kmem_cache_free(blk_requestq_cachep, q);
return NULL;
}
-struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id)
-{
- struct request_queue *q;
-
- if (WARN_ON_ONCE(!make_request))
- return NULL;
-
- q = __blk_alloc_queue(node_id);
- if (!q)
- return NULL;
- q->make_request_fn = make_request;
- q->nr_requests = BLKDEV_MAX_RQ;
- return q;
-}
-EXPORT_SYMBOL(blk_alloc_queue);
-
-bool blk_get_queue(struct request_queue *q)
-{
- if (likely(!blk_queue_dying(q))) {
- __blk_get_queue(q);
- return true;
- }
-
- return false;
-}
-EXPORT_SYMBOL(blk_get_queue);
-
-/**
- * blk_get_request - allocate a request
- * @q: request queue to allocate a request for
- * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC.
- * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT.
- */
-struct request *blk_get_request(struct request_queue *q, unsigned int op,
- blk_mq_req_flags_t flags)
-{
- struct request *req;
-
- WARN_ON_ONCE(op & REQ_NOWAIT);
- WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT));
-
- req = blk_mq_alloc_request(q, op, flags);
- if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn)
- q->mq_ops->initialize_rq_fn(req);
-
- return req;
-}
-EXPORT_SYMBOL(blk_get_request);
-
-void blk_put_request(struct request *req)
-{
- blk_mq_free_request(req);
-}
-EXPORT_SYMBOL(blk_put_request);
-
-static void blk_account_io_merge_bio(struct request *req)
-{
- if (!blk_do_io_stat(req))
- return;
-
- part_stat_lock();
- part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_unlock();
-}
-
-bool bio_attempt_back_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs)
-{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
-
- if (!ll_back_merge_fn(req, bio, nr_segs))
- return false;
-
- trace_block_bio_backmerge(req->q, req, bio);
- rq_qos_merge(req->q, req, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bio->bi_iter.bi_size;
-
- bio_crypt_free_ctx(bio);
-
- blk_account_io_merge_bio(req);
- return true;
-}
-
-bool bio_attempt_front_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs)
-{
- const int ff = bio->bi_opf & REQ_FAILFAST_MASK;
-
- if (!ll_front_merge_fn(req, bio, nr_segs))
- return false;
-
- trace_block_bio_frontmerge(req->q, req, bio);
- rq_qos_merge(req->q, req, bio);
-
- if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
- blk_rq_set_mixed_merge(req);
-
- bio->bi_next = req->bio;
- req->bio = bio;
-
- req->__sector = bio->bi_iter.bi_sector;
- req->__data_len += bio->bi_iter.bi_size;
-
- bio_crypt_do_front_merge(req, bio);
-
- blk_account_io_merge_bio(req);
- return true;
-}
-
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
- struct bio *bio)
-{
- unsigned short segments = blk_rq_nr_discard_segments(req);
-
- if (segments >= queue_max_discard_segments(q))
- goto no_merge;
- if (blk_rq_sectors(req) + bio_sectors(bio) >
- blk_rq_get_max_sectors(req, blk_rq_pos(req)))
- goto no_merge;
-
- rq_qos_merge(q, req, bio);
-
- req->biotail->bi_next = bio;
- req->biotail = bio;
- req->__data_len += bio->bi_iter.bi_size;
- req->nr_phys_segments = segments + 1;
-
- blk_account_io_merge_bio(req);
- return true;
-no_merge:
- req_set_nomerge(q, req);
- return false;
-}
-
/**
- * blk_attempt_plug_merge - try to merge with %current's plugged list
- * @q: request_queue new bio is being queued at
- * @bio: new bio being queued
- * @nr_segs: number of segments in @bio
- * @same_queue_rq: pointer to &struct request that gets filled in when
- * another request associated with @q is found on the plug list
- * (optional, may be %NULL)
- *
- * Determine whether @bio being queued on @q can be merged with a request
- * on %current's plugged list. Returns %true if merge was successful,
- * otherwise %false.
+ * blk_get_queue - increment the request_queue refcount
+ * @q: the request_queue structure to increment the refcount for
*
- * Plugging coalesces IOs from the same issuer for the same purpose without
- * going through @q->queue_lock. As such it's more of an issuing mechanism
- * than scheduling, and the request, while may have elvpriv data, is not
- * added on the elevator at this point. In addition, we don't have
- * reliable access to the elevator outside queue lock. Only check basic
- * merging parameters without querying the elevator.
+ * Increment the refcount of the request_queue kobject.
*
- * Caller must ensure !blk_queue_nomerges(q) beforehand.
+ * Context: Any context.
*/
-bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **same_queue_rq)
+bool blk_get_queue(struct request_queue *q)
{
- struct blk_plug *plug;
- struct request *rq;
- struct list_head *plug_list;
-
- plug = blk_mq_plug(q, bio);
- if (!plug)
+ if (unlikely(blk_queue_dying(q)))
return false;
-
- plug_list = &plug->mq_list;
-
- list_for_each_entry_reverse(rq, plug_list, queuelist) {
- bool merged = false;
-
- if (rq->q == q && same_queue_rq) {
- /*
- * Only blk-mq multiple hardware queues case checks the
- * rq in the same queue, there should be only one such
- * rq in a queue
- **/
- *same_queue_rq = rq;
- }
-
- if (rq->q != q || !blk_rq_merge_ok(rq, bio))
- continue;
-
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- merged = bio_attempt_back_merge(rq, bio, nr_segs);
- break;
- case ELEVATOR_FRONT_MERGE:
- merged = bio_attempt_front_merge(rq, bio, nr_segs);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- break;
- }
-
- if (merged)
- return true;
- }
-
- return false;
-}
-
-static void handle_bad_sector(struct bio *bio, sector_t maxsector)
-{
- char b[BDEVNAME_SIZE];
-
- printk(KERN_INFO "attempt to access beyond end of device\n");
- printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n",
- bio_devname(bio, b), bio->bi_opf,
- (unsigned long long)bio_end_sector(bio),
- (long long)maxsector);
+ refcount_inc(&q->refs);
+ return true;
}
+EXPORT_SYMBOL(blk_get_queue);
#ifdef CONFIG_FAIL_MAKE_REQUEST
@@ -814,9 +481,9 @@ static int __init setup_fail_make_request(char *str)
}
__setup("fail_make_request=", setup_fail_make_request);
-static bool should_fail_request(struct hd_struct *part, unsigned int bytes)
+bool should_fail_request(struct block_device *part, unsigned int bytes)
{
- return part->make_it_fail && should_fail(&fail_make_request, bytes);
+ return part->bd_make_it_fail && should_fail(&fail_make_request, bytes);
}
static int __init fail_make_request_debugfs(void)
@@ -828,41 +495,30 @@ static int __init fail_make_request_debugfs(void)
}
late_initcall(fail_make_request_debugfs);
-
-#else /* CONFIG_FAIL_MAKE_REQUEST */
-
-static inline bool should_fail_request(struct hd_struct *part,
- unsigned int bytes)
-{
- return false;
-}
-
#endif /* CONFIG_FAIL_MAKE_REQUEST */
-static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part)
+static inline void bio_check_ro(struct bio *bio)
{
- const int op = bio_op(bio);
-
- if (part->policy && op_is_write(op)) {
- char b[BDEVNAME_SIZE];
-
+ if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
- return false;
+ return;
- WARN_ONCE(1,
- "generic_make_request: Trying to write "
- "to read-only block-device %s (partno %d)\n",
- bio_devname(bio, b), part->partno);
- /* Older lvm-tools actually trigger this */
- return false;
- }
+ if (bio->bi_bdev->bd_ro_warned)
+ return;
- return false;
+ bio->bi_bdev->bd_ro_warned = true;
+ /*
+ * Use ioctl to set underlying disk of raid/dm to read-only
+ * will trigger this.
+ */
+ pr_warn("Trying to write to read-only block-device %pg\n",
+ bio->bi_bdev);
+ }
}
static noinline int should_fail_bio(struct bio *bio)
{
- if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size))
+ if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size))
return -EIO;
return 0;
}
@@ -873,14 +529,18 @@ ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO);
* This may well happen - the kernel calls bread() without checking the size of
* the device, e.g., when mounting a file system.
*/
-static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
+static inline int bio_check_eod(struct bio *bio)
{
+ sector_t maxsector = bdev_nr_sectors(bio->bi_bdev);
unsigned int nr_sectors = bio_sectors(bio);
- if (nr_sectors && maxsector &&
+ if (nr_sectors &&
(nr_sectors > maxsector ||
bio->bi_iter.bi_sector > maxsector - nr_sectors)) {
- handle_bad_sector(bio, maxsector);
+ pr_info_ratelimited("%s: attempt to access beyond end of device\n"
+ "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n",
+ current->comm, bio->bi_bdev, bio->bi_opf,
+ bio->bi_iter.bi_sector, nr_sectors, maxsector);
return -EIO;
}
return 0;
@@ -889,32 +549,20 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector)
/*
* Remap block n of partition p to block n+start(p) of the disk.
*/
-static inline int blk_partition_remap(struct bio *bio)
+static int blk_partition_remap(struct bio *bio)
{
- struct hd_struct *p;
- int ret = -EIO;
+ struct block_device *p = bio->bi_bdev;
- rcu_read_lock();
- p = __disk_get_part(bio->bi_disk, bio->bi_partno);
- if (unlikely(!p))
- goto out;
if (unlikely(should_fail_request(p, bio->bi_iter.bi_size)))
- goto out;
- if (unlikely(bio_check_ro(bio, p)))
- goto out;
-
+ return -EIO;
if (bio_sectors(bio)) {
- if (bio_check_eod(bio, part_nr_sects_read(p)))
- goto out;
- bio->bi_iter.bi_sector += p->start_sect;
- trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p),
- bio->bi_iter.bi_sector - p->start_sect);
+ bio->bi_iter.bi_sector += p->bd_start_sect;
+ trace_block_bio_remap(bio, p->bd_dev,
+ bio->bi_iter.bi_sector -
+ p->bd_start_sect);
}
- bio->bi_partno = 0;
- ret = 0;
-out:
- rcu_read_unlock();
- return ret;
+ bio_set_flag(bio, BIO_REMAPPED);
+ return 0;
}
/*
@@ -923,16 +571,15 @@ out:
static inline blk_status_t blk_check_zone_append(struct request_queue *q,
struct bio *bio)
{
- sector_t pos = bio->bi_iter.bi_sector;
int nr_sectors = bio_sectors(bio);
/* Only applicable to zoned block devices */
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
return BLK_STS_NOTSUPP;
/* The bio sector must point to the start of a sequential zone */
- if (pos & (blk_queue_zone_sectors(q) - 1) ||
- !blk_queue_zone_is_seq(q, pos))
+ if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) ||
+ !bio_zone_is_seq(bio))
return BLK_STS_IOERR;
/*
@@ -952,73 +599,195 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q,
return BLK_STS_OK;
}
-static noinline_for_stack bool
-generic_make_request_checks(struct bio *bio)
+static void __submit_bio(struct bio *bio)
{
- struct request_queue *q;
- int nr_sectors = bio_sectors(bio);
- blk_status_t status = BLK_STS_IOERR;
- char b[BDEVNAME_SIZE];
+ if (unlikely(!blk_crypto_bio_prep(&bio)))
+ return;
- might_sleep();
+ if (!bio->bi_bdev->bd_has_submit_bio) {
+ blk_mq_submit_bio(bio);
+ } else if (likely(bio_queue_enter(bio) == 0)) {
+ struct gendisk *disk = bio->bi_bdev->bd_disk;
- q = bio->bi_disk->queue;
- if (unlikely(!q)) {
- printk(KERN_ERR
- "generic_make_request: Trying to access "
- "nonexistent block-device %s (%Lu)\n",
- bio_devname(bio, b), (long long)bio->bi_iter.bi_sector);
- goto end_io;
+ disk->fops->submit_bio(bio);
+ blk_queue_exit(disk->queue);
+ }
+}
+
+/*
+ * The loop in this function may be a bit non-obvious, and so deserves some
+ * explanation:
+ *
+ * - Before entering the loop, bio->bi_next is NULL (as all callers ensure
+ * that), so we have a list with a single bio.
+ * - We pretend that we have just taken it off a longer list, so we assign
+ * bio_list to a pointer to the bio_list_on_stack, thus initialising the
+ * bio_list of new bios to be added. ->submit_bio() may indeed add some more
+ * bios through a recursive call to submit_bio_noacct. If it did, we find a
+ * non-NULL value in bio_list and re-enter the loop from the top.
+ * - In this case we really did just take the bio of the top of the list (no
+ * pretending) and so remove it from bio_list, and call into ->submit_bio()
+ * again.
+ *
+ * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio.
+ * bio_list_on_stack[1] contains bios that were submitted before the current
+ * ->submit_bio, but that haven't been processed yet.
+ */
+static void __submit_bio_noacct(struct bio *bio)
+{
+ struct bio_list bio_list_on_stack[2];
+
+ BUG_ON(bio->bi_next);
+
+ bio_list_init(&bio_list_on_stack[0]);
+ current->bio_list = bio_list_on_stack;
+
+ do {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct bio_list lower, same;
+
+ /*
+ * Create a fresh bio_list for all subordinate requests.
+ */
+ bio_list_on_stack[1] = bio_list_on_stack[0];
+ bio_list_init(&bio_list_on_stack[0]);
+
+ __submit_bio(bio);
+
+ /*
+ * Sort new bios into those for a lower level and those for the
+ * same level.
+ */
+ bio_list_init(&lower);
+ bio_list_init(&same);
+ while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
+ if (q == bdev_get_queue(bio->bi_bdev))
+ bio_list_add(&same, bio);
+ else
+ bio_list_add(&lower, bio);
+
+ /*
+ * Now assemble so we handle the lowest level first.
+ */
+ bio_list_merge(&bio_list_on_stack[0], &lower);
+ bio_list_merge(&bio_list_on_stack[0], &same);
+ bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
+ } while ((bio = bio_list_pop(&bio_list_on_stack[0])));
+
+ current->bio_list = NULL;
+}
+
+static void __submit_bio_noacct_mq(struct bio *bio)
+{
+ struct bio_list bio_list[2] = { };
+
+ current->bio_list = bio_list;
+
+ do {
+ __submit_bio(bio);
+ } while ((bio = bio_list_pop(&bio_list[0])));
+
+ current->bio_list = NULL;
+}
+
+void submit_bio_noacct_nocheck(struct bio *bio)
+{
+ blk_cgroup_bio_start(bio);
+ blkcg_bio_issue_init(bio);
+
+ if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
+ trace_block_bio_queue(bio);
+ /*
+ * Now that enqueuing has been traced, we need to trace
+ * completion as well.
+ */
+ bio_set_flag(bio, BIO_TRACE_COMPLETION);
}
/*
+ * We only want one ->submit_bio to be active at a time, else stack
+ * usage with stacked devices could be a problem. Use current->bio_list
+ * to collect a list of requests submited by a ->submit_bio method while
+ * it is active, and then process them after it returned.
+ */
+ if (current->bio_list)
+ bio_list_add(&current->bio_list[0], bio);
+ else if (!bio->bi_bdev->bd_has_submit_bio)
+ __submit_bio_noacct_mq(bio);
+ else
+ __submit_bio_noacct(bio);
+}
+
+/**
+ * submit_bio_noacct - re-submit a bio to the block device layer for I/O
+ * @bio: The bio describing the location in memory and on the device.
+ *
+ * This is a version of submit_bio() that shall only be used for I/O that is
+ * resubmitted to lower level drivers by stacking block drivers. All file
+ * systems and other upper level users of the block layer should use
+ * submit_bio() instead.
+ */
+void submit_bio_noacct(struct bio *bio)
+{
+ struct block_device *bdev = bio->bi_bdev;
+ struct request_queue *q = bdev_get_queue(bdev);
+ blk_status_t status = BLK_STS_IOERR;
+
+ might_sleep();
+
+ /*
* For a REQ_NOWAIT based request, return -EOPNOTSUPP
- * if queue is not a request based queue.
+ * if queue does not support NOWAIT.
*/
- if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q))
+ if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev))
goto not_supported;
if (should_fail_bio(bio))
goto end_io;
-
- if (bio->bi_partno) {
- if (unlikely(blk_partition_remap(bio)))
- goto end_io;
- } else {
- if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0)))
+ bio_check_ro(bio);
+ if (!bio_flagged(bio, BIO_REMAPPED)) {
+ if (unlikely(bio_check_eod(bio)))
goto end_io;
- if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk))))
+ if (bdev->bd_partno && unlikely(blk_partition_remap(bio)))
goto end_io;
}
/*
- * Filter flush bio's early so that make_request based
- * drivers without flush support don't have to worry
- * about them.
+ * Filter flush bio's early so that bio based drivers without flush
+ * support don't have to worry about them.
*/
- if (op_is_flush(bio->bi_opf) &&
- !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
- bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
- if (!nr_sectors) {
- status = BLK_STS_OK;
+ if (op_is_flush(bio->bi_opf)) {
+ if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE &&
+ bio_op(bio) != REQ_OP_ZONE_APPEND))
goto end_io;
+ if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) {
+ bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA);
+ if (!bio_sectors(bio)) {
+ status = BLK_STS_OK;
+ goto end_io;
+ }
}
}
if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
- bio->bi_opf &= ~REQ_HIPRI;
+ bio_clear_polled(bio);
switch (bio_op(bio)) {
+ case REQ_OP_READ:
+ case REQ_OP_WRITE:
+ break;
+ case REQ_OP_FLUSH:
+ /*
+ * REQ_OP_FLUSH can't be submitted through bios, it is only
+ * synthetized in struct request by the flush state machine.
+ */
+ goto not_supported;
case REQ_OP_DISCARD:
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(bdev))
goto not_supported;
break;
case REQ_OP_SECURE_ERASE:
- if (!blk_queue_secure_erase(q))
- goto not_supported;
- break;
- case REQ_OP_WRITE_SAME:
- if (!q->limits.max_write_same_sectors)
+ if (!bdev_max_secure_erase_sectors(bdev))
goto not_supported;
break;
case REQ_OP_ZONE_APPEND:
@@ -1026,186 +795,52 @@ generic_make_request_checks(struct bio *bio)
if (status != BLK_STS_OK)
goto end_io;
break;
+ case REQ_OP_WRITE_ZEROES:
+ if (!q->limits.max_write_zeroes_sectors)
+ goto not_supported;
+ break;
case REQ_OP_ZONE_RESET:
case REQ_OP_ZONE_OPEN:
case REQ_OP_ZONE_CLOSE:
case REQ_OP_ZONE_FINISH:
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bio->bi_bdev))
goto not_supported;
break;
case REQ_OP_ZONE_RESET_ALL:
- if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q))
- goto not_supported;
- break;
- case REQ_OP_WRITE_ZEROES:
- if (!q->limits.max_write_zeroes_sectors)
+ if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q))
goto not_supported;
break;
+ case REQ_OP_DRV_IN:
+ case REQ_OP_DRV_OUT:
+ /*
+ * Driver private operations are only used with passthrough
+ * requests.
+ */
+ fallthrough;
default:
- break;
+ goto not_supported;
}
- /*
- * Various block parts want %current->io_context, so allocate it up
- * front rather than dealing with lots of pain to allocate it only
- * where needed. This may fail and the block layer knows how to live
- * with it.
- */
- if (unlikely(!current->io_context))
- create_task_io_context(current, GFP_ATOMIC, q->node);
-
- if (!blkcg_bio_issue_check(q, bio))
- return false;
-
- if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) {
- trace_block_bio_queue(q, bio);
- /* Now that enqueuing has been traced, we need to trace
- * completion as well.
- */
- bio_set_flag(bio, BIO_TRACE_COMPLETION);
- }
- return true;
+ if (blk_throtl_bio(bio))
+ return;
+ submit_bio_noacct_nocheck(bio);
+ return;
not_supported:
status = BLK_STS_NOTSUPP;
end_io:
bio->bi_status = status;
bio_endio(bio);
- return false;
-}
-
-static blk_qc_t do_make_request(struct bio *bio)
-{
- struct request_queue *q = bio->bi_disk->queue;
- blk_qc_t ret = BLK_QC_T_NONE;
-
- if (blk_crypto_bio_prep(&bio)) {
- if (!q->make_request_fn)
- return blk_mq_make_request(q, bio);
- ret = q->make_request_fn(q, bio);
- }
- blk_queue_exit(q);
- return ret;
-}
-
-/**
- * generic_make_request - re-submit a bio to the block device layer for I/O
- * @bio: The bio describing the location in memory and on the device.
- *
- * This is a version of submit_bio() that shall only be used for I/O that is
- * resubmitted to lower level drivers by stacking block drivers. All file
- * systems and other upper level users of the block layer should use
- * submit_bio() instead.
- */
-blk_qc_t generic_make_request(struct bio *bio)
-{
- /*
- * bio_list_on_stack[0] contains bios submitted by the current
- * make_request_fn.
- * bio_list_on_stack[1] contains bios that were submitted before
- * the current make_request_fn, but that haven't been processed
- * yet.
- */
- struct bio_list bio_list_on_stack[2];
- blk_qc_t ret = BLK_QC_T_NONE;
-
- if (!generic_make_request_checks(bio))
- goto out;
-
- /*
- * We only want one ->make_request_fn to be active at a time, else
- * stack usage with stacked devices could be a problem. So use
- * current->bio_list to keep a list of requests submited by a
- * make_request_fn function. current->bio_list is also used as a
- * flag to say if generic_make_request is currently active in this
- * task or not. If it is NULL, then no make_request is active. If
- * it is non-NULL, then a make_request is active, and new requests
- * should be added at the tail
- */
- if (current->bio_list) {
- bio_list_add(&current->bio_list[0], bio);
- goto out;
- }
-
- /* following loop may be a bit non-obvious, and so deserves some
- * explanation.
- * Before entering the loop, bio->bi_next is NULL (as all callers
- * ensure that) so we have a list with a single bio.
- * We pretend that we have just taken it off a longer list, so
- * we assign bio_list to a pointer to the bio_list_on_stack,
- * thus initialising the bio_list of new bios to be
- * added. ->make_request() may indeed add some more bios
- * through a recursive call to generic_make_request. If it
- * did, we find a non-NULL value in bio_list and re-enter the loop
- * from the top. In this case we really did just take the bio
- * of the top of the list (no pretending) and so remove it from
- * bio_list, and call into ->make_request() again.
- */
- BUG_ON(bio->bi_next);
- bio_list_init(&bio_list_on_stack[0]);
- current->bio_list = bio_list_on_stack;
- do {
- struct request_queue *q = bio->bi_disk->queue;
-
- if (likely(bio_queue_enter(bio) == 0)) {
- struct bio_list lower, same;
-
- /* Create a fresh bio_list for all subordinate requests */
- bio_list_on_stack[1] = bio_list_on_stack[0];
- bio_list_init(&bio_list_on_stack[0]);
- ret = do_make_request(bio);
-
- /* sort new bios into those for a lower level
- * and those for the same level
- */
- bio_list_init(&lower);
- bio_list_init(&same);
- while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL)
- if (q == bio->bi_disk->queue)
- bio_list_add(&same, bio);
- else
- bio_list_add(&lower, bio);
- /* now assemble so we handle the lowest level first */
- bio_list_merge(&bio_list_on_stack[0], &lower);
- bio_list_merge(&bio_list_on_stack[0], &same);
- bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]);
- }
- bio = bio_list_pop(&bio_list_on_stack[0]);
- } while (bio);
- current->bio_list = NULL; /* deactivate */
-
-out:
- return ret;
}
-EXPORT_SYMBOL(generic_make_request);
+EXPORT_SYMBOL(submit_bio_noacct);
-/**
- * direct_make_request - hand a buffer directly to its device driver for I/O
- * @bio: The bio describing the location in memory and on the device.
- *
- * This function behaves like generic_make_request(), but does not protect
- * against recursion. Must only be used if the called driver is known
- * to be blk-mq based.
- */
-blk_qc_t direct_make_request(struct bio *bio)
+static void bio_set_ioprio(struct bio *bio)
{
- struct request_queue *q = bio->bi_disk->queue;
-
- if (WARN_ON_ONCE(q->make_request_fn)) {
- bio_io_error(bio);
- return BLK_QC_T_NONE;
- }
- if (!generic_make_request_checks(bio))
- return BLK_QC_T_NONE;
- if (unlikely(bio_queue_enter(bio)))
- return BLK_QC_T_NONE;
- if (!blk_crypto_bio_prep(&bio)) {
- blk_queue_exit(q);
- return BLK_QC_T_NONE;
- }
- return blk_mq_make_request(q, bio);
+ /* Nobody set ioprio so far? Initialize it based on task's nice value */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE)
+ bio->bi_ioprio = get_current_ioprio();
+ blkcg_set_ioprio(bio);
}
-EXPORT_SYMBOL_GPL(direct_make_request);
/**
* submit_bio - submit a bio to the block device layer for I/O
@@ -1213,429 +848,189 @@ EXPORT_SYMBOL_GPL(direct_make_request);
*
* submit_bio() is used to submit I/O requests to block devices. It is passed a
* fully set up &struct bio that describes the I/O that needs to be done. The
- * bio will be send to the device described by the bi_disk and bi_partno fields.
+ * bio will be send to the device described by the bi_bdev field.
*
* The success/failure status of the request, along with notification of
* completion, is delivered asynchronously through the ->bi_end_io() callback
- * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has
+ * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has
* been called.
*/
-blk_qc_t submit_bio(struct bio *bio)
+void submit_bio(struct bio *bio)
{
- if (blkcg_punt_bio_submit(bio))
- return BLK_QC_T_NONE;
-
- /*
- * If it's a regular read/write or a barrier with data attached,
- * go through the normal accounting stuff before submission.
- */
- if (bio_has_data(bio)) {
- unsigned int count;
-
- if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME))
- count = queue_logical_block_size(bio->bi_disk->queue) >> 9;
- else
- count = bio_sectors(bio);
-
- if (op_is_write(bio_op(bio))) {
- count_vm_events(PGPGOUT, count);
- } else {
- task_io_account_read(bio->bi_iter.bi_size);
- count_vm_events(PGPGIN, count);
- }
-
- if (unlikely(block_dump)) {
- char b[BDEVNAME_SIZE];
- printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n",
- current->comm, task_pid_nr(current),
- op_is_write(bio_op(bio)) ? "WRITE" : "READ",
- (unsigned long long)bio->bi_iter.bi_sector,
- bio_devname(bio, b), count);
- }
- }
-
- /*
- * If we're reading data that is part of the userspace workingset, count
- * submission time as memory stall. When the device is congested, or
- * the submitting cgroup IO-throttled, submission can be a significant
- * part of overall IO time.
- */
- if (unlikely(bio_op(bio) == REQ_OP_READ &&
- bio_flagged(bio, BIO_WORKINGSET))) {
- unsigned long pflags;
- blk_qc_t ret;
-
- psi_memstall_enter(&pflags);
- ret = generic_make_request(bio);
- psi_memstall_leave(&pflags);
-
- return ret;
+ if (bio_op(bio) == REQ_OP_READ) {
+ task_io_account_read(bio->bi_iter.bi_size);
+ count_vm_events(PGPGIN, bio_sectors(bio));
+ } else if (bio_op(bio) == REQ_OP_WRITE) {
+ count_vm_events(PGPGOUT, bio_sectors(bio));
}
- return generic_make_request(bio);
+ bio_set_ioprio(bio);
+ submit_bio_noacct(bio);
}
EXPORT_SYMBOL(submit_bio);
/**
- * blk_cloned_rq_check_limits - Helper function to check a cloned request
- * for the new queue limits
- * @q: the queue
- * @rq: the request being checked
+ * bio_poll - poll for BIO completions
+ * @bio: bio to poll for
+ * @iob: batches of IO
+ * @flags: BLK_POLL_* flags that control the behavior
*
- * Description:
- * @rq may have been made based on weaker limitations of upper-level queues
- * in request stacking drivers, and it may violate the limitation of @q.
- * Since the block layer and the underlying device driver trust @rq
- * after it is inserted to @q, it should be checked against @q before
- * the insertion using this generic function.
+ * Poll for completions on queue associated with the bio. Returns number of
+ * completed entries found.
*
- * Request stacking drivers like request-based dm may change the queue
- * limits when retrying requests on other queues. Those requests need
- * to be checked against the new queue limits again during dispatch.
- */
-static int blk_cloned_rq_check_limits(struct request_queue *q,
- struct request *rq)
-{
- if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) {
- printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
- __func__, blk_rq_sectors(rq),
- blk_queue_get_max_sectors(q, req_op(rq)));
- return -EIO;
- }
-
- /*
- * queue's settings related to segment counting like q->bounce_pfn
- * may differ from that of other stacking queues.
- * Recalculate it to check the request correctly on this queue's
- * limitation.
- */
- rq->nr_phys_segments = blk_recalc_rq_segments(rq);
- if (rq->nr_phys_segments > queue_max_segments(q)) {
- printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n",
- __func__, rq->nr_phys_segments, queue_max_segments(q));
- return -EIO;
- }
-
- return 0;
-}
-
-/**
- * blk_insert_cloned_request - Helper for stacking drivers to submit a request
- * @q: the queue to submit the request
- * @rq: the request being queued
+ * Note: the caller must either be the context that submitted @bio, or
+ * be in a RCU critical section to prevent freeing of @bio.
*/
-blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq)
+int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags)
{
- if (blk_cloned_rq_check_limits(q, rq))
- return BLK_STS_IOERR;
+ blk_qc_t cookie = READ_ONCE(bio->bi_cookie);
+ struct block_device *bdev;
+ struct request_queue *q;
+ int ret = 0;
- if (rq->rq_disk &&
- should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq)))
- return BLK_STS_IOERR;
+ bdev = READ_ONCE(bio->bi_bdev);
+ if (!bdev)
+ return 0;
- if (blk_crypto_insert_cloned_request(rq))
- return BLK_STS_IOERR;
+ q = bdev_get_queue(bdev);
+ if (cookie == BLK_QC_T_NONE ||
+ !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ return 0;
- if (blk_queue_io_stat(q))
- blk_account_io_start(rq);
+ /*
+ * As the requests that require a zone lock are not plugged in the
+ * first place, directly accessing the plug instead of using
+ * blk_mq_plug() should not have any consequences during flushing for
+ * zoned devices.
+ */
+ blk_flush_plug(current->plug, false);
/*
- * Since we have a scheduler attached on the top device,
- * bypass a potential scheduler on the bottom device for
- * insert.
+ * We need to be able to enter a frozen queue, similar to how
+ * timeouts also need to do that. If that is blocked, then we can
+ * have pending IO when a queue freeze is started, and then the
+ * wait for the freeze to finish will wait for polled requests to
+ * timeout as the poller is preventer from entering the queue and
+ * completing them. As long as we prevent new IO from being queued,
+ * that should be all that matters.
*/
- return blk_mq_request_issue_directly(rq, true);
+ if (!percpu_ref_tryget(&q->q_usage_counter))
+ return 0;
+ if (queue_is_mq(q)) {
+ ret = blk_mq_poll(q, cookie, iob, flags);
+ } else {
+ struct gendisk *disk = q->disk;
+
+ if (disk && disk->fops->poll_bio)
+ ret = disk->fops->poll_bio(bio, iob, flags);
+ }
+ blk_queue_exit(q);
+ return ret;
}
-EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+EXPORT_SYMBOL_GPL(bio_poll);
-/**
- * blk_rq_err_bytes - determine number of bytes till the next failure boundary
- * @rq: request to examine
- *
- * Description:
- * A request could be merge of IOs which require different failure
- * handling. This function determines the number of bytes which
- * can be failed from the beginning of the request without
- * crossing into area which need to be retried further.
- *
- * Return:
- * The number of bytes to fail.
+/*
+ * Helper to implement file_operations.iopoll. Requires the bio to be stored
+ * in iocb->private, and cleared before freeing the bio.
*/
-unsigned int blk_rq_err_bytes(const struct request *rq)
+int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob,
+ unsigned int flags)
{
- unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
- unsigned int bytes = 0;
struct bio *bio;
-
- if (!(rq->rq_flags & RQF_MIXED_MERGE))
- return blk_rq_bytes(rq);
+ int ret = 0;
/*
- * Currently the only 'mixing' which can happen is between
- * different fastfail types. We can safely fail portions
- * which have all the failfast bits that the first one has -
- * the ones which are at least as eager to fail as the first
- * one.
+ * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can
+ * point to a freshly allocated bio at this point. If that happens
+ * we have a few cases to consider:
+ *
+ * 1) the bio is beeing initialized and bi_bdev is NULL. We can just
+ * simply nothing in this case
+ * 2) the bio points to a not poll enabled device. bio_poll will catch
+ * this and return 0
+ * 3) the bio points to a poll capable device, including but not
+ * limited to the one that the original bio pointed to. In this
+ * case we will call into the actual poll method and poll for I/O,
+ * even if we don't need to, but it won't cause harm either.
+ *
+ * For cases 2) and 3) above the RCU grace period ensures that bi_bdev
+ * is still allocated. Because partitions hold a reference to the whole
+ * device bdev and thus disk, the disk is also still valid. Grabbing
+ * a reference to the queue in bio_poll() ensures the hctxs and requests
+ * are still valid as well.
*/
- for (bio = rq->bio; bio; bio = bio->bi_next) {
- if ((bio->bi_opf & ff) != ff)
- break;
- bytes += bio->bi_iter.bi_size;
- }
+ rcu_read_lock();
+ bio = READ_ONCE(kiocb->private);
+ if (bio)
+ ret = bio_poll(bio, iob, flags);
+ rcu_read_unlock();
- /* this could lead to infinite loop */
- BUG_ON(blk_rq_bytes(rq) && !bytes);
- return bytes;
+ return ret;
}
-EXPORT_SYMBOL_GPL(blk_rq_err_bytes);
+EXPORT_SYMBOL_GPL(iocb_bio_iopoll);
-static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end)
+void update_io_ticks(struct block_device *part, unsigned long now, bool end)
{
unsigned long stamp;
again:
- stamp = READ_ONCE(part->stamp);
- if (unlikely(stamp != now)) {
- if (likely(cmpxchg(&part->stamp, stamp, now) == stamp))
+ stamp = READ_ONCE(part->bd_stamp);
+ if (unlikely(time_after(now, stamp))) {
+ if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now)))
__part_stat_add(part, io_ticks, end ? now - stamp : 1);
}
- if (part->partno) {
- part = &part_to_disk(part)->part0;
+ if (part->bd_partno) {
+ part = bdev_whole(part);
goto again;
}
}
-static void blk_account_io_completion(struct request *req, unsigned int bytes)
-{
- if (req->part && blk_do_io_stat(req)) {
- const int sgrp = op_stat_group(req_op(req));
- struct hd_struct *part;
-
- part_stat_lock();
- part = req->part;
- part_stat_add(part, sectors[sgrp], bytes >> 9);
- part_stat_unlock();
- }
-}
-
-void blk_account_io_done(struct request *req, u64 now)
+unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op,
+ unsigned long start_time)
{
- /*
- * Account IO completion. flush_rq isn't accounted as a
- * normal IO on queueing nor completion. Accounting the
- * containing request is enough.
- */
- if (req->part && blk_do_io_stat(req) &&
- !(req->rq_flags & RQF_FLUSH_SEQ)) {
- const int sgrp = op_stat_group(req_op(req));
- struct hd_struct *part;
-
- part_stat_lock();
- part = req->part;
-
- update_io_ticks(part, jiffies, true);
- part_stat_inc(part, ios[sgrp]);
- part_stat_add(part, nsecs[sgrp], now - req->start_time_ns);
- part_stat_unlock();
-
- hd_struct_put(part);
- }
-}
-
-void blk_account_io_start(struct request *rq)
-{
- if (!blk_do_io_stat(rq))
- return;
-
- rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq));
-
part_stat_lock();
- update_io_ticks(rq->part, jiffies, false);
+ update_io_ticks(bdev, start_time, false);
+ part_stat_local_inc(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();
+
+ return start_time;
}
+EXPORT_SYMBOL(bdev_start_io_acct);
-unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors,
- unsigned int op)
+/**
+ * bio_start_io_acct - start I/O accounting for bio based drivers
+ * @bio: bio to start account for
+ *
+ * Returns the start time that should be passed back to bio_end_io_acct().
+ */
+unsigned long bio_start_io_acct(struct bio *bio)
{
- struct hd_struct *part = &disk->part0;
- const int sgrp = op_stat_group(op);
- unsigned long now = READ_ONCE(jiffies);
-
- part_stat_lock();
- update_io_ticks(part, now, false);
- part_stat_inc(part, ios[sgrp]);
- part_stat_add(part, sectors[sgrp], sectors);
- part_stat_local_inc(part, in_flight[op_is_write(op)]);
- part_stat_unlock();
-
- return now;
+ return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies);
}
-EXPORT_SYMBOL(disk_start_io_acct);
+EXPORT_SYMBOL_GPL(bio_start_io_acct);
-void disk_end_io_acct(struct gendisk *disk, unsigned int op,
- unsigned long start_time)
+void bdev_end_io_acct(struct block_device *bdev, enum req_op op,
+ unsigned int sectors, unsigned long start_time)
{
- struct hd_struct *part = &disk->part0;
const int sgrp = op_stat_group(op);
unsigned long now = READ_ONCE(jiffies);
unsigned long duration = now - start_time;
part_stat_lock();
- update_io_ticks(part, now, true);
- part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration));
- part_stat_local_dec(part, in_flight[op_is_write(op)]);
+ update_io_ticks(bdev, now, true);
+ part_stat_inc(bdev, ios[sgrp]);
+ part_stat_add(bdev, sectors[sgrp], sectors);
+ part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration));
+ part_stat_local_dec(bdev, in_flight[op_is_write(op)]);
part_stat_unlock();
}
-EXPORT_SYMBOL(disk_end_io_acct);
-
-/*
- * Steal bios from a request and add them to a bio list.
- * The request must not have been partially completed before.
- */
-void blk_steal_bios(struct bio_list *list, struct request *rq)
-{
- if (rq->bio) {
- if (list->tail)
- list->tail->bi_next = rq->bio;
- else
- list->head = rq->bio;
- list->tail = rq->biotail;
-
- rq->bio = NULL;
- rq->biotail = NULL;
- }
-
- rq->__data_len = 0;
-}
-EXPORT_SYMBOL_GPL(blk_steal_bios);
-
-/**
- * blk_update_request - Special helper function for request stacking drivers
- * @req: the request being processed
- * @error: block status code
- * @nr_bytes: number of bytes to complete @req
- *
- * Description:
- * Ends I/O on a number of bytes attached to @req, but doesn't complete
- * the request structure even if @req doesn't have leftover.
- * If @req has leftover, sets it up for the next range of segments.
- *
- * This special helper function is only for request stacking drivers
- * (e.g. request-based dm) so that they can handle partial completion.
- * Actual device drivers should use blk_mq_end_request instead.
- *
- * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
- * %false return from this function.
- *
- * Note:
- * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both
- * blk_rq_bytes() and in blk_update_request().
- *
- * Return:
- * %false - this request doesn't have any more data
- * %true - this request has more data
- **/
-bool blk_update_request(struct request *req, blk_status_t error,
- unsigned int nr_bytes)
-{
- int total_bytes;
-
- trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes);
-
- if (!req->bio)
- return false;
-
-#ifdef CONFIG_BLK_DEV_INTEGRITY
- if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
- error == BLK_STS_OK)
- req->q->integrity.profile->complete_fn(req, nr_bytes);
-#endif
-
- if (unlikely(error && !blk_rq_is_passthrough(req) &&
- !(req->rq_flags & RQF_QUIET)))
- print_req_error(req, error, __func__);
-
- blk_account_io_completion(req, nr_bytes);
-
- total_bytes = 0;
- while (req->bio) {
- struct bio *bio = req->bio;
- unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
-
- if (bio_bytes == bio->bi_iter.bi_size)
- req->bio = bio->bi_next;
-
- /* Completion has already been traced */
- bio_clear_flag(bio, BIO_TRACE_COMPLETION);
- req_bio_endio(req, bio, bio_bytes, error);
-
- total_bytes += bio_bytes;
- nr_bytes -= bio_bytes;
-
- if (!nr_bytes)
- break;
- }
-
- /*
- * completely done
- */
- if (!req->bio) {
- /*
- * Reset counters so that the request stacking driver
- * can find how many bytes remain in the request
- * later.
- */
- req->__data_len = 0;
- return false;
- }
-
- req->__data_len -= total_bytes;
-
- /* update sector only for requests with clear definition of sector */
- if (!blk_rq_is_passthrough(req))
- req->__sector += total_bytes >> 9;
-
- /* mixed attributes always follow the first bio */
- if (req->rq_flags & RQF_MIXED_MERGE) {
- req->cmd_flags &= ~REQ_FAILFAST_MASK;
- req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
- }
-
- if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
- /*
- * If total number of sectors is less than the first segment
- * size, something has gone terribly wrong.
- */
- if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
- blk_dump_rq_flags(req, "request botched");
- req->__data_len = blk_rq_cur_bytes(req);
- }
-
- /* recalculate the number of segments */
- req->nr_phys_segments = blk_recalc_rq_segments(req);
- }
-
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_update_request);
+EXPORT_SYMBOL(bdev_end_io_acct);
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE
-/**
- * rq_flush_dcache_pages - Helper function to flush all pages in a request
- * @rq: the request to be flushed
- *
- * Description:
- * Flush all pages in @rq.
- */
-void rq_flush_dcache_pages(struct request *rq)
+void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time,
+ struct block_device *orig_bdev)
{
- struct req_iterator iter;
- struct bio_vec bvec;
-
- rq_for_each_segment(bvec, rq, iter)
- flush_dcache_page(bvec.bv_page);
+ bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time);
}
-EXPORT_SYMBOL_GPL(rq_flush_dcache_pages);
-#endif
+EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped);
/**
* blk_lld_busy - Check if underlying low-level drivers of a device are busy
@@ -1665,91 +1060,6 @@ int blk_lld_busy(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_lld_busy);
-/**
- * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
- * @rq: the clone request to be cleaned up
- *
- * Description:
- * Free all bios in @rq for a cloned request.
- */
-void blk_rq_unprep_clone(struct request *rq)
-{
- struct bio *bio;
-
- while ((bio = rq->bio) != NULL) {
- rq->bio = bio->bi_next;
-
- bio_put(bio);
- }
-}
-EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
-
-/**
- * blk_rq_prep_clone - Helper function to setup clone request
- * @rq: the request to be setup
- * @rq_src: original request to be cloned
- * @bs: bio_set that bios for clone are allocated from
- * @gfp_mask: memory allocation mask for bio
- * @bio_ctr: setup function to be called for each clone bio.
- * Returns %0 for success, non %0 for failure.
- * @data: private data to be passed to @bio_ctr
- *
- * Description:
- * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
- * Also, pages which the original bios are pointing to are not copied
- * and the cloned bios just point same pages.
- * So cloned bios must be completed before original bios, which means
- * the caller must complete @rq before @rq_src.
- */
-int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
- struct bio_set *bs, gfp_t gfp_mask,
- int (*bio_ctr)(struct bio *, struct bio *, void *),
- void *data)
-{
- struct bio *bio, *bio_src;
-
- if (!bs)
- bs = &fs_bio_set;
-
- __rq_for_each_bio(bio_src, rq_src) {
- bio = bio_clone_fast(bio_src, gfp_mask, bs);
- if (!bio)
- goto free_and_out;
-
- if (bio_ctr && bio_ctr(bio, bio_src, data))
- goto free_and_out;
-
- if (rq->bio) {
- rq->biotail->bi_next = bio;
- rq->biotail = bio;
- } else
- rq->bio = rq->biotail = bio;
- }
-
- /* Copy attributes of the original request to the clone request. */
- rq->__sector = blk_rq_pos(rq_src);
- rq->__data_len = blk_rq_bytes(rq_src);
- if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
- rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
- rq->special_vec = rq_src->special_vec;
- }
- rq->nr_phys_segments = rq_src->nr_phys_segments;
- rq->ioprio = rq_src->ioprio;
-
- if (rq->bio)
- blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask);
-
- return 0;
-
-free_and_out:
- if (bio)
- bio_put(bio);
- blk_rq_unprep_clone(rq);
-
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
-
int kblockd_schedule_work(struct work_struct *work)
{
return queue_work(kblockd_workqueue, work);
@@ -1763,6 +1073,31 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork,
}
EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
+void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios)
+{
+ struct task_struct *tsk = current;
+
+ /*
+ * If this is a nested plug, don't actually assign it.
+ */
+ if (tsk->plug)
+ return;
+
+ plug->mq_list = NULL;
+ plug->cached_rq = NULL;
+ plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT);
+ plug->rq_count = 0;
+ plug->multiple_queues = false;
+ plug->has_elevator = false;
+ INIT_LIST_HEAD(&plug->cb_list);
+
+ /*
+ * Store ordering should not be needed here, since a potential
+ * preempt will imply a full memory barrier
+ */
+ tsk->plug = plug;
+}
+
/**
* blk_start_plug - initialize blk_plug and track it inside the task_struct
* @plug: The &struct blk_plug that needs to be initialized
@@ -1788,24 +1123,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on);
*/
void blk_start_plug(struct blk_plug *plug)
{
- struct task_struct *tsk = current;
-
- /*
- * If this is a nested plug, don't actually assign it.
- */
- if (tsk->plug)
- return;
-
- INIT_LIST_HEAD(&plug->mq_list);
- INIT_LIST_HEAD(&plug->cb_list);
- plug->rq_count = 0;
- plug->multiple_queues = false;
-
- /*
- * Store ordering should not be needed here, since a potential
- * preempt will imply a full memory barrier
- */
- tsk->plug = plug;
+ blk_start_plug_nr_ios(plug, 1);
}
EXPORT_SYMBOL(blk_start_plug);
@@ -1851,12 +1169,19 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data,
}
EXPORT_SYMBOL(blk_check_plugged);
-void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+void __blk_flush_plug(struct blk_plug *plug, bool from_schedule)
{
- flush_plug_callbacks(plug, from_schedule);
-
- if (!list_empty(&plug->mq_list))
- blk_mq_flush_plug_list(plug, from_schedule);
+ if (!list_empty(&plug->cb_list))
+ flush_plug_callbacks(plug, from_schedule);
+ blk_mq_flush_plug_list(plug, from_schedule);
+ /*
+ * Unconditionally flush out cached requests, even if the unplug
+ * event came from schedule. Since we know hold references to the
+ * queue for cached requests, we don't want a blocked task holding
+ * up a queue freeze/quiesce event.
+ */
+ if (unlikely(!rq_list_empty(plug->cached_rq)))
+ blk_mq_free_plug_rqs(plug);
}
/**
@@ -1871,11 +1196,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule)
*/
void blk_finish_plug(struct blk_plug *plug)
{
- if (plug != current->plug)
- return;
- blk_flush_plug_list(plug, false);
-
- current->plug = NULL;
+ if (plug == current->plug) {
+ __blk_flush_plug(plug, false);
+ current->plug = NULL;
+ }
}
EXPORT_SYMBOL(blk_finish_plug);
@@ -1893,7 +1217,7 @@ EXPORT_SYMBOL_GPL(blk_io_schedule);
int __init blk_dev_init(void)
{
- BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS));
+ BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
sizeof_field(struct request, cmd_flags));
BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 *
@@ -1908,9 +1232,7 @@ int __init blk_dev_init(void)
blk_requestq_cachep = kmem_cache_create("request_queue",
sizeof(struct request_queue), 0, SLAB_PANIC, NULL);
-#ifdef CONFIG_DEBUG_FS
blk_debugfs_root = debugfs_create_dir("block", NULL);
-#endif
return 0;
}
diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c
index 6e49688a2d80..e6468eab2681 100644
--- a/block/blk-crypto-fallback.c
+++ b/block/blk-crypto-fallback.c
@@ -10,15 +10,16 @@
#define pr_fmt(fmt) "blk-crypto-fallback: " fmt
#include <crypto/skcipher.h>
-#include <linux/blk-cgroup.h>
#include <linux/blk-crypto.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/blkdev.h>
#include <linux/crypto.h>
-#include <linux/keyslot-manager.h>
#include <linux/mempool.h>
#include <linux/module.h>
#include <linux/random.h>
+#include <linux/scatterlist.h>
+#include "blk-cgroup.h"
#include "blk-crypto-internal.h"
static unsigned int num_prealloc_bounce_pg = 32;
@@ -72,14 +73,15 @@ static mempool_t *bio_fallback_crypt_ctx_pool;
static DEFINE_MUTEX(tfms_init_lock);
static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX];
-static struct blk_crypto_keyslot {
+static struct blk_crypto_fallback_keyslot {
enum blk_crypto_mode_num crypto_mode;
struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX];
} *blk_crypto_keyslots;
-static struct blk_keyslot_manager blk_crypto_ksm;
+static struct blk_crypto_profile *blk_crypto_fallback_profile;
static struct workqueue_struct *blk_crypto_wq;
static mempool_t *blk_crypto_bounce_page_pool;
+static struct bio_set crypto_bio_split;
/*
* This is the key we set when evicting a keyslot. This *should* be the all 0's
@@ -87,9 +89,9 @@ static mempool_t *blk_crypto_bounce_page_pool;
*/
static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE];
-static void blk_crypto_evict_keyslot(unsigned int slot)
+static void blk_crypto_fallback_evict_keyslot(unsigned int slot)
{
- struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode;
int err;
@@ -102,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot)
slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID;
}
-static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- unsigned int slot)
+static int
+blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
{
- struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot];
+ struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot];
const enum blk_crypto_mode_num crypto_mode =
key->crypto_cfg.crypto_mode;
int err;
if (crypto_mode != slotp->crypto_mode &&
slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID)
- blk_crypto_evict_keyslot(slot);
+ blk_crypto_fallback_evict_keyslot(slot);
slotp->crypto_mode = crypto_mode;
err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw,
key->size);
if (err) {
- blk_crypto_evict_keyslot(slot);
+ blk_crypto_fallback_evict_keyslot(slot);
return err;
}
return 0;
}
-static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- unsigned int slot)
+static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ unsigned int slot)
{
- blk_crypto_evict_keyslot(slot);
+ blk_crypto_fallback_evict_keyslot(slot);
return 0;
}
-/*
- * The crypto API fallback KSM ops - only used for a bio when it specifies a
- * blk_crypto_key that was not supported by the device's inline encryption
- * hardware.
- */
-static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = {
- .keyslot_program = blk_crypto_keyslot_program,
- .keyslot_evict = blk_crypto_keyslot_evict,
+static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = {
+ .keyslot_program = blk_crypto_fallback_keyslot_program,
+ .keyslot_evict = blk_crypto_fallback_keyslot_evict,
};
static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
@@ -154,23 +152,26 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio)
src_bio->bi_status = enc_bio->bi_status;
- bio_put(enc_bio);
+ bio_uninit(enc_bio);
+ kfree(enc_bio);
bio_endio(src_bio);
}
-static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
+static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src)
{
+ unsigned int nr_segs = bio_segments(bio_src);
struct bvec_iter iter;
struct bio_vec bv;
struct bio *bio;
- bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL);
+ bio = bio_kmalloc(nr_segs, GFP_NOIO);
if (!bio)
return NULL;
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_opf = bio_src->bi_opf;
+ bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs,
+ bio_src->bi_opf);
+ if (bio_flagged(bio_src, BIO_REMAPPED))
+ bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
@@ -178,18 +179,18 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
bio_clone_blkg_association(bio, bio_src);
- blkcg_bio_issue_init(bio);
return bio;
}
-static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
- struct skcipher_request **ciph_req_ret,
- struct crypto_wait *wait)
+static bool
+blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot,
+ struct skcipher_request **ciph_req_ret,
+ struct crypto_wait *wait)
{
struct skcipher_request *ciph_req;
- const struct blk_crypto_keyslot *slotp;
- int keyslot_idx = blk_ksm_get_slot_idx(slot);
+ const struct blk_crypto_fallback_keyslot *slotp;
+ int keyslot_idx = blk_crypto_keyslot_index(slot);
slotp = &blk_crypto_keyslots[keyslot_idx];
ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode],
@@ -206,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot,
return true;
}
-static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
+static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr)
{
struct bio *bio = *bio_ptr;
unsigned int i = 0;
@@ -216,19 +217,20 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr)
bio_for_each_segment(bv, bio, iter) {
num_sectors += bv.bv_len >> SECTOR_SHIFT;
- if (++i == BIO_MAX_PAGES)
+ if (++i == BIO_MAX_VECS)
break;
}
if (num_sectors < bio_sectors(bio)) {
struct bio *split_bio;
- split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL);
+ split_bio = bio_split(bio, num_sectors, GFP_NOIO,
+ &crypto_bio_split);
if (!split_bio) {
bio->bi_status = BLK_STS_RESOURCE;
return false;
}
bio_chain(split_bio, bio);
- generic_make_request(bio);
+ submit_bio_noacct(bio);
*bio_ptr = split_bio;
}
@@ -260,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
{
struct bio *src_bio, *enc_bio;
struct bio_crypt_ctx *bc;
- struct blk_ksm_keyslot *slot;
+ struct blk_crypto_keyslot *slot;
int data_unit_size;
struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait);
@@ -272,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
blk_status_t blk_st;
/* Split the bio if it's too big for single page bvec */
- if (!blk_crypto_split_bio_if_needed(bio_ptr))
+ if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr))
return false;
src_bio = *bio_ptr;
@@ -280,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr)
data_unit_size = bc->bc_key->crypto_cfg.data_unit_size;
/* Allocate bounce bio for encryption */
- enc_bio = blk_crypto_clone_bio(src_bio);
+ enc_bio = blk_crypto_fallback_clone_bio(src_bio);
if (!enc_bio) {
src_bio->bi_status = BLK_STS_RESOURCE;
return false;
}
/*
- * Use the crypto API fallback keyslot manager to get a crypto_skcipher
- * for the algorithm and key specified for this bio.
+ * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+ * this bio's algorithm and key.
*/
- blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
+ bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) {
src_bio->bi_status = blk_st;
goto out_put_enc_bio;
}
/* and then allocate an skcipher_request for it */
- if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
src_bio->bi_status = BLK_STS_RESOURCE;
goto out_release_keyslot;
}
@@ -358,11 +361,11 @@ out_free_bounce_pages:
out_free_ciph_req:
skcipher_request_free(ciph_req);
out_release_keyslot:
- blk_ksm_put_slot(slot);
+ blk_crypto_put_keyslot(slot);
out_put_enc_bio:
if (enc_bio)
- bio_put(enc_bio);
-
+ bio_uninit(enc_bio);
+ kfree(enc_bio);
return ret;
}
@@ -376,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
container_of(work, struct bio_fallback_crypt_ctx, work);
struct bio *bio = f_ctx->bio;
struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx;
- struct blk_ksm_keyslot *slot;
+ struct blk_crypto_keyslot *slot;
struct skcipher_request *ciph_req = NULL;
DECLARE_CRYPTO_WAIT(wait);
u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE];
@@ -389,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
blk_status_t blk_st;
/*
- * Use the crypto API fallback keyslot manager to get a crypto_skcipher
- * for the algorithm and key specified for this bio.
+ * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for
+ * this bio's algorithm and key.
*/
- blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot);
+ blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile,
+ bc->bc_key, &slot);
if (blk_st != BLK_STS_OK) {
bio->bi_status = blk_st;
goto out_no_keyslot;
}
/* and then allocate an skcipher_request for it */
- if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) {
+ if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) {
bio->bi_status = BLK_STS_RESOURCE;
goto out;
}
@@ -430,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work)
out:
skcipher_request_free(ciph_req);
- blk_ksm_put_slot(slot);
+ blk_crypto_put_keyslot(slot);
out_no_keyslot:
mempool_free(f_ctx, bio_fallback_crypt_ctx_pool);
bio_endio(bio);
@@ -469,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio)
* @bio_ptr: pointer to the bio to prepare
*
* If bio is doing a WRITE operation, this splits the bio into two parts if it's
- * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio
- * for the first part, encrypts it, and update bio_ptr to point to the bounce
- * bio.
+ * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a
+ * bounce bio for the first part, encrypts it, and updates bio_ptr to point to
+ * the bounce bio.
*
* For a READ operation, we mark the bio for decryption by using bi_private and
* bi_end_io.
@@ -495,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
return false;
}
- if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm,
- &bc->bc_key->crypto_cfg)) {
+ if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile,
+ &bc->bc_key->crypto_cfg)) {
bio->bi_status = BLK_STS_NOTSUPP;
return false;
}
@@ -522,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr)
int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key)
{
- return blk_ksm_evict_key(&blk_crypto_ksm, key);
+ return __blk_crypto_evict_key(blk_crypto_fallback_profile, key);
}
static bool blk_crypto_fallback_inited;
@@ -534,26 +538,39 @@ static int blk_crypto_fallback_init(void)
if (blk_crypto_fallback_inited)
return 0;
- prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
+ get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE);
- err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots);
+ err = bioset_init(&crypto_bio_split, 64, 0, 0);
if (err)
goto out;
+
+ /* Dynamic allocation is needed because of lockdep_register_key(). */
+ blk_crypto_fallback_profile =
+ kzalloc(sizeof(*blk_crypto_fallback_profile), GFP_KERNEL);
+ if (!blk_crypto_fallback_profile) {
+ err = -ENOMEM;
+ goto fail_free_bioset;
+ }
+
+ err = blk_crypto_profile_init(blk_crypto_fallback_profile,
+ blk_crypto_num_keyslots);
+ if (err)
+ goto fail_free_profile;
err = -ENOMEM;
- blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops;
- blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
+ blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops;
+ blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE;
/* All blk-crypto modes have a crypto API fallback. */
for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++)
- blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF;
- blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
+ blk_crypto_fallback_profile->modes_supported[i] = 0xFFFFFFFF;
+ blk_crypto_fallback_profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0;
blk_crypto_wq = alloc_workqueue("blk_crypto_wq",
WQ_UNBOUND | WQ_HIGHPRI |
WQ_MEM_RECLAIM, num_online_cpus());
if (!blk_crypto_wq)
- goto fail_free_ksm;
+ goto fail_destroy_profile;
blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots,
sizeof(blk_crypto_keyslots[0]),
@@ -587,8 +604,12 @@ fail_free_keyslots:
kfree(blk_crypto_keyslots);
fail_free_wq:
destroy_workqueue(blk_crypto_wq);
-fail_free_ksm:
- blk_ksm_destroy(&blk_crypto_ksm);
+fail_destroy_profile:
+ blk_crypto_profile_destroy(blk_crypto_fallback_profile);
+fail_free_profile:
+ kfree(blk_crypto_fallback_profile);
+fail_free_bioset:
+ bioset_exit(&crypto_bio_split);
out:
return err;
}
@@ -600,7 +621,7 @@ out:
int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num)
{
const char *cipher_str = blk_crypto_modes[mode_num].cipher_str;
- struct blk_crypto_keyslot *slotp;
+ struct blk_crypto_fallback_keyslot *slotp;
unsigned int i;
int err = 0;
diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h
index d2b0f565d83c..93a141979694 100644
--- a/block/blk-crypto-internal.h
+++ b/block/blk-crypto-internal.h
@@ -7,10 +7,11 @@
#define __LINUX_BLK_CRYPTO_INTERNAL_H
#include <linux/bio.h>
-#include <linux/blkdev.h>
+#include <linux/blk-mq.h>
/* Represents a crypto mode supported by blk-crypto */
struct blk_crypto_mode {
+ const char *name; /* name of this mode, shown in sysfs */
const char *cipher_str; /* crypto API name (for fallback case) */
unsigned int keysize; /* key size in bytes */
unsigned int ivsize; /* iv size in bytes */
@@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[];
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
+int blk_crypto_sysfs_register(struct gendisk *disk);
+
+void blk_crypto_sysfs_unregister(struct gendisk *disk);
+
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
unsigned int inc);
@@ -60,8 +65,34 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return rq->crypt_ctx;
}
+static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
+{
+ return rq->crypt_keyslot;
+}
+
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ struct blk_crypto_keyslot **slot_ptr);
+
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot);
+
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key);
+
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+ const struct blk_crypto_config *cfg);
+
#else /* CONFIG_BLK_INLINE_ENCRYPTION */
+static inline int blk_crypto_sysfs_register(struct gendisk *disk)
+{
+ return 0;
+}
+
+static inline void blk_crypto_sysfs_unregister(struct gendisk *disk)
+{
+}
+
static inline bool bio_crypt_rq_ctx_compatible(struct request *rq,
struct bio *bio)
{
@@ -93,6 +124,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq)
return false;
}
+static inline bool blk_crypto_rq_has_keyslot(struct request *rq)
+{
+ return false;
+}
+
#endif /* CONFIG_BLK_INLINE_ENCRYPTION */
void __bio_crypt_advance(struct bio *bio, unsigned int bytes);
@@ -127,14 +163,21 @@ static inline bool blk_crypto_bio_prep(struct bio **bio_ptr)
return true;
}
-blk_status_t __blk_crypto_init_request(struct request *rq);
-static inline blk_status_t blk_crypto_init_request(struct request *rq)
+blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq);
+static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq)
{
if (blk_crypto_rq_is_encrypted(rq))
- return __blk_crypto_init_request(rq);
+ return __blk_crypto_rq_get_keyslot(rq);
return BLK_STS_OK;
}
+void __blk_crypto_rq_put_keyslot(struct request *rq);
+static inline void blk_crypto_rq_put_keyslot(struct request *rq)
+{
+ if (blk_crypto_rq_has_keyslot(rq))
+ __blk_crypto_rq_put_keyslot(rq);
+}
+
void __blk_crypto_free_request(struct request *rq);
static inline void blk_crypto_free_request(struct request *rq)
{
@@ -142,28 +185,24 @@ static inline void blk_crypto_free_request(struct request *rq)
__blk_crypto_free_request(rq);
}
-void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
- gfp_t gfp_mask);
-static inline void blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
- gfp_t gfp_mask)
-{
- if (bio_has_crypt_ctx(bio))
- __blk_crypto_rq_bio_prep(rq, bio, gfp_mask);
-}
-
+int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask);
/**
- * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted
- * into a request queue.
- * @rq: the request being queued
+ * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio
+ * is inserted
+ * @rq: The request to prepare
+ * @bio: The first bio being inserted into the request
+ * @gfp_mask: Memory allocation flags
*
- * Return: BLK_STS_OK on success, nonzero on error.
+ * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if
+ * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM.
*/
-static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq)
+static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
{
-
- if (blk_crypto_rq_is_encrypted(rq))
- return blk_crypto_init_request(rq);
- return BLK_STS_OK;
+ if (bio_has_crypt_ctx(bio))
+ return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask);
+ return 0;
}
#ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK
diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c
new file mode 100644
index 000000000000..7fabc883e39f
--- /dev/null
+++ b/block/blk-crypto-profile.c
@@ -0,0 +1,559 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2019 Google LLC
+ */
+
+/**
+ * DOC: blk-crypto profiles
+ *
+ * 'struct blk_crypto_profile' contains all generic inline encryption-related
+ * state for a particular inline encryption device. blk_crypto_profile serves
+ * as the way that drivers for inline encryption hardware expose their crypto
+ * capabilities and certain functions (e.g., functions to program and evict
+ * keys) to upper layers. Device drivers that want to support inline encryption
+ * construct a crypto profile, then associate it with the disk's request_queue.
+ *
+ * If the device has keyslots, then its blk_crypto_profile also handles managing
+ * these keyslots in a device-independent way, using the driver-provided
+ * functions to program and evict keys as needed. This includes keeping track
+ * of which key and how many I/O requests are using each keyslot, getting
+ * keyslots for I/O requests, and handling key eviction requests.
+ *
+ * For more information, see Documentation/block/inline-encryption.rst.
+ */
+
+#define pr_fmt(fmt) "blk-crypto: " fmt
+
+#include <linux/blk-crypto-profile.h>
+#include <linux/device.h>
+#include <linux/atomic.h>
+#include <linux/mutex.h>
+#include <linux/pm_runtime.h>
+#include <linux/wait.h>
+#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_keyslot {
+ atomic_t slot_refs;
+ struct list_head idle_slot_node;
+ struct hlist_node hash_node;
+ const struct blk_crypto_key *key;
+ struct blk_crypto_profile *profile;
+};
+
+static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile)
+{
+ /*
+ * Calling into the driver requires profile->lock held and the device
+ * resumed. But we must resume the device first, since that can acquire
+ * and release profile->lock via blk_crypto_reprogram_all_keys().
+ */
+ if (profile->dev)
+ pm_runtime_get_sync(profile->dev);
+ down_write(&profile->lock);
+}
+
+static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile)
+{
+ up_write(&profile->lock);
+ if (profile->dev)
+ pm_runtime_put_sync(profile->dev);
+}
+
+/**
+ * blk_crypto_profile_init() - Initialize a blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Storage drivers must call this when starting to set up a blk_crypto_profile,
+ * before filling in additional fields.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int blk_crypto_profile_init(struct blk_crypto_profile *profile,
+ unsigned int num_slots)
+{
+ unsigned int slot;
+ unsigned int i;
+ unsigned int slot_hashtable_size;
+
+ memset(profile, 0, sizeof(*profile));
+
+ /*
+ * profile->lock of an underlying device can nest inside profile->lock
+ * of a device-mapper device, so use a dynamic lock class to avoid
+ * false-positive lockdep reports.
+ */
+ lockdep_register_key(&profile->lockdep_key);
+ __init_rwsem(&profile->lock, "&profile->lock", &profile->lockdep_key);
+
+ if (num_slots == 0)
+ return 0;
+
+ /* Initialize keyslot management data. */
+
+ profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]),
+ GFP_KERNEL);
+ if (!profile->slots)
+ goto err_destroy;
+
+ profile->num_slots = num_slots;
+
+ init_waitqueue_head(&profile->idle_slots_wait_queue);
+ INIT_LIST_HEAD(&profile->idle_slots);
+
+ for (slot = 0; slot < num_slots; slot++) {
+ profile->slots[slot].profile = profile;
+ list_add_tail(&profile->slots[slot].idle_slot_node,
+ &profile->idle_slots);
+ }
+
+ spin_lock_init(&profile->idle_slots_lock);
+
+ slot_hashtable_size = roundup_pow_of_two(num_slots);
+ /*
+ * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2
+ * buckets. This only makes a difference when there is only 1 keyslot.
+ */
+ if (slot_hashtable_size < 2)
+ slot_hashtable_size = 2;
+
+ profile->log_slot_ht_size = ilog2(slot_hashtable_size);
+ profile->slot_hashtable =
+ kvmalloc_array(slot_hashtable_size,
+ sizeof(profile->slot_hashtable[0]), GFP_KERNEL);
+ if (!profile->slot_hashtable)
+ goto err_destroy;
+ for (i = 0; i < slot_hashtable_size; i++)
+ INIT_HLIST_HEAD(&profile->slot_hashtable[i]);
+
+ return 0;
+
+err_destroy:
+ blk_crypto_profile_destroy(profile);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_init);
+
+static void blk_crypto_profile_destroy_callback(void *profile)
+{
+ blk_crypto_profile_destroy(profile);
+}
+
+/**
+ * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init()
+ * @dev: the device which owns the blk_crypto_profile
+ * @profile: the blk_crypto_profile to initialize
+ * @num_slots: the number of keyslots
+ *
+ * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be
+ * called automatically on driver detach.
+ *
+ * Return: 0 on success, or else a negative error code.
+ */
+int devm_blk_crypto_profile_init(struct device *dev,
+ struct blk_crypto_profile *profile,
+ unsigned int num_slots)
+{
+ int err = blk_crypto_profile_init(profile, num_slots);
+
+ if (err)
+ return err;
+
+ return devm_add_action_or_reset(dev,
+ blk_crypto_profile_destroy_callback,
+ profile);
+}
+EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init);
+
+static inline struct hlist_head *
+blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ return &profile->slot_hashtable[
+ hash_ptr(key, profile->log_slot_ht_size)];
+}
+
+static void
+blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot)
+{
+ struct blk_crypto_profile *profile = slot->profile;
+ unsigned long flags;
+
+ spin_lock_irqsave(&profile->idle_slots_lock, flags);
+ list_del(&slot->idle_slot_node);
+ spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ const struct hlist_head *head =
+ blk_crypto_hash_bucket_for_key(profile, key);
+ struct blk_crypto_keyslot *slotp;
+
+ hlist_for_each_entry(slotp, head, hash_node) {
+ if (slotp->key == key)
+ return slotp;
+ }
+ return NULL;
+}
+
+static struct blk_crypto_keyslot *
+blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ struct blk_crypto_keyslot *slot;
+
+ slot = blk_crypto_find_keyslot(profile, key);
+ if (!slot)
+ return NULL;
+ if (atomic_inc_return(&slot->slot_refs) == 1) {
+ /* Took first reference to this slot; remove it from LRU list */
+ blk_crypto_remove_slot_from_lru_list(slot);
+ }
+ return slot;
+}
+
+/**
+ * blk_crypto_keyslot_index() - Get the index of a keyslot
+ * @slot: a keyslot that blk_crypto_get_keyslot() returned
+ *
+ * Return: the 0-based index of the keyslot within the device's keyslots.
+ */
+unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot)
+{
+ return slot - slot->profile->slots;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index);
+
+/**
+ * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed.
+ * @profile: the crypto profile of the device the key will be used on
+ * @key: the key that will be used
+ * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct
+ * will be stored here. blk_crypto_put_keyslot() must be called
+ * later to release it. Otherwise, NULL will be stored here.
+ *
+ * If the device has keyslots, this gets a keyslot that's been programmed with
+ * the specified key. If the key is already in a slot, this reuses it;
+ * otherwise this waits for a slot to become idle and programs the key into it.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or
+ * one wasn't needed; or a blk_status_t error on failure.
+ */
+blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key,
+ struct blk_crypto_keyslot **slot_ptr)
+{
+ struct blk_crypto_keyslot *slot;
+ int slot_idx;
+ int err;
+
+ *slot_ptr = NULL;
+
+ /*
+ * If the device has no concept of "keyslots", then there is no need to
+ * get one.
+ */
+ if (profile->num_slots == 0)
+ return BLK_STS_OK;
+
+ down_read(&profile->lock);
+ slot = blk_crypto_find_and_grab_keyslot(profile, key);
+ up_read(&profile->lock);
+ if (slot)
+ goto success;
+
+ for (;;) {
+ blk_crypto_hw_enter(profile);
+ slot = blk_crypto_find_and_grab_keyslot(profile, key);
+ if (slot) {
+ blk_crypto_hw_exit(profile);
+ goto success;
+ }
+
+ /*
+ * If we're here, that means there wasn't a slot that was
+ * already programmed with the key. So try to program it.
+ */
+ if (!list_empty(&profile->idle_slots))
+ break;
+
+ blk_crypto_hw_exit(profile);
+ wait_event(profile->idle_slots_wait_queue,
+ !list_empty(&profile->idle_slots));
+ }
+
+ slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot,
+ idle_slot_node);
+ slot_idx = blk_crypto_keyslot_index(slot);
+
+ err = profile->ll_ops.keyslot_program(profile, key, slot_idx);
+ if (err) {
+ wake_up(&profile->idle_slots_wait_queue);
+ blk_crypto_hw_exit(profile);
+ return errno_to_blk_status(err);
+ }
+
+ /* Move this slot to the hash list for the new key. */
+ if (slot->key)
+ hlist_del(&slot->hash_node);
+ slot->key = key;
+ hlist_add_head(&slot->hash_node,
+ blk_crypto_hash_bucket_for_key(profile, key));
+
+ atomic_set(&slot->slot_refs, 1);
+
+ blk_crypto_remove_slot_from_lru_list(slot);
+
+ blk_crypto_hw_exit(profile);
+success:
+ *slot_ptr = slot;
+ return BLK_STS_OK;
+}
+
+/**
+ * blk_crypto_put_keyslot() - Release a reference to a keyslot
+ * @slot: The keyslot to release the reference of
+ *
+ * Context: Any context.
+ */
+void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot)
+{
+ struct blk_crypto_profile *profile = slot->profile;
+ unsigned long flags;
+
+ if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
+ &profile->idle_slots_lock, flags)) {
+ list_add_tail(&slot->idle_slot_node, &profile->idle_slots);
+ spin_unlock_irqrestore(&profile->idle_slots_lock, flags);
+ wake_up(&profile->idle_slots_wait_queue);
+ }
+}
+
+/**
+ * __blk_crypto_cfg_supported() - Check whether the given crypto profile
+ * supports the given crypto configuration.
+ * @profile: the crypto profile to check
+ * @cfg: the crypto configuration to check for
+ *
+ * Return: %true if @profile supports the given @cfg.
+ */
+bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile,
+ const struct blk_crypto_config *cfg)
+{
+ if (!profile)
+ return false;
+ if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size))
+ return false;
+ if (profile->max_dun_bytes_supported < cfg->dun_bytes)
+ return false;
+ return true;
+}
+
+/*
+ * This is an internal function that evicts a key from an inline encryption
+ * device that can be either a real device or the blk-crypto-fallback "device".
+ * It is used only by blk_crypto_evict_key(); see that function for details.
+ */
+int __blk_crypto_evict_key(struct blk_crypto_profile *profile,
+ const struct blk_crypto_key *key)
+{
+ struct blk_crypto_keyslot *slot;
+ int err;
+
+ if (profile->num_slots == 0) {
+ if (profile->ll_ops.keyslot_evict) {
+ blk_crypto_hw_enter(profile);
+ err = profile->ll_ops.keyslot_evict(profile, key, -1);
+ blk_crypto_hw_exit(profile);
+ return err;
+ }
+ return 0;
+ }
+
+ blk_crypto_hw_enter(profile);
+ slot = blk_crypto_find_keyslot(profile, key);
+ if (!slot) {
+ /*
+ * Not an error, since a key not in use by I/O is not guaranteed
+ * to be in a keyslot. There can be more keys than keyslots.
+ */
+ err = 0;
+ goto out;
+ }
+
+ if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
+ /* BUG: key is still in use by I/O */
+ err = -EBUSY;
+ goto out_remove;
+ }
+ err = profile->ll_ops.keyslot_evict(profile, key,
+ blk_crypto_keyslot_index(slot));
+out_remove:
+ /*
+ * Callers free the key even on error, so unlink the key from the hash
+ * table and clear slot->key even on error.
+ */
+ hlist_del(&slot->hash_node);
+ slot->key = NULL;
+out:
+ blk_crypto_hw_exit(profile);
+ return err;
+}
+
+/**
+ * blk_crypto_reprogram_all_keys() - Re-program all keyslots.
+ * @profile: The crypto profile
+ *
+ * Re-program all keyslots that are supposed to have a key programmed. This is
+ * intended only for use by drivers for hardware that loses its keys on reset.
+ *
+ * Context: Process context. Takes and releases profile->lock.
+ */
+void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile)
+{
+ unsigned int slot;
+
+ if (profile->num_slots == 0)
+ return;
+
+ /* This is for device initialization, so don't resume the device */
+ down_write(&profile->lock);
+ for (slot = 0; slot < profile->num_slots; slot++) {
+ const struct blk_crypto_key *key = profile->slots[slot].key;
+ int err;
+
+ if (!key)
+ continue;
+
+ err = profile->ll_ops.keyslot_program(profile, key, slot);
+ WARN_ON(err);
+ }
+ up_write(&profile->lock);
+}
+EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys);
+
+void blk_crypto_profile_destroy(struct blk_crypto_profile *profile)
+{
+ if (!profile)
+ return;
+ lockdep_unregister_key(&profile->lockdep_key);
+ kvfree(profile->slot_hashtable);
+ kvfree_sensitive(profile->slots,
+ sizeof(profile->slots[0]) * profile->num_slots);
+ memzero_explicit(profile, sizeof(*profile));
+}
+EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy);
+
+bool blk_crypto_register(struct blk_crypto_profile *profile,
+ struct request_queue *q)
+{
+ if (blk_integrity_queue_supports_integrity(q)) {
+ pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
+ return false;
+ }
+ q->crypto_profile = profile;
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_register);
+
+/**
+ * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities
+ * by child device
+ * @parent: the crypto profile for the parent device
+ * @child: the crypto profile for the child device, or NULL
+ *
+ * This clears all crypto capabilities in @parent that aren't set in @child. If
+ * @child is NULL, then this clears all parent capabilities.
+ *
+ * Only use this when setting up the crypto profile for a layered device, before
+ * it's been exposed yet.
+ */
+void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent,
+ const struct blk_crypto_profile *child)
+{
+ if (child) {
+ unsigned int i;
+
+ parent->max_dun_bytes_supported =
+ min(parent->max_dun_bytes_supported,
+ child->max_dun_bytes_supported);
+ for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++)
+ parent->modes_supported[i] &= child->modes_supported[i];
+ } else {
+ parent->max_dun_bytes_supported = 0;
+ memset(parent->modes_supported, 0,
+ sizeof(parent->modes_supported));
+ }
+}
+EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities);
+
+/**
+ * blk_crypto_has_capabilities() - Check whether @target supports at least all
+ * the crypto capabilities that @reference does.
+ * @target: the target profile
+ * @reference: the reference profile
+ *
+ * Return: %true if @target supports all the crypto capabilities of @reference.
+ */
+bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target,
+ const struct blk_crypto_profile *reference)
+{
+ int i;
+
+ if (!reference)
+ return true;
+
+ if (!target)
+ return false;
+
+ for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) {
+ if (reference->modes_supported[i] & ~target->modes_supported[i])
+ return false;
+ }
+
+ if (reference->max_dun_bytes_supported >
+ target->max_dun_bytes_supported)
+ return false;
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities);
+
+/**
+ * blk_crypto_update_capabilities() - Update the capabilities of a crypto
+ * profile to match those of another crypto
+ * profile.
+ * @dst: The crypto profile whose capabilities to update.
+ * @src: The crypto profile whose capabilities this function will update @dst's
+ * capabilities to.
+ *
+ * Blk-crypto requires that crypto capabilities that were
+ * advertised when a bio was created continue to be supported by the
+ * device until that bio is ended. This is turn means that a device cannot
+ * shrink its advertised crypto capabilities without any explicit
+ * synchronization with upper layers. So if there's no such explicit
+ * synchronization, @src must support all the crypto capabilities that
+ * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)).
+ *
+ * Note also that as long as the crypto capabilities are being expanded, the
+ * order of updates becoming visible is not important because it's alright
+ * for blk-crypto to see stale values - they only cause blk-crypto to
+ * believe that a crypto capability isn't supported when it actually is (which
+ * might result in blk-crypto-fallback being used if available, or the bio being
+ * failed).
+ */
+void blk_crypto_update_capabilities(struct blk_crypto_profile *dst,
+ const struct blk_crypto_profile *src)
+{
+ memcpy(dst->modes_supported, src->modes_supported,
+ sizeof(dst->modes_supported));
+
+ dst->max_dun_bytes_supported = src->max_dun_bytes_supported;
+}
+EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities);
diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c
new file mode 100644
index 000000000000..a304434489ba
--- /dev/null
+++ b/block/blk-crypto-sysfs.c
@@ -0,0 +1,173 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Copyright 2021 Google LLC
+ *
+ * sysfs support for blk-crypto. This file contains the code which exports the
+ * crypto capabilities of devices via /sys/block/$disk/queue/crypto/.
+ */
+
+#include <linux/blk-crypto-profile.h>
+
+#include "blk-crypto-internal.h"
+
+struct blk_crypto_kobj {
+ struct kobject kobj;
+ struct blk_crypto_profile *profile;
+};
+
+struct blk_crypto_attr {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page);
+};
+
+static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj)
+{
+ return container_of(kobj, struct blk_crypto_kobj, kobj)->profile;
+}
+
+static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr)
+{
+ return container_of(attr, struct blk_crypto_attr, attr);
+}
+
+static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported);
+}
+
+static ssize_t num_keyslots_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ return sysfs_emit(page, "%u\n", profile->num_slots);
+}
+
+#define BLK_CRYPTO_RO_ATTR(_name) \
+ static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name)
+
+BLK_CRYPTO_RO_ATTR(max_dun_bits);
+BLK_CRYPTO_RO_ATTR(num_keyslots);
+
+static struct attribute *blk_crypto_attrs[] = {
+ &max_dun_bits_attr.attr,
+ &num_keyslots_attr.attr,
+ NULL,
+};
+
+static const struct attribute_group blk_crypto_attr_group = {
+ .attrs = blk_crypto_attrs,
+};
+
+/*
+ * The encryption mode attributes. To avoid hard-coding the list of encryption
+ * modes, these are initialized at boot time by blk_crypto_sysfs_init().
+ */
+static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX];
+static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1];
+
+static umode_t blk_crypto_mode_is_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+ int mode_num = a - __blk_crypto_mode_attrs;
+
+ if (profile->modes_supported[mode_num])
+ return 0444;
+ return 0;
+}
+
+static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile,
+ struct blk_crypto_attr *attr, char *page)
+{
+ int mode_num = attr - __blk_crypto_mode_attrs;
+
+ return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]);
+}
+
+static const struct attribute_group blk_crypto_modes_attr_group = {
+ .name = "modes",
+ .attrs = blk_crypto_mode_attrs,
+ .is_visible = blk_crypto_mode_is_visible,
+};
+
+static const struct attribute_group *blk_crypto_attr_groups[] = {
+ &blk_crypto_attr_group,
+ &blk_crypto_modes_attr_group,
+ NULL,
+};
+
+static ssize_t blk_crypto_attr_show(struct kobject *kobj,
+ struct attribute *attr, char *page)
+{
+ struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj);
+ struct blk_crypto_attr *a = attr_to_crypto_attr(attr);
+
+ return a->show(profile, a, page);
+}
+
+static const struct sysfs_ops blk_crypto_attr_ops = {
+ .show = blk_crypto_attr_show,
+};
+
+static void blk_crypto_release(struct kobject *kobj)
+{
+ kfree(container_of(kobj, struct blk_crypto_kobj, kobj));
+}
+
+static const struct kobj_type blk_crypto_ktype = {
+ .default_groups = blk_crypto_attr_groups,
+ .sysfs_ops = &blk_crypto_attr_ops,
+ .release = blk_crypto_release,
+};
+
+/*
+ * If the request_queue has a blk_crypto_profile, create the "crypto"
+ * subdirectory in sysfs (/sys/block/$disk/queue/crypto/).
+ */
+int blk_crypto_sysfs_register(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct blk_crypto_kobj *obj;
+ int err;
+
+ if (!q->crypto_profile)
+ return 0;
+
+ obj = kzalloc(sizeof(*obj), GFP_KERNEL);
+ if (!obj)
+ return -ENOMEM;
+ obj->profile = q->crypto_profile;
+
+ err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype,
+ &disk->queue_kobj, "crypto");
+ if (err) {
+ kobject_put(&obj->kobj);
+ return err;
+ }
+ q->crypto_kobject = &obj->kobj;
+ return 0;
+}
+
+void blk_crypto_sysfs_unregister(struct gendisk *disk)
+{
+ kobject_put(disk->queue->crypto_kobject);
+}
+
+static int __init blk_crypto_sysfs_init(void)
+{
+ int i;
+
+ BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0);
+ for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) {
+ struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i];
+
+ attr->attr.name = blk_crypto_modes[i].name;
+ attr->attr.mode = 0444;
+ attr->show = blk_crypto_mode_show;
+ blk_crypto_mode_attrs[i - 1] = &attr->attr;
+ }
+ return 0;
+}
+subsys_initcall(blk_crypto_sysfs_init);
diff --git a/block/blk-crypto.c b/block/blk-crypto.c
index 6533c9b36ab8..4d760b092deb 100644
--- a/block/blk-crypto.c
+++ b/block/blk-crypto.c
@@ -11,28 +11,38 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/keyslot-manager.h>
+#include <linux/blk-crypto-profile.h>
#include <linux/module.h>
+#include <linux/ratelimit.h>
#include <linux/slab.h>
#include "blk-crypto-internal.h"
const struct blk_crypto_mode blk_crypto_modes[] = {
[BLK_ENCRYPTION_MODE_AES_256_XTS] = {
+ .name = "AES-256-XTS",
.cipher_str = "xts(aes)",
.keysize = 64,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = {
+ .name = "AES-128-CBC-ESSIV",
.cipher_str = "essiv(cbc(aes),sha256)",
.keysize = 16,
.ivsize = 16,
},
[BLK_ENCRYPTION_MODE_ADIANTUM] = {
+ .name = "Adiantum",
.cipher_str = "adiantum(xchacha12,aes)",
.keysize = 32,
.ivsize = 32,
},
+ [BLK_ENCRYPTION_MODE_SM4_XTS] = {
+ .name = "SM4-XTS",
+ .cipher_str = "xts(sm4)",
+ .keysize = 32,
+ .ivsize = 16,
+ },
};
/*
@@ -81,7 +91,15 @@ subsys_initcall(bio_crypt_ctx_init);
void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key,
const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask)
{
- struct bio_crypt_ctx *bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ struct bio_crypt_ctx *bc;
+
+ /*
+ * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so
+ * that the mempool_alloc() can't fail.
+ */
+ WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM));
+
+ bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
bc->bc_key = key;
memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun));
@@ -95,12 +113,14 @@ void __bio_crypt_free_ctx(struct bio *bio)
bio->bi_crypt_context = NULL;
}
-void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
+int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask)
{
dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ if (!dst->bi_crypt_context)
+ return -ENOMEM;
*dst->bi_crypt_context = *src->bi_crypt_context;
+ return 0;
}
-EXPORT_SYMBOL_GPL(__bio_crypt_clone);
/* Increments @dun by @inc, treating @dun as a multi-limb integer. */
void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE],
@@ -205,26 +225,27 @@ static bool bio_crypt_check_alignment(struct bio *bio)
return true;
}
-blk_status_t __blk_crypto_init_request(struct request *rq)
+blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq)
{
- return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key,
- &rq->crypt_keyslot);
+ return blk_crypto_get_keyslot(rq->q->crypto_profile,
+ rq->crypt_ctx->bc_key,
+ &rq->crypt_keyslot);
+}
+
+void __blk_crypto_rq_put_keyslot(struct request *rq)
+{
+ blk_crypto_put_keyslot(rq->crypt_keyslot);
+ rq->crypt_keyslot = NULL;
}
-/**
- * __blk_crypto_free_request - Uninitialize the crypto fields of a request.
- *
- * @rq: The request whose crypto fields to uninitialize.
- *
- * Completely uninitializes the crypto fields of a request. If a keyslot has
- * been programmed into some inline encryption hardware, that keyslot is
- * released. The rq->crypt_ctx is also freed.
- */
void __blk_crypto_free_request(struct request *rq)
{
- blk_ksm_put_slot(rq->crypt_keyslot);
+ /* The keyslot, if one was needed, should have been released earlier. */
+ if (WARN_ON_ONCE(rq->crypt_keyslot))
+ __blk_crypto_rq_put_keyslot(rq);
+
mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool);
- blk_crypto_rq_set_defaults(rq);
+ rq->crypt_ctx = NULL;
}
/**
@@ -239,7 +260,7 @@ void __blk_crypto_free_request(struct request *rq)
* kernel crypto API. When the crypto API fallback is used for encryption,
* blk-crypto may choose to split the bio into 2 - the first one that will
* continue to be processed and the second one that will be resubmitted via
- * generic_make_request. A bounce bio will be allocated to encrypt the contents
+ * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents
* of the aforementioned "first one", and *bio_ptr will be updated to this
* bounce bio.
*
@@ -269,10 +290,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr)
* Success if device supports the encryption context, or if we succeeded
* in falling back to the crypto API.
*/
- if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm,
- &bc_key->crypto_cfg))
+ if (blk_crypto_config_supported_natively(bio->bi_bdev,
+ &bc_key->crypto_cfg))
return true;
-
if (blk_crypto_fallback_bio_prep(bio_ptr))
return true;
fail:
@@ -280,20 +300,16 @@ fail:
return false;
}
-/**
- * __blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio
- * is inserted
- *
- * @rq: The request to prepare
- * @bio: The first bio being inserted into the request
- * @gfp_mask: gfp mask
- */
-void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
- gfp_t gfp_mask)
+int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio,
+ gfp_t gfp_mask)
{
- if (!rq->crypt_ctx)
+ if (!rq->crypt_ctx) {
rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask);
+ if (!rq->crypt_ctx)
+ return -ENOMEM;
+ }
*rq->crypt_ctx = *bio->bi_crypt_context;
+ return 0;
}
/**
@@ -325,7 +341,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
if (mode->keysize == 0)
return -EINVAL;
- if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE)
+ if (dun_bytes == 0 || dun_bytes > mode->ivsize)
return -EINVAL;
if (!is_power_of_2(data_unit_size))
@@ -341,22 +357,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key,
return 0;
}
+bool blk_crypto_config_supported_natively(struct block_device *bdev,
+ const struct blk_crypto_config *cfg)
+{
+ return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile,
+ cfg);
+}
+
/*
* Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the
- * request queue it's submitted to supports inline crypto, or the
+ * block_device it's submitted to supports inline crypto, or the
* blk-crypto-fallback is enabled and supports the cfg).
*/
-bool blk_crypto_config_supported(struct request_queue *q,
+bool blk_crypto_config_supported(struct block_device *bdev,
const struct blk_crypto_config *cfg)
{
return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) ||
- blk_ksm_crypto_cfg_supported(q->ksm, cfg);
+ blk_crypto_config_supported_natively(bdev, cfg);
}
/**
* blk_crypto_start_using_key() - Start using a blk_crypto_key on a device
+ * @bdev: block device to operate on
* @key: A key to use on the device
- * @q: the request queue for the device
*
* Upper layers must call this function to ensure that either the hardware
* supports the key's crypto settings, or the crypto API fallback has transforms
@@ -368,37 +391,48 @@ bool blk_crypto_config_supported(struct request_queue *q,
* blk-crypto-fallback is either disabled or the needed algorithm
* is disabled in the crypto API; or another -errno code.
*/
-int blk_crypto_start_using_key(const struct blk_crypto_key *key,
- struct request_queue *q)
+int blk_crypto_start_using_key(struct block_device *bdev,
+ const struct blk_crypto_key *key)
{
- if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
+ if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
return 0;
return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode);
}
/**
- * blk_crypto_evict_key() - Evict a key from any inline encryption hardware
- * it may have been programmed into
- * @q: The request queue who's associated inline encryption hardware this key
- * might have been programmed into
- * @key: The key to evict
+ * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device
+ * @bdev: a block_device on which I/O using the key may have been done
+ * @key: the key to evict
+ *
+ * For a given block_device, this function removes the given blk_crypto_key from
+ * the keyslot management structures and evicts it from any underlying hardware
+ * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into.
*
- * Upper layers (filesystems) must call this function to ensure that a key is
- * evicted from any hardware that it might have been programmed into. The key
- * must not be in use by any in-flight IO when this function is called.
+ * Upper layers must call this before freeing the blk_crypto_key. It must be
+ * called for every block_device the key may have been used on. The key must no
+ * longer be in use by any I/O when this function is called.
*
- * Return: 0 on success or if key is not present in the q's ksm, -err on error.
+ * Context: May sleep.
*/
-int blk_crypto_evict_key(struct request_queue *q,
- const struct blk_crypto_key *key)
+void blk_crypto_evict_key(struct block_device *bdev,
+ const struct blk_crypto_key *key)
{
- if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg))
- return blk_ksm_evict_key(q->ksm, key);
+ struct request_queue *q = bdev_get_queue(bdev);
+ int err;
+ if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg))
+ err = __blk_crypto_evict_key(q->crypto_profile, key);
+ else
+ err = blk_crypto_fallback_evict_key(key);
/*
- * If the request queue's associated inline encryption hardware didn't
- * have support for the key, then the key might have been programmed
- * into the fallback keyslot manager, so try to evict from there.
+ * An error can only occur here if the key failed to be evicted from a
+ * keyslot (due to a hardware or driver issue) or is allegedly still in
+ * use by I/O (due to a kernel bug). Even in these cases, the key is
+ * still unlinked from the keyslot management structures, and the caller
+ * is allowed and expected to free it right away. There's nothing
+ * callers can do to handle errors, so just log them and return void.
*/
- return blk_crypto_fallback_evict_key(key);
+ if (err)
+ pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err);
}
+EXPORT_SYMBOL_GPL(blk_crypto_evict_key);
diff --git a/block/blk-exec.c b/block/blk-exec.c
deleted file mode 100644
index 85324d53d072..000000000000
--- a/block/blk-exec.c
+++ /dev/null
@@ -1,95 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions related to setting various queue properties from drivers
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/sched/sysctl.h>
-
-#include "blk.h"
-#include "blk-mq-sched.h"
-
-/**
- * blk_end_sync_rq - executes a completion event on a request
- * @rq: request to complete
- * @error: end I/O status of the request
- */
-static void blk_end_sync_rq(struct request *rq, blk_status_t error)
-{
- struct completion *waiting = rq->end_io_data;
-
- rq->end_io_data = NULL;
-
- /*
- * complete last, if this is a stack request the process (and thus
- * the rq pointer) could be invalid right after this complete()
- */
- complete(waiting);
-}
-
-/**
- * blk_execute_rq_nowait - insert a request into queue for execution
- * @q: queue to insert the request in
- * @bd_disk: matching gendisk
- * @rq: request to insert
- * @at_head: insert request at head or tail of queue
- * @done: I/O completion handler
- *
- * Description:
- * Insert a fully prepared request at the back of the I/O scheduler queue
- * for execution. Don't wait for completion.
- *
- * Note:
- * This function will invoke @done directly if the queue is dead.
- */
-void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk,
- struct request *rq, int at_head,
- rq_end_io_fn *done)
-{
- WARN_ON(irqs_disabled());
- WARN_ON(!blk_rq_is_passthrough(rq));
-
- rq->rq_disk = bd_disk;
- rq->end_io = done;
-
- blk_account_io_start(rq);
-
- /*
- * don't check dying flag for MQ because the request won't
- * be reused after dying flag is set
- */
- blk_mq_sched_insert_request(rq, at_head, true, false);
-}
-EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
-
-/**
- * blk_execute_rq - insert a request into queue for execution
- * @q: queue to insert the request in
- * @bd_disk: matching gendisk
- * @rq: request to insert
- * @at_head: insert request at head or tail of queue
- *
- * Description:
- * Insert a fully prepared request at the back of the I/O scheduler queue
- * for execution and wait for completion.
- */
-void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk,
- struct request *rq, int at_head)
-{
- DECLARE_COMPLETION_ONSTACK(wait);
- unsigned long hang_check;
-
- rq->end_io_data = &wait;
- blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq);
-
- /* Prevent hang_check timer from firing at us during very long I/O */
- hang_check = sysctl_hung_task_timeout_secs;
- if (hang_check)
- while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2)));
- else
- wait_for_completion_io(&wait);
-}
-EXPORT_SYMBOL(blk_execute_rq);
diff --git a/block/blk-flush.c b/block/blk-flush.c
index 15ae0155ec07..3f4d41952ef2 100644
--- a/block/blk-flush.c
+++ b/block/blk-flush.c
@@ -68,12 +68,10 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/gfp.h>
-#include <linux/blk-mq.h>
-#include <linux/lockdep.h>
+#include <linux/part_stat.h>
#include "blk.h"
#include "blk-mq.h"
-#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
/* PREFLUSH/FUA sequences */
@@ -94,7 +92,13 @@ enum {
};
static void blk_kick_flush(struct request_queue *q,
- struct blk_flush_queue *fq, unsigned int flags);
+ struct blk_flush_queue *fq, blk_opf_t flags);
+
+static inline struct blk_flush_queue *
+blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
+{
+ return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+}
static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq)
{
@@ -132,14 +136,9 @@ static void blk_flush_restore_request(struct request *rq)
rq->end_io = rq->flush.saved_end_io;
}
-static void blk_flush_queue_rq(struct request *rq, bool add_front)
-{
- blk_mq_add_to_requeue_list(rq, add_front, true);
-}
-
static void blk_account_io_flush(struct request *rq)
{
- struct hd_struct *part = &rq->rq_disk->part0;
+ struct block_device *part = rq->q->disk->part0;
part_stat_lock();
part_stat_inc(part, ios[STAT_FLUSH]);
@@ -167,7 +166,7 @@ static void blk_flush_complete_seq(struct request *rq,
{
struct request_queue *q = rq->q;
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
BUG_ON(rq->flush.seq & seq);
rq->flush.seq |= seq;
@@ -184,12 +183,15 @@ static void blk_flush_complete_seq(struct request *rq,
/* queue for flush */
if (list_empty(pending))
fq->flush_pending_since = jiffies;
- list_move_tail(&rq->flush.list, pending);
+ list_move_tail(&rq->queuelist, pending);
break;
case REQ_FSEQ_DATA:
- list_move_tail(&rq->flush.list, &fq->flush_data_in_flight);
- blk_flush_queue_rq(rq, true);
+ fq->flush_data_in_flight++;
+ spin_lock(&q->requeue_lock);
+ list_move(&rq->queuelist, &q->requeue_list);
+ spin_unlock(&q->requeue_lock);
+ blk_mq_kick_requeue_list(q);
break;
case REQ_FSEQ_DONE:
@@ -199,8 +201,7 @@ static void blk_flush_complete_seq(struct request *rq,
* flush data request completion path. Restore @rq for
* normal completion and end it.
*/
- BUG_ON(!list_empty(&rq->queuelist));
- list_del_init(&rq->flush.list);
+ list_del_init(&rq->queuelist);
blk_flush_restore_request(rq);
blk_mq_end_request(rq, error);
break;
@@ -212,36 +213,41 @@ static void blk_flush_complete_seq(struct request *rq,
blk_kick_flush(q, fq, cmd_flags);
}
-static void flush_end_io(struct request *flush_rq, blk_status_t error)
+static enum rq_end_io_ret flush_end_io(struct request *flush_rq,
+ blk_status_t error)
{
struct request_queue *q = flush_rq->q;
struct list_head *running;
struct request *rq, *n;
unsigned long flags = 0;
struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx);
- struct blk_mq_hw_ctx *hctx;
-
- blk_account_io_flush(flush_rq);
/* release the tag's ownership to the req cloned from */
spin_lock_irqsave(&fq->mq_flush_lock, flags);
- if (!refcount_dec_and_test(&flush_rq->ref)) {
+ if (!req_ref_put_and_test(flush_rq)) {
fq->rq_status = error;
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
- return;
+ return RQ_END_IO_NONE;
}
- if (fq->rq_status != BLK_STS_OK)
+ blk_account_io_flush(flush_rq);
+ /*
+ * Flush request has to be marked as IDLE when it is really ended
+ * because its .end_io() is called from timeout code path too for
+ * avoiding use-after-free.
+ */
+ WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE);
+ if (fq->rq_status != BLK_STS_OK) {
error = fq->rq_status;
+ fq->rq_status = BLK_STS_OK;
+ }
- hctx = flush_rq->mq_hctx;
if (!q->elevator) {
- blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq);
- flush_rq->tag = -1;
+ flush_rq->tag = BLK_MQ_NO_TAG;
} else {
blk_mq_put_driver_tag(flush_rq);
- flush_rq->internal_tag = -1;
+ flush_rq->internal_tag = BLK_MQ_NO_TAG;
}
running = &fq->flush_queue[fq->flush_running_idx];
@@ -251,7 +257,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
fq->flush_running_idx ^= 1;
/* and push the waiting requests to the next stage */
- list_for_each_entry_safe(rq, n, running, flush.list) {
+ list_for_each_entry_safe(rq, n, running, queuelist) {
unsigned int seq = blk_flush_cur_seq(rq);
BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH);
@@ -259,6 +265,12 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
}
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
+ return RQ_END_IO_NONE;
+}
+
+bool is_flush_rq(struct request *rq)
+{
+ return rq->end_io == flush_end_io;
}
/**
@@ -275,24 +287,19 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error)
*
*/
static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
- unsigned int flags)
+ blk_opf_t flags)
{
struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx];
struct request *first_rq =
- list_first_entry(pending, struct request, flush.list);
+ list_first_entry(pending, struct request, queuelist);
struct request *flush_rq = fq->flush_rq;
/* C1 described at the top of this file */
if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending))
return;
- /* C2 and C3
- *
- * For blk-mq + scheduling, we can risk having all driver tags
- * assigned to empty flushes, and we deadlock if we are expecting
- * other requests to make progress. Don't defer for that case.
- */
- if (!list_empty(&fq->flush_data_in_flight) && q->elevator &&
+ /* C2 and C3 */
+ if (fq->flush_data_in_flight &&
time_before(jiffies,
fq->flush_pending_since + FLUSH_PENDING_TIMEOUT))
return;
@@ -316,24 +323,33 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq,
flush_rq->mq_ctx = first_rq->mq_ctx;
flush_rq->mq_hctx = first_rq->mq_hctx;
- if (!q->elevator) {
- fq->orig_rq = first_rq;
+ if (!q->elevator)
flush_rq->tag = first_rq->tag;
- blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq);
- } else {
+ else
flush_rq->internal_tag = first_rq->internal_tag;
- }
flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH;
flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK);
flush_rq->rq_flags |= RQF_FLUSH_SEQ;
- flush_rq->rq_disk = first_rq->rq_disk;
flush_rq->end_io = flush_end_io;
+ /*
+ * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one
+ * implied in refcount_inc_not_zero() called from
+ * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref
+ * and READ flush_rq->end_io
+ */
+ smp_wmb();
+ req_ref_set(flush_rq, 1);
- blk_flush_queue_rq(flush_rq, false);
+ spin_lock(&q->requeue_lock);
+ list_add_tail(&flush_rq->queuelist, &q->flush_list);
+ spin_unlock(&q->requeue_lock);
+
+ blk_mq_kick_requeue_list(q);
}
-static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
+static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq,
+ blk_status_t error)
{
struct request_queue *q = rq->q;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
@@ -351,28 +367,42 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error)
* the comment in flush_end_io().
*/
spin_lock_irqsave(&fq->mq_flush_lock, flags);
+ fq->flush_data_in_flight--;
+ /*
+ * May have been corrupted by rq->rq_next reuse, we need to
+ * re-initialize rq->queuelist before reusing it here.
+ */
+ INIT_LIST_HEAD(&rq->queuelist);
blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error);
spin_unlock_irqrestore(&fq->mq_flush_lock, flags);
blk_mq_sched_restart(hctx);
+ return RQ_END_IO_NONE;
}
-/**
- * blk_insert_flush - insert a new PREFLUSH/FUA request
- * @rq: request to insert
- *
- * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions.
- * or __blk_mq_run_hw_queue() to dispatch request.
- * @rq is being submitted. Analyze what needs to be done and put it on the
- * right queue.
+static void blk_rq_init_flush(struct request *rq)
+{
+ rq->flush.seq = 0;
+ rq->rq_flags |= RQF_FLUSH_SEQ;
+ rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
+ rq->end_io = mq_flush_data_end_io;
+}
+
+/*
+ * Insert a PREFLUSH/FUA request into the flush state machine.
+ * Returns true if the request has been consumed by the flush state machine,
+ * or false if the caller should continue to process it.
*/
-void blk_insert_flush(struct request *rq)
+bool blk_insert_flush(struct request *rq)
{
struct request_queue *q = rq->q;
unsigned long fflags = q->queue_flags; /* may change, cache */
unsigned int policy = blk_flush_policy(fflags, rq);
struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx);
+ /* FLUSH/FUA request must never be merged */
+ WARN_ON_ONCE(rq->bio != rq->biotail);
+
/*
* @policy now records what operations need to be done. Adjust
* REQ_PREFLUSH and FUA for the driver.
@@ -388,66 +418,60 @@ void blk_insert_flush(struct request *rq)
*/
rq->cmd_flags |= REQ_SYNC;
- /*
- * An empty flush handed down from a stacking driver may
- * translate into nothing if the underlying device does not
- * advertise a write-back cache. In this case, simply
- * complete the request.
- */
- if (!policy) {
+ switch (policy) {
+ case 0:
+ /*
+ * An empty flush handed down from a stacking driver may
+ * translate into nothing if the underlying device does not
+ * advertise a write-back cache. In this case, simply
+ * complete the request.
+ */
blk_mq_end_request(rq, 0);
- return;
- }
-
- BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */
-
- /*
- * If there's data but flush is not necessary, the request can be
- * processed directly without going through flush machinery. Queue
- * for normal execution.
- */
- if ((policy & REQ_FSEQ_DATA) &&
- !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) {
- blk_mq_request_bypass_insert(rq, false, false);
- return;
+ return true;
+ case REQ_FSEQ_DATA:
+ /*
+ * If there's data, but no flush is necessary, the request can
+ * be processed directly without going through flush machinery.
+ * Queue for normal execution.
+ */
+ return false;
+ case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH:
+ /*
+ * Initialize the flush fields and completion handler to trigger
+ * the post flush, and then just pass the command on.
+ */
+ blk_rq_init_flush(rq);
+ rq->flush.seq |= REQ_FSEQ_PREFLUSH;
+ spin_lock_irq(&fq->mq_flush_lock);
+ fq->flush_data_in_flight++;
+ spin_unlock_irq(&fq->mq_flush_lock);
+ return false;
+ default:
+ /*
+ * Mark the request as part of a flush sequence and submit it
+ * for further processing to the flush state machine.
+ */
+ blk_rq_init_flush(rq);
+ spin_lock_irq(&fq->mq_flush_lock);
+ blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
+ spin_unlock_irq(&fq->mq_flush_lock);
+ return true;
}
-
- /*
- * @rq should go through flush machinery. Mark it part of flush
- * sequence and submit for further processing.
- */
- memset(&rq->flush, 0, sizeof(rq->flush));
- INIT_LIST_HEAD(&rq->flush.list);
- rq->rq_flags |= RQF_FLUSH_SEQ;
- rq->flush.saved_end_io = rq->end_io; /* Usually NULL */
-
- rq->end_io = mq_flush_data_end_io;
-
- spin_lock_irq(&fq->mq_flush_lock);
- blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0);
- spin_unlock_irq(&fq->mq_flush_lock);
}
/**
* blkdev_issue_flush - queue a flush
* @bdev: blockdev to issue flush for
- * @gfp_mask: memory allocation flags (for bio_alloc)
*
* Description:
* Issue a flush for the block device in question.
*/
-int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask)
+int blkdev_issue_flush(struct block_device *bdev)
{
- struct bio *bio;
- int ret = 0;
+ struct bio bio;
- bio = bio_alloc(gfp_mask, 0);
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH;
-
- ret = submit_bio_wait(bio);
- bio_put(bio);
- return ret;
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH);
+ return submit_bio_wait(&bio);
}
EXPORT_SYMBOL(blkdev_issue_flush);
@@ -470,10 +494,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
INIT_LIST_HEAD(&fq->flush_queue[0]);
INIT_LIST_HEAD(&fq->flush_queue[1]);
- INIT_LIST_HEAD(&fq->flush_data_in_flight);
-
- lockdep_register_key(&fq->key);
- lockdep_set_class(&fq->mq_flush_lock, &fq->key);
return fq;
@@ -489,7 +509,31 @@ void blk_free_flush_queue(struct blk_flush_queue *fq)
if (!fq)
return;
- lockdep_unregister_key(&fq->key);
kfree(fq->flush_rq);
kfree(fq);
}
+
+/*
+ * Allow driver to set its own lock class to fq->mq_flush_lock for
+ * avoiding lockdep complaint.
+ *
+ * flush_end_io() may be called recursively from some driver, such as
+ * nvme-loop, so lockdep may complain 'possible recursive locking' because
+ * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class
+ * key. We need to assign different lock class for these driver's
+ * fq->mq_flush_lock for avoiding the lockdep warning.
+ *
+ * Use dynamically allocated lock class key for each 'blk_flush_queue'
+ * instance is over-kill, and more worse it introduces horrible boot delay
+ * issue because synchronize_rcu() is implied in lockdep_unregister_key which
+ * is called for each hctx release. SCSI probing may synchronously create and
+ * destroy lots of MQ request_queues for non-existent devices, and some robot
+ * test kernel always enable lockdep option. It is observed that more than half
+ * an hour is taken during SCSI MQ probe with per-fq lock class.
+ */
+void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx,
+ struct lock_class_key *key)
+{
+ lockdep_set_class(&hctx->fq->mq_flush_lock, key);
+}
+EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class);
diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c
new file mode 100644
index 000000000000..c9eb4241e048
--- /dev/null
+++ b/block/blk-ia-ranges.c
@@ -0,0 +1,318 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Block device concurrent positioning ranges.
+ *
+ * Copyright (C) 2021 Western Digital Corporation or its Affiliates.
+ */
+#include <linux/kernel.h>
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+#include <linux/init.h>
+
+#include "blk.h"
+
+static ssize_t
+blk_ia_range_sector_show(struct blk_independent_access_range *iar,
+ char *buf)
+{
+ return sprintf(buf, "%llu\n", iar->sector);
+}
+
+static ssize_t
+blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar,
+ char *buf)
+{
+ return sprintf(buf, "%llu\n", iar->nr_sectors);
+}
+
+struct blk_ia_range_sysfs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct blk_independent_access_range *iar, char *buf);
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = {
+ .attr = { .name = "sector", .mode = 0444 },
+ .show = blk_ia_range_sector_show,
+};
+
+static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = {
+ .attr = { .name = "nr_sectors", .mode = 0444 },
+ .show = blk_ia_range_nr_sectors_show,
+};
+
+static struct attribute *blk_ia_range_attrs[] = {
+ &blk_ia_range_sector_entry.attr,
+ &blk_ia_range_nr_sectors_entry.attr,
+ NULL,
+};
+ATTRIBUTE_GROUPS(blk_ia_range);
+
+static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj,
+ struct attribute *attr, char *buf)
+{
+ struct blk_ia_range_sysfs_entry *entry =
+ container_of(attr, struct blk_ia_range_sysfs_entry, attr);
+ struct blk_independent_access_range *iar =
+ container_of(kobj, struct blk_independent_access_range, kobj);
+
+ return entry->show(iar, buf);
+}
+
+static const struct sysfs_ops blk_ia_range_sysfs_ops = {
+ .show = blk_ia_range_sysfs_show,
+};
+
+/*
+ * Independent access range entries are not freed individually, but alltogether
+ * with struct blk_independent_access_ranges and its array of ranges. Since
+ * kobject_add() takes a reference on the parent kobject contained in
+ * struct blk_independent_access_ranges, the array of independent access range
+ * entries cannot be freed until kobject_del() is called for all entries.
+ * So we do not need to do anything here, but still need this no-op release
+ * operation to avoid complaints from the kobject code.
+ */
+static void blk_ia_range_sysfs_nop_release(struct kobject *kobj)
+{
+}
+
+static const struct kobj_type blk_ia_range_ktype = {
+ .sysfs_ops = &blk_ia_range_sysfs_ops,
+ .default_groups = blk_ia_range_groups,
+ .release = blk_ia_range_sysfs_nop_release,
+};
+
+/*
+ * This will be executed only after all independent access range entries are
+ * removed with kobject_del(), at which point, it is safe to free everything,
+ * including the array of ranges.
+ */
+static void blk_ia_ranges_sysfs_release(struct kobject *kobj)
+{
+ struct blk_independent_access_ranges *iars =
+ container_of(kobj, struct blk_independent_access_ranges, kobj);
+
+ kfree(iars);
+}
+
+static const struct kobj_type blk_ia_ranges_ktype = {
+ .release = blk_ia_ranges_sysfs_release,
+};
+
+/**
+ * disk_register_independent_access_ranges - register with sysfs a set of
+ * independent access ranges
+ * @disk: Target disk
+ *
+ * Register with sysfs a set of independent access ranges for @disk.
+ */
+int disk_register_independent_access_ranges(struct gendisk *disk)
+{
+ struct blk_independent_access_ranges *iars = disk->ia_ranges;
+ struct request_queue *q = disk->queue;
+ int i, ret;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+ lockdep_assert_held(&q->sysfs_lock);
+
+ if (!iars)
+ return 0;
+
+ /*
+ * At this point, iars is the new set of sector access ranges that needs
+ * to be registered with sysfs.
+ */
+ WARN_ON(iars->sysfs_registered);
+ ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype,
+ &disk->queue_kobj, "%s",
+ "independent_access_ranges");
+ if (ret) {
+ disk->ia_ranges = NULL;
+ kobject_put(&iars->kobj);
+ return ret;
+ }
+
+ for (i = 0; i < iars->nr_ia_ranges; i++) {
+ ret = kobject_init_and_add(&iars->ia_range[i].kobj,
+ &blk_ia_range_ktype, &iars->kobj,
+ "%d", i);
+ if (ret) {
+ while (--i >= 0)
+ kobject_del(&iars->ia_range[i].kobj);
+ kobject_del(&iars->kobj);
+ kobject_put(&iars->kobj);
+ return ret;
+ }
+ }
+
+ iars->sysfs_registered = true;
+
+ return 0;
+}
+
+void disk_unregister_independent_access_ranges(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct blk_independent_access_ranges *iars = disk->ia_ranges;
+ int i;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+ lockdep_assert_held(&q->sysfs_lock);
+
+ if (!iars)
+ return;
+
+ if (iars->sysfs_registered) {
+ for (i = 0; i < iars->nr_ia_ranges; i++)
+ kobject_del(&iars->ia_range[i].kobj);
+ kobject_del(&iars->kobj);
+ kobject_put(&iars->kobj);
+ } else {
+ kfree(iars);
+ }
+
+ disk->ia_ranges = NULL;
+}
+
+static struct blk_independent_access_range *
+disk_find_ia_range(struct blk_independent_access_ranges *iars,
+ sector_t sector)
+{
+ struct blk_independent_access_range *iar;
+ int i;
+
+ for (i = 0; i < iars->nr_ia_ranges; i++) {
+ iar = &iars->ia_range[i];
+ if (sector >= iar->sector &&
+ sector < iar->sector + iar->nr_sectors)
+ return iar;
+ }
+
+ return NULL;
+}
+
+static bool disk_check_ia_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *iars)
+{
+ struct blk_independent_access_range *iar, *tmp;
+ sector_t capacity = get_capacity(disk);
+ sector_t sector = 0;
+ int i;
+
+ if (WARN_ON_ONCE(!iars->nr_ia_ranges))
+ return false;
+
+ /*
+ * While sorting the ranges in increasing LBA order, check that the
+ * ranges do not overlap, that there are no sector holes and that all
+ * sectors belong to one range.
+ */
+ for (i = 0; i < iars->nr_ia_ranges; i++) {
+ tmp = disk_find_ia_range(iars, sector);
+ if (!tmp || tmp->sector != sector) {
+ pr_warn("Invalid non-contiguous independent access ranges\n");
+ return false;
+ }
+
+ iar = &iars->ia_range[i];
+ if (tmp != iar) {
+ swap(iar->sector, tmp->sector);
+ swap(iar->nr_sectors, tmp->nr_sectors);
+ }
+
+ sector += iar->nr_sectors;
+ }
+
+ if (sector != capacity) {
+ pr_warn("Independent access ranges do not match disk capacity\n");
+ return false;
+ }
+
+ return true;
+}
+
+static bool disk_ia_ranges_changed(struct gendisk *disk,
+ struct blk_independent_access_ranges *new)
+{
+ struct blk_independent_access_ranges *old = disk->ia_ranges;
+ int i;
+
+ if (!old)
+ return true;
+
+ if (old->nr_ia_ranges != new->nr_ia_ranges)
+ return true;
+
+ for (i = 0; i < old->nr_ia_ranges; i++) {
+ if (new->ia_range[i].sector != old->ia_range[i].sector ||
+ new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors)
+ return true;
+ }
+
+ return false;
+}
+
+/**
+ * disk_alloc_independent_access_ranges - Allocate an independent access ranges
+ * data structure
+ * @disk: target disk
+ * @nr_ia_ranges: Number of independent access ranges
+ *
+ * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges
+ * access range descriptors.
+ */
+struct blk_independent_access_ranges *
+disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges)
+{
+ struct blk_independent_access_ranges *iars;
+
+ iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges),
+ GFP_KERNEL, disk->queue->node);
+ if (iars)
+ iars->nr_ia_ranges = nr_ia_ranges;
+ return iars;
+}
+EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges);
+
+/**
+ * disk_set_independent_access_ranges - Set a disk independent access ranges
+ * @disk: target disk
+ * @iars: independent access ranges structure
+ *
+ * Set the independent access ranges information of the request queue
+ * of @disk to @iars. If @iars is NULL and the independent access ranges
+ * structure already set is cleared. If there are no differences between
+ * @iars and the independent access ranges structure already set, @iars
+ * is freed.
+ */
+void disk_set_independent_access_ranges(struct gendisk *disk,
+ struct blk_independent_access_ranges *iars)
+{
+ struct request_queue *q = disk->queue;
+
+ mutex_lock(&q->sysfs_dir_lock);
+ mutex_lock(&q->sysfs_lock);
+ if (iars && !disk_check_ia_ranges(disk, iars)) {
+ kfree(iars);
+ iars = NULL;
+ }
+ if (iars && !disk_ia_ranges_changed(disk, iars)) {
+ kfree(iars);
+ goto unlock;
+ }
+
+ /*
+ * This may be called for a registered queue. E.g. during a device
+ * revalidation. If that is the case, we need to unregister the old
+ * set of independent access ranges and register the new set. If the
+ * queue is not registered, registration of the device request queue
+ * will register the independent access ranges.
+ */
+ disk_unregister_independent_access_ranges(disk);
+ disk->ia_ranges = iars;
+ if (blk_queue_registered(q))
+ disk_register_independent_access_ranges(disk);
+unlock:
+ mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
+}
+EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges);
diff --git a/block/blk-integrity.c b/block/blk-integrity.c
index c03705cbb9c9..d4e9b4556d14 100644
--- a/block/blk-integrity.c
+++ b/block/blk-integrity.c
@@ -6,7 +6,7 @@
* Written by: Martin K. Petersen <martin.petersen@oracle.com>
*/
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/backing-dev.h>
#include <linux/mempool.h>
#include <linux/bio.h>
@@ -183,7 +183,6 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req,
return true;
}
-EXPORT_SYMBOL(blk_integrity_merge_rq);
bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
struct bio *bio)
@@ -212,63 +211,45 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req,
return true;
}
-EXPORT_SYMBOL(blk_integrity_merge_bio);
-struct integrity_sysfs_entry {
- struct attribute attr;
- ssize_t (*show)(struct blk_integrity *, char *);
- ssize_t (*store)(struct blk_integrity *, const char *, size_t);
-};
-
-static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr,
- char *page)
+static inline struct blk_integrity *dev_to_bi(struct device *dev)
{
- struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj);
- struct blk_integrity *bi = &disk->queue->integrity;
- struct integrity_sysfs_entry *entry =
- container_of(attr, struct integrity_sysfs_entry, attr);
-
- return entry->show(bi, page);
+ return &dev_to_disk(dev)->queue->integrity;
}
-static ssize_t integrity_attr_store(struct kobject *kobj,
- struct attribute *attr, const char *page,
- size_t count)
+static ssize_t format_show(struct device *dev, struct device_attribute *attr,
+ char *page)
{
- struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj);
- struct blk_integrity *bi = &disk->queue->integrity;
- struct integrity_sysfs_entry *entry =
- container_of(attr, struct integrity_sysfs_entry, attr);
- ssize_t ret = 0;
+ struct blk_integrity *bi = dev_to_bi(dev);
- if (entry->store)
- ret = entry->store(bi, page, count);
-
- return ret;
-}
-
-static ssize_t integrity_format_show(struct blk_integrity *bi, char *page)
-{
if (bi->profile && bi->profile->name)
- return sprintf(page, "%s\n", bi->profile->name);
- else
- return sprintf(page, "none\n");
+ return sysfs_emit(page, "%s\n", bi->profile->name);
+ return sysfs_emit(page, "none\n");
}
-static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page)
+static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr,
+ char *page)
{
- return sprintf(page, "%u\n", bi->tag_size);
+ struct blk_integrity *bi = dev_to_bi(dev);
+
+ return sysfs_emit(page, "%u\n", bi->tag_size);
}
-static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page)
+static ssize_t protection_interval_bytes_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
{
- return sprintf(page, "%u\n",
- bi->interval_exp ? 1 << bi->interval_exp : 0);
+ struct blk_integrity *bi = dev_to_bi(dev);
+
+ return sysfs_emit(page, "%u\n",
+ bi->interval_exp ? 1 << bi->interval_exp : 0);
}
-static ssize_t integrity_verify_store(struct blk_integrity *bi,
- const char *page, size_t count)
+static ssize_t read_verify_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *page, size_t count)
{
+ struct blk_integrity *bi = dev_to_bi(dev);
char *p = (char *) page;
unsigned long val = simple_strtoul(p, &p, 10);
@@ -280,14 +261,20 @@ static ssize_t integrity_verify_store(struct blk_integrity *bi,
return count;
}
-static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page)
+static ssize_t read_verify_show(struct device *dev,
+ struct device_attribute *attr, char *page)
{
- return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0);
+ struct blk_integrity *bi = dev_to_bi(dev);
+
+ return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY));
}
-static ssize_t integrity_generate_store(struct blk_integrity *bi,
- const char *page, size_t count)
+static ssize_t write_generate_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *page, size_t count)
{
+ struct blk_integrity *bi = dev_to_bi(dev);
+
char *p = (char *) page;
unsigned long val = simple_strtoul(p, &p, 10);
@@ -299,68 +286,44 @@ static ssize_t integrity_generate_store(struct blk_integrity *bi,
return count;
}
-static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page)
+static ssize_t write_generate_show(struct device *dev,
+ struct device_attribute *attr, char *page)
{
- return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0);
-}
+ struct blk_integrity *bi = dev_to_bi(dev);
-static ssize_t integrity_device_show(struct blk_integrity *bi, char *page)
-{
- return sprintf(page, "%u\n",
- (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0);
+ return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE));
}
-static struct integrity_sysfs_entry integrity_format_entry = {
- .attr = { .name = "format", .mode = 0444 },
- .show = integrity_format_show,
-};
-
-static struct integrity_sysfs_entry integrity_tag_size_entry = {
- .attr = { .name = "tag_size", .mode = 0444 },
- .show = integrity_tag_size_show,
-};
-
-static struct integrity_sysfs_entry integrity_interval_entry = {
- .attr = { .name = "protection_interval_bytes", .mode = 0444 },
- .show = integrity_interval_show,
-};
-
-static struct integrity_sysfs_entry integrity_verify_entry = {
- .attr = { .name = "read_verify", .mode = 0644 },
- .show = integrity_verify_show,
- .store = integrity_verify_store,
-};
+static ssize_t device_is_integrity_capable_show(struct device *dev,
+ struct device_attribute *attr,
+ char *page)
+{
+ struct blk_integrity *bi = dev_to_bi(dev);
-static struct integrity_sysfs_entry integrity_generate_entry = {
- .attr = { .name = "write_generate", .mode = 0644 },
- .show = integrity_generate_show,
- .store = integrity_generate_store,
-};
+ return sysfs_emit(page, "%u\n",
+ !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE));
+}
-static struct integrity_sysfs_entry integrity_device_entry = {
- .attr = { .name = "device_is_integrity_capable", .mode = 0444 },
- .show = integrity_device_show,
-};
+static DEVICE_ATTR_RO(format);
+static DEVICE_ATTR_RO(tag_size);
+static DEVICE_ATTR_RO(protection_interval_bytes);
+static DEVICE_ATTR_RW(read_verify);
+static DEVICE_ATTR_RW(write_generate);
+static DEVICE_ATTR_RO(device_is_integrity_capable);
static struct attribute *integrity_attrs[] = {
- &integrity_format_entry.attr,
- &integrity_tag_size_entry.attr,
- &integrity_interval_entry.attr,
- &integrity_verify_entry.attr,
- &integrity_generate_entry.attr,
- &integrity_device_entry.attr,
- NULL,
-};
-ATTRIBUTE_GROUPS(integrity);
-
-static const struct sysfs_ops integrity_ops = {
- .show = &integrity_attr_show,
- .store = &integrity_attr_store,
+ &dev_attr_format.attr,
+ &dev_attr_tag_size.attr,
+ &dev_attr_protection_interval_bytes.attr,
+ &dev_attr_read_verify.attr,
+ &dev_attr_write_generate.attr,
+ &dev_attr_device_is_integrity_capable.attr,
+ NULL
};
-static struct kobj_type integrity_ktype = {
- .default_groups = integrity_groups,
- .sysfs_ops = &integrity_ops,
+const struct attribute_group blk_integrity_attr_group = {
+ .name = "integrity",
+ .attrs = integrity_attrs,
};
static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter)
@@ -408,12 +371,12 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template
bi->tuple_size = template->tuple_size;
bi->tag_size = template->tag_size;
- disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES;
+ blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue);
#ifdef CONFIG_BLK_INLINE_ENCRYPTION
- if (disk->queue->ksm) {
+ if (disk->queue->crypto_profile) {
pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
- blk_ksm_unregister(disk->queue);
+ disk->queue->crypto_profile = NULL;
}
#endif
}
@@ -428,23 +391,14 @@ EXPORT_SYMBOL(blk_integrity_register);
*/
void blk_integrity_unregister(struct gendisk *disk)
{
- disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES;
- memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity));
-}
-EXPORT_SYMBOL(blk_integrity_unregister);
+ struct blk_integrity *bi = &disk->queue->integrity;
-void blk_integrity_add(struct gendisk *disk)
-{
- if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype,
- &disk_to_dev(disk)->kobj, "%s", "integrity"))
+ if (!bi->profile)
return;
- kobject_uevent(&disk->integrity_kobj, KOBJ_ADD);
-}
-
-void blk_integrity_del(struct gendisk *disk)
-{
- kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE);
- kobject_del(&disk->integrity_kobj);
- kobject_put(&disk->integrity_kobj);
+ /* ensure all bios are off the integrity workqueue */
+ blk_flush_integrity();
+ blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue);
+ memset(bi, 0, sizeof(*bi));
}
+EXPORT_SYMBOL(blk_integrity_unregister);
diff --git a/block/blk-ioc.c b/block/blk-ioc.c
index 9df50fb507ca..25dd4db11121 100644
--- a/block/blk-ioc.c
+++ b/block/blk-ioc.c
@@ -8,22 +8,25 @@
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
+#include <linux/security.h>
#include <linux/sched/task.h>
#include "blk.h"
+#include "blk-mq-sched.h"
/*
* For io context allocations
*/
static struct kmem_cache *iocontext_cachep;
+#ifdef CONFIG_BLK_ICQ
/**
* get_io_context - increment reference count to io_context
* @ioc: io_context to get
*
* Increment reference count to @ioc.
*/
-void get_io_context(struct io_context *ioc)
+static void get_io_context(struct io_context *ioc)
{
BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
atomic_long_inc(&ioc->refcount);
@@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq)
icq->flags |= ICQ_EXITED;
}
+static void ioc_exit_icqs(struct io_context *ioc)
+{
+ struct io_cq *icq;
+
+ spin_lock_irq(&ioc->lock);
+ hlist_for_each_entry(icq, &ioc->icq_list, ioc_node)
+ ioc_exit_icq(icq);
+ spin_unlock_irq(&ioc->lock);
+}
+
/*
* Release an icq. Called with ioc locked for blk-mq, and with both ioc
* and queue locked for legacy.
@@ -64,6 +77,10 @@ static void ioc_destroy_icq(struct io_cq *icq)
struct elevator_type *et = q->elevator->type;
lockdep_assert_held(&ioc->lock);
+ lockdep_assert_held(&q->queue_lock);
+
+ if (icq->flags & ICQ_DESTROYED)
+ return;
radix_tree_delete(&ioc->icq_tree, icq->q->id);
hlist_del_init(&icq->ioc_node);
@@ -96,15 +113,7 @@ static void ioc_release_fn(struct work_struct *work)
{
struct io_context *ioc = container_of(work, struct io_context,
release_work);
- unsigned long flags;
-
- /*
- * Exiting icq may call into put_io_context() through elevator
- * which will trigger lockdep warning. The ioc's are guaranteed to
- * be different, use a different locking subclass here. Use
- * irqsave variant as there's no spin_lock_irq_nested().
- */
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ spin_lock_irq(&ioc->lock);
while (!hlist_empty(&ioc->icq_list)) {
struct io_cq *icq = hlist_entry(ioc->icq_list.first,
@@ -115,85 +124,91 @@ static void ioc_release_fn(struct work_struct *work)
ioc_destroy_icq(icq);
spin_unlock(&q->queue_lock);
} else {
- spin_unlock_irqrestore(&ioc->lock, flags);
- cpu_relax();
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
+ /* Make sure q and icq cannot be freed. */
+ rcu_read_lock();
+
+ /* Re-acquire the locks in the correct order. */
+ spin_unlock(&ioc->lock);
+ spin_lock(&q->queue_lock);
+ spin_lock(&ioc->lock);
+
+ ioc_destroy_icq(icq);
+
+ spin_unlock(&q->queue_lock);
+ rcu_read_unlock();
}
}
- spin_unlock_irqrestore(&ioc->lock, flags);
+ spin_unlock_irq(&ioc->lock);
kmem_cache_free(iocontext_cachep, ioc);
}
-/**
- * put_io_context - put a reference of io_context
- * @ioc: io_context to put
- *
- * Decrement reference count of @ioc and release it if the count reaches
- * zero.
+/*
+ * Releasing icqs requires reverse order double locking and we may already be
+ * holding a queue_lock. Do it asynchronously from a workqueue.
*/
-void put_io_context(struct io_context *ioc)
+static bool ioc_delay_free(struct io_context *ioc)
{
unsigned long flags;
- bool free_ioc = false;
-
- if (ioc == NULL)
- return;
-
- BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
- /*
- * Releasing ioc requires reverse order double locking and we may
- * already be holding a queue_lock. Do it asynchronously from wq.
- */
- if (atomic_long_dec_and_test(&ioc->refcount)) {
- spin_lock_irqsave(&ioc->lock, flags);
- if (!hlist_empty(&ioc->icq_list))
- queue_work(system_power_efficient_wq,
- &ioc->release_work);
- else
- free_ioc = true;
+ spin_lock_irqsave(&ioc->lock, flags);
+ if (!hlist_empty(&ioc->icq_list)) {
+ queue_work(system_power_efficient_wq, &ioc->release_work);
spin_unlock_irqrestore(&ioc->lock, flags);
+ return true;
}
-
- if (free_ioc)
- kmem_cache_free(iocontext_cachep, ioc);
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ return false;
}
/**
- * put_io_context_active - put active reference on ioc
- * @ioc: ioc of interest
+ * ioc_clear_queue - break any ioc association with the specified queue
+ * @q: request_queue being cleared
*
- * Undo get_io_context_active(). If active reference reaches zero after
- * put, @ioc can never issue further IOs and ioscheds are notified.
+ * Walk @q->icq_list and exit all io_cq's.
*/
-void put_io_context_active(struct io_context *ioc)
+void ioc_clear_queue(struct request_queue *q)
{
- unsigned long flags;
- struct io_cq *icq;
-
- if (!atomic_dec_and_test(&ioc->active_ref)) {
- put_io_context(ioc);
- return;
- }
-
- /*
- * Need ioc lock to walk icq_list and q lock to exit icq. Perform
- * reverse double locking. Read comment in ioc_release_fn() for
- * explanation on the nested locking annotation.
- */
- spin_lock_irqsave_nested(&ioc->lock, flags, 1);
- hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) {
- if (icq->flags & ICQ_EXITED)
- continue;
-
- ioc_exit_icq(icq);
+ spin_lock_irq(&q->queue_lock);
+ while (!list_empty(&q->icq_list)) {
+ struct io_cq *icq =
+ list_first_entry(&q->icq_list, struct io_cq, q_node);
+
+ /*
+ * Other context won't hold ioc lock to wait for queue_lock, see
+ * details in ioc_release_fn().
+ */
+ spin_lock(&icq->ioc->lock);
+ ioc_destroy_icq(icq);
+ spin_unlock(&icq->ioc->lock);
}
- spin_unlock_irqrestore(&ioc->lock, flags);
+ spin_unlock_irq(&q->queue_lock);
+}
+#else /* CONFIG_BLK_ICQ */
+static inline void ioc_exit_icqs(struct io_context *ioc)
+{
+}
+static inline bool ioc_delay_free(struct io_context *ioc)
+{
+ return false;
+}
+#endif /* CONFIG_BLK_ICQ */
- put_io_context(ioc);
+/**
+ * put_io_context - put a reference of io_context
+ * @ioc: io_context to put
+ *
+ * Decrement reference count of @ioc and release it if the count reaches
+ * zero.
+ */
+void put_io_context(struct io_context *ioc)
+{
+ BUG_ON(atomic_long_read(&ioc->refcount) <= 0);
+ if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc))
+ kmem_cache_free(iocontext_cachep, ioc);
}
+EXPORT_SYMBOL_GPL(put_io_context);
/* Called by the exiting task */
void exit_io_context(struct task_struct *task)
@@ -205,132 +220,110 @@ void exit_io_context(struct task_struct *task)
task->io_context = NULL;
task_unlock(task);
- atomic_dec(&ioc->nr_tasks);
- put_io_context_active(ioc);
-}
-
-static void __ioc_clear_queue(struct list_head *icq_list)
-{
- unsigned long flags;
-
- rcu_read_lock();
- while (!list_empty(icq_list)) {
- struct io_cq *icq = list_entry(icq_list->next,
- struct io_cq, q_node);
- struct io_context *ioc = icq->ioc;
-
- spin_lock_irqsave(&ioc->lock, flags);
- if (icq->flags & ICQ_DESTROYED) {
- spin_unlock_irqrestore(&ioc->lock, flags);
- continue;
- }
- ioc_destroy_icq(icq);
- spin_unlock_irqrestore(&ioc->lock, flags);
+ if (atomic_dec_and_test(&ioc->active_ref)) {
+ ioc_exit_icqs(ioc);
+ put_io_context(ioc);
}
- rcu_read_unlock();
}
-/**
- * ioc_clear_queue - break any ioc association with the specified queue
- * @q: request_queue being cleared
- *
- * Walk @q->icq_list and exit all io_cq's.
- */
-void ioc_clear_queue(struct request_queue *q)
-{
- LIST_HEAD(icq_list);
-
- spin_lock_irq(&q->queue_lock);
- list_splice_init(&q->icq_list, &icq_list);
- spin_unlock_irq(&q->queue_lock);
-
- __ioc_clear_queue(&icq_list);
-}
-
-int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node)
+static struct io_context *alloc_io_context(gfp_t gfp_flags, int node)
{
struct io_context *ioc;
- int ret;
ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO,
node);
if (unlikely(!ioc))
- return -ENOMEM;
+ return NULL;
- /* initialize */
atomic_long_set(&ioc->refcount, 1);
- atomic_set(&ioc->nr_tasks, 1);
atomic_set(&ioc->active_ref, 1);
+#ifdef CONFIG_BLK_ICQ
spin_lock_init(&ioc->lock);
INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC);
INIT_HLIST_HEAD(&ioc->icq_list);
INIT_WORK(&ioc->release_work, ioc_release_fn);
+#endif
+ ioc->ioprio = IOPRIO_DEFAULT;
- /*
- * Try to install. ioc shouldn't be installed if someone else
- * already did or @task, which isn't %current, is exiting. Note
- * that we need to allow ioc creation on exiting %current as exit
- * path may issue IOs from e.g. exit_files(). The exit path is
- * responsible for not issuing IO after exit_io_context().
- */
- task_lock(task);
- if (!task->io_context &&
- (task == current || !(task->flags & PF_EXITING)))
- task->io_context = ioc;
- else
- kmem_cache_free(iocontext_cachep, ioc);
+ return ioc;
+}
- ret = task->io_context ? 0 : -EBUSY;
+int set_task_ioprio(struct task_struct *task, int ioprio)
+{
+ int err;
+ const struct cred *cred = current_cred(), *tcred;
- task_unlock(task);
+ rcu_read_lock();
+ tcred = __task_cred(task);
+ if (!uid_eq(tcred->uid, cred->euid) &&
+ !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
+ rcu_read_unlock();
+ return -EPERM;
+ }
+ rcu_read_unlock();
- return ret;
-}
+ err = security_task_setioprio(task, ioprio);
+ if (err)
+ return err;
-/**
- * get_task_io_context - get io_context of a task
- * @task: task of interest
- * @gfp_flags: allocation flags, used if allocation is necessary
- * @node: allocation node, used if allocation is necessary
- *
- * Return io_context of @task. If it doesn't exist, it is created with
- * @gfp_flags and @node. The returned io_context has its reference count
- * incremented.
- *
- * This function always goes through task_lock() and it's better to use
- * %current->io_context + get_io_context() for %current.
- */
-struct io_context *get_task_io_context(struct task_struct *task,
- gfp_t gfp_flags, int node)
-{
- struct io_context *ioc;
+ task_lock(task);
+ if (unlikely(!task->io_context)) {
+ struct io_context *ioc;
+
+ task_unlock(task);
- might_sleep_if(gfpflags_allow_blocking(gfp_flags));
+ ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE);
+ if (!ioc)
+ return -ENOMEM;
- do {
task_lock(task);
- ioc = task->io_context;
- if (likely(ioc)) {
- get_io_context(ioc);
- task_unlock(task);
- return ioc;
+ if (task->flags & PF_EXITING) {
+ kmem_cache_free(iocontext_cachep, ioc);
+ goto out;
}
- task_unlock(task);
- } while (!create_task_io_context(task, gfp_flags, node));
+ if (task->io_context)
+ kmem_cache_free(iocontext_cachep, ioc);
+ else
+ task->io_context = ioc;
+ }
+ task->io_context->ioprio = ioprio;
+out:
+ task_unlock(task);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(set_task_ioprio);
+
+int __copy_io(unsigned long clone_flags, struct task_struct *tsk)
+{
+ struct io_context *ioc = current->io_context;
- return NULL;
+ /*
+ * Share io context with parent, if CLONE_IO is set
+ */
+ if (clone_flags & CLONE_IO) {
+ atomic_inc(&ioc->active_ref);
+ tsk->io_context = ioc;
+ } else if (ioprio_valid(ioc->ioprio)) {
+ tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE);
+ if (!tsk->io_context)
+ return -ENOMEM;
+ tsk->io_context->ioprio = ioc->ioprio;
+ }
+
+ return 0;
}
+#ifdef CONFIG_BLK_ICQ
/**
* ioc_lookup_icq - lookup io_cq from ioc
- * @ioc: the associated io_context
* @q: the associated request_queue
*
* Look up io_cq associated with @ioc - @q pair from @ioc. Must be called
* with @q->queue_lock held.
*/
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q)
+struct io_cq *ioc_lookup_icq(struct request_queue *q)
{
+ struct io_context *ioc = current->io_context;
struct io_cq *icq;
lockdep_assert_held(&q->queue_lock);
@@ -359,9 +352,7 @@ EXPORT_SYMBOL(ioc_lookup_icq);
/**
* ioc_create_icq - create and link io_cq
- * @ioc: io_context of interest
* @q: request_queue of interest
- * @gfp_mask: allocation mask
*
* Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they
* will be created using @gfp_mask.
@@ -369,19 +360,19 @@ EXPORT_SYMBOL(ioc_lookup_icq);
* The caller is responsible for ensuring @ioc won't go away and @q is
* alive and will stay alive until this function returns.
*/
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
- gfp_t gfp_mask)
+static struct io_cq *ioc_create_icq(struct request_queue *q)
{
+ struct io_context *ioc = current->io_context;
struct elevator_type *et = q->elevator->type;
struct io_cq *icq;
/* allocate stuff */
- icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO,
+ icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO,
q->node);
if (!icq)
return NULL;
- if (radix_tree_maybe_preload(gfp_mask) < 0) {
+ if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) {
kmem_cache_free(et->icq_cache, icq);
return NULL;
}
@@ -402,7 +393,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
et->ops.init_icq(icq);
} else {
kmem_cache_free(et->icq_cache, icq);
- icq = ioc_lookup_icq(ioc, q);
+ icq = ioc_lookup_icq(q);
if (!icq)
printk(KERN_ERR "cfq: icq link failed!\n");
}
@@ -413,6 +404,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
return icq;
}
+struct io_cq *ioc_find_get_icq(struct request_queue *q)
+{
+ struct io_context *ioc = current->io_context;
+ struct io_cq *icq = NULL;
+
+ if (unlikely(!ioc)) {
+ ioc = alloc_io_context(GFP_ATOMIC, q->node);
+ if (!ioc)
+ return NULL;
+
+ task_lock(current);
+ if (current->io_context) {
+ kmem_cache_free(iocontext_cachep, ioc);
+ ioc = current->io_context;
+ } else {
+ current->io_context = ioc;
+ }
+
+ get_io_context(ioc);
+ task_unlock(current);
+ } else {
+ get_io_context(ioc);
+
+ spin_lock_irq(&q->queue_lock);
+ icq = ioc_lookup_icq(q);
+ spin_unlock_irq(&q->queue_lock);
+ }
+
+ if (!icq) {
+ icq = ioc_create_icq(q);
+ if (!icq) {
+ put_io_context(ioc);
+ return NULL;
+ }
+ }
+ return icq;
+}
+EXPORT_SYMBOL_GPL(ioc_find_get_icq);
+#endif /* CONFIG_BLK_ICQ */
+
static int __init blk_ioc_init(void)
{
iocontext_cachep = kmem_cache_create("blkdev_ioc",
diff --git a/block/blk-iocost.c b/block/blk-iocost.c
index 8ac4aad66ebc..04d44f0bcbc8 100644
--- a/block/blk-iocost.c
+++ b/block/blk-iocost.c
@@ -39,7 +39,7 @@
* On top of that, a size cost proportional to the length of the IO is
* added. While simple, this model captures the operational
* characteristics of a wide varienty of devices well enough. Default
- * paramters for several different classes of devices are provided and the
+ * parameters for several different classes of devices are provided and the
* parameters can be configured from userspace via
* /sys/fs/cgroup/io.cost.model.
*
@@ -68,7 +68,7 @@
* gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
* 12.5% each. The distribution mechanism only cares about these flattened
* shares. They're called hweights (hierarchical weights) and always add
- * upto 1 (HWEIGHT_WHOLE).
+ * upto 1 (WEIGHT_ONE).
*
* A given cgroup's vtime runs slower in inverse proportion to its hweight.
* For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
@@ -77,7 +77,7 @@
*
* This constitutes the basis of IO capacity distribution. Each cgroup's
* vtime is running at a rate determined by its hweight. A cgroup tracks
- * the vtime consumed by past IOs and can issue a new IO iff doing so
+ * the vtime consumed by past IOs and can issue a new IO if doing so
* wouldn't outrun the current device vtime. Otherwise, the IO is
* suspended until the vtime has progressed enough to cover it.
*
@@ -111,7 +111,7 @@
* busy signal.
*
* As devices can have deep queues and be unfair in how the queued commands
- * are executed, soley depending on rq wait may not result in satisfactory
+ * are executed, solely depending on rq wait may not result in satisfactory
* control quality. For a better control quality, completion latency QoS
* parameters can be configured so that the device is considered saturated
* if N'th percentile completion latency rises above the set point.
@@ -155,7 +155,7 @@
* Instead of debugfs or other clumsy monitoring mechanisms, this
* controller uses a drgn based monitoring script -
* tools/cgroup/iocost_monitor.py. For details on drgn, please see
- * https://github.com/osandov/drgn. The ouput looks like the following.
+ * https://github.com/osandov/drgn. The output looks like the following.
*
* sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
* active weight hweight% inflt% dbt delay usages%
@@ -178,10 +178,12 @@
#include <linux/time64.h>
#include <linux/parser.h>
#include <linux/sched/signal.h>
-#include <linux/blk-cgroup.h>
+#include <asm/local.h>
+#include <asm/local64.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
#ifdef CONFIG_TRACEPOINTS
@@ -215,37 +217,24 @@ enum {
MAX_PERIOD = USEC_PER_SEC,
/*
- * A cgroup's vtime can run 50% behind the device vtime, which
+ * iocg->vtime is targeted at 50% behind the device vtime, which
* serves as its IO credit buffer. Surplus weight adjustment is
* immediately canceled if the vtime margin runs below 10%.
*/
- MARGIN_PCT = 50,
- INUSE_MARGIN_PCT = 10,
+ MARGIN_MIN_PCT = 10,
+ MARGIN_LOW_PCT = 20,
+ MARGIN_TARGET_PCT = 50,
- /* Have some play in waitq timer operations */
- WAITQ_TIMER_MARGIN_PCT = 5,
+ INUSE_ADJ_STEP_PCT = 25,
- /*
- * vtime can wrap well within a reasonable uptime when vrate is
- * consistently raised. Don't trust recorded cgroup vtime if the
- * period counter indicates that it's older than 5mins.
- */
- VTIME_VALID_DUR = 300 * USEC_PER_SEC,
-
- /*
- * Remember the past three non-zero usages and use the max for
- * surplus calculation. Three slots guarantee that we remember one
- * full period usage from the last active stretch even after
- * partial deactivation and re-activation periods. Don't start
- * giving away weight before collecting two data points to prevent
- * hweight adjustments based on one partial activation period.
- */
- NR_USAGE_SLOTS = 3,
- MIN_VALID_USAGES = 2,
+ /* Have some play in timer operations */
+ TIMER_SLACK_PCT = 1,
/* 1/64k is granular enough and can easily be handled w/ u32 */
- HWEIGHT_WHOLE = 1 << 16,
+ WEIGHT_ONE = 1 << 16,
+};
+enum {
/*
* As vtime is used to calculate the cost of each IO, it needs to
* be fairly high precision. For example, it should be able to
@@ -269,25 +258,48 @@ enum {
VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
VRATE_CLAMP_ADJ_PCT = 4,
+ /* switch iff the conditions are met for longer than this */
+ AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
+};
+
+enum {
/* if IOs end up waiting for requests, issue less */
RQ_WAIT_BUSY_PCT = 5,
/* unbusy hysterisis */
UNBUSY_THR_PCT = 75,
- /* don't let cmds which take a very long time pin lagging for too long */
- MAX_LAGGING_PERIODS = 10,
-
/*
- * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
- * donate the surplus.
+ * The effect of delay is indirect and non-linear and a huge amount of
+ * future debt can accumulate abruptly while unthrottled. Linearly scale
+ * up delay as debt is going up and then let it decay exponentially.
+ * This gives us quick ramp ups while delay is accumulating and long
+ * tails which can help reducing the frequency of debt explosions on
+ * unthrottle. The parameters are experimentally determined.
+ *
+ * The delay mechanism provides adequate protection and behavior in many
+ * cases. However, this is far from ideal and falls shorts on both
+ * fronts. The debtors are often throttled too harshly costing a
+ * significant level of fairness and possibly total work while the
+ * protection against their impacts on the system can be choppy and
+ * unreliable.
+ *
+ * The shortcoming primarily stems from the fact that, unlike for page
+ * cache, the kernel doesn't have well-defined back-pressure propagation
+ * mechanism and policies for anonymous memory. Fully addressing this
+ * issue will likely require substantial improvements in the area.
*/
- SURPLUS_SCALE_PCT = 125, /* * 125% */
- SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */
- SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */
+ MIN_DELAY_THR_PCT = 500,
+ MAX_DELAY_THR_PCT = 25000,
+ MIN_DELAY = 250,
+ MAX_DELAY = 250 * USEC_PER_MSEC,
- /* switch iff the conditions are met for longer than this */
- AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
+ /* halve debts if avg usage over 100ms is under 50% */
+ DFGV_USAGE_PCT = 50,
+ DFGV_PERIOD = 100 * USEC_PER_MSEC,
+
+ /* don't let cmds which take a very long time pin lagging for too long */
+ MAX_LAGGING_PERIODS = 10,
/*
* Count IO size in 4k pages. The 12bit shift helps keeping
@@ -362,8 +374,6 @@ enum {
AUTOP_SSD_FAST,
};
-struct ioc_gq;
-
struct ioc_params {
u32 qos[NR_QOS_PARAMS];
u64 i_lcoefs[NR_I_LCOEFS];
@@ -372,9 +382,15 @@ struct ioc_params {
u32 too_slow_vrate_pct;
};
+struct ioc_margins {
+ s64 min;
+ s64 low;
+ s64 target;
+};
+
struct ioc_missed {
- u32 nr_met;
- u32 nr_missed;
+ local_t nr_met;
+ local_t nr_missed;
u32 last_met;
u32 last_missed;
};
@@ -382,7 +398,7 @@ struct ioc_missed {
struct ioc_pcpu_stat {
struct ioc_missed missed[2];
- u64 rq_wait_ns;
+ local64_t rq_wait_ns;
u64 last_rq_wait_ns;
};
@@ -393,8 +409,9 @@ struct ioc {
bool enabled;
struct ioc_params params;
+ struct ioc_margins margins;
u32 period_us;
- u32 margin_us;
+ u32 timer_slack_ns;
u64 vrate_min;
u64 vrate_max;
@@ -405,18 +422,24 @@ struct ioc {
enum ioc_running running;
atomic64_t vtime_rate;
+ u64 vtime_base_rate;
+ s64 vtime_err;
- seqcount_t period_seqcount;
- u32 period_at; /* wallclock starttime */
+ seqcount_spinlock_t period_seqcount;
+ u64 period_at; /* wallclock starttime */
u64 period_at_vtime; /* vtime starttime */
atomic64_t cur_period; /* inc'd each period */
int busy_level; /* saturation history */
- u64 inuse_margin_vtime;
bool weights_updated;
atomic_t hweight_gen; /* for lazy hweights */
+ /* debt forgivness */
+ u64 dfgv_period_at;
+ u64 dfgv_period_rem;
+ u64 dfgv_usage_us_sum;
+
u64 autop_too_fast_at;
u64 autop_too_slow_at;
int autop_idx;
@@ -424,6 +447,17 @@ struct ioc {
bool user_cost_model:1;
};
+struct iocg_pcpu_stat {
+ local64_t abs_vusage;
+};
+
+struct iocg_stat {
+ u64 usage_us;
+ u64 wait_us;
+ u64 indebt_us;
+ u64 indelay_us;
+};
+
/* per device-cgroup pair */
struct ioc_gq {
struct blkg_policy_data pd;
@@ -443,32 +477,37 @@ struct ioc_gq {
*
* `last_inuse` remembers `inuse` while an iocg is idle to persist
* surplus adjustments.
+ *
+ * `inuse` may be adjusted dynamically during period. `saved_*` are used
+ * to determine and track adjustments.
*/
u32 cfg_weight;
u32 weight;
u32 active;
u32 inuse;
+
u32 last_inuse;
+ s64 saved_margin;
sector_t cursor; /* to detect randio */
/*
* `vtime` is this iocg's vtime cursor which progresses as IOs are
* issued. If lagging behind device vtime, the delta represents
- * the currently available IO budget. If runnning ahead, the
+ * the currently available IO budget. If running ahead, the
* overage.
*
* `vtime_done` is the same but progressed on completion rather
* than issue. The delta behind `vtime` represents the cost of
* currently in-flight IOs.
- *
- * `last_vtime` is used to remember `vtime` at the end of the last
- * period to calculate utilization.
*/
atomic64_t vtime;
atomic64_t done_vtime;
u64 abs_vdebt;
- u64 last_vtime;
+
+ /* current delay in effect and when it started */
+ u64 delay;
+ u64 delay_at;
/*
* The period this iocg was last active in. Used for deactivation
@@ -477,21 +516,34 @@ struct ioc_gq {
atomic64_t active_period;
struct list_head active_list;
- /* see __propagate_active_weight() and current_hweight() for details */
+ /* see __propagate_weights() and current_hweight() for details */
u64 child_active_sum;
u64 child_inuse_sum;
+ u64 child_adjusted_sum;
int hweight_gen;
u32 hweight_active;
u32 hweight_inuse;
- bool has_surplus;
+ u32 hweight_donating;
+ u32 hweight_after_donation;
+
+ struct list_head walk_list;
+ struct list_head surplus_list;
struct wait_queue_head waitq;
struct hrtimer waitq_timer;
- struct hrtimer delay_timer;
- /* usage is recorded as fractions of HWEIGHT_WHOLE */
- int usage_idx;
- u32 usages[NR_USAGE_SLOTS];
+ /* timestamp at the latest activation */
+ u64 activated_at;
+
+ /* statistics */
+ struct iocg_pcpu_stat __percpu *pcpu_stat;
+ struct iocg_stat stat;
+ struct iocg_stat last_stat;
+ u64 last_stat_abs_vusage;
+ u64 usage_delta_us;
+ u64 wait_since;
+ u64 indebt_since;
+ u64 indelay_since;
/* this iocg's depth in the hierarchy and ancestors including self */
int level;
@@ -506,9 +558,8 @@ struct ioc_cgrp {
struct ioc_now {
u64 now_ns;
- u32 now;
+ u64 now;
u64 vnow;
- u64 vrate;
};
struct iocg_wait {
@@ -616,17 +667,13 @@ static struct ioc *q_to_ioc(struct request_queue *q)
return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
}
-static const char *q_name(struct request_queue *q)
-{
- if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
- return kobject_name(q->kobj.parent);
- else
- return "<unknown>";
-}
-
static const char __maybe_unused *ioc_name(struct ioc *ioc)
{
- return q_name(ioc->rqos.q);
+ struct gendisk *disk = ioc->rqos.disk;
+
+ if (!disk)
+ return "<unknown>";
+ return disk->disk_name;
}
static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
@@ -656,7 +703,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
*/
static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
{
- return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
+ return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
}
/*
@@ -664,18 +711,56 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
*/
static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
{
- return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
+ return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
}
-static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
+static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
+ u64 abs_cost, u64 cost)
{
+ struct iocg_pcpu_stat *gcs;
+
bio->bi_iocost_cost = cost;
atomic64_add(cost, &iocg->vtime);
+
+ gcs = get_cpu_ptr(iocg->pcpu_stat);
+ local64_add(abs_cost, &gcs->abs_vusage);
+ put_cpu_ptr(gcs);
+}
+
+static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
+{
+ if (lock_ioc) {
+ spin_lock_irqsave(&iocg->ioc->lock, *flags);
+ spin_lock(&iocg->waitq.lock);
+ } else {
+ spin_lock_irqsave(&iocg->waitq.lock, *flags);
+ }
+}
+
+static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
+{
+ if (unlock_ioc) {
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
+ } else {
+ spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
+ }
}
#define CREATE_TRACE_POINTS
#include <trace/events/iocost.h>
+static void ioc_refresh_margins(struct ioc *ioc)
+{
+ struct ioc_margins *margins = &ioc->margins;
+ u32 period_us = ioc->period_us;
+ u64 vrate = ioc->vtime_base_rate;
+
+ margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
+ margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
+ margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
+}
+
/* latency Qos params changed, update period_us and all the dependent params */
static void ioc_refresh_period_us(struct ioc *ioc)
{
@@ -709,12 +794,17 @@ static void ioc_refresh_period_us(struct ioc *ioc)
/* calculate dependent params */
ioc->period_us = period_us;
- ioc->margin_us = period_us * MARGIN_PCT / 100;
- ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
- period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
+ ioc->timer_slack_ns = div64_u64(
+ (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
+ 100);
+ ioc_refresh_margins(ioc);
}
-static int ioc_autop_idx(struct ioc *ioc)
+/*
+ * ioc->rqos.disk isn't initialized when this function is called from
+ * the init path.
+ */
+static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
{
int idx = ioc->autop_idx;
const struct ioc_params *p = &autop[idx];
@@ -722,11 +812,11 @@ static int ioc_autop_idx(struct ioc *ioc)
u64 now_ns;
/* rotational? */
- if (!blk_queue_nonrot(ioc->rqos.q))
+ if (!blk_queue_nonrot(disk->queue))
return AUTOP_HDD;
/* handle SATA SSDs w/ broken NCQ */
- if (blk_queue_depth(ioc->rqos.q) == 1)
+ if (blk_queue_depth(disk->queue) == 1)
return AUTOP_SSD_QD1;
/* use one of the normal ssd sets */
@@ -738,8 +828,7 @@ static int ioc_autop_idx(struct ioc *ioc)
return idx;
/* step up/down based on the vrate */
- vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
- VTIME_PER_USEC);
+ vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
now_ns = ktime_get_ns();
if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
@@ -783,9 +872,14 @@ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
*page = *seqio = *randio = 0;
- if (bps)
- *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
- DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
+ if (bps) {
+ u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
+
+ if (bps_pages)
+ *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
+ else
+ *page = 1;
+ }
if (seqiops) {
v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
@@ -811,21 +905,28 @@ static void ioc_refresh_lcoefs(struct ioc *ioc)
&c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
}
-static bool ioc_refresh_params(struct ioc *ioc, bool force)
+/*
+ * struct gendisk is required as an argument because ioc->rqos.disk
+ * is not properly initialized when called from the init path.
+ */
+static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
+ struct gendisk *disk)
{
const struct ioc_params *p;
int idx;
lockdep_assert_held(&ioc->lock);
- idx = ioc_autop_idx(ioc);
+ idx = ioc_autop_idx(ioc, disk);
p = &autop[idx];
if (idx == ioc->autop_idx && !force)
return false;
- if (idx != ioc->autop_idx)
+ if (idx != ioc->autop_idx) {
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
+ ioc->vtime_base_rate = VTIME_PER_USEC;
+ }
ioc->autop_idx = idx;
ioc->autop_too_fast_at = 0;
@@ -841,20 +942,111 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force)
ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
VTIME_PER_USEC, MILLION);
- ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
- VTIME_PER_USEC, MILLION);
+ ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
+ VTIME_PER_USEC, MILLION);
return true;
}
+static bool ioc_refresh_params(struct ioc *ioc, bool force)
+{
+ return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
+}
+
+/*
+ * When an iocg accumulates too much vtime or gets deactivated, we throw away
+ * some vtime, which lowers the overall device utilization. As the exact amount
+ * which is being thrown away is known, we can compensate by accelerating the
+ * vrate accordingly so that the extra vtime generated in the current period
+ * matches what got lost.
+ */
+static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
+{
+ s64 pleft = ioc->period_at + ioc->period_us - now->now;
+ s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
+ s64 vcomp, vcomp_min, vcomp_max;
+
+ lockdep_assert_held(&ioc->lock);
+
+ /* we need some time left in this period */
+ if (pleft <= 0)
+ goto done;
+
+ /*
+ * Calculate how much vrate should be adjusted to offset the error.
+ * Limit the amount of adjustment and deduct the adjusted amount from
+ * the error.
+ */
+ vcomp = -div64_s64(ioc->vtime_err, pleft);
+ vcomp_min = -(ioc->vtime_base_rate >> 1);
+ vcomp_max = ioc->vtime_base_rate;
+ vcomp = clamp(vcomp, vcomp_min, vcomp_max);
+
+ ioc->vtime_err += vcomp * pleft;
+
+ atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
+done:
+ /* bound how much error can accumulate */
+ ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
+}
+
+static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
+ int nr_lagging, int nr_shortages,
+ int prev_busy_level, u32 *missed_ppm)
+{
+ u64 vrate = ioc->vtime_base_rate;
+ u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
+
+ if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
+ if (ioc->busy_level != prev_busy_level || nr_lagging)
+ trace_iocost_ioc_vrate_adj(ioc, vrate,
+ missed_ppm, rq_wait_pct,
+ nr_lagging, nr_shortages);
+
+ return;
+ }
+
+ /*
+ * If vrate is out of bounds, apply clamp gradually as the
+ * bounds can change abruptly. Otherwise, apply busy_level
+ * based adjustment.
+ */
+ if (vrate < vrate_min) {
+ vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
+ vrate = min(vrate, vrate_min);
+ } else if (vrate > vrate_max) {
+ vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
+ vrate = max(vrate, vrate_max);
+ } else {
+ int idx = min_t(int, abs(ioc->busy_level),
+ ARRAY_SIZE(vrate_adj_pct) - 1);
+ u32 adj_pct = vrate_adj_pct[idx];
+
+ if (ioc->busy_level > 0)
+ adj_pct = 100 - adj_pct;
+ else
+ adj_pct = 100 + adj_pct;
+
+ vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
+ vrate_min, vrate_max);
+ }
+
+ trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
+ nr_lagging, nr_shortages);
+
+ ioc->vtime_base_rate = vrate;
+ ioc_refresh_margins(ioc);
+}
+
/* take a snapshot of the current [v]time and vrate */
static void ioc_now(struct ioc *ioc, struct ioc_now *now)
{
unsigned seq;
+ u64 vrate;
now->now_ns = ktime_get();
now->now = ktime_to_us(now->now_ns);
- now->vrate = atomic64_read(&ioc->vtime_rate);
+ vrate = atomic64_read(&ioc->vtime_rate);
/*
* The current vtime is
@@ -867,13 +1059,12 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now)
do {
seq = read_seqcount_begin(&ioc->period_seqcount);
now->vnow = ioc->period_at_vtime +
- (now->now - ioc->period_at) * now->vrate;
+ (now->now - ioc->period_at) * vrate;
} while (read_seqcount_retry(&ioc->period_seqcount, seq));
}
static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
{
- lockdep_assert_held(&ioc->lock);
WARN_ON_ONCE(ioc->running != IOC_RUNNING);
write_seqcount_begin(&ioc->period_seqcount);
@@ -887,16 +1078,35 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
/*
* Update @iocg's `active` and `inuse` to @active and @inuse, update level
- * weight sums and propagate upwards accordingly.
+ * weight sums and propagate upwards accordingly. If @save, the current margin
+ * is saved to be used as reference for later inuse in-period adjustments.
*/
-static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
+ bool save, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
int lvl;
lockdep_assert_held(&ioc->lock);
- inuse = min(active, inuse);
+ /*
+ * For an active leaf node, its inuse shouldn't be zero or exceed
+ * @active. An active internal node's inuse is solely determined by the
+ * inuse to active ratio of its children regardless of @inuse.
+ */
+ if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
+ inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
+ iocg->child_active_sum);
+ } else {
+ inuse = clamp_t(u32, inuse, 1, active);
+ }
+
+ iocg->last_inuse = iocg->inuse;
+ if (save)
+ iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
+
+ if (active == iocg->active && inuse == iocg->inuse)
+ return;
for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
struct ioc_gq *parent = iocg->ancestors[lvl];
@@ -906,13 +1116,13 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse
/* update the level sums */
parent->child_active_sum += (s32)(active - child->active);
parent->child_inuse_sum += (s32)(inuse - child->inuse);
- /* apply the udpates */
+ /* apply the updates */
child->active = active;
child->inuse = inuse;
/*
* The delta between inuse and active sums indicates that
- * that much of weight is being given away. Parent's inuse
+ * much of weight is being given away. Parent's inuse
* and active should reflect the ratio.
*/
if (parent->child_active_sum) {
@@ -934,7 +1144,7 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse
ioc->weights_updated = true;
}
-static void commit_active_weights(struct ioc *ioc)
+static void commit_weights(struct ioc *ioc)
{
lockdep_assert_held(&ioc->lock);
@@ -946,10 +1156,11 @@ static void commit_active_weights(struct ioc *ioc)
}
}
-static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
+static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
+ bool save, struct ioc_now *now)
{
- __propagate_active_weight(iocg, active, inuse);
- commit_active_weights(iocg->ioc);
+ __propagate_weights(iocg, active, inuse, save, now);
+ commit_weights(iocg->ioc);
}
static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
@@ -965,9 +1176,9 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
goto out;
/*
- * Paired with wmb in commit_active_weights(). If we saw the
- * updated hweight_gen, all the weight updates from
- * __propagate_active_weight() are visible too.
+ * Paired with wmb in commit_weights(). If we saw the updated
+ * hweight_gen, all the weight updates from __propagate_weights() are
+ * visible too.
*
* We can race with weight updates during calculation and get it
* wrong. However, hweight_gen would have changed and a future
@@ -976,12 +1187,12 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
*/
smp_rmb();
- hwa = hwi = HWEIGHT_WHOLE;
+ hwa = hwi = WEIGHT_ONE;
for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
struct ioc_gq *parent = iocg->ancestors[lvl];
struct ioc_gq *child = iocg->ancestors[lvl + 1];
- u32 active_sum = READ_ONCE(parent->child_active_sum);
- u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
+ u64 active_sum = READ_ONCE(parent->child_active_sum);
+ u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
u32 active = READ_ONCE(child->active);
u32 inuse = READ_ONCE(child->inuse);
@@ -989,11 +1200,11 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep
if (!active_sum || !inuse_sum)
continue;
- active_sum = max(active, active_sum);
- hwa = hwa * active / active_sum; /* max 16bits * 10000 */
+ active_sum = max_t(u64, active, active_sum);
+ hwa = div64_u64((u64)hwa * active, active_sum);
- inuse_sum = max(inuse, inuse_sum);
- hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */
+ inuse_sum = max_t(u64, inuse, inuse_sum);
+ hwi = div64_u64((u64)hwi * inuse, inuse_sum);
}
iocg->hweight_active = max_t(u32, hwa, 1);
@@ -1006,7 +1217,33 @@ out:
*hw_inusep = iocg->hweight_inuse;
}
-static void weight_updated(struct ioc_gq *iocg)
+/*
+ * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
+ * other weights stay unchanged.
+ */
+static u32 current_hweight_max(struct ioc_gq *iocg)
+{
+ u32 hwm = WEIGHT_ONE;
+ u32 inuse = iocg->active;
+ u64 child_inuse_sum;
+ int lvl;
+
+ lockdep_assert_held(&iocg->ioc->lock);
+
+ for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
+ struct ioc_gq *parent = iocg->ancestors[lvl];
+ struct ioc_gq *child = iocg->ancestors[lvl + 1];
+
+ child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
+ hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
+ inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
+ parent->child_active_sum);
+ }
+
+ return max_t(u32, hwm, 1);
+}
+
+static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
struct blkcg_gq *blkg = iocg_to_blkg(iocg);
@@ -1017,16 +1254,15 @@ static void weight_updated(struct ioc_gq *iocg)
weight = iocg->cfg_weight ?: iocc->dfl_weight;
if (weight != iocg->weight && iocg->active)
- propagate_active_weight(iocg, weight,
- DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
+ propagate_weights(iocg, weight, iocg->inuse, true, now);
iocg->weight = weight;
}
static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
- u64 last_period, cur_period, max_period_delta;
- u64 vtime, vmargin, vmin;
+ u64 __maybe_unused last_period, cur_period;
+ u64 vtime, vtarget;
int i;
/*
@@ -1065,22 +1301,15 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
goto fail_unlock;
/*
- * vtime may wrap when vrate is raised substantially due to
- * underestimated IO costs. Look at the period and ignore its
- * vtime if the iocg has been idle for too long. Also, cap the
- * budget it can start with to the margin.
+ * Always start with the target budget. On deactivation, we throw away
+ * anything above it.
*/
- max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
+ vtarget = now->vnow - ioc->margins.target;
vtime = atomic64_read(&iocg->vtime);
- vmargin = ioc->margin_us * now->vrate;
- vmin = now->vnow - vmargin;
- if (last_period + max_period_delta < cur_period ||
- time_before64(vtime, vmin)) {
- atomic64_add(vmin - vtime, &iocg->vtime);
- atomic64_add(vmin - vtime, &iocg->done_vtime);
- vtime = vmin;
- }
+ atomic64_add(vtarget - vtime, &iocg->vtime);
+ atomic64_add(vtarget - vtime, &iocg->done_vtime);
+ vtime = vtarget;
/*
* Activate, propagate weight and start period timer if not
@@ -1089,16 +1318,19 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
*/
iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
list_add(&iocg->active_list, &ioc->active_iocgs);
- propagate_active_weight(iocg, iocg->weight,
- iocg->last_inuse ?: iocg->weight);
+
+ propagate_weights(iocg, iocg->weight,
+ iocg->last_inuse ?: iocg->weight, true, now);
TRACE_IOCG_PATH(iocg_activate, iocg, now,
last_period, cur_period, vtime);
- iocg->last_vtime = vtime;
+ iocg->activated_at = now->now;
if (ioc->running == IOC_IDLE) {
ioc->running = IOC_RUNNING;
+ ioc->dfgv_period_at = now->now;
+ ioc->dfgv_period_rem = 0;
ioc_start_period(ioc, now);
}
@@ -1111,11 +1343,122 @@ fail_unlock:
return false;
}
+static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct blkcg_gq *blkg = iocg_to_blkg(iocg);
+ u64 tdelta, delay, new_delay;
+ s64 vover, vover_pct;
+ u32 hwa;
+
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /*
+ * If the delay is set by another CPU, we may be in the past. No need to
+ * change anything if so. This avoids decay calculation underflow.
+ */
+ if (time_before64(now->now, iocg->delay_at))
+ return false;
+
+ /* calculate the current delay in effect - 1/2 every second */
+ tdelta = now->now - iocg->delay_at;
+ if (iocg->delay)
+ delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
+ else
+ delay = 0;
+
+ /* calculate the new delay from the debt amount */
+ current_hweight(iocg, &hwa, NULL);
+ vover = atomic64_read(&iocg->vtime) +
+ abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
+ vover_pct = div64_s64(100 * vover,
+ ioc->period_us * ioc->vtime_base_rate);
+
+ if (vover_pct <= MIN_DELAY_THR_PCT)
+ new_delay = 0;
+ else if (vover_pct >= MAX_DELAY_THR_PCT)
+ new_delay = MAX_DELAY;
+ else
+ new_delay = MIN_DELAY +
+ div_u64((MAX_DELAY - MIN_DELAY) *
+ (vover_pct - MIN_DELAY_THR_PCT),
+ MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
+
+ /* pick the higher one and apply */
+ if (new_delay > delay) {
+ iocg->delay = new_delay;
+ iocg->delay_at = now->now;
+ delay = new_delay;
+ }
+
+ if (delay >= MIN_DELAY) {
+ if (!iocg->indelay_since)
+ iocg->indelay_since = now->now;
+ blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
+ return true;
+ } else {
+ if (iocg->indelay_since) {
+ iocg->stat.indelay_us += now->now - iocg->indelay_since;
+ iocg->indelay_since = 0;
+ }
+ iocg->delay = 0;
+ blkcg_clear_delay(blkg);
+ return false;
+ }
+}
+
+static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
+ struct ioc_now *now)
+{
+ struct iocg_pcpu_stat *gcs;
+
+ lockdep_assert_held(&iocg->ioc->lock);
+ lockdep_assert_held(&iocg->waitq.lock);
+ WARN_ON_ONCE(list_empty(&iocg->active_list));
+
+ /*
+ * Once in debt, debt handling owns inuse. @iocg stays at the minimum
+ * inuse donating all of it share to others until its debt is paid off.
+ */
+ if (!iocg->abs_vdebt && abs_cost) {
+ iocg->indebt_since = now->now;
+ propagate_weights(iocg, iocg->active, 0, false, now);
+ }
+
+ iocg->abs_vdebt += abs_cost;
+
+ gcs = get_cpu_ptr(iocg->pcpu_stat);
+ local64_add(abs_cost, &gcs->abs_vusage);
+ put_cpu_ptr(gcs);
+}
+
+static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
+ struct ioc_now *now)
+{
+ lockdep_assert_held(&iocg->ioc->lock);
+ lockdep_assert_held(&iocg->waitq.lock);
+
+ /* make sure that nobody messed with @iocg */
+ WARN_ON_ONCE(list_empty(&iocg->active_list));
+ WARN_ON_ONCE(iocg->inuse > 1);
+
+ iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
+
+ /* if debt is paid in full, restore inuse */
+ if (!iocg->abs_vdebt) {
+ iocg->stat.indebt_us += now->now - iocg->indebt_since;
+ iocg->indebt_since = 0;
+
+ propagate_weights(iocg, iocg->active, iocg->last_inuse,
+ false, now);
+ }
+}
+
static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
int flags, void *key)
{
struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
- struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
+ struct iocg_wake_ctx *ctx = key;
u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
ctx->vbudget -= cost;
@@ -1123,146 +1466,121 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
if (ctx->vbudget < 0)
return -1;
- iocg_commit_bio(ctx->iocg, wait->bio, cost);
+ iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
+ wait->committed = true;
/*
* autoremove_wake_function() removes the wait entry only when it
- * actually changed the task state. We want the wait always
- * removed. Remove explicitly and use default_wake_function().
+ * actually changed the task state. We want the wait always removed.
+ * Remove explicitly and use default_wake_function(). Note that the
+ * order of operations is important as finish_wait() tests whether
+ * @wq_entry is removed without grabbing the lock.
*/
- list_del_init(&wq_entry->entry);
- wait->committed = true;
-
default_wake_function(wq_entry, mode, flags, key);
+ list_del_init_careful(&wq_entry->entry);
return 0;
}
-static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
+/*
+ * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
+ * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
+ * addition to iocg->waitq.lock.
+ */
+static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
+ struct ioc_now *now)
{
struct ioc *ioc = iocg->ioc;
struct iocg_wake_ctx ctx = { .iocg = iocg };
- u64 margin_ns = (u64)(ioc->period_us *
- WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
- u64 vdebt, vshortage, expires, oexpires;
+ u64 vshortage, expires, oexpires;
s64 vbudget;
- u32 hw_inuse;
+ u32 hwa;
lockdep_assert_held(&iocg->waitq.lock);
- current_hweight(iocg, NULL, &hw_inuse);
+ current_hweight(iocg, &hwa, NULL);
vbudget = now->vnow - atomic64_read(&iocg->vtime);
/* pay off debt */
- vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
- if (vdebt && vbudget > 0) {
- u64 delta = min_t(u64, vbudget, vdebt);
- u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
- iocg->abs_vdebt);
+ if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
+ u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
+ u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
+ u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
+
+ lockdep_assert_held(&ioc->lock);
- atomic64_add(delta, &iocg->vtime);
- atomic64_add(delta, &iocg->done_vtime);
- iocg->abs_vdebt -= abs_delta;
+ atomic64_add(vpay, &iocg->vtime);
+ atomic64_add(vpay, &iocg->done_vtime);
+ iocg_pay_debt(iocg, abs_vpay, now);
+ vbudget -= vpay;
}
+ if (iocg->abs_vdebt || iocg->delay)
+ iocg_kick_delay(iocg, now);
+
/*
- * Wake up the ones which are due and see how much vtime we'll need
- * for the next one.
+ * Debt can still be outstanding if we haven't paid all yet or the
+ * caller raced and called without @pay_debt. Shouldn't wake up waiters
+ * under debt. Make sure @vbudget reflects the outstanding amount and is
+ * not positive.
*/
- ctx.hw_inuse = hw_inuse;
- ctx.vbudget = vbudget - vdebt;
+ if (iocg->abs_vdebt) {
+ s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
+ vbudget = min_t(s64, 0, vbudget - vdebt);
+ }
+
+ /*
+ * Wake up the ones which are due and see how much vtime we'll need for
+ * the next one. As paying off debt restores hw_inuse, it must be read
+ * after the above debt payment.
+ */
+ ctx.vbudget = vbudget;
+ current_hweight(iocg, NULL, &ctx.hw_inuse);
+
__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
- if (!waitqueue_active(&iocg->waitq))
+
+ if (!waitqueue_active(&iocg->waitq)) {
+ if (iocg->wait_since) {
+ iocg->stat.wait_us += now->now - iocg->wait_since;
+ iocg->wait_since = 0;
+ }
return;
+ }
+
+ if (!iocg->wait_since)
+ iocg->wait_since = now->now;
+
if (WARN_ON_ONCE(ctx.vbudget >= 0))
return;
- /* determine next wakeup, add a quarter margin to guarantee chunking */
+ /* determine next wakeup, add a timer margin to guarantee chunking */
vshortage = -ctx.vbudget;
expires = now->now_ns +
- DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
- expires += margin_ns / 4;
+ DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
+ NSEC_PER_USEC;
+ expires += ioc->timer_slack_ns;
/* if already active and close enough, don't bother */
oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
if (hrtimer_is_queued(&iocg->waitq_timer) &&
- abs(oexpires - expires) <= margin_ns / 4)
+ abs(oexpires - expires) <= ioc->timer_slack_ns)
return;
hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
- margin_ns / 4, HRTIMER_MODE_ABS);
+ ioc->timer_slack_ns, HRTIMER_MODE_ABS);
}
static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
{
struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
+ bool pay_debt = READ_ONCE(iocg->abs_vdebt);
struct ioc_now now;
unsigned long flags;
ioc_now(iocg->ioc, &now);
- spin_lock_irqsave(&iocg->waitq.lock, flags);
- iocg_kick_waitq(iocg, &now);
- spin_unlock_irqrestore(&iocg->waitq.lock, flags);
-
- return HRTIMER_NORESTART;
-}
-
-static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
-{
- struct ioc *ioc = iocg->ioc;
- struct blkcg_gq *blkg = iocg_to_blkg(iocg);
- u64 vtime = atomic64_read(&iocg->vtime);
- u64 vmargin = ioc->margin_us * now->vrate;
- u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
- u64 delta_ns, expires, oexpires;
- u32 hw_inuse;
-
- lockdep_assert_held(&iocg->waitq.lock);
-
- /* debt-adjust vtime */
- current_hweight(iocg, NULL, &hw_inuse);
- vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
-
- /*
- * Clear or maintain depending on the overage. Non-zero vdebt is what
- * guarantees that @iocg is online and future iocg_kick_delay() will
- * clear use_delay. Don't leave it on when there's no vdebt.
- */
- if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
- blkcg_clear_delay(blkg);
- return false;
- }
- if (!atomic_read(&blkg->use_delay) &&
- time_before_eq64(vtime, now->vnow + vmargin))
- return false;
-
- /* use delay */
- delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
- now->vrate) * NSEC_PER_USEC;
- blkcg_set_delay(blkg, delta_ns);
- expires = now->now_ns + delta_ns;
-
- /* if already active and close enough, don't bother */
- oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
- if (hrtimer_is_queued(&iocg->delay_timer) &&
- abs(oexpires - expires) <= margin_ns / 4)
- return true;
-
- hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
- margin_ns / 4, HRTIMER_MODE_ABS);
- return true;
-}
-
-static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
-{
- struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
- struct ioc_now now;
- unsigned long flags;
-
- spin_lock_irqsave(&iocg->waitq.lock, flags);
- ioc_now(iocg->ioc, &now);
- iocg_kick_delay(iocg, &now);
- spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+ iocg_lock(iocg, pay_debt, &flags);
+ iocg_kick_waitq(iocg, pay_debt, &now);
+ iocg_unlock(iocg, pay_debt, &flags);
return HRTIMER_NORESTART;
}
@@ -1279,8 +1597,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
u64 this_rq_wait_ns;
for (rw = READ; rw <= WRITE; rw++) {
- u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
- u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
+ u32 this_met = local_read(&stat->missed[rw].nr_met);
+ u32 this_missed = local_read(&stat->missed[rw].nr_missed);
nr_met[rw] += this_met - stat->missed[rw].last_met;
nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
@@ -1288,7 +1606,7 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p
stat->missed[rw].last_missed = this_missed;
}
- this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
+ this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
stat->last_rq_wait_ns = this_rq_wait_ns;
}
@@ -1323,78 +1641,634 @@ static bool iocg_is_idle(struct ioc_gq *iocg)
return true;
}
-/* returns usage with margin added if surplus is large enough */
-static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
+/*
+ * Call this function on the target leaf @iocg's to build pre-order traversal
+ * list of all the ancestors in @inner_walk. The inner nodes are linked through
+ * ->walk_list and the caller is responsible for dissolving the list after use.
+ */
+static void iocg_build_inner_walk(struct ioc_gq *iocg,
+ struct list_head *inner_walk)
{
- /* add margin */
- usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
- usage += SURPLUS_SCALE_ABS;
+ int lvl;
- /* don't bother if the surplus is too small */
- if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
- return 0;
+ WARN_ON_ONCE(!list_empty(&iocg->walk_list));
- return usage;
+ /* find the first ancestor which hasn't been visited yet */
+ for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
+ if (!list_empty(&iocg->ancestors[lvl]->walk_list))
+ break;
+ }
+
+ /* walk down and visit the inner nodes to get pre-order traversal */
+ while (++lvl <= iocg->level - 1) {
+ struct ioc_gq *inner = iocg->ancestors[lvl];
+
+ /* record traversal order */
+ list_add_tail(&inner->walk_list, inner_walk);
+ }
}
-static void ioc_timer_fn(struct timer_list *timer)
+/* propagate the deltas to the parent */
+static void iocg_flush_stat_upward(struct ioc_gq *iocg)
{
- struct ioc *ioc = container_of(timer, struct ioc, timer);
+ if (iocg->level > 0) {
+ struct iocg_stat *parent_stat =
+ &iocg->ancestors[iocg->level - 1]->stat;
+
+ parent_stat->usage_us +=
+ iocg->stat.usage_us - iocg->last_stat.usage_us;
+ parent_stat->wait_us +=
+ iocg->stat.wait_us - iocg->last_stat.wait_us;
+ parent_stat->indebt_us +=
+ iocg->stat.indebt_us - iocg->last_stat.indebt_us;
+ parent_stat->indelay_us +=
+ iocg->stat.indelay_us - iocg->last_stat.indelay_us;
+ }
+
+ iocg->last_stat = iocg->stat;
+}
+
+/* collect per-cpu counters and propagate the deltas to the parent */
+static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 abs_vusage = 0;
+ u64 vusage_delta;
+ int cpu;
+
+ lockdep_assert_held(&iocg->ioc->lock);
+
+ /* collect per-cpu counters */
+ for_each_possible_cpu(cpu) {
+ abs_vusage += local64_read(
+ per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
+ }
+ vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
+ iocg->last_stat_abs_vusage = abs_vusage;
+
+ iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
+ iocg->stat.usage_us += iocg->usage_delta_us;
+
+ iocg_flush_stat_upward(iocg);
+}
+
+/* get stat counters ready for reading on all active iocgs */
+static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
+{
+ LIST_HEAD(inner_walk);
struct ioc_gq *iocg, *tiocg;
- struct ioc_now now;
- int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
- u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
- u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
- u32 missed_ppm[2], rq_wait_pct;
- u64 period_vtime;
- int prev_busy_level, i;
- /* how were the latencies during the period? */
- ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+ /* flush leaves and build inner node walk list */
+ list_for_each_entry(iocg, target_iocgs, active_list) {
+ iocg_flush_stat_leaf(iocg, now);
+ iocg_build_inner_walk(iocg, &inner_walk);
+ }
- /* take care of active iocgs */
- spin_lock_irq(&ioc->lock);
+ /* keep flushing upwards by walking the inner list backwards */
+ list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
+ iocg_flush_stat_upward(iocg);
+ list_del_init(&iocg->walk_list);
+ }
+}
- ioc_now(ioc, &now);
+/*
+ * Determine what @iocg's hweight_inuse should be after donating unused
+ * capacity. @hwm is the upper bound and used to signal no donation. This
+ * function also throws away @iocg's excess budget.
+ */
+static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
+ u32 usage, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ u64 vtime = atomic64_read(&iocg->vtime);
+ s64 excess, delta, target, new_hwi;
+
+ /* debt handling owns inuse for debtors */
+ if (iocg->abs_vdebt)
+ return 1;
+
+ /* see whether minimum margin requirement is met */
+ if (waitqueue_active(&iocg->waitq) ||
+ time_after64(vtime, now->vnow - ioc->margins.min))
+ return hwm;
+
+ /* throw away excess above target */
+ excess = now->vnow - vtime - ioc->margins.target;
+ if (excess > 0) {
+ atomic64_add(excess, &iocg->vtime);
+ atomic64_add(excess, &iocg->done_vtime);
+ vtime += excess;
+ ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
+ }
- period_vtime = now.vnow - ioc->period_at_vtime;
- if (WARN_ON_ONCE(!period_vtime)) {
- spin_unlock_irq(&ioc->lock);
+ /*
+ * Let's say the distance between iocg's and device's vtimes as a
+ * fraction of period duration is delta. Assuming that the iocg will
+ * consume the usage determined above, we want to determine new_hwi so
+ * that delta equals MARGIN_TARGET at the end of the next period.
+ *
+ * We need to execute usage worth of IOs while spending the sum of the
+ * new budget (1 - MARGIN_TARGET) and the leftover from the last period
+ * (delta):
+ *
+ * usage = (1 - MARGIN_TARGET + delta) * new_hwi
+ *
+ * Therefore, the new_hwi is:
+ *
+ * new_hwi = usage / (1 - MARGIN_TARGET + delta)
+ */
+ delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
+ now->vnow - ioc->period_at_vtime);
+ target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
+ new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
+
+ return clamp_t(s64, new_hwi, 1, hwm);
+}
+
+/*
+ * For work-conservation, an iocg which isn't using all of its share should
+ * donate the leftover to other iocgs. There are two ways to achieve this - 1.
+ * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
+ *
+ * #1 is mathematically simpler but has the drawback of requiring synchronous
+ * global hweight_inuse updates when idle iocg's get activated or inuse weights
+ * change due to donation snapbacks as it has the possibility of grossly
+ * overshooting what's allowed by the model and vrate.
+ *
+ * #2 is inherently safe with local operations. The donating iocg can easily
+ * snap back to higher weights when needed without worrying about impacts on
+ * other nodes as the impacts will be inherently correct. This also makes idle
+ * iocg activations safe. The only effect activations have is decreasing
+ * hweight_inuse of others, the right solution to which is for those iocgs to
+ * snap back to higher weights.
+ *
+ * So, we go with #2. The challenge is calculating how each donating iocg's
+ * inuse should be adjusted to achieve the target donation amounts. This is done
+ * using Andy's method described in the following pdf.
+ *
+ * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
+ *
+ * Given the weights and target after-donation hweight_inuse values, Andy's
+ * method determines how the proportional distribution should look like at each
+ * sibling level to maintain the relative relationship between all non-donating
+ * pairs. To roughly summarize, it divides the tree into donating and
+ * non-donating parts, calculates global donation rate which is used to
+ * determine the target hweight_inuse for each node, and then derives per-level
+ * proportions.
+ *
+ * The following pdf shows that global distribution calculated this way can be
+ * achieved by scaling inuse weights of donating leaves and propagating the
+ * adjustments upwards proportionally.
+ *
+ * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
+ *
+ * Combining the above two, we can determine how each leaf iocg's inuse should
+ * be adjusted to achieve the target donation.
+ *
+ * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
+ *
+ * The inline comments use symbols from the last pdf.
+ *
+ * b is the sum of the absolute budgets in the subtree. 1 for the root node.
+ * f is the sum of the absolute budgets of non-donating nodes in the subtree.
+ * t is the sum of the absolute budgets of donating nodes in the subtree.
+ * w is the weight of the node. w = w_f + w_t
+ * w_f is the non-donating portion of w. w_f = w * f / b
+ * w_b is the donating portion of w. w_t = w * t / b
+ * s is the sum of all sibling weights. s = Sum(w) for siblings
+ * s_f and s_t are the non-donating and donating portions of s.
+ *
+ * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
+ * w_pt is the donating portion of the parent's weight and w'_pt the same value
+ * after adjustments. Subscript r denotes the root node's values.
+ */
+static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
+{
+ LIST_HEAD(over_hwa);
+ LIST_HEAD(inner_walk);
+ struct ioc_gq *iocg, *tiocg, *root_iocg;
+ u32 after_sum, over_sum, over_target, gamma;
+
+ /*
+ * It's pretty unlikely but possible for the total sum of
+ * hweight_after_donation's to be higher than WEIGHT_ONE, which will
+ * confuse the following calculations. If such condition is detected,
+ * scale down everyone over its full share equally to keep the sum below
+ * WEIGHT_ONE.
+ */
+ after_sum = 0;
+ over_sum = 0;
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ u32 hwa;
+
+ current_hweight(iocg, &hwa, NULL);
+ after_sum += iocg->hweight_after_donation;
+
+ if (iocg->hweight_after_donation > hwa) {
+ over_sum += iocg->hweight_after_donation;
+ list_add(&iocg->walk_list, &over_hwa);
+ }
+ }
+
+ if (after_sum >= WEIGHT_ONE) {
+ /*
+ * The delta should be deducted from the over_sum, calculate
+ * target over_sum value.
+ */
+ u32 over_delta = after_sum - (WEIGHT_ONE - 1);
+ WARN_ON_ONCE(over_sum <= over_delta);
+ over_target = over_sum - over_delta;
+ } else {
+ over_target = 0;
+ }
+
+ list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
+ if (over_target)
+ iocg->hweight_after_donation =
+ div_u64((u64)iocg->hweight_after_donation *
+ over_target, over_sum);
+ list_del_init(&iocg->walk_list);
+ }
+
+ /*
+ * Build pre-order inner node walk list and prepare for donation
+ * adjustment calculations.
+ */
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ iocg_build_inner_walk(iocg, &inner_walk);
+ }
+
+ root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
+ WARN_ON_ONCE(root_iocg->level > 0);
+
+ list_for_each_entry(iocg, &inner_walk, walk_list) {
+ iocg->child_adjusted_sum = 0;
+ iocg->hweight_donating = 0;
+ iocg->hweight_after_donation = 0;
+ }
+
+ /*
+ * Propagate the donating budget (b_t) and after donation budget (b'_t)
+ * up the hierarchy.
+ */
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+
+ parent->hweight_donating += iocg->hweight_donating;
+ parent->hweight_after_donation += iocg->hweight_after_donation;
+ }
+
+ list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
+ if (iocg->level > 0) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+
+ parent->hweight_donating += iocg->hweight_donating;
+ parent->hweight_after_donation += iocg->hweight_after_donation;
+ }
+ }
+
+ /*
+ * Calculate inner hwa's (b) and make sure the donation values are
+ * within the accepted ranges as we're doing low res calculations with
+ * roundups.
+ */
+ list_for_each_entry(iocg, &inner_walk, walk_list) {
+ if (iocg->level) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+
+ iocg->hweight_active = DIV64_U64_ROUND_UP(
+ (u64)parent->hweight_active * iocg->active,
+ parent->child_active_sum);
+
+ }
+
+ iocg->hweight_donating = min(iocg->hweight_donating,
+ iocg->hweight_active);
+ iocg->hweight_after_donation = min(iocg->hweight_after_donation,
+ iocg->hweight_donating - 1);
+ if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
+ iocg->hweight_donating <= 1 ||
+ iocg->hweight_after_donation == 0)) {
+ pr_warn("iocg: invalid donation weights in ");
+ pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
+ pr_cont(": active=%u donating=%u after=%u\n",
+ iocg->hweight_active, iocg->hweight_donating,
+ iocg->hweight_after_donation);
+ }
+ }
+
+ /*
+ * Calculate the global donation rate (gamma) - the rate to adjust
+ * non-donating budgets by.
+ *
+ * No need to use 64bit multiplication here as the first operand is
+ * guaranteed to be smaller than WEIGHT_ONE (1<<16).
+ *
+ * We know that there are beneficiary nodes and the sum of the donating
+ * hweights can't be whole; however, due to the round-ups during hweight
+ * calculations, root_iocg->hweight_donating might still end up equal to
+ * or greater than whole. Limit the range when calculating the divider.
+ *
+ * gamma = (1 - t_r') / (1 - t_r)
+ */
+ gamma = DIV_ROUND_UP(
+ (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
+ WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
+
+ /*
+ * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
+ * nodes.
+ */
+ list_for_each_entry(iocg, &inner_walk, walk_list) {
+ struct ioc_gq *parent;
+ u32 inuse, wpt, wptp;
+ u64 st, sf;
+
+ if (iocg->level == 0) {
+ /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
+ iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
+ iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
+ WEIGHT_ONE - iocg->hweight_after_donation);
+ continue;
+ }
+
+ parent = iocg->ancestors[iocg->level - 1];
+
+ /* b' = gamma * b_f + b_t' */
+ iocg->hweight_inuse = DIV64_U64_ROUND_UP(
+ (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
+ WEIGHT_ONE) + iocg->hweight_after_donation;
+
+ /* w' = s' * b' / b'_p */
+ inuse = DIV64_U64_ROUND_UP(
+ (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
+ parent->hweight_inuse);
+
+ /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
+ st = DIV64_U64_ROUND_UP(
+ iocg->child_active_sum * iocg->hweight_donating,
+ iocg->hweight_active);
+ sf = iocg->child_active_sum - st;
+ wpt = DIV64_U64_ROUND_UP(
+ (u64)iocg->active * iocg->hweight_donating,
+ iocg->hweight_active);
+ wptp = DIV64_U64_ROUND_UP(
+ (u64)inuse * iocg->hweight_after_donation,
+ iocg->hweight_inuse);
+
+ iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
+ }
+
+ /*
+ * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
+ * we can finally determine leaf adjustments.
+ */
+ list_for_each_entry(iocg, surpluses, surplus_list) {
+ struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
+ u32 inuse;
+
+ /*
+ * In-debt iocgs participated in the donation calculation with
+ * the minimum target hweight_inuse. Configuring inuse
+ * accordingly would work fine but debt handling expects
+ * @iocg->inuse stay at the minimum and we don't wanna
+ * interfere.
+ */
+ if (iocg->abs_vdebt) {
+ WARN_ON_ONCE(iocg->inuse > 1);
+ continue;
+ }
+
+ /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
+ inuse = DIV64_U64_ROUND_UP(
+ parent->child_adjusted_sum * iocg->hweight_after_donation,
+ parent->hweight_inuse);
+
+ TRACE_IOCG_PATH(inuse_transfer, iocg, now,
+ iocg->inuse, inuse,
+ iocg->hweight_inuse,
+ iocg->hweight_after_donation);
+
+ __propagate_weights(iocg, iocg->active, inuse, true, now);
+ }
+
+ /* walk list should be dissolved after use */
+ list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
+ list_del_init(&iocg->walk_list);
+}
+
+/*
+ * A low weight iocg can amass a large amount of debt, for example, when
+ * anonymous memory gets reclaimed aggressively. If the system has a lot of
+ * memory paired with a slow IO device, the debt can span multiple seconds or
+ * more. If there are no other subsequent IO issuers, the in-debt iocg may end
+ * up blocked paying its debt while the IO device is idle.
+ *
+ * The following protects against such cases. If the device has been
+ * sufficiently idle for a while, the debts are halved and delays are
+ * recalculated.
+ */
+static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
+ struct ioc_now *now)
+{
+ struct ioc_gq *iocg;
+ u64 dur, usage_pct, nr_cycles;
+
+ /* if no debtor, reset the cycle */
+ if (!nr_debtors) {
+ ioc->dfgv_period_at = now->now;
+ ioc->dfgv_period_rem = 0;
+ ioc->dfgv_usage_us_sum = 0;
return;
}
/*
- * Waiters determine the sleep durations based on the vrate they
- * saw at the time of sleep. If vrate has increased, some waiters
- * could be sleeping for too long. Wake up tardy waiters which
- * should have woken up in the last period and expire idle iocgs.
+ * Debtors can pass through a lot of writes choking the device and we
+ * don't want to be forgiving debts while the device is struggling from
+ * write bursts. If we're missing latency targets, consider the device
+ * fully utilized.
+ */
+ if (ioc->busy_level > 0)
+ usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
+
+ ioc->dfgv_usage_us_sum += usage_us_sum;
+ if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
+ return;
+
+ /*
+ * At least DFGV_PERIOD has passed since the last period. Calculate the
+ * average usage and reset the period counters.
*/
+ dur = now->now - ioc->dfgv_period_at;
+ usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
+
+ ioc->dfgv_period_at = now->now;
+ ioc->dfgv_usage_us_sum = 0;
+
+ /* if was too busy, reset everything */
+ if (usage_pct > DFGV_USAGE_PCT) {
+ ioc->dfgv_period_rem = 0;
+ return;
+ }
+
+ /*
+ * Usage is lower than threshold. Let's forgive some debts. Debt
+ * forgiveness runs off of the usual ioc timer but its period usually
+ * doesn't match ioc's. Compensate the difference by performing the
+ * reduction as many times as would fit in the duration since the last
+ * run and carrying over the left-over duration in @ioc->dfgv_period_rem
+ * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
+ * reductions is doubled.
+ */
+ nr_cycles = dur + ioc->dfgv_period_rem;
+ ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
+
+ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
+ u64 __maybe_unused old_debt, __maybe_unused old_delay;
+
+ if (!iocg->abs_vdebt && !iocg->delay)
+ continue;
+
+ spin_lock(&iocg->waitq.lock);
+
+ old_debt = iocg->abs_vdebt;
+ old_delay = iocg->delay;
+
+ if (iocg->abs_vdebt)
+ iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
+ if (iocg->delay)
+ iocg->delay = iocg->delay >> nr_cycles ?: 1;
+
+ iocg_kick_waitq(iocg, true, now);
+
+ TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
+ old_debt, iocg->abs_vdebt,
+ old_delay, iocg->delay);
+
+ spin_unlock(&iocg->waitq.lock);
+ }
+}
+
+/*
+ * Check the active iocgs' state to avoid oversleeping and deactive
+ * idle iocgs.
+ *
+ * Since waiters determine the sleep durations based on the vrate
+ * they saw at the time of sleep, if vrate has increased, some
+ * waiters could be sleeping for too long. Wake up tardy waiters
+ * which should have woken up in the last period and expire idle
+ * iocgs.
+ */
+static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
+{
+ int nr_debtors = 0;
+ struct ioc_gq *iocg, *tiocg;
+
list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
- if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
- !iocg_is_idle(iocg))
+ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
+ !iocg->delay && !iocg_is_idle(iocg))
continue;
spin_lock(&iocg->waitq.lock);
- if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
+ /* flush wait and indebt stat deltas */
+ if (iocg->wait_since) {
+ iocg->stat.wait_us += now->now - iocg->wait_since;
+ iocg->wait_since = now->now;
+ }
+ if (iocg->indebt_since) {
+ iocg->stat.indebt_us +=
+ now->now - iocg->indebt_since;
+ iocg->indebt_since = now->now;
+ }
+ if (iocg->indelay_since) {
+ iocg->stat.indelay_us +=
+ now->now - iocg->indelay_since;
+ iocg->indelay_since = now->now;
+ }
+
+ if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
+ iocg->delay) {
/* might be oversleeping vtime / hweight changes, kick */
- iocg_kick_waitq(iocg, &now);
- iocg_kick_delay(iocg, &now);
+ iocg_kick_waitq(iocg, true, now);
+ if (iocg->abs_vdebt || iocg->delay)
+ nr_debtors++;
} else if (iocg_is_idle(iocg)) {
/* no waiter and idle, deactivate */
- iocg->last_inuse = iocg->inuse;
- __propagate_active_weight(iocg, 0, 0);
+ u64 vtime = atomic64_read(&iocg->vtime);
+ s64 excess;
+
+ /*
+ * @iocg has been inactive for a full duration and will
+ * have a high budget. Account anything above target as
+ * error and throw away. On reactivation, it'll start
+ * with the target budget.
+ */
+ excess = now->vnow - vtime - ioc->margins.target;
+ if (excess > 0) {
+ u32 old_hwi;
+
+ current_hweight(iocg, NULL, &old_hwi);
+ ioc->vtime_err -= div64_u64(excess * old_hwi,
+ WEIGHT_ONE);
+ }
+
+ TRACE_IOCG_PATH(iocg_idle, iocg, now,
+ atomic64_read(&iocg->active_period),
+ atomic64_read(&ioc->cur_period), vtime);
+ __propagate_weights(iocg, 0, 0, false, now);
list_del_init(&iocg->active_list);
}
spin_unlock(&iocg->waitq.lock);
}
- commit_active_weights(ioc);
- /* calc usages and see whether some weights need to be moved around */
+ commit_weights(ioc);
+ return nr_debtors;
+}
+
+static void ioc_timer_fn(struct timer_list *timer)
+{
+ struct ioc *ioc = container_of(timer, struct ioc, timer);
+ struct ioc_gq *iocg, *tiocg;
+ struct ioc_now now;
+ LIST_HEAD(surpluses);
+ int nr_debtors, nr_shortages = 0, nr_lagging = 0;
+ u64 usage_us_sum = 0;
+ u32 ppm_rthr;
+ u32 ppm_wthr;
+ u32 missed_ppm[2], rq_wait_pct;
+ u64 period_vtime;
+ int prev_busy_level;
+
+ /* how were the latencies during the period? */
+ ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
+
+ /* take care of active iocgs */
+ spin_lock_irq(&ioc->lock);
+
+ ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
+ ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
+ ioc_now(ioc, &now);
+
+ period_vtime = now.vnow - ioc->period_at_vtime;
+ if (WARN_ON_ONCE(!period_vtime)) {
+ spin_unlock_irq(&ioc->lock);
+ return;
+ }
+
+ nr_debtors = ioc_check_iocgs(ioc, &now);
+
+ /*
+ * Wait and indebt stat are flushed above and the donation calculation
+ * below needs updated usage stat. Let's bring stat up-to-date.
+ */
+ iocg_flush_stat(&ioc->active_iocgs, &now);
+
+ /* calc usage and see whether some weights need to be moved around */
list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
- u64 vdone, vtime, vusage, vmargin, vmin;
- u32 hw_active, hw_inuse, usage;
+ u64 vdone, vtime, usage_us;
+ u32 hw_active, hw_inuse;
/*
* Collect unused and wind vtime closer to vnow to prevent
@@ -1418,116 +2292,92 @@ static void ioc_timer_fn(struct timer_list *timer)
time_before64(vdone, now.vnow - period_vtime))
nr_lagging++;
- if (waitqueue_active(&iocg->waitq))
- vusage = now.vnow - iocg->last_vtime;
- else if (time_before64(iocg->last_vtime, vtime))
- vusage = vtime - iocg->last_vtime;
- else
- vusage = 0;
-
- iocg->last_vtime += vusage;
/*
- * Factor in in-flight vtime into vusage to avoid
- * high-latency completions appearing as idle. This should
- * be done after the above ->last_time adjustment.
+ * Determine absolute usage factoring in in-flight IOs to avoid
+ * high-latency completions appearing as idle.
*/
- vusage = max(vusage, vtime - vdone);
-
- /* calculate hweight based usage ratio and record */
- if (vusage) {
- usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
- period_vtime);
- iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
- iocg->usages[iocg->usage_idx] = usage;
- } else {
- usage = 0;
- }
+ usage_us = iocg->usage_delta_us;
+ usage_us_sum += usage_us;
/* see whether there's surplus vtime */
- vmargin = ioc->margin_us * now.vrate;
- vmin = now.vnow - vmargin;
-
- iocg->has_surplus = false;
-
- if (!waitqueue_active(&iocg->waitq) &&
- time_before64(vtime, vmin)) {
- u64 delta = vmin - vtime;
-
- /* throw away surplus vtime */
- atomic64_add(delta, &iocg->vtime);
- atomic64_add(delta, &iocg->done_vtime);
- iocg->last_vtime += delta;
- /* if usage is sufficiently low, maybe it can donate */
- if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
- iocg->has_surplus = true;
- nr_surpluses++;
- }
- } else if (hw_inuse < hw_active) {
- u32 new_hwi, new_inuse;
-
- /* was donating but might need to take back some */
- if (waitqueue_active(&iocg->waitq)) {
- new_hwi = hw_active;
- } else {
- new_hwi = max(hw_inuse,
- usage * SURPLUS_SCALE_PCT / 100 +
- SURPLUS_SCALE_ABS);
+ WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
+ if (hw_inuse < hw_active ||
+ (!waitqueue_active(&iocg->waitq) &&
+ time_before64(vtime, now.vnow - ioc->margins.low))) {
+ u32 hwa, old_hwi, hwm, new_hwi, usage;
+ u64 usage_dur;
+
+ if (vdone != vtime) {
+ u64 inflight_us = DIV64_U64_ROUND_UP(
+ cost_to_abs_cost(vtime - vdone, hw_inuse),
+ ioc->vtime_base_rate);
+
+ usage_us = max(usage_us, inflight_us);
}
- new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
- hw_inuse);
- new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
+ /* convert to hweight based usage ratio */
+ if (time_after64(iocg->activated_at, ioc->period_at))
+ usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
+ else
+ usage_dur = max_t(u64, now.now - ioc->period_at, 1);
+
+ usage = clamp_t(u32,
+ DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
+ usage_dur),
+ 1, WEIGHT_ONE);
- if (new_inuse > iocg->inuse) {
- TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
- iocg->inuse, new_inuse,
- hw_inuse, new_hwi);
- __propagate_active_weight(iocg, iocg->weight,
- new_inuse);
+ /*
+ * Already donating or accumulated enough to start.
+ * Determine the donation amount.
+ */
+ current_hweight(iocg, &hwa, &old_hwi);
+ hwm = current_hweight_max(iocg);
+ new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
+ usage, &now);
+ /*
+ * Donation calculation assumes hweight_after_donation
+ * to be positive, a condition that a donor w/ hwa < 2
+ * can't meet. Don't bother with donation if hwa is
+ * below 2. It's not gonna make a meaningful difference
+ * anyway.
+ */
+ if (new_hwi < hwm && hwa >= 2) {
+ iocg->hweight_donating = hwa;
+ iocg->hweight_after_donation = new_hwi;
+ list_add(&iocg->surplus_list, &surpluses);
+ } else if (!iocg->abs_vdebt) {
+ /*
+ * @iocg doesn't have enough to donate. Reset
+ * its inuse to active.
+ *
+ * Don't reset debtors as their inuse's are
+ * owned by debt handling. This shouldn't affect
+ * donation calculuation in any meaningful way
+ * as @iocg doesn't have a meaningful amount of
+ * share anyway.
+ */
+ TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
+ iocg->inuse, iocg->active,
+ iocg->hweight_inuse, new_hwi);
+
+ __propagate_weights(iocg, iocg->active,
+ iocg->active, true, &now);
+ nr_shortages++;
}
} else {
- /* genuninely out of vtime */
+ /* genuinely short on vtime */
nr_shortages++;
}
}
- if (!nr_shortages || !nr_surpluses)
- goto skip_surplus_transfers;
+ if (!list_empty(&surpluses) && nr_shortages)
+ transfer_surpluses(&surpluses, &now);
- /* there are both shortages and surpluses, transfer surpluses */
- list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
- u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
- int nr_valid = 0;
+ commit_weights(ioc);
- if (!iocg->has_surplus)
- continue;
-
- /* base the decision on max historical usage */
- for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
- if (iocg->usages[i]) {
- usage = max(usage, iocg->usages[i]);
- nr_valid++;
- }
- }
- if (nr_valid < MIN_VALID_USAGES)
- continue;
-
- current_hweight(iocg, &hw_active, &hw_inuse);
- new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
- if (!new_hwi)
- continue;
-
- new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
- hw_inuse);
- if (new_inuse < iocg->inuse) {
- TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
- iocg->inuse, new_inuse,
- hw_inuse, new_hwi);
- __propagate_active_weight(iocg, iocg->weight, new_inuse);
- }
- }
-skip_surplus_transfers:
- commit_active_weights(ioc);
+ /* surplus list should be dissolved after use */
+ list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
+ list_del_init(&iocg->surplus_list);
/*
* If q is getting clogged or we're missing too much, we're issuing
@@ -1555,11 +2405,9 @@ skip_surplus_transfers:
/*
* If there are IOs spanning multiple periods, wait
- * them out before pushing the device harder. If
- * there are surpluses, let redistribution work it
- * out first.
+ * them out before pushing the device harder.
*/
- if (!nr_lagging && !nr_surpluses)
+ if (!nr_lagging)
ioc->busy_level--;
} else {
/*
@@ -1577,56 +2425,13 @@ skip_surplus_transfers:
ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
- if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
- u64 vrate = atomic64_read(&ioc->vtime_rate);
- u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
-
- /* rq_wait signal is always reliable, ignore user vrate_min */
- if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
- vrate_min = VRATE_MIN;
-
- /*
- * If vrate is out of bounds, apply clamp gradually as the
- * bounds can change abruptly. Otherwise, apply busy_level
- * based adjustment.
- */
- if (vrate < vrate_min) {
- vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
- 100);
- vrate = min(vrate, vrate_min);
- } else if (vrate > vrate_max) {
- vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
- 100);
- vrate = max(vrate, vrate_max);
- } else {
- int idx = min_t(int, abs(ioc->busy_level),
- ARRAY_SIZE(vrate_adj_pct) - 1);
- u32 adj_pct = vrate_adj_pct[idx];
-
- if (ioc->busy_level > 0)
- adj_pct = 100 - adj_pct;
- else
- adj_pct = 100 + adj_pct;
-
- vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
- vrate_min, vrate_max);
- }
-
- trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
- nr_lagging, nr_shortages,
- nr_surpluses);
-
- atomic64_set(&ioc->vtime_rate, vrate);
- ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
- ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
- } else if (ioc->busy_level != prev_busy_level || nr_lagging) {
- trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
- missed_ppm, rq_wait_pct, nr_lagging,
- nr_shortages, nr_surpluses);
- }
+ ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
+ prev_busy_level, missed_ppm);
ioc_refresh_params(ioc, false);
+ ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
+
/*
* This period is done. Move onto the next one. If nothing's
* going on with the device, stop the timer.
@@ -1638,13 +2443,77 @@ skip_surplus_transfers:
ioc_start_period(ioc, &now);
} else {
ioc->busy_level = 0;
+ ioc->vtime_err = 0;
ioc->running = IOC_IDLE;
}
+
+ ioc_refresh_vrate(ioc, &now);
}
spin_unlock_irq(&ioc->lock);
}
+static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
+ u64 abs_cost, struct ioc_now *now)
+{
+ struct ioc *ioc = iocg->ioc;
+ struct ioc_margins *margins = &ioc->margins;
+ u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
+ u32 hwi, adj_step;
+ s64 margin;
+ u64 cost, new_inuse;
+ unsigned long flags;
+
+ current_hweight(iocg, NULL, &hwi);
+ old_hwi = hwi;
+ cost = abs_cost_to_cost(abs_cost, hwi);
+ margin = now->vnow - vtime - cost;
+
+ /* debt handling owns inuse for debtors */
+ if (iocg->abs_vdebt)
+ return cost;
+
+ /*
+ * We only increase inuse during period and do so if the margin has
+ * deteriorated since the previous adjustment.
+ */
+ if (margin >= iocg->saved_margin || margin >= margins->low ||
+ iocg->inuse == iocg->active)
+ return cost;
+
+ spin_lock_irqsave(&ioc->lock, flags);
+
+ /* we own inuse only when @iocg is in the normal active state */
+ if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
+ spin_unlock_irqrestore(&ioc->lock, flags);
+ return cost;
+ }
+
+ /*
+ * Bump up inuse till @abs_cost fits in the existing budget.
+ * adj_step must be determined after acquiring ioc->lock - we might
+ * have raced and lost to another thread for activation and could
+ * be reading 0 iocg->active before ioc->lock which will lead to
+ * infinite loop.
+ */
+ new_inuse = iocg->inuse;
+ adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
+ do {
+ new_inuse = new_inuse + adj_step;
+ propagate_weights(iocg, iocg->active, new_inuse, true, now);
+ current_hweight(iocg, NULL, &hwi);
+ cost = abs_cost_to_cost(abs_cost, hwi);
+ } while (time_after64(vtime + cost, now->vnow) &&
+ iocg->inuse != iocg->active);
+
+ spin_unlock_irqrestore(&ioc->lock, flags);
+
+ TRACE_IOCG_PATH(inuse_adjust, iocg, now,
+ old_inuse, iocg->inuse, old_hwi, hwi);
+
+ return cost;
+}
+
static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
bool is_merge, u64 *costp)
{
@@ -1654,6 +2523,10 @@ static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
u64 seek_pages = 0;
u64 cost = 0;
+ /* Can't calculate cost for empty bio */
+ if (!bio->bi_iter.bi_size)
+ goto out;
+
switch (bio_op(bio)) {
case REQ_OP_READ:
coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
@@ -1726,15 +2599,12 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
struct ioc_gq *iocg = blkg_to_iocg(blkg);
struct ioc_now now;
struct iocg_wait wait;
- u32 hw_active, hw_inuse;
u64 abs_cost, cost, vtime;
+ bool use_debt, ioc_locked;
+ unsigned long flags;
- /* bypass IOs if disabled or for root cgroup */
- if (!ioc->enabled || !iocg->level)
- return;
-
- /* always activate so that even 0 cost IOs get protected to some level */
- if (!iocg_activate(iocg, &now))
+ /* bypass IOs if disabled, still initializing, or for root cgroup */
+ if (!ioc->enabled || !iocg || !iocg->level)
return;
/* calculate the absolute vtime cost */
@@ -1742,22 +2612,12 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
if (!abs_cost)
return;
- iocg->cursor = bio_end_sector(bio);
+ if (!iocg_activate(iocg, &now))
+ return;
+ iocg->cursor = bio_end_sector(bio);
vtime = atomic64_read(&iocg->vtime);
- current_hweight(iocg, &hw_active, &hw_inuse);
-
- if (hw_inuse < hw_active &&
- time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
- TRACE_IOCG_PATH(inuse_reset, iocg, &now,
- iocg->inuse, iocg->weight, hw_inuse, hw_active);
- spin_lock_irq(&ioc->lock);
- propagate_active_weight(iocg, iocg->weight, iocg->weight);
- spin_unlock_irq(&ioc->lock);
- current_hweight(iocg, &hw_active, &hw_inuse);
- }
-
- cost = abs_cost_to_cost(abs_cost, hw_inuse);
+ cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
/*
* If no one's waiting and within budget, issue right away. The
@@ -1766,21 +2626,32 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
*/
if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
time_before_eq64(vtime + cost, now.vnow)) {
- iocg_commit_bio(iocg, bio, cost);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
}
/*
- * We activated above but w/o any synchronization. Deactivation is
- * synchronized with waitq.lock and we won't get deactivated as long
- * as we're waiting or has debt, so we're good if we're activated
- * here. In the unlikely case that we aren't, just issue the IO.
+ * We're over budget. This can be handled in two ways. IOs which may
+ * cause priority inversions are punted to @ioc->aux_iocg and charged as
+ * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
+ * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
+ * whether debt handling is needed and acquire locks accordingly.
*/
- spin_lock_irq(&iocg->waitq.lock);
+ use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
+ ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
+retry_lock:
+ iocg_lock(iocg, ioc_locked, &flags);
+ /*
+ * @iocg must stay activated for debt and waitq handling. Deactivation
+ * is synchronized against both ioc->lock and waitq.lock and we won't
+ * get deactivated as long as we're waiting or has debt, so we're good
+ * if we're activated here. In the unlikely cases that we aren't, just
+ * issue the IO.
+ */
if (unlikely(list_empty(&iocg->active_list))) {
- spin_unlock_irq(&iocg->waitq.lock);
- iocg_commit_bio(iocg, bio, cost);
+ iocg_unlock(iocg, ioc_locked, &flags);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
}
@@ -1801,15 +2672,26 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
* clear them and leave @iocg inactive w/ dangling use_delay heavily
* penalizing the cgroup and its descendants.
*/
- if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
- iocg->abs_vdebt += abs_cost;
+ if (use_debt) {
+ iocg_incur_debt(iocg, abs_cost, &now);
if (iocg_kick_delay(iocg, &now))
- blkcg_schedule_throttle(rqos->q,
+ blkcg_schedule_throttle(rqos->disk,
(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
- spin_unlock_irq(&iocg->waitq.lock);
+ iocg_unlock(iocg, ioc_locked, &flags);
return;
}
+ /* guarantee that iocgs w/ waiters have maximum inuse */
+ if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
+ if (!ioc_locked) {
+ iocg_unlock(iocg, false, &flags);
+ ioc_locked = true;
+ goto retry_lock;
+ }
+ propagate_weights(iocg, iocg->active, iocg->active, true,
+ &now);
+ }
+
/*
* Append self to the waitq and schedule the wakeup timer if we're
* the first waiter. The timer duration is calculated based on the
@@ -1830,9 +2712,9 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
wait.committed = false; /* will be set true by waker */
__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
- iocg_kick_waitq(iocg, &now);
+ iocg_kick_waitq(iocg, ioc_locked, &now);
- spin_unlock_irq(&iocg->waitq.lock);
+ iocg_unlock(iocg, ioc_locked, &flags);
while (true) {
set_current_state(TASK_UNINTERRUPTIBLE);
@@ -1849,15 +2731,14 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
struct bio *bio)
{
struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
- struct ioc *ioc = iocg->ioc;
+ struct ioc *ioc = rqos_to_ioc(rqos);
sector_t bio_end = bio_end_sector(bio);
struct ioc_now now;
- u32 hw_inuse;
- u64 abs_cost, cost;
+ u64 vtime, abs_cost, cost;
unsigned long flags;
- /* bypass if disabled or for root cgroup */
- if (!ioc->enabled || !iocg->level)
+ /* bypass if disabled, still initializing, or for root cgroup */
+ if (!ioc->enabled || !iocg || !iocg->level)
return;
abs_cost = calc_vtime_cost(bio, iocg, true);
@@ -1865,8 +2746,9 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
return;
ioc_now(ioc, &now);
- current_hweight(iocg, NULL, &hw_inuse);
- cost = abs_cost_to_cost(abs_cost, hw_inuse);
+
+ vtime = atomic64_read(&iocg->vtime);
+ cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
/* update cursor if backmerging into the request at the cursor */
if (blk_rq_pos(rq) < bio_end &&
@@ -1879,7 +2761,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
*/
if (rq->bio && rq->bio->bi_iocost_cost &&
time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
- iocg_commit_bio(iocg, bio, cost);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
return;
}
@@ -1888,14 +2770,20 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
* be for the vast majority of cases. See debt handling in
* ioc_rqos_throttle() for details.
*/
- spin_lock_irqsave(&iocg->waitq.lock, flags);
+ spin_lock_irqsave(&ioc->lock, flags);
+ spin_lock(&iocg->waitq.lock);
+
if (likely(!list_empty(&iocg->active_list))) {
- iocg->abs_vdebt += abs_cost;
- iocg_kick_delay(iocg, &now);
+ iocg_incur_debt(iocg, abs_cost, &now);
+ if (iocg_kick_delay(iocg, &now))
+ blkcg_schedule_throttle(rqos->disk,
+ (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
} else {
- iocg_commit_bio(iocg, bio, cost);
+ iocg_commit_bio(iocg, bio, abs_cost, cost);
}
- spin_unlock_irqrestore(&iocg->waitq.lock, flags);
+
+ spin_unlock(&iocg->waitq.lock);
+ spin_unlock_irqrestore(&ioc->lock, flags);
}
static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
@@ -1909,13 +2797,14 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
{
struct ioc *ioc = rqos_to_ioc(rqos);
+ struct ioc_pcpu_stat *ccs;
u64 on_q_ns, rq_wait_ns, size_nsec;
int pidx, rw;
if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
return;
- switch (req_op(rq) & REQ_OP_MASK) {
+ switch (req_op(rq)) {
case REQ_OP_READ:
pidx = QOS_RLAT;
rw = READ;
@@ -1932,13 +2821,17 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
+ ccs = get_cpu_ptr(ioc->pcpu_stat);
+
if (on_q_ns <= size_nsec ||
on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
- this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
+ local_inc(&ccs->missed[rw].nr_met);
else
- this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
+ local_inc(&ccs->missed[rw].nr_missed);
- this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
+ local64_add(rq_wait_ns, &ccs->rq_wait_ns);
+
+ put_cpu_ptr(ccs);
}
static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
@@ -1954,18 +2847,18 @@ static void ioc_rqos_exit(struct rq_qos *rqos)
{
struct ioc *ioc = rqos_to_ioc(rqos);
- blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
+ blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
spin_lock_irq(&ioc->lock);
ioc->running = IOC_STOP;
spin_unlock_irq(&ioc->lock);
- del_timer_sync(&ioc->timer);
+ timer_shutdown_sync(&ioc->timer);
free_percpu(ioc->pcpu_stat);
kfree(ioc);
}
-static struct rq_qos_ops ioc_rqos_ops = {
+static const struct rq_qos_ops ioc_rqos_ops = {
.throttle = ioc_rqos_throttle,
.merge = ioc_rqos_merge,
.done_bio = ioc_rqos_done_bio,
@@ -1974,11 +2867,10 @@ static struct rq_qos_ops ioc_rqos_ops = {
.exit = ioc_rqos_exit,
};
-static int blk_iocost_init(struct request_queue *q)
+static int blk_iocost_init(struct gendisk *disk)
{
struct ioc *ioc;
- struct rq_qos *rqos;
- int ret;
+ int i, cpu, ret;
ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
if (!ioc)
@@ -1990,36 +2882,54 @@ static int blk_iocost_init(struct request_queue *q)
return -ENOMEM;
}
- rqos = &ioc->rqos;
- rqos->id = RQ_QOS_COST;
- rqos->ops = &ioc_rqos_ops;
- rqos->q = q;
+ for_each_possible_cpu(cpu) {
+ struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
+
+ for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
+ local_set(&ccs->missed[i].nr_met, 0);
+ local_set(&ccs->missed[i].nr_missed, 0);
+ }
+ local64_set(&ccs->rq_wait_ns, 0);
+ }
spin_lock_init(&ioc->lock);
timer_setup(&ioc->timer, ioc_timer_fn, 0);
INIT_LIST_HEAD(&ioc->active_iocgs);
ioc->running = IOC_IDLE;
+ ioc->vtime_base_rate = VTIME_PER_USEC;
atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
- seqcount_init(&ioc->period_seqcount);
+ seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
ioc->period_at = ktime_to_us(ktime_get());
atomic64_set(&ioc->cur_period, 0);
atomic_set(&ioc->hweight_gen, 0);
spin_lock_irq(&ioc->lock);
ioc->autop_idx = AUTOP_INVALID;
- ioc_refresh_params(ioc, true);
+ ioc_refresh_params_disk(ioc, true, disk);
spin_unlock_irq(&ioc->lock);
- rq_qos_add(q, rqos);
- ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
- if (ret) {
- rq_qos_del(q, rqos);
- free_percpu(ioc->pcpu_stat);
- kfree(ioc);
- return ret;
- }
+ /*
+ * rqos must be added before activation to allow ioc_pd_init() to
+ * lookup the ioc from q. This means that the rqos methods may get
+ * called before policy activation completion, can't assume that the
+ * target bio has an iocg associated and need to test for NULL iocg.
+ */
+ ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
+ if (ret)
+ goto err_free_ioc;
+
+ ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
+ if (ret)
+ goto err_del_qos;
return 0;
+
+err_del_qos:
+ rq_qos_del(&ioc->rqos);
+err_free_ioc:
+ free_percpu(ioc->pcpu_stat);
+ kfree(ioc);
+ return ret;
}
static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
@@ -2030,7 +2940,7 @@ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
if (!iocc)
return NULL;
- iocc->dfl_weight = CGROUP_WEIGHT_DFL;
+ iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
return &iocc->cpd;
}
@@ -2039,17 +2949,23 @@ static void ioc_cpd_free(struct blkcg_policy_data *cpd)
kfree(container_of(cpd, struct ioc_cgrp, cpd));
}
-static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
- struct blkcg *blkcg)
+static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
+ struct blkcg *blkcg, gfp_t gfp)
{
int levels = blkcg->css.cgroup->level + 1;
struct ioc_gq *iocg;
- iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
- gfp, q->node);
+ iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
+ disk->node_id);
if (!iocg)
return NULL;
+ iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
+ if (!iocg->pcpu_stat) {
+ kfree(iocg);
+ return NULL;
+ }
+
return &iocg->pd;
}
@@ -2069,14 +2985,14 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
atomic64_set(&iocg->done_vtime, now.vnow);
atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
INIT_LIST_HEAD(&iocg->active_list);
- iocg->hweight_active = HWEIGHT_WHOLE;
- iocg->hweight_inuse = HWEIGHT_WHOLE;
+ INIT_LIST_HEAD(&iocg->walk_list);
+ INIT_LIST_HEAD(&iocg->surplus_list);
+ iocg->hweight_active = WEIGHT_ONE;
+ iocg->hweight_inuse = WEIGHT_ONE;
init_waitqueue_head(&iocg->waitq);
hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
iocg->waitq_timer.function = iocg_waitq_timer_fn;
- hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
- iocg->delay_timer.function = iocg_delay_timer_fn;
iocg->level = blkg->blkcg->css.cgroup->level;
@@ -2086,7 +3002,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd)
}
spin_lock_irqsave(&ioc->lock, flags);
- weight_updated(iocg);
+ weight_updated(iocg, &now);
spin_unlock_irqrestore(&ioc->lock, flags);
}
@@ -2094,21 +3010,54 @@ static void ioc_pd_free(struct blkg_policy_data *pd)
{
struct ioc_gq *iocg = pd_to_iocg(pd);
struct ioc *ioc = iocg->ioc;
+ unsigned long flags;
if (ioc) {
- spin_lock(&ioc->lock);
+ spin_lock_irqsave(&ioc->lock, flags);
+
if (!list_empty(&iocg->active_list)) {
- propagate_active_weight(iocg, 0, 0);
+ struct ioc_now now;
+
+ ioc_now(ioc, &now);
+ propagate_weights(iocg, 0, 0, false, &now);
list_del_init(&iocg->active_list);
}
- spin_unlock(&ioc->lock);
+
+ WARN_ON_ONCE(!list_empty(&iocg->walk_list));
+ WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
+
+ spin_unlock_irqrestore(&ioc->lock, flags);
hrtimer_cancel(&iocg->waitq_timer);
- hrtimer_cancel(&iocg->delay_timer);
}
+ free_percpu(iocg->pcpu_stat);
kfree(iocg);
}
+static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
+{
+ struct ioc_gq *iocg = pd_to_iocg(pd);
+ struct ioc *ioc = iocg->ioc;
+
+ if (!ioc->enabled)
+ return;
+
+ if (iocg->level == 0) {
+ unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
+ ioc->vtime_base_rate * 10000,
+ VTIME_PER_USEC);
+ seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
+ }
+
+ seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
+
+ if (blkcg_debug_stats)
+ seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
+ iocg->last_stat.wait_us,
+ iocg->last_stat.indebt_us,
+ iocg->last_stat.indelay_us);
+}
+
static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
int off)
{
@@ -2116,7 +3065,7 @@ static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
struct ioc_gq *iocg = pd_to_iocg(pd);
if (dname && iocg->cfg_weight)
- seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
+ seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
return 0;
}
@@ -2126,7 +3075,7 @@ static int ioc_weight_show(struct seq_file *sf, void *v)
struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
- seq_printf(sf, "default %u\n", iocc->dfl_weight);
+ seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
&blkcg_policy_iocost, seq_cft(sf)->private, false);
return 0;
@@ -2138,6 +3087,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
struct blkcg *blkcg = css_to_blkcg(of_css(of));
struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
struct blkg_conf_ctx ctx;
+ struct ioc_now now;
struct ioc_gq *iocg;
u32 v;
int ret;
@@ -2151,25 +3101,28 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
return -EINVAL;
- spin_lock(&blkcg->lock);
- iocc->dfl_weight = v;
+ spin_lock_irq(&blkcg->lock);
+ iocc->dfl_weight = v * WEIGHT_ONE;
hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
struct ioc_gq *iocg = blkg_to_iocg(blkg);
if (iocg) {
- spin_lock_irq(&iocg->ioc->lock);
- weight_updated(iocg);
- spin_unlock_irq(&iocg->ioc->lock);
+ spin_lock(&iocg->ioc->lock);
+ ioc_now(iocg->ioc, &now);
+ weight_updated(iocg, &now);
+ spin_unlock(&iocg->ioc->lock);
}
}
- spin_unlock(&blkcg->lock);
+ spin_unlock_irq(&blkcg->lock);
return nbytes;
}
- ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
+ blkg_conf_init(&ctx, buf);
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
if (ret)
- return ret;
+ goto err;
iocg = blkg_to_iocg(ctx.blkg);
@@ -2183,16 +3136,19 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
}
spin_lock(&iocg->ioc->lock);
- iocg->cfg_weight = v;
- weight_updated(iocg);
+ iocg->cfg_weight = v * WEIGHT_ONE;
+ ioc_now(iocg->ioc, &now);
+ weight_updated(iocg, &now);
spin_unlock(&iocg->ioc->lock);
- blkg_conf_finish(&ctx);
+ blkg_conf_exit(&ctx);
return nbytes;
einval:
- blkg_conf_finish(&ctx);
- return -EINVAL;
+ ret = -EINVAL;
+err:
+ blkg_conf_exit(&ctx);
+ return ret;
}
static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
@@ -2204,6 +3160,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
if (!dname)
return 0;
+ spin_lock_irq(&ioc->lock);
seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
ioc->params.qos[QOS_RPPM] / 10000,
@@ -2216,6 +3173,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
ioc->params.qos[QOS_MIN] % 10000 / 100,
ioc->params.qos[QOS_MAX] / 10000,
ioc->params.qos[QOS_MAX] % 10000 / 100);
+ spin_unlock_irq(&ioc->lock);
return 0;
}
@@ -2247,32 +3205,44 @@ static const match_table_t qos_tokens = {
static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
+ struct blkg_conf_ctx ctx;
struct gendisk *disk;
struct ioc *ioc;
u32 qos[NR_QOS_PARAMS];
bool enable, user;
- char *p;
+ char *body, *p;
int ret;
- disk = blkcg_conf_get_disk(&input);
- if (IS_ERR(disk))
- return PTR_ERR(disk);
+ blkg_conf_init(&ctx, input);
+
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto err;
+
+ body = ctx.body;
+ disk = ctx.bdev->bd_disk;
+ if (!queue_is_mq(disk->queue)) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
ioc = q_to_ioc(disk->queue);
if (!ioc) {
- ret = blk_iocost_init(disk->queue);
+ ret = blk_iocost_init(disk);
if (ret)
goto err;
ioc = q_to_ioc(disk->queue);
}
+ blk_mq_freeze_queue(disk->queue);
+ blk_mq_quiesce_queue(disk->queue);
+
spin_lock_irq(&ioc->lock);
memcpy(qos, ioc->params.qos, sizeof(qos));
enable = ioc->enabled;
user = ioc->user_qos_params;
- spin_unlock_irq(&ioc->lock);
- while ((p = strsep(&input, " \t\n"))) {
+ while ((p = strsep(&body, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
char buf[32];
int tok;
@@ -2283,7 +3253,8 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
switch (match_token(p, qos_ctrl_tokens, args)) {
case QOS_ENABLE:
- match_u64(&args[0], &v);
+ if (match_u64(&args[0], &v))
+ goto einval;
enable = v;
continue;
case QOS_CTRL:
@@ -2337,14 +3308,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
if (qos[QOS_MIN] > qos[QOS_MAX])
goto einval;
- spin_lock_irq(&ioc->lock);
-
- if (enable) {
- blk_stat_enable_accounting(ioc->rqos.q);
- blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+ if (enable && !ioc->enabled) {
+ blk_stat_enable_accounting(disk->queue);
+ blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = true;
- } else {
- blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
+ } else if (!enable && ioc->enabled) {
+ blk_stat_disable_accounting(disk->queue);
+ blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
ioc->enabled = false;
}
@@ -2358,12 +3328,25 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
- put_disk_and_module(disk);
+ if (enable)
+ wbt_disable_default(disk);
+ else
+ wbt_enable_default(disk);
+
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
+ blkg_conf_exit(&ctx);
return nbytes;
einval:
+ spin_unlock_irq(&ioc->lock);
+
+ blk_mq_unquiesce_queue(disk->queue);
+ blk_mq_unfreeze_queue(disk->queue);
+
ret = -EINVAL;
err:
- put_disk_and_module(disk);
+ blkg_conf_exit(&ctx);
return ret;
}
@@ -2377,12 +3360,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf,
if (!dname)
return 0;
+ spin_lock_irq(&ioc->lock);
seq_printf(sf, "%s ctrl=%s model=linear "
"rbps=%llu rseqiops=%llu rrandiops=%llu "
"wbps=%llu wseqiops=%llu wrandiops=%llu\n",
dname, ioc->user_cost_model ? "user" : "auto",
u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
+ spin_unlock_irq(&ioc->lock);
return 0;
}
@@ -2414,31 +3399,43 @@ static const match_table_t i_lcoef_tokens = {
static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
size_t nbytes, loff_t off)
{
- struct gendisk *disk;
+ struct blkg_conf_ctx ctx;
+ struct request_queue *q;
struct ioc *ioc;
u64 u[NR_I_LCOEFS];
bool user;
- char *p;
+ char *body, *p;
int ret;
- disk = blkcg_conf_get_disk(&input);
- if (IS_ERR(disk))
- return PTR_ERR(disk);
+ blkg_conf_init(&ctx, input);
- ioc = q_to_ioc(disk->queue);
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto err;
+
+ body = ctx.body;
+ q = bdev_get_queue(ctx.bdev);
+ if (!queue_is_mq(q)) {
+ ret = -EOPNOTSUPP;
+ goto err;
+ }
+
+ ioc = q_to_ioc(q);
if (!ioc) {
- ret = blk_iocost_init(disk->queue);
+ ret = blk_iocost_init(ctx.bdev->bd_disk);
if (ret)
goto err;
- ioc = q_to_ioc(disk->queue);
+ ioc = q_to_ioc(q);
}
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
spin_lock_irq(&ioc->lock);
memcpy(u, ioc->params.i_lcoefs, sizeof(u));
user = ioc->user_cost_model;
- spin_unlock_irq(&ioc->lock);
- while ((p = strsep(&input, " \t\n"))) {
+ while ((p = strsep(&body, " \t\n"))) {
substring_t args[MAX_OPT_ARGS];
char buf[32];
int tok;
@@ -2473,7 +3470,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
user = true;
}
- spin_lock_irq(&ioc->lock);
if (user) {
memcpy(ioc->params.i_lcoefs, u, sizeof(u));
ioc->user_cost_model = true;
@@ -2483,13 +3479,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
ioc_refresh_params(ioc, true);
spin_unlock_irq(&ioc->lock);
- put_disk_and_module(disk);
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
+ blkg_conf_exit(&ctx);
return nbytes;
einval:
+ spin_unlock_irq(&ioc->lock);
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
+
ret = -EINVAL;
err:
- put_disk_and_module(disk);
+ blkg_conf_exit(&ctx);
return ret;
}
@@ -2522,6 +3526,7 @@ static struct blkcg_policy blkcg_policy_iocost = {
.pd_alloc_fn = ioc_pd_alloc,
.pd_init_fn = ioc_pd_init,
.pd_free_fn = ioc_pd_free,
+ .pd_stat_fn = ioc_pd_stat,
};
static int __init ioc_init(void)
@@ -2531,7 +3536,7 @@ static int __init ioc_init(void)
static void __exit ioc_exit(void)
{
- return blkcg_policy_unregister(&blkcg_policy_iocost);
+ blkcg_policy_unregister(&blkcg_policy_iocost);
}
module_init(ioc_init);
diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c
index c128d50cb410..c1a6aba1d59e 100644
--- a/block/blk-iolatency.c
+++ b/block/blk-iolatency.c
@@ -76,6 +76,7 @@
#include <linux/blk-mq.h>
#include "blk-rq-qos.h"
#include "blk-stat.h"
+#include "blk-cgroup.h"
#include "blk.h"
#define DEFAULT_SCALE_COOKIE 1000000U
@@ -86,7 +87,17 @@ struct iolatency_grp;
struct blk_iolatency {
struct rq_qos rqos;
struct timer_list timer;
- atomic_t enabled;
+
+ /*
+ * ->enabled is the master enable switch gating the throttling logic and
+ * inflight tracking. The number of cgroups which have iolat enabled is
+ * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly
+ * from ->enable_work with the request_queue frozen. For details, See
+ * blkiolatency_enable_work_fn().
+ */
+ bool enabled;
+ atomic_t enable_cnt;
+ struct work_struct enable_work;
};
static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
@@ -94,11 +105,6 @@ static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos)
return container_of(rqos, struct blk_iolatency, rqos);
}
-static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat)
-{
- return atomic_read(&blkiolat->enabled) > 0;
-}
-
struct child_latency_info {
spinlock_t lock;
@@ -135,7 +141,7 @@ struct iolatency_grp {
struct latency_stat __percpu *stats;
struct latency_stat cur_stat;
struct blk_iolatency *blkiolat;
- struct rq_depth rq_depth;
+ unsigned int max_depth;
struct rq_wait rq_wait;
atomic64_t window_start;
atomic_t scale_cookie;
@@ -274,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data)
static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data)
{
struct iolatency_grp *iolat = private_data;
- return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth);
+ return rq_wait_inc_below(rqw, iolat->max_depth);
}
static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
@@ -286,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos,
unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay);
if (use_delay)
- blkcg_schedule_throttle(rqos->q, use_memdelay);
+ blkcg_schedule_throttle(rqos->disk, use_memdelay);
/*
* To avoid priority inversions we want to just take a slot if we are
@@ -324,7 +330,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
struct child_latency_info *lat_info,
bool up)
{
- unsigned long qd = blkiolat->rqos.q->nr_requests;
+ unsigned long qd = blkiolat->rqos.disk->queue->nr_requests;
unsigned long scale = scale_amount(qd, up);
unsigned long old = atomic_read(&lat_info->scale_cookie);
unsigned long max_scale = qd << 1;
@@ -358,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat,
}
/*
- * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the
+ * Change the queue depth of the iolatency_grp. We add 1/16th of the
* queue depth at a time so we don't get wild swings and hopefully dial in to
- * fairer distribution of the overall queue depth.
+ * fairer distribution of the overall queue depth. We halve the queue depth
+ * at a time so we can scale down queue depth quickly from default unlimited
+ * to target.
*/
static void scale_change(struct iolatency_grp *iolat, bool up)
{
- unsigned long qd = iolat->blkiolat->rqos.q->nr_requests;
+ unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests;
unsigned long scale = scale_amount(qd, up);
- unsigned long old = iolat->rq_depth.max_depth;
+ unsigned long old = iolat->max_depth;
if (old > qd)
old = qd;
@@ -378,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up)
if (old < qd) {
old += scale;
old = min(old, qd);
- iolat->rq_depth.max_depth = old;
+ iolat->max_depth = old;
wake_up_all(&iolat->rq_wait.wait);
}
} else {
old >>= 1;
- iolat->rq_depth.max_depth = max(old, 1UL);
+ iolat->max_depth = max(old, 1UL);
}
}
@@ -395,12 +403,8 @@ static void check_scale_change(struct iolatency_grp *iolat)
unsigned int cur_cookie;
unsigned int our_cookie = atomic_read(&iolat->scale_cookie);
u64 scale_lat;
- unsigned int old;
int direction = 0;
- if (lat_to_blkg(iolat)->parent == NULL)
- return;
-
parent = blkg_to_lat(lat_to_blkg(iolat)->parent);
if (!parent)
return;
@@ -416,11 +420,10 @@ static void check_scale_change(struct iolatency_grp *iolat)
else
return;
- old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie);
-
- /* Somebody beat us to the punch, just bail. */
- if (old != our_cookie)
+ if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) {
+ /* Somebody beat us to the punch, just bail. */
return;
+ }
if (direction < 0 && iolat->min_lat_nsec) {
u64 samples_thresh;
@@ -441,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
}
/* We're as low as we can go. */
- if (iolat->rq_depth.max_depth == 1 && direction < 0) {
+ if (iolat->max_depth == 1 && direction < 0) {
blkcg_use_delay(lat_to_blkg(iolat));
return;
}
@@ -449,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat)
/* We're back to the default cookie, unthrottle all the things. */
if (cur_cookie == DEFAULT_SCALE_COOKIE) {
blkcg_clear_delay(lat_to_blkg(iolat));
- iolat->rq_depth.max_depth = UINT_MAX;
+ iolat->max_depth = UINT_MAX;
wake_up_all(&iolat->rq_wait.wait);
return;
}
@@ -463,7 +466,7 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio)
struct blkcg_gq *blkg = bio->bi_blkg;
bool issue_as_root = bio_issue_as_root_blkg(bio);
- if (!blk_iolatency_enabled(blkiolat))
+ if (!blkiolat->enabled)
return;
while (blkg && blkg->parent) {
@@ -504,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat,
* We don't want to count issue_as_root bio's in the cgroups latency
* statistics as it could skew the numbers downwards.
*/
- if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) {
+ if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) {
u64 sub = iolat->min_lat_nsec;
if (req_time < sub)
blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time);
@@ -591,23 +594,22 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
struct rq_wait *rqw;
struct iolatency_grp *iolat;
u64 window_start;
- u64 now = ktime_to_ns(ktime_get());
+ u64 now;
bool issue_as_root = bio_issue_as_root_blkg(bio);
- bool enabled = false;
int inflight = 0;
blkg = bio->bi_blkg;
- if (!blkg || !bio_flagged(bio, BIO_TRACKED))
+ if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED))
return;
iolat = blkg_to_lat(bio->bi_blkg);
if (!iolat)
return;
- enabled = blk_iolatency_enabled(iolat->blkiolat);
- if (!enabled)
+ if (!iolat->blkiolat->enabled)
return;
+ now = ktime_to_ns(ktime_get());
while (blkg && blkg->parent) {
iolat = blkg_to_lat(blkg);
if (!iolat) {
@@ -628,8 +630,8 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio)
window_start = atomic64_read(&iolat->window_start);
if (now > window_start &&
(now - window_start) >= iolat->cur_win_nsec) {
- if (atomic64_cmpxchg(&iolat->window_start,
- window_start, now) == window_start)
+ if (atomic64_try_cmpxchg(&iolat->window_start,
+ &window_start, now))
iolatency_check_latencies(iolat, now);
}
}
@@ -642,12 +644,13 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos)
{
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
- del_timer_sync(&blkiolat->timer);
- blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency);
+ timer_shutdown_sync(&blkiolat->timer);
+ flush_work(&blkiolat->enable_work);
+ blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iolatency);
kfree(blkiolat);
}
-static struct rq_qos_ops blkcg_iolatency_ops = {
+static const struct rq_qos_ops blkcg_iolatency_ops = {
.throttle = blkcg_iolatency_throttle,
.done_bio = blkcg_iolatency_done_bio,
.exit = blkcg_iolatency_exit,
@@ -662,7 +665,7 @@ static void blkiolatency_timer_fn(struct timer_list *t)
rcu_read_lock();
blkg_for_each_descendant_pre(blkg, pos_css,
- blkiolat->rqos.q->root_blkg) {
+ blkiolat->rqos.disk->queue->root_blkg) {
struct iolatency_grp *iolat;
struct child_latency_info *lat_info;
unsigned long flags;
@@ -714,42 +717,77 @@ next:
rcu_read_unlock();
}
-int blk_iolatency_init(struct request_queue *q)
+/**
+ * blkiolatency_enable_work_fn - Enable or disable iolatency on the device
+ * @work: enable_work of the blk_iolatency of interest
+ *
+ * iolatency needs to keep track of the number of in-flight IOs per cgroup. This
+ * is relatively expensive as it involves walking up the hierarchy twice for
+ * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we
+ * want to disable the in-flight tracking.
+ *
+ * We have to make sure that the counting is balanced - we don't want to leak
+ * the in-flight counts by disabling accounting in the completion path while IOs
+ * are in flight. This is achieved by ensuring that no IO is in flight by
+ * freezing the queue while flipping ->enabled. As this requires a sleepable
+ * context, ->enabled flipping is punted to this work function.
+ */
+static void blkiolatency_enable_work_fn(struct work_struct *work)
+{
+ struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency,
+ enable_work);
+ bool enabled;
+
+ /*
+ * There can only be one instance of this function running for @blkiolat
+ * and it's guaranteed to be executed at least once after the latest
+ * ->enabled_cnt modification. Acting on the latest ->enable_cnt is
+ * sufficient.
+ *
+ * Also, we know @blkiolat is safe to access as ->enable_work is flushed
+ * in blkcg_iolatency_exit().
+ */
+ enabled = atomic_read(&blkiolat->enable_cnt);
+ if (enabled != blkiolat->enabled) {
+ blk_mq_freeze_queue(blkiolat->rqos.disk->queue);
+ blkiolat->enabled = enabled;
+ blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue);
+ }
+}
+
+static int blk_iolatency_init(struct gendisk *disk)
{
struct blk_iolatency *blkiolat;
- struct rq_qos *rqos;
int ret;
blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL);
if (!blkiolat)
return -ENOMEM;
- rqos = &blkiolat->rqos;
- rqos->id = RQ_QOS_LATENCY;
- rqos->ops = &blkcg_iolatency_ops;
- rqos->q = q;
-
- rq_qos_add(q, rqos);
-
- ret = blkcg_activate_policy(q, &blkcg_policy_iolatency);
- if (ret) {
- rq_qos_del(q, rqos);
- kfree(blkiolat);
- return ret;
- }
+ ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY,
+ &blkcg_iolatency_ops);
+ if (ret)
+ goto err_free;
+ ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency);
+ if (ret)
+ goto err_qos_del;
timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0);
+ INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn);
return 0;
+
+err_qos_del:
+ rq_qos_del(&blkiolat->rqos);
+err_free:
+ kfree(blkiolat);
+ return ret;
}
-/*
- * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise
- * return 0.
- */
-static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
+static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
{
struct iolatency_grp *iolat = blkg_to_lat(blkg);
+ struct blk_iolatency *blkiolat = iolat->blkiolat;
u64 oldval = iolat->min_lat_nsec;
iolat->min_lat_nsec = val;
@@ -757,13 +795,15 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val)
iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec,
BLKIOLATENCY_MAX_WIN_SIZE);
- if (!oldval && val)
- return 1;
+ if (!oldval && val) {
+ if (atomic_inc_return(&blkiolat->enable_cnt) == 1)
+ schedule_work(&blkiolat->enable_work);
+ }
if (oldval && !val) {
blkcg_clear_delay(blkg);
- return -1;
+ if (atomic_dec_return(&blkiolat->enable_cnt) == 0)
+ schedule_work(&blkiolat->enable_work);
}
- return 0;
}
static void iolatency_clear_scaling(struct blkcg_gq *blkg)
@@ -795,11 +835,26 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
u64 lat_val = 0;
u64 oldval;
int ret;
- int enable = 0;
- ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx);
+ blkg_conf_init(&ctx, buf);
+
+ ret = blkg_conf_open_bdev(&ctx);
+ if (ret)
+ goto out;
+
+ /*
+ * blk_iolatency_init() may fail after rq_qos_add() succeeds which can
+ * confuse iolat_rq_qos() test. Make the test and init atomic.
+ */
+ lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex);
+ if (!iolat_rq_qos(ctx.bdev->bd_queue))
+ ret = blk_iolatency_init(ctx.bdev->bd_disk);
+ if (ret)
+ goto out;
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx);
if (ret)
- return ret;
+ goto out;
iolat = blkg_to_lat(ctx.blkg);
p = ctx.body;
@@ -830,37 +885,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf,
blkg = ctx.blkg;
oldval = iolat->min_lat_nsec;
- enable = iolatency_set_min_lat_nsec(blkg, lat_val);
- if (enable) {
- WARN_ON_ONCE(!blk_get_queue(blkg->q));
- blkg_get(blkg);
- }
-
- if (oldval != iolat->min_lat_nsec) {
+ iolatency_set_min_lat_nsec(blkg, lat_val);
+ if (oldval != iolat->min_lat_nsec)
iolatency_clear_scaling(blkg);
- }
-
ret = 0;
out:
- blkg_conf_finish(&ctx);
- if (ret == 0 && enable) {
- struct iolatency_grp *tmp = blkg_to_lat(blkg);
- struct blk_iolatency *blkiolat = tmp->blkiolat;
-
- blk_mq_freeze_queue(blkg->q);
-
- if (enable == 1)
- atomic_inc(&blkiolat->enabled);
- else if (enable == -1)
- atomic_dec(&blkiolat->enabled);
- else
- WARN_ON_ONCE(1);
-
- blk_mq_unfreeze_queue(blkg->q);
-
- blkg_put(blkg);
- blk_put_queue(blkg->q);
- }
+ blkg_conf_exit(&ctx);
return ret ?: nbytes;
}
@@ -885,8 +915,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v)
return 0;
}
-static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
- size_t size)
+static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s)
{
struct latency_stat stat;
int cpu;
@@ -900,47 +929,45 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf,
}
preempt_enable();
- if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " missed=%llu total=%llu depth=max",
- (unsigned long long)stat.ps.missed,
- (unsigned long long)stat.ps.total);
- return scnprintf(buf, size, " missed=%llu total=%llu depth=%u",
- (unsigned long long)stat.ps.missed,
- (unsigned long long)stat.ps.total,
- iolat->rq_depth.max_depth);
+ if (iolat->max_depth == UINT_MAX)
+ seq_printf(s, " missed=%llu total=%llu depth=max",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total);
+ else
+ seq_printf(s, " missed=%llu total=%llu depth=%u",
+ (unsigned long long)stat.ps.missed,
+ (unsigned long long)stat.ps.total,
+ iolat->max_depth);
}
-static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf,
- size_t size)
+static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
unsigned long long avg_lat;
unsigned long long cur_win;
if (!blkcg_debug_stats)
- return 0;
+ return;
if (iolat->ssd)
- return iolatency_ssd_stat(iolat, buf, size);
+ return iolatency_ssd_stat(iolat, s);
avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC);
cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC);
- if (iolat->rq_depth.max_depth == UINT_MAX)
- return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu",
- avg_lat, cur_win);
-
- return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu",
- iolat->rq_depth.max_depth, avg_lat, cur_win);
+ if (iolat->max_depth == UINT_MAX)
+ seq_printf(s, " depth=max avg_lat=%llu win=%llu",
+ avg_lat, cur_win);
+ else
+ seq_printf(s, " depth=%u avg_lat=%llu win=%llu",
+ iolat->max_depth, avg_lat, cur_win);
}
-
-static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp,
- struct request_queue *q,
- struct blkcg *blkcg)
+static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk,
+ struct blkcg *blkcg, gfp_t gfp)
{
struct iolatency_grp *iolat;
- iolat = kzalloc_node(sizeof(*iolat), gfp, q->node);
+ iolat = kzalloc_node(sizeof(*iolat), gfp, disk->node_id);
if (!iolat)
return NULL;
iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat),
@@ -956,7 +983,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
struct blkcg_gq *blkg = lat_to_blkg(iolat);
- struct rq_qos *rqos = blkcg_rq_qos(blkg->q);
+ struct rq_qos *rqos = iolat_rq_qos(blkg->q);
struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos);
u64 now = ktime_to_ns(ktime_get());
int cpu;
@@ -975,9 +1002,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd)
latency_stat_init(iolat, &iolat->cur_stat);
rq_wait_init(&iolat->rq_wait);
spin_lock_init(&iolat->child_lat.lock);
- iolat->rq_depth.queue_depth = blkg->q->nr_requests;
- iolat->rq_depth.max_depth = UINT_MAX;
- iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth;
+ iolat->max_depth = UINT_MAX;
iolat->blkiolat = blkiolat;
iolat->cur_win_nsec = 100 * NSEC_PER_MSEC;
atomic64_set(&iolat->window_start, now);
@@ -1001,14 +1026,8 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd)
{
struct iolatency_grp *iolat = pd_to_lat(pd);
struct blkcg_gq *blkg = lat_to_blkg(iolat);
- struct blk_iolatency *blkiolat = iolat->blkiolat;
- int ret;
- ret = iolatency_set_min_lat_nsec(blkg, 0);
- if (ret == 1)
- atomic_inc(&blkiolat->enabled);
- if (ret == -1)
- atomic_dec(&blkiolat->enabled);
+ iolatency_set_min_lat_nsec(blkg, 0);
iolatency_clear_scaling(blkg);
}
@@ -1045,7 +1064,7 @@ static int __init iolatency_init(void)
static void __exit iolatency_exit(void)
{
- return blkcg_policy_unregister(&blkcg_policy_iolatency);
+ blkcg_policy_unregister(&blkcg_policy_iolatency);
}
module_init(iolatency_init);
diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c
new file mode 100644
index 000000000000..4051fada01f1
--- /dev/null
+++ b/block/blk-ioprio.c
@@ -0,0 +1,243 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Block rq-qos policy for assigning an I/O priority class to requests.
+ *
+ * Using an rq-qos policy for assigning I/O priority class has two advantages
+ * over using the ioprio_set() system call:
+ *
+ * - This policy is cgroup based so it has all the advantages of cgroups.
+ * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos
+ * controller affects page cache writeback I/O for filesystems that support
+ * assiociating a cgroup with writeback I/O. See also
+ * Documentation/admin-guide/cgroup-v2.rst.
+ */
+
+#include <linux/blk-mq.h>
+#include <linux/blk_types.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include "blk-cgroup.h"
+#include "blk-ioprio.h"
+#include "blk-rq-qos.h"
+
+/**
+ * enum prio_policy - I/O priority class policy.
+ * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class.
+ * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT.
+ * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into
+ * IOPRIO_CLASS_BE.
+ * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE.
+ * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT.
+ *
+ * See also <linux/ioprio.h>.
+ */
+enum prio_policy {
+ POLICY_NO_CHANGE = 0,
+ POLICY_PROMOTE_TO_RT = 1,
+ POLICY_RESTRICT_TO_BE = 2,
+ POLICY_ALL_TO_IDLE = 3,
+ POLICY_NONE_TO_RT = 4,
+};
+
+static const char *policy_name[] = {
+ [POLICY_NO_CHANGE] = "no-change",
+ [POLICY_PROMOTE_TO_RT] = "promote-to-rt",
+ [POLICY_RESTRICT_TO_BE] = "restrict-to-be",
+ [POLICY_ALL_TO_IDLE] = "idle",
+ [POLICY_NONE_TO_RT] = "none-to-rt",
+};
+
+static struct blkcg_policy ioprio_policy;
+
+/**
+ * struct ioprio_blkg - Per (cgroup, request queue) data.
+ * @pd: blkg_policy_data structure.
+ */
+struct ioprio_blkg {
+ struct blkg_policy_data pd;
+};
+
+/**
+ * struct ioprio_blkcg - Per cgroup data.
+ * @cpd: blkcg_policy_data structure.
+ * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>.
+ */
+struct ioprio_blkcg {
+ struct blkcg_policy_data cpd;
+ enum prio_policy prio_policy;
+};
+
+static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL;
+}
+
+static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg)
+{
+ return container_of(blkcg_to_cpd(blkcg, &ioprio_policy),
+ struct ioprio_blkcg, cpd);
+}
+
+static struct ioprio_blkcg *
+ioprio_blkcg_from_css(struct cgroup_subsys_state *css)
+{
+ return blkcg_to_ioprio_blkcg(css_to_blkcg(css));
+}
+
+static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio)
+{
+ struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy);
+
+ if (!pd)
+ return NULL;
+
+ return blkcg_to_ioprio_blkcg(pd->blkg->blkcg);
+}
+
+static int ioprio_show_prio_policy(struct seq_file *sf, void *v)
+{
+ struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf));
+
+ seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]);
+ return 0;
+}
+
+static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf,
+ size_t nbytes, loff_t off)
+{
+ struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of));
+ int ret;
+
+ if (off != 0)
+ return -EIO;
+ /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */
+ ret = sysfs_match_string(policy_name, buf);
+ if (ret < 0)
+ return ret;
+ blkcg->prio_policy = ret;
+ return nbytes;
+}
+
+static struct blkg_policy_data *
+ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp)
+{
+ struct ioprio_blkg *ioprio_blkg;
+
+ ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp);
+ if (!ioprio_blkg)
+ return NULL;
+
+ return &ioprio_blkg->pd;
+}
+
+static void ioprio_free_pd(struct blkg_policy_data *pd)
+{
+ struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd);
+
+ kfree(ioprio_blkg);
+}
+
+static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp)
+{
+ struct ioprio_blkcg *blkcg;
+
+ blkcg = kzalloc(sizeof(*blkcg), gfp);
+ if (!blkcg)
+ return NULL;
+ blkcg->prio_policy = POLICY_NO_CHANGE;
+ return &blkcg->cpd;
+}
+
+static void ioprio_free_cpd(struct blkcg_policy_data *cpd)
+{
+ struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd);
+
+ kfree(blkcg);
+}
+
+#define IOPRIO_ATTRS \
+ { \
+ .name = "prio.class", \
+ .seq_show = ioprio_show_prio_policy, \
+ .write = ioprio_set_prio_policy, \
+ }, \
+ { } /* sentinel */
+
+/* cgroup v2 attributes */
+static struct cftype ioprio_files[] = {
+ IOPRIO_ATTRS
+};
+
+/* cgroup v1 attributes */
+static struct cftype ioprio_legacy_files[] = {
+ IOPRIO_ATTRS
+};
+
+static struct blkcg_policy ioprio_policy = {
+ .dfl_cftypes = ioprio_files,
+ .legacy_cftypes = ioprio_legacy_files,
+
+ .cpd_alloc_fn = ioprio_alloc_cpd,
+ .cpd_free_fn = ioprio_free_cpd,
+
+ .pd_alloc_fn = ioprio_alloc_pd,
+ .pd_free_fn = ioprio_free_pd,
+};
+
+void blkcg_set_ioprio(struct bio *bio)
+{
+ struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio);
+ u16 prio;
+
+ if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE)
+ return;
+
+ if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT ||
+ blkcg->prio_policy == POLICY_NONE_TO_RT) {
+ /*
+ * For RT threads, the default priority level is 4 because
+ * task_nice is 0. By promoting non-RT io-priority to RT-class
+ * and default level 4, those requests that are already
+ * RT-class but need a higher io-priority can use ioprio_set()
+ * to achieve this.
+ */
+ if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT)
+ bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4);
+ return;
+ }
+
+ /*
+ * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers
+ * correspond to a lower priority. Hence, the max_t() below selects
+ * the lower priority of bi_ioprio and the cgroup I/O priority class.
+ * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O
+ * priority is assigned to the bio.
+ */
+ prio = max_t(u16, bio->bi_ioprio,
+ IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0));
+ if (prio > bio->bi_ioprio)
+ bio->bi_ioprio = prio;
+}
+
+void blk_ioprio_exit(struct gendisk *disk)
+{
+ blkcg_deactivate_policy(disk, &ioprio_policy);
+}
+
+int blk_ioprio_init(struct gendisk *disk)
+{
+ return blkcg_activate_policy(disk, &ioprio_policy);
+}
+
+static int __init ioprio_init(void)
+{
+ return blkcg_policy_register(&ioprio_policy);
+}
+
+static void __exit ioprio_exit(void)
+{
+ blkcg_policy_unregister(&ioprio_policy);
+}
+
+module_init(ioprio_init);
+module_exit(ioprio_exit);
diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h
new file mode 100644
index 000000000000..b6afb8e80de0
--- /dev/null
+++ b/block/blk-ioprio.h
@@ -0,0 +1,28 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+
+#ifndef _BLK_IOPRIO_H_
+#define _BLK_IOPRIO_H_
+
+#include <linux/kconfig.h>
+
+struct request_queue;
+struct bio;
+
+#ifdef CONFIG_BLK_CGROUP_IOPRIO
+int blk_ioprio_init(struct gendisk *disk);
+void blk_ioprio_exit(struct gendisk *disk);
+void blkcg_set_ioprio(struct bio *bio);
+#else
+static inline int blk_ioprio_init(struct gendisk *disk)
+{
+ return 0;
+}
+static inline void blk_ioprio_exit(struct gendisk *disk)
+{
+}
+static inline void blkcg_set_ioprio(struct bio *bio)
+{
+}
+#endif
+
+#endif /* _BLK_IOPRIO_H_ */
diff --git a/block/blk-lib.c b/block/blk-lib.c
index 5f2c429d4378..e59c3069e835 100644
--- a/block/blk-lib.c
+++ b/block/blk-lib.c
@@ -10,41 +10,47 @@
#include "blk.h"
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp)
+static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector)
{
- struct bio *new = bio_alloc(gfp, nr_pages);
-
- if (bio) {
- bio_chain(bio, new);
- submit_bio(bio);
- }
-
- return new;
+ unsigned int discard_granularity = bdev_discard_granularity(bdev);
+ sector_t granularity_aligned_sector;
+
+ if (bdev_is_partition(bdev))
+ sector += bdev->bd_start_sect;
+
+ granularity_aligned_sector =
+ round_up(sector, discard_granularity >> SECTOR_SHIFT);
+
+ /*
+ * Make sure subsequent bios start aligned to the discard granularity if
+ * it needs to be split.
+ */
+ if (granularity_aligned_sector != sector)
+ return granularity_aligned_sector - sector;
+
+ /*
+ * Align the bio size to the discard granularity to make splitting the bio
+ * at discard granularity boundaries easier in the driver if needed.
+ */
+ return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT;
}
int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, int flags,
- struct bio **biop)
+ sector_t nr_sects, gfp_t gfp_mask, struct bio **biop)
{
- struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
- unsigned int op;
sector_t bs_mask;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
+ if (!bdev_max_discard_sectors(bdev))
+ return -EOPNOTSUPP;
- if (flags & BLKDEV_DISCARD_SECURE) {
- if (!blk_queue_secure_erase(q))
- return -EOPNOTSUPP;
- op = REQ_OP_SECURE_ERASE;
- } else {
- if (!blk_queue_discard(q))
- return -EOPNOTSUPP;
- op = REQ_OP_DISCARD;
+ /* In case the discard granularity isn't set by buggy device driver */
+ if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) {
+ pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n",
+ bdev);
+ return -EOPNOTSUPP;
}
bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
@@ -55,16 +61,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector,
return -EINVAL;
while (nr_sects) {
- sector_t req_sects = min_t(sector_t, nr_sects,
- bio_allowed_max_sectors(q));
-
- WARN_ON_ONCE((req_sects << 9) > UINT_MAX);
+ sector_t req_sects =
+ min(nr_sects, bio_discard_limit(bdev, sector));
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, op, 0);
-
bio->bi_iter.bi_size = req_sects << 9;
sector += req_sects;
nr_sects -= req_sects;
@@ -89,21 +90,19 @@ EXPORT_SYMBOL(__blkdev_issue_discard);
* @sector: start sector
* @nr_sects: number of sectors to discard
* @gfp_mask: memory allocation flags (for bio_alloc)
- * @flags: BLKDEV_DISCARD_* flags to control behaviour
*
* Description:
* Issue a discard request for the sectors in question.
*/
int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, unsigned long flags)
+ sector_t nr_sects, gfp_t gfp_mask)
{
struct bio *bio = NULL;
struct blk_plug plug;
int ret;
blk_start_plug(&plug);
- ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags,
- &bio);
+ ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio);
if (!ret && bio) {
ret = submit_bio_wait(bio);
if (ret == -EOPNOTSUPP)
@@ -116,109 +115,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL(blkdev_issue_discard);
-/**
- * __blkdev_issue_write_same - generate number of bios with same page
- * @bdev: target blockdev
- * @sector: start sector
- * @nr_sects: number of sectors to write
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @page: page containing data to write
- * @biop: pointer to anchor bio
- *
- * Description:
- * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page.
- */
-static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask, struct page *page,
- struct bio **biop)
-{
- struct request_queue *q = bdev_get_queue(bdev);
- unsigned int max_write_same_sectors;
- struct bio *bio = *biop;
- sector_t bs_mask;
-
- if (!q)
- return -ENXIO;
-
- if (bdev_read_only(bdev))
- return -EPERM;
-
- bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
- if ((sector | nr_sects) & bs_mask)
- return -EINVAL;
-
- if (!bdev_write_same(bdev))
- return -EOPNOTSUPP;
-
- /* Ensure that max_write_same_sectors doesn't overflow bi_size */
- max_write_same_sectors = bio_allowed_max_sectors(q);
-
- while (nr_sects) {
- bio = blk_next_bio(bio, 1, gfp_mask);
- bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio->bi_vcnt = 1;
- bio->bi_io_vec->bv_page = page;
- bio->bi_io_vec->bv_offset = 0;
- bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0);
-
- if (nr_sects > max_write_same_sectors) {
- bio->bi_iter.bi_size = max_write_same_sectors << 9;
- nr_sects -= max_write_same_sectors;
- sector += max_write_same_sectors;
- } else {
- bio->bi_iter.bi_size = nr_sects << 9;
- nr_sects = 0;
- }
- cond_resched();
- }
-
- *biop = bio;
- return 0;
-}
-
-/**
- * blkdev_issue_write_same - queue a write same operation
- * @bdev: target blockdev
- * @sector: start sector
- * @nr_sects: number of sectors to write
- * @gfp_mask: memory allocation flags (for bio_alloc)
- * @page: page containing data
- *
- * Description:
- * Issue a write same request for the sectors in question.
- */
-int blkdev_issue_write_same(struct block_device *bdev, sector_t sector,
- sector_t nr_sects, gfp_t gfp_mask,
- struct page *page)
-{
- struct bio *bio = NULL;
- struct blk_plug plug;
- int ret;
-
- blk_start_plug(&plug);
- ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page,
- &bio);
- if (ret == 0 && bio) {
- ret = submit_bio_wait(bio);
- bio_put(bio);
- }
- blk_finish_plug(&plug);
- return ret;
-}
-EXPORT_SYMBOL(blkdev_issue_write_same);
-
static int __blkdev_issue_write_zeroes(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop, unsigned flags)
{
struct bio *bio = *biop;
unsigned int max_write_zeroes_sectors;
- struct request_queue *q = bdev_get_queue(bdev);
-
- if (!q)
- return -ENXIO;
if (bdev_read_only(bdev))
return -EPERM;
@@ -230,10 +132,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev,
return -EOPNOTSUPP;
while (nr_sects) {
- bio = blk_next_bio(bio, 0, gfp_mask);
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio->bi_opf = REQ_OP_WRITE_ZEROES;
if (flags & BLKDEV_ZERO_NOUNMAP)
bio->bi_opf |= REQ_NOUNMAP;
@@ -262,30 +162,24 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects)
{
sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512);
- return min(pages, (sector_t)BIO_MAX_PAGES);
+ return min(pages, (sector_t)BIO_MAX_VECS);
}
static int __blkdev_issue_zero_pages(struct block_device *bdev,
sector_t sector, sector_t nr_sects, gfp_t gfp_mask,
struct bio **biop)
{
- struct request_queue *q = bdev_get_queue(bdev);
struct bio *bio = *biop;
int bi_size = 0;
unsigned int sz;
- if (!q)
- return -ENXIO;
-
if (bdev_read_only(bdev))
return -EPERM;
while (nr_sects != 0) {
- bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects),
- gfp_mask);
+ bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects),
+ REQ_OP_WRITE, gfp_mask);
bio->bi_iter.bi_sector = sector;
- bio_set_dev(bio, bdev);
- bio_set_op_attrs(bio, REQ_OP_WRITE, 0);
while (nr_sects != 0) {
sz = min((sector_t) PAGE_SIZE, nr_sects << 9);
@@ -405,3 +299,47 @@ retry:
return ret;
}
EXPORT_SYMBOL(blkdev_issue_zeroout);
+
+int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector,
+ sector_t nr_sects, gfp_t gfp)
+{
+ sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1;
+ unsigned int max_sectors = bdev_max_secure_erase_sectors(bdev);
+ struct bio *bio = NULL;
+ struct blk_plug plug;
+ int ret = 0;
+
+ /* make sure that "len << SECTOR_SHIFT" doesn't overflow */
+ if (max_sectors > UINT_MAX >> SECTOR_SHIFT)
+ max_sectors = UINT_MAX >> SECTOR_SHIFT;
+ max_sectors &= ~bs_mask;
+
+ if (max_sectors == 0)
+ return -EOPNOTSUPP;
+ if ((sector | nr_sects) & bs_mask)
+ return -EINVAL;
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
+ blk_start_plug(&plug);
+ for (;;) {
+ unsigned int len = min_t(sector_t, nr_sects, max_sectors);
+
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp);
+ bio->bi_iter.bi_sector = sector;
+ bio->bi_iter.bi_size = len << SECTOR_SHIFT;
+
+ sector += len;
+ nr_sects -= len;
+ if (!nr_sects) {
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ break;
+ }
+ cond_resched();
+ }
+ blk_finish_plug(&plug);
+
+ return ret;
+}
+EXPORT_SYMBOL(blkdev_issue_secure_erase);
diff --git a/block/blk-map.c b/block/blk-map.c
index 6e804892d5ec..71210cdb3442 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -12,7 +12,8 @@
#include "blk.h"
struct bio_map_data {
- int is_our_pages;
+ bool is_our_pages : 1;
+ bool is_null_mapped : 1;
struct iov_iter iter;
struct iovec iov[];
};
@@ -28,9 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data,
bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask);
if (!bmd)
return NULL;
- memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs);
bmd->iter = *data;
- bmd->iter.iov = bmd->iov;
+ if (iter_is_iovec(data)) {
+ memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs);
+ bmd->iter.__iov = bmd->iov;
+ }
return bmd;
}
@@ -108,7 +111,7 @@ static int bio_uncopy_user(struct bio *bio)
struct bio_map_data *bmd = bio->bi_private;
int ret = 0;
- if (!bio_flagged(bio, BIO_NULL_MAPPED)) {
+ if (!bmd->is_null_mapped) {
/*
* if we're in a workqueue, the request is orphaned, so
* don't copy into a random user address space, just free
@@ -122,24 +125,11 @@ static int bio_uncopy_user(struct bio *bio)
bio_free_pages(bio);
}
kfree(bmd);
- bio_put(bio);
return ret;
}
-/**
- * bio_copy_user_iov - copy user data to bio
- * @q: destination block queue
- * @map_data: pointer to the rq_map_data holding pages (if necessary)
- * @iter: iovec iterator
- * @gfp_mask: memory allocation flags
- *
- * Prepares and returns a bio for indirect user io, bouncing data
- * to/from kernel pages as necessary. Must be paired with
- * call bio_uncopy_user() on io completion.
- */
-static struct bio *bio_copy_user_iov(struct request_queue *q,
- struct rq_map_data *map_data, struct iov_iter *iter,
- gfp_t gfp_mask)
+static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data,
+ struct iov_iter *iter, gfp_t gfp_mask)
{
struct bio_map_data *bmd;
struct page *page;
@@ -151,28 +141,26 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
bmd = bio_alloc_map_data(iter, gfp_mask);
if (!bmd)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
/*
* We need to do a deep copy of the iov_iter including the iovecs.
* The caller provided iov might point to an on-stack or otherwise
* shortlived one.
*/
- bmd->is_our_pages = map_data ? 0 : 1;
+ bmd->is_our_pages = !map_data;
+ bmd->is_null_mapped = (map_data && map_data->null_mapped);
- nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE);
- if (nr_pages > BIO_MAX_PAGES)
- nr_pages = BIO_MAX_PAGES;
+ nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE));
ret = -ENOMEM;
- bio = bio_kmalloc(gfp_mask, nr_pages);
+ bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
goto out_bmd;
-
- ret = 0;
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq));
if (map_data) {
- nr_pages = 1 << map_data->page_order;
+ nr_pages = 1U << map_data->page_order;
i = map_data->offset / PAGE_SIZE;
}
while (len) {
@@ -186,7 +174,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
if (map_data) {
if (i == map_data->nr_entries * nr_pages) {
ret = -ENOMEM;
- break;
+ goto cleanup;
}
page = map_data->pages[i / nr_pages];
@@ -194,14 +182,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
i++;
} else {
- page = alloc_page(q->bounce_gfp | gfp_mask);
+ page = alloc_page(GFP_NOIO | gfp_mask);
if (!page) {
ret = -ENOMEM;
- break;
+ goto cleanup;
}
}
- if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) {
+ if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) {
if (!map_data)
__free_page(page);
break;
@@ -211,21 +199,25 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
offset = 0;
}
- if (ret)
- goto cleanup;
-
if (map_data)
map_data->offset += bio->bi_iter.bi_size;
/*
* success
*/
- if ((iov_iter_rw(iter) == WRITE &&
- (!map_data || !map_data->null_mapped)) ||
- (map_data && map_data->from_user)) {
+ if (iov_iter_rw(iter) == WRITE &&
+ (!map_data || !map_data->null_mapped)) {
ret = bio_copy_from_iter(bio, iter);
if (ret)
goto cleanup;
+ } else if (map_data && map_data->from_user) {
+ struct iov_iter iter2 = *iter;
+
+ /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */
+ iter2.data_source = ITER_SOURCE;
+ ret = bio_copy_from_iter(bio, &iter2);
+ if (ret)
+ goto cleanup;
} else {
if (bmd->is_our_pages)
zero_fill_bio(bio);
@@ -233,49 +225,84 @@ static struct bio *bio_copy_user_iov(struct request_queue *q,
}
bio->bi_private = bmd;
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
- return bio;
+
+ ret = blk_rq_append_bio(rq, bio);
+ if (ret)
+ goto cleanup;
+ return 0;
cleanup:
if (!map_data)
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
out_bmd:
kfree(bmd);
- return ERR_PTR(ret);
+ return ret;
}
-/**
- * bio_map_user_iov - map user iovec into bio
- * @q: the struct request_queue for the bio
- * @iter: iovec iterator
- * @gfp_mask: memory allocation flags
- *
- * Map the user space address into a bio suitable for io to a block
- * device. Returns an error pointer in case of error.
- */
-static struct bio *bio_map_user_iov(struct request_queue *q,
- struct iov_iter *iter, gfp_t gfp_mask)
+static void blk_mq_map_bio_put(struct bio *bio)
+{
+ if (bio->bi_opf & REQ_ALLOC_CACHE) {
+ bio_put(bio);
+ } else {
+ bio_uninit(bio);
+ kfree(bio);
+ }
+}
+
+static struct bio *blk_rq_map_bio_alloc(struct request *rq,
+ unsigned int nr_vecs, gfp_t gfp_mask)
{
- unsigned int max_sectors = queue_max_hw_sectors(q);
- int j;
+ struct bio *bio;
+
+ if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) {
+ bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask,
+ &fs_bio_set);
+ if (!bio)
+ return NULL;
+ } else {
+ bio = bio_kmalloc(nr_vecs, gfp_mask);
+ if (!bio)
+ return NULL;
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq));
+ }
+ return bio;
+}
+
+static int bio_map_user_iov(struct request *rq, struct iov_iter *iter,
+ gfp_t gfp_mask)
+{
+ iov_iter_extraction_t extraction_flags = 0;
+ unsigned int max_sectors = queue_max_hw_sectors(rq->q);
+ unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS);
struct bio *bio;
int ret;
+ int j;
if (!iov_iter_count(iter))
- return ERR_PTR(-EINVAL);
+ return -EINVAL;
- bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES));
- if (!bio)
- return ERR_PTR(-ENOMEM);
+ bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask);
+ if (bio == NULL)
+ return -ENOMEM;
+
+ if (blk_queue_pci_p2pdma(rq->q))
+ extraction_flags |= ITER_ALLOW_P2PDMA;
+ if (iov_iter_extract_will_pin(iter))
+ bio_set_flag(bio, BIO_PAGE_PINNED);
while (iov_iter_count(iter)) {
- struct page **pages;
+ struct page *stack_pages[UIO_FASTIOV];
+ struct page **pages = stack_pages;
ssize_t bytes;
- size_t offs, added = 0;
+ size_t offs;
int npages;
- bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs);
+ if (nr_vecs > ARRAY_SIZE(stack_pages))
+ pages = NULL;
+
+ bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX,
+ nr_vecs, extraction_flags, &offs);
if (unlikely(bytes <= 0)) {
ret = bytes ? bytes : -EFAULT;
goto out_unmap;
@@ -283,10 +310,9 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE);
- if (unlikely(offs & queue_dma_alignment(q))) {
- ret = -EINVAL;
+ if (unlikely(offs & queue_dma_alignment(rq->q)))
j = 0;
- } else {
+ else {
for (j = 0; j < npages; j++) {
struct page *page = pages[j];
unsigned int n = PAGE_SIZE - offs;
@@ -295,66 +321,44 @@ static struct bio *bio_map_user_iov(struct request_queue *q,
if (n > bytes)
n = bytes;
- if (!bio_add_hw_page(q, bio, page, n, offs,
- max_sectors, &same_page)) {
- if (same_page)
- put_page(page);
+ if (!bio_add_hw_page(rq->q, bio, page, n, offs,
+ max_sectors, &same_page))
break;
- }
- added += n;
+ if (same_page)
+ bio_release_page(bio, page);
bytes -= n;
offs = 0;
}
- iov_iter_advance(iter, added);
}
/*
* release the pages we didn't map into the bio, if any
*/
while (j < npages)
- put_page(pages[j++]);
- kvfree(pages);
+ bio_release_page(bio, pages[j++]);
+ if (pages != stack_pages)
+ kvfree(pages);
/* couldn't stuff something into bio? */
- if (bytes)
+ if (bytes) {
+ iov_iter_revert(iter, bytes);
break;
+ }
}
- bio_set_flag(bio, BIO_USER_MAPPED);
-
- /*
- * subtle -- if bio_map_user_iov() ended up bouncing a bio,
- * it would normally disappear when its bi_end_io is run.
- * however, we need it for the unmap, so grab an extra
- * reference to it
- */
- bio_get(bio);
- return bio;
+ ret = blk_rq_append_bio(rq, bio);
+ if (ret)
+ goto out_unmap;
+ return 0;
out_unmap:
bio_release_pages(bio, false);
- bio_put(bio);
- return ERR_PTR(ret);
-}
-
-/**
- * bio_unmap_user - unmap a bio
- * @bio: the bio being unmapped
- *
- * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from
- * process context.
- *
- * bio_unmap_user() may sleep.
- */
-static void bio_unmap_user(struct bio *bio)
-{
- bio_release_pages(bio, bio_data_dir(bio) == READ);
- bio_put(bio);
- bio_put(bio);
+ blk_mq_map_bio_put(bio);
+ return ret;
}
static void bio_invalidate_vmalloc_pages(struct bio *bio)
{
-#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE
+#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE
if (bio->bi_private && !op_is_write(bio_op(bio))) {
unsigned long i, len = 0;
@@ -368,7 +372,8 @@ static void bio_invalidate_vmalloc_pages(struct bio *bio)
static void bio_map_kern_endio(struct bio *bio)
{
bio_invalidate_vmalloc_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
}
/**
@@ -393,9 +398,10 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data,
int offset, i;
struct bio *bio;
- bio = bio_kmalloc(gfp_mask, nr_pages);
+ bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
if (is_vmalloc) {
flush_kernel_vmap_range(data, len);
@@ -419,7 +425,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data,
if (bio_add_pc_page(q, bio, page, bytes,
offset) < bytes) {
/* we don't support partial mappings */
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
return ERR_PTR(-EINVAL);
}
@@ -435,7 +442,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data,
static void bio_copy_kern_endio(struct bio *bio)
{
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
}
static void bio_copy_kern_endio_read(struct bio *bio)
@@ -445,7 +453,7 @@ static void bio_copy_kern_endio_read(struct bio *bio)
struct bvec_iter_all iter_all;
bio_for_each_segment_all(bvec, bio, iter_all) {
- memcpy(p, page_address(bvec->bv_page), bvec->bv_len);
+ memcpy_from_bvec(p, bvec);
p += bvec->bv_len;
}
@@ -480,9 +488,10 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
return ERR_PTR(-EINVAL);
nr_pages = end - start;
- bio = bio_kmalloc(gfp_mask, nr_pages);
+ bio = bio_kmalloc(nr_pages, gfp_mask);
if (!bio)
return ERR_PTR(-ENOMEM);
+ bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0);
while (len) {
struct page *page;
@@ -491,7 +500,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
if (bytes > len)
bytes = len;
- page = alloc_page(q->bounce_gfp | gfp_mask);
+ page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask);
if (!page)
goto cleanup;
@@ -516,7 +525,8 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data,
cleanup:
bio_free_pages(bio);
- bio_put(bio);
+ bio_uninit(bio);
+ kfree(bio);
return ERR_PTR(-ENOMEM);
}
@@ -524,86 +534,84 @@ cleanup:
* Append a bio to a passthrough request. Only works if the bio can be merged
* into the request based on the driver constraints.
*/
-int blk_rq_append_bio(struct request *rq, struct bio **bio)
+int blk_rq_append_bio(struct request *rq, struct bio *bio)
{
- struct bio *orig_bio = *bio;
struct bvec_iter iter;
struct bio_vec bv;
unsigned int nr_segs = 0;
- blk_queue_bounce(rq->q, bio);
-
- bio_for_each_bvec(bv, *bio, iter)
+ bio_for_each_bvec(bv, bio, iter)
nr_segs++;
if (!rq->bio) {
- blk_rq_bio_prep(rq, *bio, nr_segs);
+ blk_rq_bio_prep(rq, bio, nr_segs);
} else {
- if (!ll_back_merge_fn(rq, *bio, nr_segs)) {
- if (orig_bio != *bio) {
- bio_put(*bio);
- *bio = orig_bio;
- }
+ if (!ll_back_merge_fn(rq, bio, nr_segs))
return -EINVAL;
- }
-
- rq->biotail->bi_next = *bio;
- rq->biotail = *bio;
- rq->__data_len += (*bio)->bi_iter.bi_size;
- bio_crypt_free_ctx(*bio);
+ rq->biotail->bi_next = bio;
+ rq->biotail = bio;
+ rq->__data_len += (bio)->bi_iter.bi_size;
+ bio_crypt_free_ctx(bio);
}
return 0;
}
EXPORT_SYMBOL(blk_rq_append_bio);
-static int __blk_rq_unmap_user(struct bio *bio)
-{
- int ret = 0;
-
- if (bio) {
- if (bio_flagged(bio, BIO_USER_MAPPED))
- bio_unmap_user(bio);
- else
- ret = bio_uncopy_user(bio);
- }
-
- return ret;
-}
-
-static int __blk_rq_map_user_iov(struct request *rq,
- struct rq_map_data *map_data, struct iov_iter *iter,
- gfp_t gfp_mask, bool copy)
+/* Prepare bio for passthrough IO given ITER_BVEC iter */
+static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter)
{
struct request_queue *q = rq->q;
- struct bio *bio, *orig_bio;
- int ret;
+ size_t nr_iter = iov_iter_count(iter);
+ size_t nr_segs = iter->nr_segs;
+ struct bio_vec *bvecs, *bvprvp = NULL;
+ const struct queue_limits *lim = &q->limits;
+ unsigned int nsegs = 0, bytes = 0;
+ struct bio *bio;
+ size_t i;
- if (copy)
- bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
- else
- bio = bio_map_user_iov(q, iter, gfp_mask);
+ if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q))
+ return -EINVAL;
+ if (nr_segs > queue_max_segments(q))
+ return -EINVAL;
- if (IS_ERR(bio))
- return PTR_ERR(bio);
+ /* no iovecs to alloc, as we already have a BVEC iterator */
+ bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL);
+ if (bio == NULL)
+ return -ENOMEM;
- bio->bi_opf &= ~REQ_OP_MASK;
- bio->bi_opf |= req_op(rq);
+ bio_iov_bvec_set(bio, (struct iov_iter *)iter);
+ blk_rq_bio_prep(rq, bio, nr_segs);
- orig_bio = bio;
+ /* loop to perform a bunch of sanity checks */
+ bvecs = (struct bio_vec *)iter->bvec;
+ for (i = 0; i < nr_segs; i++) {
+ struct bio_vec *bv = &bvecs[i];
- /*
- * We link the bounce buffer in and could have to traverse it
- * later so we have to get a ref to prevent it from being freed
- */
- ret = blk_rq_append_bio(rq, &bio);
- if (ret) {
- __blk_rq_unmap_user(orig_bio);
- return ret;
+ /*
+ * If the queue doesn't support SG gaps and adding this
+ * offset would create a gap, fallback to copy.
+ */
+ if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) {
+ blk_mq_map_bio_put(bio);
+ return -EREMOTEIO;
+ }
+ /* check full condition */
+ if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len)
+ goto put_bio;
+ if (bytes + bv->bv_len > nr_iter)
+ goto put_bio;
+ if (bv->bv_offset + bv->bv_len > PAGE_SIZE)
+ goto put_bio;
+
+ nsegs++;
+ bytes += bv->bv_len;
+ bvprvp = bv;
}
- bio_get(bio);
-
return 0;
+put_bio:
+ blk_mq_map_bio_put(bio);
+ return -EINVAL;
}
/**
@@ -620,36 +628,46 @@ static int __blk_rq_map_user_iov(struct request *rq,
*
* A matching blk_rq_unmap_user() must be issued at the end of I/O, while
* still in process context.
- *
- * Note: The mapped bio may need to be bounced through blk_queue_bounce()
- * before being submitted to the device, as pages mapped may be out of
- * reach. It's the callers responsibility to make sure this happens. The
- * original bio must be passed back in to blk_rq_unmap_user() for proper
- * unmapping.
*/
int blk_rq_map_user_iov(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
- bool copy = false;
+ bool copy = false, map_bvec = false;
unsigned long align = q->dma_pad_mask | queue_dma_alignment(q);
struct bio *bio = NULL;
struct iov_iter i;
int ret = -EINVAL;
- if (!iter_is_iovec(iter))
- goto fail;
-
if (map_data)
copy = true;
+ else if (blk_queue_may_bounce(q))
+ copy = true;
else if (iov_iter_alignment(iter) & align)
copy = true;
+ else if (iov_iter_is_bvec(iter))
+ map_bvec = true;
+ else if (!user_backed_iter(iter))
+ copy = true;
else if (queue_virt_boundary(q))
copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter);
+ if (map_bvec) {
+ ret = blk_rq_map_user_bvec(rq, iter);
+ if (!ret)
+ return 0;
+ if (ret != -EREMOTEIO)
+ goto fail;
+ /* fall back to copying the data on limits mismatches */
+ copy = true;
+ }
+
i = *iter;
do {
- ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+ if (copy)
+ ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask);
+ else
+ ret = bio_map_user_iov(rq, &i, gfp_mask);
if (ret)
goto unmap_rq;
if (!bio)
@@ -670,9 +688,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
struct rq_map_data *map_data, void __user *ubuf,
unsigned long len, gfp_t gfp_mask)
{
- struct iovec iov;
struct iov_iter i;
- int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i);
+ int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i);
if (unlikely(ret < 0))
return ret;
@@ -681,6 +698,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(blk_rq_map_user);
+int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data,
+ void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask,
+ bool vec, int iov_count, bool check_iter_count, int rw)
+{
+ int ret = 0;
+
+ if (vec) {
+ struct iovec fast_iov[UIO_FASTIOV];
+ struct iovec *iov = fast_iov;
+ struct iov_iter iter;
+
+ ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len,
+ UIO_FASTIOV, &iov, &iter);
+ if (ret < 0)
+ return ret;
+
+ if (iov_count) {
+ /* SG_IO howto says that the shorter of the two wins */
+ iov_iter_truncate(&iter, buf_len);
+ if (check_iter_count && !iov_iter_count(&iter)) {
+ kfree(iov);
+ return -EINVAL;
+ }
+ }
+
+ ret = blk_rq_map_user_iov(req->q, req, map_data, &iter,
+ gfp_mask);
+ kfree(iov);
+ } else if (buf_len) {
+ ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len,
+ gfp_mask);
+ }
+ return ret;
+}
+EXPORT_SYMBOL(blk_rq_map_user_io);
+
/**
* blk_rq_unmap_user - unmap a request with user data
* @bio: start of bio list
@@ -692,21 +745,21 @@ EXPORT_SYMBOL(blk_rq_map_user);
*/
int blk_rq_unmap_user(struct bio *bio)
{
- struct bio *mapped_bio;
+ struct bio *next_bio;
int ret = 0, ret2;
while (bio) {
- mapped_bio = bio;
- if (unlikely(bio_flagged(bio, BIO_BOUNCED)))
- mapped_bio = bio->bi_private;
-
- ret2 = __blk_rq_unmap_user(mapped_bio);
- if (ret2 && !ret)
- ret = ret2;
+ if (bio->bi_private) {
+ ret2 = bio_uncopy_user(bio);
+ if (ret2 && !ret)
+ ret = ret2;
+ } else {
+ bio_release_pages(bio, bio_data_dir(bio) == READ);
+ }
- mapped_bio = bio;
+ next_bio = bio;
bio = bio->bi_next;
- bio_put(mapped_bio);
+ blk_mq_map_bio_put(next_bio);
}
return ret;
@@ -731,7 +784,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
{
int reading = rq_data_dir(rq) == READ;
unsigned long addr = (unsigned long) kbuf;
- struct bio *bio, *orig_bio;
+ struct bio *bio;
int ret;
if (len > (queue_max_hw_sectors(q) << 9))
@@ -739,7 +792,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
if (!len || !kbuf)
return -EINVAL;
- if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf))
+ if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) ||
+ blk_queue_may_bounce(q))
bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading);
else
bio = bio_map_kern(q, kbuf, len, gfp_mask);
@@ -750,14 +804,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf,
bio->bi_opf &= ~REQ_OP_MASK;
bio->bi_opf |= req_op(rq);
- orig_bio = bio;
- ret = blk_rq_append_bio(rq, &bio);
+ ret = blk_rq_append_bio(rq, bio);
if (unlikely(ret)) {
- /* request is too big */
- bio_put(orig_bio);
- return ret;
+ bio_uninit(bio);
+ kfree(bio);
}
-
- return 0;
+ return ret;
}
EXPORT_SYMBOL(blk_rq_map_kern);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index f0b0bae075a0..2d470cf2173e 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -6,11 +6,48 @@
#include <linux/module.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/scatterlist.h>
+#include <linux/part_stat.h>
+#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-rq-qos.h"
+#include "blk-throttle.h"
+
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ struct bvec_iter iter = bio->bi_iter;
+ int idx;
+
+ bio_get_first_bvec(bio, bv);
+ if (bv->bv_len == bio->bi_iter.bi_size)
+ return; /* this bio only has a single bvec */
+
+ bio_advance_iter(bio, &iter, iter.bi_size);
+
+ if (!iter.bi_bvec_done)
+ idx = iter.bi_idx - 1;
+ else /* in the middle of bvec */
+ idx = iter.bi_idx;
+
+ *bv = bio->bi_io_vec[idx];
+
+ /*
+ * iter.bi_bvec_done records actual length of the last bvec
+ * if this bio ends in the middle of one io vector
+ */
+ if (iter.bi_bvec_done)
+ bv->bv_len = iter.bi_bvec_done;
+}
static inline bool bio_will_gap(struct request_queue *q,
struct request *prev_rq, struct bio *prev, struct bio *next)
@@ -45,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q,
bio_get_first_bvec(next, &nb);
if (biovec_phys_mergeable(q, &pb, &nb))
return false;
- return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+ return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset);
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
@@ -58,29 +95,33 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio)
return bio_will_gap(req->q, NULL, bio, req->bio);
}
-static struct bio *blk_bio_discard_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *nsegs)
+/*
+ * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
+ * is defined as 'unsigned int', meantime it has to be aligned to with the
+ * logical block size, which is the minimum accepted unit by hardware.
+ */
+static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim)
+{
+ return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT;
+}
+
+static struct bio *bio_split_discard(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
unsigned int max_discard_sectors, granularity;
- int alignment;
sector_t tmp;
unsigned split_sectors;
*nsegs = 1;
- /* Zero-sector (unknown) and one-sector granularities are the same. */
- granularity = max(q->limits.discard_granularity >> 9, 1U);
+ granularity = max(lim->discard_granularity >> 9, 1U);
- max_discard_sectors = min(q->limits.max_discard_sectors,
- bio_allowed_max_sectors(q));
+ max_discard_sectors =
+ min(lim->max_discard_sectors, bio_allowed_max_sectors(lim));
max_discard_sectors -= max_discard_sectors % granularity;
-
- if (unlikely(!max_discard_sectors)) {
- /* XXX: warn */
+ if (unlikely(!max_discard_sectors))
return NULL;
- }
if (bio_sectors(bio) <= max_discard_sectors)
return NULL;
@@ -91,9 +132,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
* If the next starting sector would be misaligned, stop the discard at
* the previous aligned sector.
*/
- alignment = (q->limits.discard_alignment >> 9) % granularity;
-
- tmp = bio->bi_iter.bi_sector + split_sectors - alignment;
+ tmp = bio->bi_iter.bi_sector + split_sectors -
+ ((lim->discard_alignment >> 9) % granularity);
tmp = sector_div(tmp, granularity);
if (split_sectors > tmp)
@@ -102,34 +142,16 @@ static struct bio *blk_bio_discard_split(struct request_queue *q,
return bio_split(bio, split_sectors, GFP_NOIO, bs);
}
-static struct bio *blk_bio_write_zeroes_split(struct request_queue *q,
- struct bio *bio, struct bio_set *bs, unsigned *nsegs)
+static struct bio *bio_split_write_zeroes(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned *nsegs, struct bio_set *bs)
{
*nsegs = 0;
-
- if (!q->limits.max_write_zeroes_sectors)
- return NULL;
-
- if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors)
- return NULL;
-
- return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs);
-}
-
-static struct bio *blk_bio_write_same_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *nsegs)
-{
- *nsegs = 1;
-
- if (!q->limits.max_write_same_sectors)
+ if (!lim->max_write_zeroes_sectors)
return NULL;
-
- if (bio_sectors(bio) <= q->limits.max_write_same_sectors)
+ if (bio_sectors(bio) <= lim->max_write_zeroes_sectors)
return NULL;
-
- return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs);
+ return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs);
}
/*
@@ -140,51 +162,60 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q,
* requests that are submitted to a block device if the start of a bio is not
* aligned to a physical block boundary.
*/
-static inline unsigned get_max_io_size(struct request_queue *q,
- struct bio *bio)
+static inline unsigned get_max_io_size(struct bio *bio,
+ const struct queue_limits *lim)
{
- unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector);
- unsigned max_sectors = sectors;
- unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT;
- unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT;
- unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1);
-
- max_sectors += start_offset;
- max_sectors &= ~(pbs - 1);
- if (max_sectors > start_offset)
- return max_sectors - start_offset;
+ unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT;
+ unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT;
+ unsigned max_sectors = lim->max_sectors, start, end;
+
+ if (lim->chunk_sectors) {
+ max_sectors = min(max_sectors,
+ blk_chunk_sectors_left(bio->bi_iter.bi_sector,
+ lim->chunk_sectors));
+ }
- return sectors & (lbs - 1);
+ start = bio->bi_iter.bi_sector & (pbs - 1);
+ end = (start + max_sectors) & ~(pbs - 1);
+ if (end > start)
+ return end - start;
+ return max_sectors & ~(lbs - 1);
}
-static inline unsigned get_max_segment_size(const struct request_queue *q,
- struct page *start_page,
- unsigned long offset)
+/**
+ * get_max_segment_size() - maximum number of bytes to add as a single segment
+ * @lim: Request queue limits.
+ * @start_page: See below.
+ * @offset: Offset from @start_page where to add a segment.
+ *
+ * Returns the maximum number of bytes that can be added as a single segment.
+ */
+static inline unsigned get_max_segment_size(const struct queue_limits *lim,
+ struct page *start_page, unsigned long offset)
{
- unsigned long mask = queue_segment_boundary(q);
+ unsigned long mask = lim->seg_boundary_mask;
offset = mask & (page_to_phys(start_page) + offset);
/*
- * overflow may be triggered in case of zero page physical address
- * on 32bit arch, use queue's max segment size when that happens.
+ * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1
+ * after having calculated the minimum.
*/
- return min_not_zero(mask - offset + 1,
- (unsigned long)queue_max_segment_size(q));
+ return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1;
}
/**
* bvec_split_segs - verify whether or not a bvec should be split in the middle
- * @q: [in] request queue associated with the bio associated with @bv
+ * @lim: [in] queue limits to split based on
* @bv: [in] bvec to examine
* @nsegs: [in,out] Number of segments in the bio being built. Incremented
* by the number of segments from @bv that may be appended to that
* bio without exceeding @max_segs
- * @sectors: [in,out] Number of sectors in the bio being built. Incremented
- * by the number of sectors from @bv that may be appended to that
- * bio without exceeding @max_sectors
+ * @bytes: [in,out] Number of bytes in the bio being built. Incremented
+ * by the number of bytes from @bv that may be appended to that
+ * bio without exceeding @max_bytes
* @max_segs: [in] upper bound for *@nsegs
- * @max_sectors: [in] upper bound for *@sectors
+ * @max_bytes: [in] upper bound for *@bytes
*
* When splitting a bio, it can happen that a bvec is encountered that is too
* big to fit in a single segment and hence that it has to be split in the
@@ -193,18 +224,17 @@ static inline unsigned get_max_segment_size(const struct request_queue *q,
* *@nsegs segments and *@sectors sectors would make that bio unacceptable for
* the block driver.
*/
-static bool bvec_split_segs(const struct request_queue *q,
- const struct bio_vec *bv, unsigned *nsegs,
- unsigned *sectors, unsigned max_segs,
- unsigned max_sectors)
+static bool bvec_split_segs(const struct queue_limits *lim,
+ const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes,
+ unsigned max_segs, unsigned max_bytes)
{
- unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9;
+ unsigned max_len = min(max_bytes, UINT_MAX) - *bytes;
unsigned len = min(bv->bv_len, max_len);
unsigned total_len = 0;
unsigned seg_size = 0;
while (len && *nsegs < max_segs) {
- seg_size = get_max_segment_size(q, bv->bv_page,
+ seg_size = get_max_segment_size(lim, bv->bv_page,
bv->bv_offset + total_len);
seg_size = min(seg_size, len);
@@ -212,27 +242,28 @@ static bool bvec_split_segs(const struct request_queue *q,
total_len += seg_size;
len -= seg_size;
- if ((bv->bv_offset + total_len) & queue_virt_boundary(q))
+ if ((bv->bv_offset + total_len) & lim->virt_boundary_mask)
break;
}
- *sectors += total_len >> 9;
+ *bytes += total_len;
/* tell the caller to split the bvec if it is too big to fit */
return len > 0 || bv->bv_len > max_len;
}
/**
- * blk_bio_segment_split - split a bio in two bios
- * @q: [in] request queue pointer
+ * bio_split_rw - split a bio in two bios
* @bio: [in] bio to be split
- * @bs: [in] bio set to allocate the clone from
+ * @lim: [in] queue limits to split based on
* @segs: [out] number of segments in the bio with the first half of the sectors
+ * @bs: [in] bio set to allocate the clone from
+ * @max_bytes: [in] maximum number of bytes per bio
*
* Clone @bio, update the bi_iter of the clone to represent the first sectors
* of @bio and update @bio->bi_iter to represent the remaining sectors. The
* following is guaranteed for the cloned bio:
- * - That it has at most get_max_io_size(@q, @bio) sectors.
+ * - That it has at most @max_bytes worth of data
* - That it has at most queue_max_segments(@q) segments.
*
* Except for discard requests the cloned bio will point at the bi_io_vec of
@@ -241,33 +272,30 @@ static bool bvec_split_segs(const struct request_queue *q,
* responsible for ensuring that @bs is only destroyed after processing of the
* split bio has finished.
*/
-static struct bio *blk_bio_segment_split(struct request_queue *q,
- struct bio *bio,
- struct bio_set *bs,
- unsigned *segs)
+struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim,
+ unsigned *segs, struct bio_set *bs, unsigned max_bytes)
{
struct bio_vec bv, bvprv, *bvprvp = NULL;
struct bvec_iter iter;
- unsigned nsegs = 0, sectors = 0;
- const unsigned max_sectors = get_max_io_size(q, bio);
- const unsigned max_segs = queue_max_segments(q);
+ unsigned nsegs = 0, bytes = 0;
bio_for_each_bvec(bv, bio, iter) {
/*
* If the queue doesn't support SG gaps and adding this
* offset would create a gap, disallow it.
*/
- if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset))
+ if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset))
goto split;
- if (nsegs < max_segs &&
- sectors + (bv.bv_len >> 9) <= max_sectors &&
+ if (nsegs < lim->max_segments &&
+ bytes + bv.bv_len <= max_bytes &&
bv.bv_offset + bv.bv_len <= PAGE_SIZE) {
nsegs++;
- sectors += bv.bv_len >> 9;
- } else if (bvec_split_segs(q, &bv, &nsegs, &sectors, max_segs,
- max_sectors)) {
- goto split;
+ bytes += bv.bv_len;
+ } else {
+ if (bvec_split_segs(lim, &bv, &nsegs, &bytes,
+ lim->max_segments, max_bytes))
+ goto split;
}
bvprv = bv;
@@ -277,95 +305,110 @@ static struct bio *blk_bio_segment_split(struct request_queue *q,
*segs = nsegs;
return NULL;
split:
+ /*
+ * We can't sanely support splitting for a REQ_NOWAIT bio. End it
+ * with EAGAIN if splitting is required and return an error pointer.
+ */
+ if (bio->bi_opf & REQ_NOWAIT) {
+ bio->bi_status = BLK_STS_AGAIN;
+ bio_endio(bio);
+ return ERR_PTR(-EAGAIN);
+ }
+
*segs = nsegs;
- return bio_split(bio, sectors, GFP_NOIO, bs);
+
+ /*
+ * Individual bvecs might not be logical block aligned. Round down the
+ * split size so that each bio is properly block size aligned, even if
+ * we do not use the full hardware limits.
+ */
+ bytes = ALIGN_DOWN(bytes, lim->logical_block_size);
+
+ /*
+ * Bio splitting may cause subtle trouble such as hang when doing sync
+ * iopoll in direct IO routine. Given performance gain of iopoll for
+ * big IO can be trival, disable iopoll when split needed.
+ */
+ bio_clear_polled(bio);
+ return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs);
}
+EXPORT_SYMBOL_GPL(bio_split_rw);
/**
- * __blk_queue_split - split a bio and submit the second half
- * @q: [in] request queue pointer
- * @bio: [in, out] bio to be split
- * @nr_segs: [out] number of segments in the first bio
+ * __bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ * @lim: queue limits to split based on
+ * @nr_segs: returns the number of segments in the returned bio
*
- * Split a bio into two bios, chain the two bios, submit the second half and
- * store a pointer to the first half in *@bio. If the second bio is still too
- * big it will be split by a recursive call to this function. Since this
- * function may allocate a new bio from @q->bio_split, it is the responsibility
- * of the caller to ensure that @q is only released after processing of the
- * split bio has finished.
+ * Check if @bio needs splitting based on the queue limits, and if so split off
+ * a bio fitting the limits from the beginning of @bio and return it. @bio is
+ * shortened to the remainder and re-submitted.
+ *
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
*/
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
- unsigned int *nr_segs)
+struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nr_segs)
{
- struct bio *split = NULL;
+ struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split;
+ struct bio *split;
- switch (bio_op(*bio)) {
+ switch (bio_op(bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
- split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs);
+ split = bio_split_discard(bio, lim, nr_segs, bs);
break;
case REQ_OP_WRITE_ZEROES:
- split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split,
- nr_segs);
- break;
- case REQ_OP_WRITE_SAME:
- split = blk_bio_write_same_split(q, *bio, &q->bio_split,
- nr_segs);
+ split = bio_split_write_zeroes(bio, lim, nr_segs, bs);
break;
default:
- /*
- * All drivers must accept single-segments bios that are <=
- * PAGE_SIZE. This is a quick and dirty check that relies on
- * the fact that bi_io_vec[0] is always valid if a bio has data.
- * The check might lead to occasional false negatives when bios
- * are cloned, but compared to the performance impact of cloned
- * bios themselves the loop below doesn't matter anyway.
- */
- if (!q->limits.chunk_sectors &&
- (*bio)->bi_vcnt == 1 &&
- ((*bio)->bi_io_vec[0].bv_len +
- (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) {
- *nr_segs = 1;
- break;
- }
- split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs);
+ split = bio_split_rw(bio, lim, nr_segs, bs,
+ get_max_io_size(bio, lim) << SECTOR_SHIFT);
+ if (IS_ERR(split))
+ return NULL;
break;
}
if (split) {
- /* there isn't chance to merge the splitted bio */
+ /* there isn't chance to merge the split bio */
split->bi_opf |= REQ_NOMERGE;
- bio_chain(split, *bio);
- trace_block_split(q, split, (*bio)->bi_iter.bi_sector);
- generic_make_request(*bio);
- *bio = split;
+ blkcg_bio_issue_init(split);
+ bio_chain(split, bio);
+ trace_block_split(split, bio->bi_iter.bi_sector);
+ submit_bio_noacct(bio);
+ return split;
}
+ return bio;
}
/**
- * blk_queue_split - split a bio and submit the second half
- * @q: [in] request queue pointer
- * @bio: [in, out] bio to be split
+ * bio_split_to_limits - split a bio to fit the queue limits
+ * @bio: bio to be split
+ *
+ * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and
+ * if so split off a bio fitting the limits from the beginning of @bio and
+ * return it. @bio is shortened to the remainder and re-submitted.
*
- * Split a bio into two bios, chains the two bios, submit the second half and
- * store a pointer to the first half in *@bio. Since this function may allocate
- * a new bio from @q->bio_split, it is the responsibility of the caller to
- * ensure that @q is only released after processing of the split bio has
- * finished.
+ * The split bio is allocated from @q->bio_split, which is provided by the
+ * block layer.
*/
-void blk_queue_split(struct request_queue *q, struct bio **bio)
+struct bio *bio_split_to_limits(struct bio *bio)
{
+ const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits;
unsigned int nr_segs;
- __blk_queue_split(q, bio, &nr_segs);
+ if (bio_may_exceed_limits(bio, lim))
+ return __bio_split_to_limits(bio, lim, &nr_segs);
+ return bio;
}
-EXPORT_SYMBOL(blk_queue_split);
+EXPORT_SYMBOL(bio_split_to_limits);
unsigned int blk_recalc_rq_segments(struct request *rq)
{
unsigned int nr_phys_segs = 0;
- unsigned int nr_sectors = 0;
+ unsigned int bytes = 0;
struct req_iterator iter;
struct bio_vec bv;
@@ -375,14 +418,22 @@ unsigned int blk_recalc_rq_segments(struct request *rq)
switch (bio_op(rq->bio)) {
case REQ_OP_DISCARD:
case REQ_OP_SECURE_ERASE:
+ if (queue_max_discard_segments(rq->q) > 1) {
+ struct bio *bio = rq->bio;
+
+ for_each_bio(bio)
+ nr_phys_segs++;
+ return nr_phys_segs;
+ }
+ return 1;
case REQ_OP_WRITE_ZEROES:
return 0;
- case REQ_OP_WRITE_SAME:
- return 1;
+ default:
+ break;
}
rq_for_each_bvec(bv, rq, iter)
- bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors,
+ bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes,
UINT_MAX, UINT_MAX);
return nr_phys_segs;
}
@@ -413,8 +464,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q,
while (nbytes > 0) {
unsigned offset = bvec->bv_offset + total;
- unsigned len = min(get_max_segment_size(q, bvec->bv_page,
- offset), nbytes);
+ unsigned len = min(get_max_segment_size(&q->limits,
+ bvec->bv_page, offset), nbytes);
struct page *page = bvec->bv_page;
/*
@@ -473,7 +524,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio,
struct scatterlist *sglist,
struct scatterlist **sg)
{
- struct bio_vec uninitialized_var(bvec), bvprv = { NULL };
+ struct bio_vec bvec, bvprv = { NULL };
struct bvec_iter iter;
int nsegs = 0;
bool new_bio = false;
@@ -516,8 +567,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
if (rq->rq_flags & RQF_SPECIAL_PAYLOAD)
nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg);
- else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME)
- nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg);
else if (rq->bio)
nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg);
@@ -534,15 +583,40 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq,
}
EXPORT_SYMBOL(__blk_rq_map_sg);
+static inline unsigned int blk_rq_get_max_sectors(struct request *rq,
+ sector_t offset)
+{
+ struct request_queue *q = rq->q;
+ unsigned int max_sectors;
+
+ if (blk_rq_is_passthrough(rq))
+ return q->limits.max_hw_sectors;
+
+ max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ if (!q->limits.chunk_sectors ||
+ req_op(rq) == REQ_OP_DISCARD ||
+ req_op(rq) == REQ_OP_SECURE_ERASE)
+ return max_sectors;
+ return min(max_sectors,
+ blk_chunk_sectors_left(offset, q->limits.chunk_sectors));
+}
+
static inline int ll_new_hw_segment(struct request *req, struct bio *bio,
unsigned int nr_phys_segs)
{
- if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q))
+ if (!blk_cgroup_mergeable(req, bio))
goto no_merge;
if (blk_integrity_merge_bio(req->q, req, bio) == false)
goto no_merge;
+ /* discard request merge won't add new segment */
+ if (req_op(req) == REQ_OP_DISCARD)
+ return 1;
+
+ if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req))
+ goto no_merge;
+
/*
* This will form the start of a new hw segment. Bump both
* counters.
@@ -573,7 +647,8 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
return ll_new_hw_segment(req, bio, nr_segs);
}
-int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs)
+static int ll_front_merge_fn(struct request *req, struct bio *bio,
+ unsigned int nr_segs)
{
if (req_gap_front_merge(req, bio))
return 0;
@@ -625,7 +700,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
return 0;
total_phys_segments = req->nr_phys_segments + next->nr_phys_segments;
- if (total_phys_segments > queue_max_segments(q))
+ if (total_phys_segments > blk_rq_get_max_segments(req))
+ return 0;
+
+ if (!blk_cgroup_mergeable(req, next->bio))
return 0;
if (blk_integrity_merge_rq(q, req, next) == false)
@@ -650,7 +728,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req,
*/
void blk_rq_set_mixed_merge(struct request *rq)
{
- unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK;
+ blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK;
struct bio *bio;
if (rq->rq_flags & RQF_MIXED_MERGE)
@@ -669,31 +747,40 @@ void blk_rq_set_mixed_merge(struct request *rq)
rq->rq_flags |= RQF_MIXED_MERGE;
}
-static void blk_account_io_merge_request(struct request *req)
+static inline blk_opf_t bio_failfast(const struct bio *bio)
{
- if (blk_do_io_stat(req)) {
- part_stat_lock();
- part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
- part_stat_unlock();
+ if (bio->bi_opf & REQ_RAHEAD)
+ return REQ_FAILFAST_MASK;
- hd_struct_put(req->part);
- }
+ return bio->bi_opf & REQ_FAILFAST_MASK;
}
/*
- * Two cases of handling DISCARD merge:
- * If max_discard_segments > 1, the driver takes every bio
- * as a range and send them to controller together. The ranges
- * needn't to be contiguous.
- * Otherwise, the bios/requests will be handled as same as
- * others which should be contiguous.
+ * After we are marked as MIXED_MERGE, any new RA bio has to be updated
+ * as failfast, and request's failfast has to be updated in case of
+ * front merge.
*/
-static inline bool blk_discard_mergable(struct request *req)
+static inline void blk_update_mixed_merge(struct request *req,
+ struct bio *bio, bool front_merge)
{
- if (req_op(req) == REQ_OP_DISCARD &&
- queue_max_discard_segments(req->q) > 1)
- return true;
- return false;
+ if (req->rq_flags & RQF_MIXED_MERGE) {
+ if (bio->bi_opf & REQ_RAHEAD)
+ bio->bi_opf |= REQ_FAILFAST_MASK;
+
+ if (front_merge) {
+ req->cmd_flags &= ~REQ_FAILFAST_MASK;
+ req->cmd_flags |= bio->bi_opf & REQ_FAILFAST_MASK;
+ }
+ }
+}
+
+static void blk_account_io_merge_request(struct request *req)
+{
+ if (blk_do_io_stat(req)) {
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+ }
}
static enum elv_merge blk_try_req_merge(struct request *req,
@@ -720,19 +807,7 @@ static struct request *attempt_merge(struct request_queue *q,
if (req_op(req) != req_op(next))
return NULL;
- if (rq_data_dir(req) != rq_data_dir(next)
- || req->rq_disk != next->rq_disk)
- return NULL;
-
- if (req_op(req) == REQ_OP_WRITE_SAME &&
- !blk_write_same_mergeable(req->bio, next->bio))
- return NULL;
-
- /*
- * Don't allow merge of different write hints, or for a hint with
- * non-hint IO.
- */
- if (req->write_hint != next->write_hint)
+ if (rq_data_dir(req) != rq_data_dir(next))
return NULL;
if (req->ioprio != next->ioprio)
@@ -788,11 +863,15 @@ static struct request *attempt_merge(struct request_queue *q,
if (!blk_discard_mergable(req))
elv_merge_requests(q, req, next);
+ blk_crypto_rq_put_keyslot(next);
+
/*
* 'next' is going away, so update stats accordingly
*/
blk_account_io_merge_request(next);
+ trace_block_rq_merge(next);
+
/*
* ownership of bio passed from next to req, return 'next' for
* the caller to free
@@ -801,7 +880,8 @@ static struct request *attempt_merge(struct request_queue *q,
return next;
}
-struct request *attempt_back_merge(struct request_queue *q, struct request *rq)
+static struct request *attempt_back_merge(struct request_queue *q,
+ struct request *rq)
{
struct request *next = elv_latter_request(q, rq);
@@ -811,7 +891,8 @@ struct request *attempt_back_merge(struct request_queue *q, struct request *rq)
return NULL;
}
-struct request *attempt_front_merge(struct request_queue *q, struct request *rq)
+static struct request *attempt_front_merge(struct request_queue *q,
+ struct request *rq)
{
struct request *prev = elv_former_request(q, rq);
@@ -821,18 +902,15 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq)
return NULL;
}
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
- struct request *next)
+/*
+ * Try to merge 'next' into 'rq'. Return true if the merge happened, false
+ * otherwise. The caller is responsible for freeing 'next' if the merge
+ * happened.
+ */
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+ struct request *next)
{
- struct request *free;
-
- free = attempt_merge(q, rq, next);
- if (free) {
- blk_put_request(free);
- return 1;
- }
-
- return 0;
+ return attempt_merge(q, rq, next);
}
bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
@@ -847,8 +925,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (bio_data_dir(bio) != rq_data_dir(rq))
return false;
- /* must be same device */
- if (rq->rq_disk != bio->bi_disk)
+ /* don't merge across cgroup boundaries */
+ if (!blk_cgroup_mergeable(rq, bio))
return false;
/* only merge integrity protected bio into ditto rq */
@@ -859,18 +937,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio)
if (!bio_crypt_rq_ctx_compatible(rq, bio))
return false;
- /* must be using the same buffer */
- if (req_op(rq) == REQ_OP_WRITE_SAME &&
- !blk_write_same_mergeable(rq->bio, bio))
- return false;
-
- /*
- * Don't allow merge of different write hints, or for a hint with
- * non-hint IO.
- */
- if (rq->write_hint != bio->bi_write_hint)
- return false;
-
if (rq->ioprio != bio_prio(bio))
return false;
@@ -887,3 +953,234 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio)
return ELEVATOR_FRONT_MERGE;
return ELEVATOR_NO_MERGE;
}
+
+static void blk_account_io_merge_bio(struct request *req)
+{
+ if (!blk_do_io_stat(req))
+ return;
+
+ part_stat_lock();
+ part_stat_inc(req->part, merges[op_stat_group(req_op(req))]);
+ part_stat_unlock();
+}
+
+enum bio_merge_status {
+ BIO_MERGE_OK,
+ BIO_MERGE_NONE,
+ BIO_MERGE_FAILED,
+};
+
+static enum bio_merge_status bio_attempt_back_merge(struct request *req,
+ struct bio *bio, unsigned int nr_segs)
+{
+ const blk_opf_t ff = bio_failfast(bio);
+
+ if (!ll_back_merge_fn(req, bio, nr_segs))
+ return BIO_MERGE_FAILED;
+
+ trace_block_bio_backmerge(bio);
+ rq_qos_merge(req->q, req, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ blk_update_mixed_merge(req, bio, false);
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->__data_len += bio->bi_iter.bi_size;
+
+ bio_crypt_free_ctx(bio);
+
+ blk_account_io_merge_bio(req);
+ return BIO_MERGE_OK;
+}
+
+static enum bio_merge_status bio_attempt_front_merge(struct request *req,
+ struct bio *bio, unsigned int nr_segs)
+{
+ const blk_opf_t ff = bio_failfast(bio);
+
+ if (!ll_front_merge_fn(req, bio, nr_segs))
+ return BIO_MERGE_FAILED;
+
+ trace_block_bio_frontmerge(bio);
+ rq_qos_merge(req->q, req, bio);
+
+ if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff)
+ blk_rq_set_mixed_merge(req);
+
+ blk_update_mixed_merge(req, bio, true);
+
+ bio->bi_next = req->bio;
+ req->bio = bio;
+
+ req->__sector = bio->bi_iter.bi_sector;
+ req->__data_len += bio->bi_iter.bi_size;
+
+ bio_crypt_do_front_merge(req, bio);
+
+ blk_account_io_merge_bio(req);
+ return BIO_MERGE_OK;
+}
+
+static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q,
+ struct request *req, struct bio *bio)
+{
+ unsigned short segments = blk_rq_nr_discard_segments(req);
+
+ if (segments >= queue_max_discard_segments(q))
+ goto no_merge;
+ if (blk_rq_sectors(req) + bio_sectors(bio) >
+ blk_rq_get_max_sectors(req, blk_rq_pos(req)))
+ goto no_merge;
+
+ rq_qos_merge(q, req, bio);
+
+ req->biotail->bi_next = bio;
+ req->biotail = bio;
+ req->__data_len += bio->bi_iter.bi_size;
+ req->nr_phys_segments = segments + 1;
+
+ blk_account_io_merge_bio(req);
+ return BIO_MERGE_OK;
+no_merge:
+ req_set_nomerge(q, req);
+ return BIO_MERGE_FAILED;
+}
+
+static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q,
+ struct request *rq,
+ struct bio *bio,
+ unsigned int nr_segs,
+ bool sched_allow_merge)
+{
+ if (!blk_rq_merge_ok(rq, bio))
+ return BIO_MERGE_NONE;
+
+ switch (blk_try_merge(rq, bio)) {
+ case ELEVATOR_BACK_MERGE:
+ if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
+ return bio_attempt_back_merge(rq, bio, nr_segs);
+ break;
+ case ELEVATOR_FRONT_MERGE:
+ if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio))
+ return bio_attempt_front_merge(rq, bio, nr_segs);
+ break;
+ case ELEVATOR_DISCARD_MERGE:
+ return bio_attempt_discard_merge(q, rq, bio);
+ default:
+ return BIO_MERGE_NONE;
+ }
+
+ return BIO_MERGE_FAILED;
+}
+
+/**
+ * blk_attempt_plug_merge - try to merge with %current's plugged list
+ * @q: request_queue new bio is being queued at
+ * @bio: new bio being queued
+ * @nr_segs: number of segments in @bio
+ * from the passed in @q already in the plug list
+ *
+ * Determine whether @bio being queued on @q can be merged with the previous
+ * request on %current's plugged list. Returns %true if merge was successful,
+ * otherwise %false.
+ *
+ * Plugging coalesces IOs from the same issuer for the same purpose without
+ * going through @q->queue_lock. As such it's more of an issuing mechanism
+ * than scheduling, and the request, while may have elvpriv data, is not
+ * added on the elevator at this point. In addition, we don't have
+ * reliable access to the elevator outside queue lock. Only check basic
+ * merging parameters without querying the elevator.
+ *
+ * Caller must ensure !blk_queue_nomerges(q) beforehand.
+ */
+bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs)
+{
+ struct blk_plug *plug;
+ struct request *rq;
+
+ plug = blk_mq_plug(bio);
+ if (!plug || rq_list_empty(plug->mq_list))
+ return false;
+
+ rq_list_for_each(&plug->mq_list, rq) {
+ if (rq->q == q) {
+ if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) ==
+ BIO_MERGE_OK)
+ return true;
+ break;
+ }
+
+ /*
+ * Only keep iterating plug list for merges if we have multiple
+ * queues
+ */
+ if (!plug->multiple_queues)
+ break;
+ }
+ return false;
+}
+
+/*
+ * Iterate list of requests and see if we can merge this bio with any
+ * of them.
+ */
+bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
+ struct bio *bio, unsigned int nr_segs)
+{
+ struct request *rq;
+ int checked = 8;
+
+ list_for_each_entry_reverse(rq, list, queuelist) {
+ if (!checked--)
+ break;
+
+ switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) {
+ case BIO_MERGE_NONE:
+ continue;
+ case BIO_MERGE_OK:
+ return true;
+ case BIO_MERGE_FAILED:
+ return false;
+ }
+
+ }
+
+ return false;
+}
+EXPORT_SYMBOL_GPL(blk_bio_list_merge);
+
+bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
+ unsigned int nr_segs, struct request **merged_request)
+{
+ struct request *rq;
+
+ switch (elv_merge(q, &rq, bio)) {
+ case ELEVATOR_BACK_MERGE:
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ return false;
+ if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
+ return false;
+ *merged_request = attempt_back_merge(q, rq);
+ if (!*merged_request)
+ elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
+ return true;
+ case ELEVATOR_FRONT_MERGE:
+ if (!blk_mq_sched_allow_merge(q, rq, bio))
+ return false;
+ if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK)
+ return false;
+ *merged_request = attempt_front_merge(q, rq);
+ if (!*merged_request)
+ elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
+ return true;
+ case ELEVATOR_DISCARD_MERGE:
+ return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK;
+ default:
+ return false;
+ }
+}
+EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c
index 0157f2b3485a..9638b25fd521 100644
--- a/block/blk-mq-cpumap.c
+++ b/block/blk-mq-cpumap.c
@@ -10,68 +10,28 @@
#include <linux/mm.h>
#include <linux/smp.h>
#include <linux/cpu.h>
+#include <linux/group_cpus.h>
-#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
-static int queue_index(struct blk_mq_queue_map *qmap,
- unsigned int nr_queues, const int q)
+void blk_mq_map_queues(struct blk_mq_queue_map *qmap)
{
- return qmap->queue_offset + (q % nr_queues);
-}
-
-static int get_first_sibling(unsigned int cpu)
-{
- unsigned int ret;
-
- ret = cpumask_first(topology_sibling_cpumask(cpu));
- if (ret < nr_cpu_ids)
- return ret;
-
- return cpu;
-}
-
-int blk_mq_map_queues(struct blk_mq_queue_map *qmap)
-{
- unsigned int *map = qmap->mq_map;
- unsigned int nr_queues = qmap->nr_queues;
- unsigned int cpu, first_sibling, q = 0;
-
- for_each_possible_cpu(cpu)
- map[cpu] = -1;
-
- /*
- * Spread queues among present CPUs first for minimizing
- * count of dead queues which are mapped by all un-present CPUs
- */
- for_each_present_cpu(cpu) {
- if (q >= nr_queues)
- break;
- map[cpu] = queue_index(qmap, nr_queues, q++);
+ const struct cpumask *masks;
+ unsigned int queue, cpu;
+
+ masks = group_cpus_evenly(qmap->nr_queues);
+ if (!masks) {
+ for_each_possible_cpu(cpu)
+ qmap->mq_map[cpu] = qmap->queue_offset;
+ return;
}
- for_each_possible_cpu(cpu) {
- if (map[cpu] != -1)
- continue;
- /*
- * First do sequential mapping between CPUs and queues.
- * In case we still have CPUs to map, and we have some number of
- * threads per cores then map sibling threads to the same queue
- * for performance optimizations.
- */
- if (q < nr_queues) {
- map[cpu] = queue_index(qmap, nr_queues, q++);
- } else {
- first_sibling = get_first_sibling(cpu);
- if (first_sibling == cpu)
- map[cpu] = queue_index(qmap, nr_queues, q++);
- else
- map[cpu] = map[first_sibling];
- }
+ for (queue = 0; queue < qmap->nr_queues; queue++) {
+ for_each_cpu(cpu, &masks[queue])
+ qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
-
- return 0;
+ kfree(masks);
}
EXPORT_SYMBOL_GPL(blk_mq_map_queues);
@@ -89,7 +49,7 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index)
for_each_possible_cpu(i) {
if (index == qmap->mq_map[i])
- return local_memory_node(cpu_to_node(i));
+ return cpu_to_node(i);
}
return NUMA_NO_NODE;
diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c
index 038cb627c868..a77b099c34b7 100644
--- a/block/blk-mq-debugfs-zoned.c
+++ b/block/blk-mq-debugfs-zoned.c
@@ -11,11 +11,11 @@ int queue_zone_wlock_show(void *data, struct seq_file *m)
struct request_queue *q = data;
unsigned int i;
- if (!q->seq_zones_wlock)
+ if (!q->disk->seq_zones_wlock)
return 0;
- for (i = 0; i < q->nr_zones; i++)
- if (test_bit(i, q->seq_zones_wlock))
+ for (i = 0; i < q->disk->nr_zones; i++)
+ if (test_bit(i, q->disk->seq_zones_wlock))
seq_printf(m, "%u\n", i);
return 0;
diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c
index e0b2bc131bf5..94668e72ab09 100644
--- a/block/blk-mq-debugfs.c
+++ b/block/blk-mq-debugfs.c
@@ -7,37 +7,14 @@
#include <linux/blkdev.h>
#include <linux/debugfs.h>
-#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
-#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
-static void print_stat(struct seq_file *m, struct blk_rq_stat *stat)
-{
- if (stat->nr_samples) {
- seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu",
- stat->nr_samples, stat->mean, stat->min, stat->max);
- } else {
- seq_puts(m, "samples=0");
- }
-}
-
static int queue_poll_stat_show(void *data, struct seq_file *m)
{
- struct request_queue *q = data;
- int bucket;
-
- for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) {
- seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket));
- print_stat(m, &q->poll_stat[2 * bucket]);
- seq_puts(m, "\n");
-
- seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket));
- print_stat(m, &q->poll_stat[2 * bucket + 1]);
- seq_puts(m, "\n");
- }
return 0;
}
@@ -109,25 +86,26 @@ static const char *const blk_queue_flag_name[] = {
QUEUE_FLAG_NAME(FAIL_IO),
QUEUE_FLAG_NAME(NONROT),
QUEUE_FLAG_NAME(IO_STAT),
- QUEUE_FLAG_NAME(DISCARD),
QUEUE_FLAG_NAME(NOXMERGES),
QUEUE_FLAG_NAME(ADD_RANDOM),
- QUEUE_FLAG_NAME(SECERASE),
+ QUEUE_FLAG_NAME(SYNCHRONOUS),
QUEUE_FLAG_NAME(SAME_FORCE),
- QUEUE_FLAG_NAME(DEAD),
QUEUE_FLAG_NAME(INIT_DONE),
+ QUEUE_FLAG_NAME(STABLE_WRITES),
QUEUE_FLAG_NAME(POLL),
QUEUE_FLAG_NAME(WC),
QUEUE_FLAG_NAME(FUA),
QUEUE_FLAG_NAME(DAX),
QUEUE_FLAG_NAME(STATS),
- QUEUE_FLAG_NAME(POLL_STATS),
QUEUE_FLAG_NAME(REGISTERED),
- QUEUE_FLAG_NAME(SCSI_PASSTHROUGH),
QUEUE_FLAG_NAME(QUIESCED),
QUEUE_FLAG_NAME(PCI_P2PDMA),
QUEUE_FLAG_NAME(ZONE_RESETALL),
QUEUE_FLAG_NAME(RQ_ALLOC_TIME),
+ QUEUE_FLAG_NAME(HCTX_ACTIVE),
+ QUEUE_FLAG_NAME(NOWAIT),
+ QUEUE_FLAG_NAME(SQ_SCHED),
+ QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE),
};
#undef QUEUE_FLAG_NAME
@@ -148,11 +126,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf,
char opbuf[16] = { }, *op;
/*
- * The "state" attribute is removed after blk_cleanup_queue() has called
- * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid
- * triggering a use-after-free.
+ * The "state" attribute is removed when the queue is removed. Don't
+ * allow setting the state on a dying queue to avoid a use-after-free.
*/
- if (blk_queue_dead(q))
+ if (blk_queue_dying(q))
return -ENOENT;
if (count >= sizeof(opbuf)) {
@@ -178,35 +155,11 @@ inval:
return count;
}
-static int queue_write_hint_show(void *data, struct seq_file *m)
-{
- struct request_queue *q = data;
- int i;
-
- for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
- seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]);
-
- return 0;
-}
-
-static ssize_t queue_write_hint_store(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct request_queue *q = data;
- int i;
-
- for (i = 0; i < BLK_MAX_WRITE_HINTS; i++)
- q->write_hints[i] = 0;
-
- return count;
-}
-
static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = {
{ "poll_stat", 0400, queue_poll_stat_show },
{ "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops },
{ "pm_only", 0600, queue_pm_only_show, NULL },
{ "state", 0600, queue_state_show, queue_state_write },
- { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store },
{ "zone_wlock", 0400, queue_zone_wlock_show, NULL },
{ },
};
@@ -240,10 +193,11 @@ static const char *const alloc_policy_name[] = {
#define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name
static const char *const hctx_flag_name[] = {
HCTX_FLAG_NAME(SHOULD_MERGE),
- HCTX_FLAG_NAME(TAG_SHARED),
+ HCTX_FLAG_NAME(TAG_QUEUE_SHARED),
HCTX_FLAG_NAME(BLOCKING),
HCTX_FLAG_NAME(NO_SCHED),
HCTX_FLAG_NAME(STACKING),
+ HCTX_FLAG_NAME(TAG_HCTX_SHARED),
};
#undef HCTX_FLAG_NAME
@@ -283,31 +237,28 @@ static const char *const cmd_flag_name[] = {
CMD_FLAG_NAME(BACKGROUND),
CMD_FLAG_NAME(NOWAIT),
CMD_FLAG_NAME(NOUNMAP),
- CMD_FLAG_NAME(HIPRI),
+ CMD_FLAG_NAME(POLLED),
};
#undef CMD_FLAG_NAME
#define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name
static const char *const rqf_name[] = {
- RQF_NAME(SORTED),
RQF_NAME(STARTED),
- RQF_NAME(SOFTBARRIER),
RQF_NAME(FLUSH_SEQ),
RQF_NAME(MIXED_MERGE),
- RQF_NAME(MQ_INFLIGHT),
RQF_NAME(DONTPREP),
- RQF_NAME(PREEMPT),
+ RQF_NAME(SCHED_TAGS),
+ RQF_NAME(USE_SCHED),
RQF_NAME(FAILED),
RQF_NAME(QUIET),
- RQF_NAME(ELVPRIV),
RQF_NAME(IO_STAT),
- RQF_NAME(ALLOCED),
RQF_NAME(PM),
RQF_NAME(HASHED),
RQF_NAME(STATS),
RQF_NAME(SPECIAL_PAYLOAD),
RQF_NAME(ZONE_WRITE_LOCKED),
- RQF_NAME(MQ_POLL_SLEPT),
+ RQF_NAME(TIMED_OUT),
+ RQF_NAME(RESV),
};
#undef RQF_NAME
@@ -328,7 +279,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state)
int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
{
const struct blk_mq_ops *const mq_ops = rq->q->mq_ops;
- const unsigned int op = req_op(rq);
+ const enum req_op op = req_op(rq);
const char *op_str = blk_op_str(op);
seq_printf(m, "%p {.op=", rq);
@@ -337,8 +288,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq)
else
seq_printf(m, "%s", op_str);
seq_puts(m, ", .cmd_flags=");
- blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name,
- ARRAY_SIZE(cmd_flag_name));
+ blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK),
+ cmd_flag_name, ARRAY_SIZE(cmd_flag_name));
seq_puts(m, ", .rq_flags=");
blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name,
ARRAY_SIZE(rqf_name));
@@ -399,13 +350,12 @@ struct show_busy_params {
* e.g. due to a concurrent blk_mq_finish_request() call. Returns true to
* keep iterating requests.
*/
-static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved)
+static bool hctx_show_busy_rq(struct request *rq, void *data)
{
const struct show_busy_params *params = data;
if (rq->mq_hctx == params->hctx)
- __blk_mq_debugfs_rq_show(params->m,
- list_entry_rq(&rq->queuelist));
+ __blk_mq_debugfs_rq_show(params->m, rq);
return true;
}
@@ -450,7 +400,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m,
seq_printf(m, "nr_tags=%u\n", tags->nr_tags);
seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags);
seq_printf(m, "active_queues=%d\n",
- atomic_read(&tags->active_queues));
+ READ_ONCE(tags->active_queues));
seq_puts(m, "\nbitmap_tags:\n");
sbitmap_queue_show(&tags->bitmap_tags, m);
@@ -529,92 +479,11 @@ out:
return res;
}
-static int hctx_io_poll_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "considered=%lu\n", hctx->poll_considered);
- seq_printf(m, "invoked=%lu\n", hctx->poll_invoked);
- seq_printf(m, "success=%lu\n", hctx->poll_success);
- return 0;
-}
-
-static ssize_t hctx_io_poll_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0;
- return count;
-}
-
-static int hctx_dispatched_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
- int i;
-
- seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]);
-
- for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) {
- unsigned int d = 1U << (i - 1);
-
- seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]);
- }
-
- seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]);
- return 0;
-}
-
-static ssize_t hctx_dispatched_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
- int i;
-
- for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++)
- hctx->dispatched[i] = 0;
- return count;
-}
-
-static int hctx_queued_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "%lu\n", hctx->queued);
- return 0;
-}
-
-static ssize_t hctx_queued_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->queued = 0;
- return count;
-}
-
-static int hctx_run_show(void *data, struct seq_file *m)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- seq_printf(m, "%lu\n", hctx->run);
- return 0;
-}
-
-static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count,
- loff_t *ppos)
-{
- struct blk_mq_hw_ctx *hctx = data;
-
- hctx->run = 0;
- return count;
-}
-
static int hctx_active_show(void *data, struct seq_file *m)
{
struct blk_mq_hw_ctx *hctx = data;
- seq_printf(m, "%d\n", atomic_read(&hctx->nr_active));
+ seq_printf(m, "%d\n", __blk_mq_active_requests(hctx));
return 0;
}
@@ -663,57 +532,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT);
CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ);
CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL);
-static int ctx_dispatched_show(void *data, struct seq_file *m)
-{
- struct blk_mq_ctx *ctx = data;
-
- seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]);
- return 0;
-}
-
-static ssize_t ctx_dispatched_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_ctx *ctx = data;
-
- ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0;
- return count;
-}
-
-static int ctx_merged_show(void *data, struct seq_file *m)
-{
- struct blk_mq_ctx *ctx = data;
-
- seq_printf(m, "%lu\n", ctx->rq_merged);
- return 0;
-}
-
-static ssize_t ctx_merged_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_ctx *ctx = data;
-
- ctx->rq_merged = 0;
- return count;
-}
-
-static int ctx_completed_show(void *data, struct seq_file *m)
-{
- struct blk_mq_ctx *ctx = data;
-
- seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]);
- return 0;
-}
-
-static ssize_t ctx_completed_write(void *data, const char __user *buf,
- size_t count, loff_t *ppos)
-{
- struct blk_mq_ctx *ctx = data;
-
- ctx->rq_completed[0] = ctx->rq_completed[1] = 0;
- return count;
-}
-
static int blk_mq_debugfs_show(struct seq_file *m, void *v)
{
const struct blk_mq_debugfs_attr *attr = m->private;
@@ -789,10 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = {
{"tags_bitmap", 0400, hctx_tags_bitmap_show},
{"sched_tags", 0400, hctx_sched_tags_show},
{"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show},
- {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write},
- {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write},
- {"queued", 0600, hctx_queued_show, hctx_queued_write},
- {"run", 0600, hctx_run_show, hctx_run_write},
{"active", 0400, hctx_active_show},
{"dispatch_busy", 0400, hctx_dispatch_busy_show},
{"type", 0400, hctx_type_show},
@@ -803,9 +617,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = {
{"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops},
{"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops},
{"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops},
- {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write},
- {"merged", 0600, ctx_merged_show, ctx_merged_write},
- {"completed", 0600, ctx_completed_show, ctx_completed_write},
{},
};
@@ -825,10 +636,7 @@ static void debugfs_create_files(struct dentry *parent, void *data,
void blk_mq_debugfs_register(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
-
- q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent),
- blk_debugfs_root);
+ unsigned long i;
debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs);
@@ -858,13 +666,6 @@ void blk_mq_debugfs_register(struct request_queue *q)
}
}
-void blk_mq_debugfs_unregister(struct request_queue *q)
-{
- debugfs_remove_recursive(q->debugfs_dir);
- q->sched_debugfs_dir = NULL;
- q->debugfs_dir = NULL;
-}
-
static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *ctx)
{
@@ -884,6 +685,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
char name[20];
int i;
+ if (!q->debugfs_dir)
+ return;
+
snprintf(name, sizeof(name), "hctx%u", hctx->queue_num);
hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir);
@@ -895,6 +699,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
{
+ if (!hctx->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(hctx->debugfs_dir);
hctx->sched_debugfs_dir = NULL;
hctx->debugfs_dir = NULL;
@@ -903,7 +709,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx)
void blk_mq_debugfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_register_hctx(q, hctx);
@@ -912,7 +718,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q)
void blk_mq_debugfs_unregister_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_debugfs_unregister_hctx(hctx);
@@ -922,6 +728,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
{
struct elevator_type *e = q->elevator->type;
+ lockdep_assert_held(&q->debugfs_mutex);
+
/*
* If the parent directory has not been created yet, return, we will be
* called again later on and the directory/files will be created then.
@@ -939,21 +747,42 @@ void blk_mq_debugfs_register_sched(struct request_queue *q)
void blk_mq_debugfs_unregister_sched(struct request_queue *q)
{
+ lockdep_assert_held(&q->debugfs_mutex);
+
debugfs_remove_recursive(q->sched_debugfs_dir);
q->sched_debugfs_dir = NULL;
}
+static const char *rq_qos_id_to_name(enum rq_qos_id id)
+{
+ switch (id) {
+ case RQ_QOS_WBT:
+ return "wbt";
+ case RQ_QOS_LATENCY:
+ return "latency";
+ case RQ_QOS_COST:
+ return "cost";
+ }
+ return "unknown";
+}
+
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
+ lockdep_assert_held(&rqos->disk->queue->debugfs_mutex);
+
+ if (!rqos->disk->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(rqos->debugfs_dir);
rqos->debugfs_dir = NULL;
}
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
{
- struct request_queue *q = rqos->q;
+ struct request_queue *q = rqos->disk->queue;
const char *dir_name = rq_qos_id_to_name(rqos->id);
+ lockdep_assert_held(&q->debugfs_mutex);
+
if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs)
return;
@@ -961,23 +790,25 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
q->rqos_debugfs_dir = debugfs_create_dir("rqos",
q->debugfs_dir);
- rqos->debugfs_dir = debugfs_create_dir(dir_name,
- rqos->q->rqos_debugfs_dir);
-
+ rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir);
debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs);
}
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
- debugfs_remove_recursive(q->rqos_debugfs_dir);
- q->rqos_debugfs_dir = NULL;
-}
-
void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx)
{
struct elevator_type *e = q->elevator->type;
+ lockdep_assert_held(&q->debugfs_mutex);
+
+ /*
+ * If the parent debugfs directory has not been created yet, return;
+ * We will be called again later on with appropriate parent debugfs
+ * directory from blk_register_queue()
+ */
+ if (!hctx->debugfs_dir)
+ return;
+
if (!e->hctx_debugfs_attrs)
return;
@@ -989,6 +820,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q,
void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx)
{
+ lockdep_assert_held(&hctx->queue->debugfs_mutex);
+
+ if (!hctx->queue->debugfs_dir)
+ return;
debugfs_remove_recursive(hctx->sched_debugfs_dir);
hctx->sched_debugfs_dir = NULL;
}
diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h
index a68aa6041a10..9c7d4b6117d4 100644
--- a/block/blk-mq-debugfs.h
+++ b/block/blk-mq-debugfs.h
@@ -6,6 +6,8 @@
#include <linux/seq_file.h>
+struct blk_mq_hw_ctx;
+
struct blk_mq_debugfs_attr {
const char *name;
umode_t mode;
@@ -19,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq);
int blk_mq_debugfs_rq_show(struct seq_file *m, void *v);
void blk_mq_debugfs_register(struct request_queue *q);
-void blk_mq_debugfs_unregister(struct request_queue *q);
void blk_mq_debugfs_register_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx);
@@ -34,16 +35,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx);
void blk_mq_debugfs_register_rqos(struct rq_qos *rqos);
void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos);
-void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q);
#else
static inline void blk_mq_debugfs_register(struct request_queue *q)
{
}
-static inline void blk_mq_debugfs_unregister(struct request_queue *q)
-{
-}
-
static inline void blk_mq_debugfs_register_hctx(struct request_queue *q,
struct blk_mq_hw_ctx *hctx)
{
@@ -85,10 +81,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos)
static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos)
{
}
-
-static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q)
-{
-}
#endif
#ifdef CONFIG_BLK_DEBUG_FS_ZONED
diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c
index b595a94c4d16..d47b5c73c9eb 100644
--- a/block/blk-mq-pci.c
+++ b/block/blk-mq-pci.c
@@ -4,7 +4,6 @@
*/
#include <linux/kobject.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
#include <linux/blk-mq-pci.h>
#include <linux/pci.h>
#include <linux/module.h>
@@ -23,8 +22,8 @@
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
*/
-int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
- int offset)
+void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
+ int offset)
{
const struct cpumask *mask;
unsigned int queue, cpu;
@@ -38,11 +37,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev,
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
- return 0;
+ return;
fallback:
WARN_ON_ONCE(qmap->nr_queues > 1);
blk_mq_clear_mq_map(qmap);
- return 0;
}
EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues);
diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c
deleted file mode 100644
index 14f968e58b8f..000000000000
--- a/block/blk-mq-rdma.c
+++ /dev/null
@@ -1,44 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (c) 2017 Sagi Grimberg.
- */
-#include <linux/blk-mq.h>
-#include <linux/blk-mq-rdma.h>
-#include <rdma/ib_verbs.h>
-
-/**
- * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device
- * @map: CPU to hardware queue map.
- * @dev: rdma device to provide a mapping for.
- * @first_vec: first interrupt vectors to use for queues (usually 0)
- *
- * This function assumes the rdma device @dev has at least as many available
- * interrupt vetors as @set has queues. It will then query it's affinity mask
- * and built queue mapping that maps a queue to the CPUs that have irq affinity
- * for the corresponding vector.
- *
- * In case either the driver passed a @dev with less vectors than
- * @set->nr_hw_queues, or @dev does not provide an affinity mask for a
- * vector, we fallback to the naive mapping.
- */
-int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map,
- struct ib_device *dev, int first_vec)
-{
- const struct cpumask *mask;
- unsigned int queue, cpu;
-
- for (queue = 0; queue < map->nr_queues; queue++) {
- mask = ib_get_vector_affinity(dev, first_vec + queue);
- if (!mask)
- goto fallback;
-
- for_each_cpu(cpu, mask)
- map->mq_map[cpu] = map->queue_offset + queue;
- }
-
- return 0;
-
-fallback:
- return blk_mq_map_queues(map);
-}
-EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues);
diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c
index fdcc2c1dd178..451a2c1f1f32 100644
--- a/block/blk-mq-sched.c
+++ b/block/blk-mq-sched.c
@@ -6,7 +6,7 @@
*/
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/blk-mq.h>
+#include <linux/list_sort.h>
#include <trace/events/block.h>
@@ -14,70 +14,64 @@
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
-#include "blk-mq-tag.h"
#include "blk-wbt.h"
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *))
+/*
+ * Mark a hardware queue as needing a restart.
+ */
+void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
{
- struct blk_mq_hw_ctx *hctx;
- int i;
+ if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ return;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (exit && hctx->sched_data)
- exit(hctx);
- kfree(hctx->sched_data);
- hctx->sched_data = NULL;
- }
+ set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
}
-EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
+EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
-void blk_mq_sched_assign_ioc(struct request *rq)
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
- struct request_queue *q = rq->q;
- struct io_context *ioc;
- struct io_cq *icq;
+ clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
/*
- * May not have an IO context if it's a passthrough request
+ * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch)
+ * in blk_mq_run_hw_queue(). Its pair is the barrier in
+ * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART,
+ * meantime new request added to hctx->dispatch is missed to check in
+ * blk_mq_run_hw_queue().
*/
- ioc = current->io_context;
- if (!ioc)
- return;
+ smp_mb();
- spin_lock_irq(&q->queue_lock);
- icq = ioc_lookup_icq(ioc, q);
- spin_unlock_irq(&q->queue_lock);
-
- if (!icq) {
- icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
- if (!icq)
- return;
- }
- get_io_context(icq->ioc);
- rq->elv.icq = icq;
+ blk_mq_run_hw_queue(hctx, true);
}
-/*
- * Mark a hardware queue as needing a restart. For shared queues, maintain
- * a count of how many hardware queues are marked for restart.
- */
-void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
+static int sched_rq_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
- if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return;
+ struct request *rqa = container_of(a, struct request, queuelist);
+ struct request *rqb = container_of(b, struct request, queuelist);
- set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ return rqa->mq_hctx > rqb->mq_hctx;
}
-EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
+static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list)
{
- if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
- return;
- clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
+ struct blk_mq_hw_ctx *hctx =
+ list_first_entry(rq_list, struct request, queuelist)->mq_hctx;
+ struct request *rq;
+ LIST_HEAD(hctx_list);
+ unsigned int count = 0;
- blk_mq_run_hw_queue(hctx, true);
+ list_for_each_entry(rq, rq_list, queuelist) {
+ if (rq->mq_hctx != hctx) {
+ list_cut_before(&hctx_list, rq_list, &rq->queuelist);
+ goto dispatch;
+ }
+ count++;
+ }
+ list_splice_tail_init(rq_list, &hctx_list);
+
+dispatch:
+ return blk_mq_dispatch_rq_list(hctx, &hctx_list, count);
}
#define BLK_MQ_BUDGET_DELAY 3 /* ms units */
@@ -85,35 +79,45 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
- * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ * restart queue if .get_budget() fails to get the budget.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
* be run again. This is necessary to avoid starving flushes.
*/
-static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct elevator_queue *e = q->elevator;
+ bool multi_hctxs = false, run_queue = false;
+ bool dispatched = false, busy = false;
+ unsigned int max_dispatch;
LIST_HEAD(rq_list);
- int ret = 0;
+ int count = 0;
+
+ if (hctx->dispatch_busy)
+ max_dispatch = 1;
+ else
+ max_dispatch = hctx->queue->nr_requests;
do {
struct request *rq;
+ int budget_token;
if (e->type->ops.has_work && !e->type->ops.has_work(hctx))
break;
if (!list_empty_careful(&hctx->dispatch)) {
- ret = -EAGAIN;
+ busy = true;
break;
}
- if (!blk_mq_get_dispatch_budget(hctx))
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
break;
rq = e->type->ops.dispatch_request(hctx);
if (!rq) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(q, budget_token);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
@@ -121,17 +125,70 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
* no guarantee anyone will kick the queue. Kick it
* ourselves.
*/
- blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
+ run_queue = true;
break;
}
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
* in blk_mq_dispatch_rq_list().
*/
- list_add(&rq->queuelist, &rq_list);
- } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+ list_add_tail(&rq->queuelist, &rq_list);
+ count++;
+ if (rq->mq_hctx != hctx)
+ multi_hctxs = true;
+
+ /*
+ * If we cannot get tag for the request, stop dequeueing
+ * requests from the IO scheduler. We are unlikely to be able
+ * to submit them anyway and it creates false impression for
+ * scheduling heuristics that the device can take more IO.
+ */
+ if (!blk_mq_get_driver_tag(rq))
+ break;
+ } while (count < max_dispatch);
+
+ if (!count) {
+ if (run_queue)
+ blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY);
+ } else if (multi_hctxs) {
+ /*
+ * Requests from different hctx may be dequeued from some
+ * schedulers, such as bfq and deadline.
+ *
+ * Sort the requests in the list according to their hctx,
+ * dispatch batching requests from same hctx at a time.
+ */
+ list_sort(NULL, &rq_list, sched_rq_cmp);
+ do {
+ dispatched |= blk_mq_dispatch_hctx_list(&rq_list);
+ } while (!list_empty(&rq_list));
+ } else {
+ dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count);
+ }
+
+ if (busy)
+ return -EAGAIN;
+ return !!dispatched;
+}
+
+static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
+{
+ unsigned long end = jiffies + HZ;
+ int ret;
+
+ do {
+ ret = __blk_mq_do_dispatch_sched(hctx);
+ if (ret != 1)
+ break;
+ if (need_resched() || time_is_before_jiffies(end)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ break;
+ }
+ } while (1);
return ret;
}
@@ -150,10 +207,10 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
/*
* Only SCSI implements .get_budget and .put_budget, and SCSI restarts
* its queue by itself in its completion handler, so we don't need to
- * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
+ * restart queue if .get_budget() fails to get the budget.
*
* Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to
- * to be run again. This is necessary to avoid starving flushes.
+ * be run again. This is necessary to avoid starving flushes.
*/
static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
{
@@ -161,9 +218,10 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
LIST_HEAD(rq_list);
struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
int ret = 0;
+ struct request *rq;
do {
- struct request *rq;
+ int budget_token;
if (!list_empty_careful(&hctx->dispatch)) {
ret = -EAGAIN;
@@ -173,12 +231,13 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
if (!sbitmap_any_bit_set(&hctx->ctx_map))
break;
- if (!blk_mq_get_dispatch_budget(hctx))
+ budget_token = blk_mq_get_dispatch_budget(q);
+ if (budget_token < 0)
break;
rq = blk_mq_dequeue_from_ctx(hctx, ctx);
if (!rq) {
- blk_mq_put_dispatch_budget(hctx);
+ blk_mq_put_dispatch_budget(q, budget_token);
/*
* We're releasing without dispatching. Holding the
* budget could have blocked any "hctx"s with the
@@ -190,6 +249,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
break;
}
+ blk_mq_set_rq_budget_token(rq, budget_token);
+
/*
* Now this rq owns the budget which has to be released
* if this rq won't be queued to driver via .queue_rq()
@@ -200,7 +261,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
/* round robin for fair dispatch */
ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
- } while (blk_mq_dispatch_rq_list(q, &rq_list, true));
+ } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1));
WRITE_ONCE(hctx->dispatch_from, ctx);
return ret;
@@ -208,10 +269,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
{
- struct request_queue *q = hctx->queue;
- struct elevator_queue *e = q->elevator;
- const bool has_sched_dispatch = e && e->type->ops.dispatch_request;
- int ret = 0;
+ bool need_dispatch = false;
LIST_HEAD(rq_list);
/*
@@ -240,23 +298,22 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
*/
if (!list_empty(&rq_list)) {
blk_mq_sched_mark_restart_hctx(hctx);
- if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
- if (has_sched_dispatch)
- ret = blk_mq_do_dispatch_sched(hctx);
- else
- ret = blk_mq_do_dispatch_ctx(hctx);
- }
- } else if (has_sched_dispatch) {
- ret = blk_mq_do_dispatch_sched(hctx);
- } else if (hctx->dispatch_busy) {
- /* dequeue request one by one from sw queue if queue is busy */
- ret = blk_mq_do_dispatch_ctx(hctx);
+ if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0))
+ return 0;
+ need_dispatch = true;
} else {
- blk_mq_flush_busy_ctxs(hctx, &rq_list);
- blk_mq_dispatch_rq_list(q, &rq_list, false);
+ need_dispatch = hctx->dispatch_busy;
}
- return ret;
+ if (hctx->queue->elevator)
+ return blk_mq_do_dispatch_sched(hctx);
+
+ /* dequeue request one by one from sw queue if queue is busy */
+ if (need_dispatch)
+ return blk_mq_do_dispatch_ctx(hctx);
+ blk_mq_flush_busy_ctxs(hctx, &rq_list);
+ blk_mq_dispatch_rq_list(hctx, &rq_list, 0);
+ return 0;
}
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
@@ -267,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
return;
- hctx->run++;
-
/*
* A return of -EAGAIN is an indication that hctx->dispatch is not
* empty and we must run again in order to avoid starving flushes.
@@ -279,355 +334,168 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
}
}
-bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **merged_request)
-{
- struct request *rq;
-
- switch (elv_merge(q, &rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- if (!blk_mq_sched_allow_merge(q, rq, bio))
- return false;
- if (!bio_attempt_back_merge(rq, bio, nr_segs))
- return false;
- *merged_request = attempt_back_merge(q, rq);
- if (!*merged_request)
- elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
- return true;
- case ELEVATOR_FRONT_MERGE:
- if (!blk_mq_sched_allow_merge(q, rq, bio))
- return false;
- if (!bio_attempt_front_merge(rq, bio, nr_segs))
- return false;
- *merged_request = attempt_front_merge(q, rq);
- if (!*merged_request)
- elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
- return true;
- case ELEVATOR_DISCARD_MERGE:
- return bio_attempt_discard_merge(q, rq, bio);
- default:
- return false;
- }
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
-
-/*
- * Iterate list of requests and see if we can merge this bio with any
- * of them.
- */
-bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list,
- struct bio *bio, unsigned int nr_segs)
-{
- struct request *rq;
- int checked = 8;
-
- list_for_each_entry_reverse(rq, list, queuelist) {
- bool merged = false;
-
- if (!checked--)
- break;
-
- if (!blk_rq_merge_ok(rq, bio))
- continue;
-
- switch (blk_try_merge(rq, bio)) {
- case ELEVATOR_BACK_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_back_merge(rq, bio,
- nr_segs);
- break;
- case ELEVATOR_FRONT_MERGE:
- if (blk_mq_sched_allow_merge(q, rq, bio))
- merged = bio_attempt_front_merge(rq, bio,
- nr_segs);
- break;
- case ELEVATOR_DISCARD_MERGE:
- merged = bio_attempt_discard_merge(q, rq, bio);
- break;
- default:
- continue;
- }
-
- return merged;
- }
-
- return false;
-}
-EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge);
-
-/*
- * Reverse check our software queue for entries that we could potentially
- * merge with. Currently includes a hand-wavy stop count of 8, to not spend
- * too much time checking for merges.
- */
-static bool blk_mq_attempt_merge(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx, struct bio *bio,
- unsigned int nr_segs)
-{
- enum hctx_type type = hctx->type;
-
- lockdep_assert_held(&ctx->lock);
-
- if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) {
- ctx->rq_merged++;
- return true;
- }
-
- return false;
-}
-
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
struct elevator_queue *e = q->elevator;
- struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
- struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
+ struct blk_mq_ctx *ctx;
+ struct blk_mq_hw_ctx *hctx;
bool ret = false;
enum hctx_type type;
- if (e && e->type->ops.bio_merge)
- return e->type->ops.bio_merge(hctx, bio, nr_segs);
+ if (e && e->type->ops.bio_merge) {
+ ret = e->type->ops.bio_merge(q, bio, nr_segs);
+ goto out_put;
+ }
+ ctx = blk_mq_get_ctx(q);
+ hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
type = hctx->type;
- if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) &&
- !list_empty_careful(&ctx->rq_lists[type])) {
- /* default per sw-queue merge */
- spin_lock(&ctx->lock);
- ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs);
- spin_unlock(&ctx->lock);
- }
+ if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) ||
+ list_empty_careful(&ctx->rq_lists[type]))
+ goto out_put;
+
+ /* default per sw-queue merge */
+ spin_lock(&ctx->lock);
+ /*
+ * Reverse check our software queue for entries that we could
+ * potentially merge with. Currently includes a hand-wavy stop
+ * count of 8, to not spend too much time checking for merges.
+ */
+ if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs))
+ ret = true;
+ spin_unlock(&ctx->lock);
+out_put:
return ret;
}
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free)
{
- return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
+ return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free);
}
EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
-void blk_mq_sched_request_inserted(struct request *rq)
+static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q,
+ struct blk_mq_hw_ctx *hctx,
+ unsigned int hctx_idx)
{
- trace_block_rq_insert(rq->q, rq);
-}
-EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
-
-static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
- bool has_sched,
- struct request *rq)
-{
- /*
- * dispatch flush and passthrough rq directly
- *
- * passthrough request has to be added to hctx->dispatch directly.
- * For some reason, device may be in one situation which can't
- * handle FS request, so STS_RESOURCE is always returned and the
- * FS request will be added to hctx->dispatch. However passthrough
- * request may be required at that time for fixing the problem. If
- * passthrough request is added to scheduler queue, there isn't any
- * chance to dispatch it given we prioritize requests in hctx->dispatch.
- */
- if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq))
- return true;
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ hctx->sched_tags = q->sched_shared_tags;
+ return 0;
+ }
- if (has_sched)
- rq->rq_flags |= RQF_SORTED;
+ hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx,
+ q->nr_requests);
- return false;
+ if (!hctx->sched_tags)
+ return -ENOMEM;
+ return 0;
}
-void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async)
+static void blk_mq_exit_sched_shared_tags(struct request_queue *queue)
{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
- /* flush rq in flush machinery need to be dispatched directly */
- if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) {
- blk_insert_flush(rq);
- goto run;
- }
-
- WARN_ON(e && (rq->tag != -1));
-
- if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) {
- /*
- * Firstly normal IO request is inserted to scheduler queue or
- * sw queue, meantime we add flush request to dispatch queue(
- * hctx->dispatch) directly and there is at most one in-flight
- * flush request for each hw queue, so it doesn't matter to add
- * flush request to tail or front of the dispatch queue.
- *
- * Secondly in case of NCQ, flush request belongs to non-NCQ
- * command, and queueing it will fail when there is any
- * in-flight normal IO request(NCQ command). When adding flush
- * rq to the front of hctx->dispatch, it is easier to introduce
- * extra time to flush rq's latency because of S_SCHED_RESTART
- * compared with adding to the tail of dispatch queue, then
- * chance of flush merge is increased, and less flush requests
- * will be issued to controller. It is observed that ~10% time
- * is saved in blktests block/004 on disk attached to AHCI/NCQ
- * drive when adding flush rq to the front of hctx->dispatch.
- *
- * Simply queue flush rq to the front of hctx->dispatch so that
- * intensive flush workloads can benefit in case of NCQ HW.
- */
- at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head;
- blk_mq_request_bypass_insert(rq, at_head, false);
- goto run;
- }
-
- if (e && e->type->ops.insert_requests) {
- LIST_HEAD(list);
-
- list_add(&rq->queuelist, &list);
- e->type->ops.insert_requests(hctx, &list, at_head);
- } else {
- spin_lock(&ctx->lock);
- __blk_mq_insert_request(hctx, rq, at_head);
- spin_unlock(&ctx->lock);
- }
-
-run:
- if (run_queue)
- blk_mq_run_hw_queue(hctx, async);
+ blk_mq_free_rq_map(queue->sched_shared_tags);
+ queue->sched_shared_tags = NULL;
}
-void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx,
- struct list_head *list, bool run_queue_async)
+/* called in queue's release handler, tagset has gone away */
+static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags)
{
- struct elevator_queue *e;
- struct request_queue *q = hctx->queue;
-
- /*
- * blk_mq_sched_insert_requests() is called from flush plug
- * context only, and hold one usage counter to prevent queue
- * from being released.
- */
- percpu_ref_get(&q->q_usage_counter);
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
- e = hctx->queue->elevator;
- if (e && e->type->ops.insert_requests)
- e->type->ops.insert_requests(hctx, list, false);
- else {
- /*
- * try to issue requests directly if the hw queue isn't
- * busy in case of 'none' scheduler, and this way may save
- * us one extra enqueue & dequeue to sw queue.
- */
- if (!hctx->dispatch_busy && !e && !run_queue_async) {
- blk_mq_try_issue_list_directly(hctx, list);
- if (list_empty(list))
- goto out;
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->sched_tags) {
+ if (!blk_mq_is_shared_tags(flags))
+ blk_mq_free_rq_map(hctx->sched_tags);
+ hctx->sched_tags = NULL;
}
- blk_mq_insert_requests(hctx, ctx, list);
}
- blk_mq_run_hw_queue(hctx, run_queue_async);
- out:
- percpu_ref_put(&q->q_usage_counter);
+ if (blk_mq_is_shared_tags(flags))
+ blk_mq_exit_sched_shared_tags(q);
}
-static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
+static int blk_mq_init_sched_shared_tags(struct request_queue *queue)
{
- if (hctx->sched_tags) {
- blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
- }
-}
+ struct blk_mq_tag_set *set = queue->tag_set;
-static int blk_mq_sched_alloc_tags(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx,
- unsigned int hctx_idx)
-{
- struct blk_mq_tag_set *set = q->tag_set;
- int ret;
-
- hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
- set->reserved_tags);
- if (!hctx->sched_tags)
+ /*
+ * Set initial depth at max so that we don't need to reallocate for
+ * updating nr_requests.
+ */
+ queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set,
+ BLK_MQ_NO_HCTX_IDX,
+ MAX_SCHED_RQ);
+ if (!queue->sched_shared_tags)
return -ENOMEM;
- ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
- if (ret)
- blk_mq_sched_free_tags(set, hctx, hctx_idx);
+ blk_mq_tag_update_sched_shared_tags(queue);
- return ret;
-}
-
-/* called in queue's release handler, tagset has gone away */
-static void blk_mq_sched_tags_teardown(struct request_queue *q)
-{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags) {
- blk_mq_free_rq_map(hctx->sched_tags);
- hctx->sched_tags = NULL;
- }
- }
+ return 0;
}
+/* caller must have a reference to @e, will grab another one if successful */
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
{
+ unsigned int flags = q->tag_set->flags;
struct blk_mq_hw_ctx *hctx;
struct elevator_queue *eq;
- unsigned int i;
+ unsigned long i;
int ret;
- if (!e) {
- q->elevator = NULL;
- q->nr_requests = q->tag_set->queue_depth;
- return 0;
- }
-
/*
* Default to double of smaller one between hw queue_depth and 128,
* since we don't split into sync/async like the old code did.
* Additionally, this is a per-hw queue depth.
*/
q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
- BLKDEV_MAX_RQ);
+ BLKDEV_DEFAULT_RQ);
+
+ if (blk_mq_is_shared_tags(flags)) {
+ ret = blk_mq_init_sched_shared_tags(q);
+ if (ret)
+ return ret;
+ }
queue_for_each_hw_ctx(q, hctx, i) {
- ret = blk_mq_sched_alloc_tags(q, hctx, i);
+ ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i);
if (ret)
- goto err;
+ goto err_free_map_and_rqs;
}
ret = e->ops.init_sched(q, e);
if (ret)
- goto err;
+ goto err_free_map_and_rqs;
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
queue_for_each_hw_ctx(q, hctx, i) {
if (e->ops.init_hctx) {
ret = e->ops.init_hctx(hctx, i);
if (ret) {
eq = q->elevator;
- blk_mq_sched_free_requests(q);
+ blk_mq_sched_free_rqs(q);
blk_mq_exit_sched(q, eq);
kobject_put(&eq->kobj);
return ret;
}
}
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_register_sched_hctx(q, hctx);
+ mutex_unlock(&q->debugfs_mutex);
}
return 0;
-err:
- blk_mq_sched_free_requests(q);
- blk_mq_sched_tags_teardown(q);
+err_free_map_and_rqs:
+ blk_mq_sched_free_rqs(q);
+ blk_mq_sched_tags_teardown(q, flags);
+
q->elevator = NULL;
return ret;
}
@@ -636,32 +504,47 @@ err:
* called in either blk_queue_cleanup or elevator_switch, tagset
* is required for freeing requests
*/
-void blk_mq_sched_free_requests(struct request_queue *q)
+void blk_mq_sched_free_rqs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->sched_tags)
- blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i);
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ blk_mq_free_rqs(q->tag_set, q->sched_shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ } else {
+ queue_for_each_hw_ctx(q, hctx, i) {
+ if (hctx->sched_tags)
+ blk_mq_free_rqs(q->tag_set,
+ hctx->sched_tags, i);
+ }
}
}
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
+ unsigned int flags = 0;
queue_for_each_hw_ctx(q, hctx, i) {
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched_hctx(hctx);
+ mutex_unlock(&q->debugfs_mutex);
+
if (e->type->ops.exit_hctx && hctx->sched_data) {
e->type->ops.exit_hctx(hctx, i);
hctx->sched_data = NULL;
}
+ flags = hctx->flags;
}
+
+ mutex_lock(&q->debugfs_mutex);
blk_mq_debugfs_unregister_sched(q);
+ mutex_unlock(&q->debugfs_mutex);
+
if (e->type->ops.exit_sched)
e->type->ops.exit_sched(e);
- blk_mq_sched_tags_teardown(q);
+ blk_mq_sched_tags_teardown(q, flags);
q->elevator = NULL;
}
diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h
index 126021fc3a11..1326526bb733 100644
--- a/block/blk-mq-sched.h
+++ b/block/blk-mq-sched.h
@@ -2,72 +2,69 @@
#ifndef BLK_MQ_SCHED_H
#define BLK_MQ_SCHED_H
+#include "elevator.h"
#include "blk-mq.h"
-#include "blk-mq-tag.h"
-void blk_mq_sched_free_hctx_data(struct request_queue *q,
- void (*exit)(struct blk_mq_hw_ctx *));
+#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ)
-void blk_mq_sched_assign_ioc(struct request *rq);
-
-void blk_mq_sched_request_inserted(struct request *rq);
bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs, struct request **merged_request);
-bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
+bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs);
-bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq);
+bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free);
void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx);
-void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
-
-void blk_mq_sched_insert_request(struct request *rq, bool at_head,
- bool run_queue, bool async);
-void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_ctx *ctx,
- struct list_head *list, bool run_queue_async);
+void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx);
void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx);
int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e);
void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e);
-void blk_mq_sched_free_requests(struct request_queue *q);
+void blk_mq_sched_free_rqs(struct request_queue *q);
-static inline bool
-blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs)
+static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
{
- if (blk_queue_nomerges(q) || !bio_mergeable(bio))
- return false;
+ if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
+ __blk_mq_sched_restart(hctx);
+}
- return __blk_mq_sched_bio_merge(q, bio, nr_segs);
+static inline bool bio_mergeable(struct bio *bio)
+{
+ return !(bio->bi_opf & REQ_NOMERGE_FLAGS);
}
static inline bool
blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- struct elevator_queue *e = q->elevator;
-
- if (e && e->type->ops.allow_merge)
- return e->type->ops.allow_merge(q, rq, bio);
+ if (rq->rq_flags & RQF_USE_SCHED) {
+ struct elevator_queue *e = q->elevator;
+ if (e->type->ops.allow_merge)
+ return e->type->ops.allow_merge(q, rq, bio);
+ }
return true;
}
static inline void blk_mq_sched_completed_request(struct request *rq, u64 now)
{
- struct elevator_queue *e = rq->q->elevator;
+ if (rq->rq_flags & RQF_USE_SCHED) {
+ struct elevator_queue *e = rq->q->elevator;
- if (e && e->type->ops.completed_request)
- e->type->ops.completed_request(rq, now);
+ if (e->type->ops.completed_request)
+ e->type->ops.completed_request(rq, now);
+ }
}
static inline void blk_mq_sched_requeue_request(struct request *rq)
{
- struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
+ if (rq->rq_flags & RQF_USE_SCHED) {
+ struct request_queue *q = rq->q;
+ struct elevator_queue *e = q->elevator;
- if (e && e->type->ops.requeue_request)
- e->type->ops.requeue_request(rq);
+ if (e->type->ops.requeue_request)
+ e->type->ops.requeue_request(rq);
+ }
}
static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx)
diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c
index 062229395a50..156e9bb07abf 100644
--- a/block/blk-mq-sysfs.c
+++ b/block/blk-mq-sysfs.c
@@ -10,10 +10,8 @@
#include <linux/workqueue.h>
#include <linux/smp.h>
-#include <linux/blk-mq.h>
#include "blk.h"
#include "blk-mq.h"
-#include "blk-mq-tag.h"
static void blk_mq_sysfs_release(struct kobject *kobj)
{
@@ -36,10 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx,
kobj);
- cancel_delayed_work_sync(&hctx->run_work);
-
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- cleanup_srcu_struct(hctx->srcu);
blk_free_flush_queue(hctx->fq);
sbitmap_free(&hctx->ctx_map);
free_cpumask_var(hctx->cpumask);
@@ -47,60 +41,11 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj)
kfree(hctx);
}
-struct blk_mq_ctx_sysfs_entry {
- struct attribute attr;
- ssize_t (*show)(struct blk_mq_ctx *, char *);
- ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t);
-};
-
struct blk_mq_hw_ctx_sysfs_entry {
struct attribute attr;
ssize_t (*show)(struct blk_mq_hw_ctx *, char *);
- ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t);
};
-static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr,
- char *page)
-{
- struct blk_mq_ctx_sysfs_entry *entry;
- struct blk_mq_ctx *ctx;
- struct request_queue *q;
- ssize_t res;
-
- entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
- ctx = container_of(kobj, struct blk_mq_ctx, kobj);
- q = ctx->queue;
-
- if (!entry->show)
- return -EIO;
-
- mutex_lock(&q->sysfs_lock);
- res = entry->show(ctx, page);
- mutex_unlock(&q->sysfs_lock);
- return res;
-}
-
-static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr,
- const char *page, size_t length)
-{
- struct blk_mq_ctx_sysfs_entry *entry;
- struct blk_mq_ctx *ctx;
- struct request_queue *q;
- ssize_t res;
-
- entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr);
- ctx = container_of(kobj, struct blk_mq_ctx, kobj);
- q = ctx->queue;
-
- if (!entry->store)
- return -EIO;
-
- mutex_lock(&q->sysfs_lock);
- res = entry->store(ctx, page, length);
- mutex_unlock(&q->sysfs_lock);
- return res;
-}
-
static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
struct attribute *attr, char *page)
{
@@ -122,28 +67,6 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj,
return res;
}
-static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj,
- struct attribute *attr, const char *page,
- size_t length)
-{
- struct blk_mq_hw_ctx_sysfs_entry *entry;
- struct blk_mq_hw_ctx *hctx;
- struct request_queue *q;
- ssize_t res;
-
- entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr);
- hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj);
- q = hctx->queue;
-
- if (!entry->store)
- return -EIO;
-
- mutex_lock(&q->sysfs_lock);
- res = entry->store(hctx, page, length);
- mutex_unlock(&q->sysfs_lock);
- return res;
-}
-
static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx,
char *page)
{
@@ -200,27 +123,19 @@ static struct attribute *default_hw_ctx_attrs[] = {
};
ATTRIBUTE_GROUPS(default_hw_ctx);
-static const struct sysfs_ops blk_mq_sysfs_ops = {
- .show = blk_mq_sysfs_show,
- .store = blk_mq_sysfs_store,
-};
-
static const struct sysfs_ops blk_mq_hw_sysfs_ops = {
.show = blk_mq_hw_sysfs_show,
- .store = blk_mq_hw_sysfs_store,
};
-static struct kobj_type blk_mq_ktype = {
- .sysfs_ops = &blk_mq_sysfs_ops,
+static const struct kobj_type blk_mq_ktype = {
.release = blk_mq_sysfs_release,
};
-static struct kobj_type blk_mq_ctx_ktype = {
- .sysfs_ops = &blk_mq_sysfs_ops,
+static const struct kobj_type blk_mq_ctx_ktype = {
.release = blk_mq_ctx_sysfs_release,
};
-static struct kobj_type blk_mq_hw_ktype = {
+static const struct kobj_type blk_mq_hw_ktype = {
.sysfs_ops = &blk_mq_hw_sysfs_ops,
.default_groups = default_hw_ctx_groups,
.release = blk_mq_hw_sysfs_release,
@@ -244,7 +159,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
{
struct request_queue *q = hctx->queue;
struct blk_mq_ctx *ctx;
- int i, ret;
+ int i, j, ret;
if (!hctx->nr_ctx)
return 0;
@@ -256,29 +171,19 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx)
hctx_for_each_ctx(hctx, ctx, i) {
ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu);
if (ret)
- break;
+ goto out;
}
+ return 0;
+out:
+ hctx_for_each_ctx(hctx, ctx, j) {
+ if (j < i)
+ kobject_del(&ctx->kobj);
+ }
+ kobject_del(&hctx->kobj);
return ret;
}
-void blk_mq_unregister_dev(struct device *dev, struct request_queue *q)
-{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- lockdep_assert_held(&q->sysfs_dir_lock);
-
- queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_unregister_hctx(hctx);
-
- kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
- kobject_del(q->mq_kobj);
- kobject_put(&dev->kobj);
-
- q->mq_sysfs_init_done = false;
-}
-
void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx)
{
kobject_init(&hctx->kobj, &blk_mq_hw_ktype);
@@ -311,15 +216,16 @@ void blk_mq_sysfs_init(struct request_queue *q)
}
}
-int __blk_mq_register_dev(struct device *dev, struct request_queue *q)
+int blk_mq_sysfs_register(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
- int ret, i;
+ unsigned long i, j;
+ int ret;
- WARN_ON_ONCE(!q->kobj.parent);
lockdep_assert_held(&q->sysfs_dir_lock);
- ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq");
+ ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq");
if (ret < 0)
goto out;
@@ -337,19 +243,37 @@ out:
return ret;
unreg:
- while (--i >= 0)
- blk_mq_unregister_hctx(q->queue_hw_ctx[i]);
+ queue_for_each_hw_ctx(q, hctx, j) {
+ if (j < i)
+ blk_mq_unregister_hctx(hctx);
+ }
kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
kobject_del(q->mq_kobj);
- kobject_put(&dev->kobj);
return ret;
}
-void blk_mq_sysfs_unregister(struct request_queue *q)
+void blk_mq_sysfs_unregister(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
+
+ lockdep_assert_held(&q->sysfs_dir_lock);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ blk_mq_unregister_hctx(hctx);
+
+ kobject_uevent(q->mq_kobj, KOBJ_REMOVE);
+ kobject_del(q->mq_kobj);
+
+ q->mq_sysfs_init_done = false;
+}
+
+void blk_mq_sysfs_unregister_hctxs(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
@@ -362,10 +286,11 @@ unlock:
mutex_unlock(&q->sysfs_dir_lock);
}
-int blk_mq_sysfs_register(struct request_queue *q)
+int blk_mq_sysfs_register_hctxs(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i, ret = 0;
+ unsigned long i;
+ int ret = 0;
mutex_lock(&q->sysfs_dir_lock);
if (!q->mq_sysfs_init_done)
diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c
index ae722f8b13fb..cc57e2dd9a0b 100644
--- a/block/blk-mq-tag.c
+++ b/block/blk-mq-tag.c
@@ -9,11 +9,25 @@
#include <linux/kernel.h>
#include <linux/module.h>
-#include <linux/blk-mq.h>
#include <linux/delay.h>
#include "blk.h"
#include "blk-mq.h"
-#include "blk-mq-tag.h"
+#include "blk-mq-sched.h"
+
+/*
+ * Recalculate wakeup batch when tag is shared by hctx.
+ */
+static void blk_mq_update_wake_batch(struct blk_mq_tags *tags,
+ unsigned int users)
+{
+ if (!users)
+ return;
+
+ sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags,
+ users);
+ sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags,
+ users);
+}
/*
* If a previously inactive queue goes active, bump the active user count.
@@ -21,13 +35,32 @@
* to get tag when first time, the other shared-tag users could reserve
* budget for it.
*/
-bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
{
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) &&
- !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- atomic_inc(&hctx->tags->active_queues);
+ unsigned int users;
+ struct blk_mq_tags *tags = hctx->tags;
- return true;
+ /*
+ * calling test_bit() prior to test_and_set_bit() is intentional,
+ * it avoids dirtying the cacheline if the queue is already active.
+ */
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+
+ if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) ||
+ test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+ return;
+ } else {
+ if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) ||
+ test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+ }
+
+ spin_lock_irq(&tags->lock);
+ users = tags->active_queues + 1;
+ WRITE_ONCE(tags->active_queues, users);
+ blk_mq_update_wake_batch(tags, users);
+ spin_unlock_irq(&tags->lock);
}
/*
@@ -47,58 +80,56 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve)
void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
{
struct blk_mq_tags *tags = hctx->tags;
+ unsigned int users;
- if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- return;
-
- atomic_dec(&tags->active_queues);
-
- blk_mq_tag_wakeup_all(tags, false);
-}
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
-/*
- * For shared tag users, we track the number of currently active users
- * and attempt to provide a fair share of the tag depth for each of them.
- */
-static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
- struct sbitmap_queue *bt)
-{
- unsigned int depth, users;
-
- if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED))
- return true;
- if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
- return true;
+ if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE,
+ &q->queue_flags))
+ return;
+ } else {
+ if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return;
+ }
- /*
- * Don't try dividing an ant
- */
- if (bt->sb.depth == 1)
- return true;
+ spin_lock_irq(&tags->lock);
+ users = tags->active_queues - 1;
+ WRITE_ONCE(tags->active_queues, users);
+ blk_mq_update_wake_batch(tags, users);
+ spin_unlock_irq(&tags->lock);
- users = atomic_read(&hctx->tags->active_queues);
- if (!users)
- return true;
-
- /*
- * Allow at least some tags
- */
- depth = max((bt->sb.depth + users - 1) / users, 4U);
- return atomic_read(&hctx->nr_active) < depth;
+ blk_mq_tag_wakeup_all(tags, false);
}
static int __blk_mq_get_tag(struct blk_mq_alloc_data *data,
struct sbitmap_queue *bt)
{
- if (!(data->flags & BLK_MQ_REQ_INTERNAL) &&
- !hctx_may_queue(data->hctx, bt))
+ if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) &&
+ !hctx_may_queue(data->hctx, bt))
return BLK_MQ_NO_TAG;
+
if (data->shallow_depth)
- return __sbitmap_queue_get_shallow(bt, data->shallow_depth);
+ return sbitmap_queue_get_shallow(bt, data->shallow_depth);
else
return __sbitmap_queue_get(bt);
}
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+ unsigned int *offset)
+{
+ struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct sbitmap_queue *bt = &tags->bitmap_tags;
+ unsigned long ret;
+
+ if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED ||
+ data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ return 0;
+ ret = __sbitmap_queue_get_batch(bt, nr_tags, offset);
+ *offset += tags->nr_reserved_tags;
+ return ret;
+}
+
unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
{
struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
@@ -172,7 +203,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data)
* other allocations on previous queue won't be starved.
*/
if (bt != bt_prev)
- sbitmap_queue_wake_up(bt_prev);
+ sbitmap_queue_wake_up(bt_prev, 1);
ws = bt_wait_ptr(bt, data->hctx);
} while (1);
@@ -191,33 +222,6 @@ found_tag:
return tag + tag_offset;
}
-bool __blk_mq_get_driver_tag(struct request *rq)
-{
- struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
- unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
- bool shared = blk_mq_tag_busy(rq->mq_hctx);
- int tag;
-
- if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
- bt = &rq->mq_hctx->tags->breserved_tags;
- tag_offset = 0;
- }
-
- if (!hctx_may_queue(rq->mq_hctx, bt))
- return false;
- tag = __sbitmap_queue_get(bt);
- if (tag == BLK_MQ_NO_TAG)
- return false;
-
- rq->tag = tag + tag_offset;
- if (shared) {
- rq->rq_flags |= RQF_MQ_INFLIGHT;
- atomic_inc(&rq->mq_hctx->nr_active);
- }
- rq->mq_hctx->tags->rqs[rq->tag] = rq;
- return true;
-}
-
void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
unsigned int tag)
{
@@ -227,42 +231,73 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
BUG_ON(real_tag >= tags->nr_tags);
sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu);
} else {
- BUG_ON(tag >= tags->nr_reserved_tags);
sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu);
}
}
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags)
+{
+ sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags,
+ tag_array, nr_tags);
+}
+
struct bt_iter_data {
struct blk_mq_hw_ctx *hctx;
- busy_iter_fn *fn;
+ struct request_queue *q;
+ busy_tag_iter_fn *fn;
void *data;
bool reserved;
};
+static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags,
+ unsigned int bitnr)
+{
+ struct request *rq;
+ unsigned long flags;
+
+ spin_lock_irqsave(&tags->lock, flags);
+ rq = tags->rqs[bitnr];
+ if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq))
+ rq = NULL;
+ spin_unlock_irqrestore(&tags->lock, flags);
+ return rq;
+}
+
static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_iter_data *iter_data = data;
struct blk_mq_hw_ctx *hctx = iter_data->hctx;
- struct blk_mq_tags *tags = hctx->tags;
- bool reserved = iter_data->reserved;
+ struct request_queue *q = iter_data->q;
+ struct blk_mq_tag_set *set = q->tag_set;
+ struct blk_mq_tags *tags;
struct request *rq;
+ bool ret = true;
- if (!reserved)
- bitnr += tags->nr_reserved_tags;
- rq = tags->rqs[bitnr];
+ if (blk_mq_is_shared_tags(set->flags))
+ tags = set->shared_tags;
+ else
+ tags = hctx->tags;
+ if (!iter_data->reserved)
+ bitnr += tags->nr_reserved_tags;
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
- if (rq && rq->q == hctx->queue)
- return iter_data->fn(hctx, rq, iter_data->data, reserved);
- return true;
+ rq = blk_mq_find_and_get_req(tags, bitnr);
+ if (!rq)
+ return true;
+
+ if (rq->q == q && (!hctx || rq->mq_hctx == hctx))
+ ret = iter_data->fn(rq, iter_data->data);
+ blk_mq_put_rq_ref(rq);
+ return ret;
}
/**
* bt_for_each - iterate over the requests associated with a hardware queue
* @hctx: Hardware queue to examine.
+ * @q: Request queue to examine.
* @bt: sbitmap to examine. This is either the breserved_tags member
* or the bitmap_tags member of struct blk_mq_tags.
* @fn: Pointer to the function that will be called for each request
@@ -274,14 +309,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
* @reserved: Indicates whether @bt is the breserved_tags member or the
* bitmap_tags member of struct blk_mq_tags.
*/
-static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt,
- busy_iter_fn *fn, void *data, bool reserved)
+static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q,
+ struct sbitmap_queue *bt, busy_tag_iter_fn *fn,
+ void *data, bool reserved)
{
struct bt_iter_data iter_data = {
.hctx = hctx,
.fn = fn,
.data = data,
.reserved = reserved,
+ .q = q,
};
sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data);
@@ -302,26 +339,30 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data)
{
struct bt_tags_iter_data *iter_data = data;
struct blk_mq_tags *tags = iter_data->tags;
- bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED;
struct request *rq;
+ bool ret = true;
+ bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS);
- if (!reserved)
+ if (!(iter_data->flags & BT_TAG_ITER_RESERVED))
bitnr += tags->nr_reserved_tags;
/*
* We can hit rq == NULL here, because the tagging functions
* test and set the bit before assigning ->rqs[].
*/
- if (iter_data->flags & BT_TAG_ITER_STATIC_RQS)
+ if (iter_static_rqs)
rq = tags->static_rqs[bitnr];
else
- rq = tags->rqs[bitnr];
+ rq = blk_mq_find_and_get_req(tags, bitnr);
if (!rq)
return true;
- if ((iter_data->flags & BT_TAG_ITER_STARTED) &&
- !blk_mq_request_started(rq))
- return true;
- return iter_data->fn(rq, iter_data->data, reserved);
+
+ if (!(iter_data->flags & BT_TAG_ITER_STARTED) ||
+ blk_mq_request_started(rq))
+ ret = iter_data->fn(rq, iter_data->data);
+ if (!iter_static_rqs)
+ blk_mq_put_rq_ref(rq);
+ return ret;
}
/**
@@ -388,13 +429,19 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
* indicates whether or not @rq is a reserved request. Return
* true to continue iterating tags, false to stop.
* @priv: Will be passed as second argument to @fn.
+ *
+ * We grab one request reference before calling @fn and release it after
+ * @fn returns.
*/
void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
busy_tag_iter_fn *fn, void *priv)
{
- int i;
+ unsigned int flags = tagset->flags;
+ int i, nr_tags;
+
+ nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues;
- for (i = 0; i < tagset->nr_hw_queues; i++) {
+ for (i = 0; i < nr_tags; i++) {
if (tagset->tags && tagset->tags[i])
__blk_mq_all_tag_iter(tagset->tags[i], fn, priv,
BT_TAG_ITER_STARTED);
@@ -402,8 +449,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset,
}
EXPORT_SYMBOL(blk_mq_tagset_busy_iter);
-static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
- void *data, bool reserved)
+static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data)
{
unsigned *count = data;
@@ -413,8 +459,8 @@ static bool blk_mq_tagset_count_completed_rqs(struct request *rq,
}
/**
- * blk_mq_tagset_wait_completed_request - wait until all completed req's
- * complete funtion is run
+ * blk_mq_tagset_wait_completed_request - Wait until all scheduled request
+ * completions have finished.
* @tagset: Tag set to drain completed request
*
* Note: This function has to be run after all IO queues are shutdown
@@ -447,35 +493,45 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request);
* called for all requests on all queues that share that tag set and not only
* for requests associated with @q.
*/
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
void *priv)
{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
/*
- * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx
+ * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table
* while the queue is frozen. So we can use q_usage_counter to avoid
- * racing with it. __blk_mq_update_nr_hw_queues() uses
- * synchronize_rcu() to ensure this function left the critical section
- * below.
+ * racing with it.
*/
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- queue_for_each_hw_ctx(q, hctx, i) {
- struct blk_mq_tags *tags = hctx->tags;
-
- /*
- * If no software queues are currently mapped to this
- * hardware queue, there's nothing to check
- */
- if (!blk_mq_hw_queue_mapped(hctx))
- continue;
+ if (blk_mq_is_shared_tags(q->tag_set->flags)) {
+ struct blk_mq_tags *tags = q->tag_set->shared_tags;
+ struct sbitmap_queue *bresv = &tags->breserved_tags;
+ struct sbitmap_queue *btags = &tags->bitmap_tags;
if (tags->nr_reserved_tags)
- bt_for_each(hctx, &tags->breserved_tags, fn, priv, true);
- bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false);
+ bt_for_each(NULL, q, bresv, fn, priv, true);
+ bt_for_each(NULL, q, btags, fn, priv, false);
+ } else {
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ queue_for_each_hw_ctx(q, hctx, i) {
+ struct blk_mq_tags *tags = hctx->tags;
+ struct sbitmap_queue *bresv = &tags->breserved_tags;
+ struct sbitmap_queue *btags = &tags->bitmap_tags;
+
+ /*
+ * If no software queues are currently mapped to this
+ * hardware queue, there's nothing to check
+ */
+ if (!blk_mq_hw_queue_mapped(hctx))
+ continue;
+
+ if (tags->nr_reserved_tags)
+ bt_for_each(hctx, q, bresv, fn, priv, true);
+ bt_for_each(hctx, q, btags, fn, priv, false);
+ }
}
blk_queue_exit(q);
}
@@ -487,24 +543,24 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth,
node);
}
-static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags,
- int node, int alloc_policy)
+int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
+ struct sbitmap_queue *breserved_tags,
+ unsigned int queue_depth, unsigned int reserved,
+ int node, int alloc_policy)
{
- unsigned int depth = tags->nr_tags - tags->nr_reserved_tags;
+ unsigned int depth = queue_depth - reserved;
bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR;
- if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node))
- goto free_tags;
- if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin,
- node))
+ if (bt_alloc(bitmap_tags, depth, round_robin, node))
+ return -ENOMEM;
+ if (bt_alloc(breserved_tags, reserved, round_robin, node))
goto free_bitmap_tags;
- return tags;
+ return 0;
+
free_bitmap_tags:
- sbitmap_queue_free(&tags->bitmap_tags);
-free_tags:
- kfree(tags);
- return NULL;
+ sbitmap_queue_free(bitmap_tags);
+ return -ENOMEM;
}
struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
@@ -524,8 +580,15 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags,
tags->nr_tags = total_tags;
tags->nr_reserved_tags = reserved_tags;
+ spin_lock_init(&tags->lock);
- return blk_mq_init_bitmap_tags(tags, node, alloc_policy);
+ if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags,
+ total_tags, reserved_tags, node,
+ alloc_policy) < 0) {
+ kfree(tags);
+ return NULL;
+ }
+ return tags;
}
void blk_mq_free_tags(struct blk_mq_tags *tags)
@@ -551,7 +614,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
if (tdepth > tags->nr_tags) {
struct blk_mq_tag_set *set = hctx->queue->tag_set;
struct blk_mq_tags *new;
- bool ret;
if (!can_grow)
return -EINVAL;
@@ -560,21 +622,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
* We need some sort of upper limit, set it high enough that
* no valid use cases should require more.
*/
- if (tdepth > 16 * BLKDEV_MAX_RQ)
+ if (tdepth > MAX_SCHED_RQ)
return -EINVAL;
- new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth,
- tags->nr_reserved_tags);
+ /*
+ * Only the sbitmap needs resizing since we allocated the max
+ * initially.
+ */
+ if (blk_mq_is_shared_tags(set->flags))
+ return 0;
+
+ new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth);
if (!new)
return -ENOMEM;
- ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth);
- if (ret) {
- blk_mq_free_rq_map(new);
- return -ENOMEM;
- }
- blk_mq_free_rqs(set, *tagsptr, hctx->queue_num);
- blk_mq_free_rq_map(*tagsptr);
+ blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num);
*tagsptr = new;
} else {
/*
@@ -588,6 +650,19 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
return 0;
}
+void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size)
+{
+ struct blk_mq_tags *tags = set->shared_tags;
+
+ sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags);
+}
+
+void blk_mq_tag_update_sched_shared_tags(struct request_queue *q)
+{
+ sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags,
+ q->nr_requests - q->tag_set->reserved_tags);
+}
+
/**
* blk_mq_unique_tag() - return a tag that is unique queue-wide
* @rq: request for which to compute a unique tag
diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h
deleted file mode 100644
index 2e4ef51cdb32..000000000000
--- a/block/blk-mq-tag.h
+++ /dev/null
@@ -1,99 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-#ifndef INT_BLK_MQ_TAG_H
-#define INT_BLK_MQ_TAG_H
-
-#include "blk-mq.h"
-
-/*
- * Tag address space map.
- */
-struct blk_mq_tags {
- unsigned int nr_tags;
- unsigned int nr_reserved_tags;
-
- atomic_t active_queues;
-
- struct sbitmap_queue bitmap_tags;
- struct sbitmap_queue breserved_tags;
-
- struct request **rqs;
- struct request **static_rqs;
- struct list_head page_list;
-};
-
-
-extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy);
-extern void blk_mq_free_tags(struct blk_mq_tags *tags);
-
-extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
-extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
- unsigned int tag);
-extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
- struct blk_mq_tags **tags,
- unsigned int depth, bool can_grow);
-extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
-void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn,
- void *priv);
-void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
- void *priv);
-
-static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
- struct blk_mq_hw_ctx *hctx)
-{
- if (!hctx)
- return &bt->ws[0];
- return sbq_wait_ptr(bt, &hctx->wait_index);
-}
-
-enum {
- BLK_MQ_NO_TAG = -1U,
- BLK_MQ_TAG_MIN = 1,
- BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
-};
-
-bool __blk_mq_get_driver_tag(struct request *rq);
-static inline bool blk_mq_get_driver_tag(struct request *rq)
-{
- if (rq->tag != BLK_MQ_NO_TAG)
- return true;
- return __blk_mq_get_driver_tag(rq);
-}
-
-extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
-extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
-
-static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
-{
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
- return false;
-
- return __blk_mq_tag_busy(hctx);
-}
-
-static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
-{
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED))
- return;
-
- __blk_mq_tag_idle(hctx);
-}
-
-/*
- * This helper should only be used for flush request to share tag
- * with the request cloned from, and both the two requests can't be
- * in flight at the same time. The caller has to make sure the tag
- * can't be freed.
- */
-static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx,
- unsigned int tag, struct request *rq)
-{
- hctx->tags->rqs[tag] = rq;
-}
-
-static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
- unsigned int tag)
-{
- return tag < tags->nr_reserved_tags;
-}
-
-#endif
diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c
index 7b8a42c35102..68d0945c0b08 100644
--- a/block/blk-mq-virtio.c
+++ b/block/blk-mq-virtio.c
@@ -3,7 +3,6 @@
* Copyright (c) 2016 Christoph Hellwig.
*/
#include <linux/device.h>
-#include <linux/blk-mq.h>
#include <linux/blk-mq-virtio.h>
#include <linux/virtio_config.h>
#include <linux/module.h>
@@ -21,7 +20,7 @@
* that maps a queue to the CPUs that have irq affinity for the corresponding
* vector.
*/
-int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
+void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
struct virtio_device *vdev, int first_vec)
{
const struct cpumask *mask;
@@ -39,8 +38,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap,
qmap->mq_map[cpu] = qmap->queue_offset + queue;
}
- return 0;
+ return;
+
fallback:
- return blk_mq_map_queues(qmap);
+ blk_mq_map_queues(qmap);
}
EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues);
diff --git a/block/blk-mq.c b/block/blk-mq.c
index 4e0d173beaa3..2dc01551e27c 100644
--- a/block/blk-mq.c
+++ b/block/blk-mq.c
@@ -10,14 +10,15 @@
#include <linux/backing-dev.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/kmemleak.h>
#include <linux/mm.h>
#include <linux/init.h>
#include <linux/slab.h>
#include <linux/workqueue.h>
#include <linux/smp.h>
+#include <linux/interrupt.h>
#include <linux/llist.h>
-#include <linux/list_sort.h>
#include <linux/cpu.h>
#include <linux/cache.h>
#include <linux/sched/sysctl.h>
@@ -27,39 +28,29 @@
#include <linux/crash_dump.h>
#include <linux/prefetch.h>
#include <linux/blk-crypto.h>
+#include <linux/part_stat.h>
#include <trace/events/block.h>
-#include <linux/blk-mq.h>
#include <linux/t10-pi.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
-#include "blk-mq-tag.h"
#include "blk-pm.h"
#include "blk-stat.h"
#include "blk-mq-sched.h"
#include "blk-rq-qos.h"
-static void blk_mq_poll_stats_start(struct request_queue *q);
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb);
+static DEFINE_PER_CPU(struct llist_head, blk_cpu_done);
+static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd);
-static int blk_mq_poll_stats_bkt(const struct request *rq)
-{
- int ddir, sectors, bucket;
-
- ddir = rq_data_dir(rq);
- sectors = blk_rq_stats_sectors(rq);
-
- bucket = ddir + 2 * ilog2(sectors);
-
- if (bucket < 0)
- return -1;
- else if (bucket >= BLK_MQ_POLL_STATS_BKTS)
- return ddir + BLK_MQ_POLL_STATS_BKTS - 2;
-
- return bucket;
-}
+static void blk_mq_insert_request(struct request *rq, blk_insert_t flags);
+static void blk_mq_request_bypass_insert(struct request *rq,
+ blk_insert_t flags);
+static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+ struct list_head *list);
+static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+ struct io_comp_batch *iob, unsigned int flags);
/*
* Check if any of the ctx, dispatch list or elevator
@@ -93,23 +84,24 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx,
}
struct mq_inflight {
- struct hd_struct *part;
+ struct block_device *part;
unsigned int inflight[2];
};
-static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx,
- struct request *rq, void *priv,
- bool reserved)
+static bool blk_mq_check_inflight(struct request *rq, void *priv)
{
struct mq_inflight *mi = priv;
- if (rq->part == mi->part)
+ if (rq->part && blk_do_io_stat(rq) &&
+ (!mi->part->bd_partno || rq->part == mi->part) &&
+ blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT)
mi->inflight[rq_data_dir(rq)]++;
return true;
}
-unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
+unsigned int blk_mq_in_flight(struct request_queue *q,
+ struct block_device *part)
{
struct mq_inflight mi = { .part = part };
@@ -118,8 +110,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part)
return mi.inflight[0] + mi.inflight[1];
}
-void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2])
+void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+ unsigned int inflight[2])
{
struct mq_inflight mi = { .part = part };
@@ -184,9 +176,11 @@ void blk_mq_freeze_queue(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_freeze_queue);
-void blk_mq_unfreeze_queue(struct request_queue *q)
+void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic)
{
mutex_lock(&q->mq_freeze_lock);
+ if (force_atomic)
+ q->q_usage_counter.data->force_atomic = true;
q->mq_freeze_depth--;
WARN_ON_ONCE(q->mq_freeze_depth < 0);
if (!q->mq_freeze_depth) {
@@ -195,6 +189,11 @@ void blk_mq_unfreeze_queue(struct request_queue *q)
}
mutex_unlock(&q->mq_freeze_lock);
}
+
+void blk_mq_unfreeze_queue(struct request_queue *q)
+{
+ __blk_mq_unfreeze_queue(q, false);
+}
EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
/*
@@ -203,11 +202,34 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue);
*/
void blk_mq_quiesce_queue_nowait(struct request_queue *q)
{
- blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->queue_lock, flags);
+ if (!q->quiesce_depth++)
+ blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q);
+ spin_unlock_irqrestore(&q->queue_lock, flags);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
/**
+ * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done
+ * @set: tag_set to wait on
+ *
+ * Note: it is driver's responsibility for making sure that quiesce has
+ * been started on or more of the request_queues of the tag_set. This
+ * function only waits for the quiesce on those request_queues that had
+ * the quiesce flag set using blk_mq_quiesce_queue_nowait.
+ */
+void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set)
+{
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ synchronize_srcu(set->srcu);
+ else
+ synchronize_rcu();
+}
+EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done);
+
+/**
* blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished
* @q: request queue.
*
@@ -218,20 +240,10 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait);
*/
void blk_mq_quiesce_queue(struct request_queue *q)
{
- struct blk_mq_hw_ctx *hctx;
- unsigned int i;
- bool rcu = false;
-
blk_mq_quiesce_queue_nowait(q);
-
- queue_for_each_hw_ctx(q, hctx, i) {
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- synchronize_srcu(hctx->srcu);
- else
- rcu = true;
- }
- if (rcu)
- synchronize_rcu();
+ /* nothing to wait for non-mq queues */
+ if (queue_is_mq(q))
+ blk_mq_wait_quiesce_done(q->tag_set);
}
EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
@@ -244,114 +256,189 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue);
*/
void blk_mq_unquiesce_queue(struct request_queue *q)
{
- blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+ unsigned long flags;
+ bool run_queue = false;
+
+ spin_lock_irqsave(&q->queue_lock, flags);
+ if (WARN_ON_ONCE(q->quiesce_depth <= 0)) {
+ ;
+ } else if (!--q->quiesce_depth) {
+ blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q);
+ run_queue = true;
+ }
+ spin_unlock_irqrestore(&q->queue_lock, flags);
/* dispatch requests which are inserted during quiescing */
- blk_mq_run_hw_queues(q, true);
+ if (run_queue)
+ blk_mq_run_hw_queues(q, true);
}
EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue);
+void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_quiesce_queue_nowait(q);
+ }
+ blk_mq_wait_quiesce_done(set);
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset);
+
+void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set)
+{
+ struct request_queue *q;
+
+ mutex_lock(&set->tag_list_lock);
+ list_for_each_entry(q, &set->tag_list, tag_set_list) {
+ if (!blk_queue_skip_tagset_quiesce(q))
+ blk_mq_unquiesce_queue(q);
+ }
+ mutex_unlock(&set->tag_list_lock);
+}
+EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset);
+
void blk_mq_wake_waiters(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_wakeup_all(hctx->tags, true);
}
-/*
- * Only need start/end time stamping if we have iostat or
- * blk stats enabled, or using an IO scheduler.
- */
-static inline bool blk_mq_need_time_stamp(struct request *rq)
+void blk_rq_init(struct request_queue *q, struct request *rq)
+{
+ memset(rq, 0, sizeof(*rq));
+
+ INIT_LIST_HEAD(&rq->queuelist);
+ rq->q = q;
+ rq->__sector = (sector_t) -1;
+ INIT_HLIST_NODE(&rq->hash);
+ RB_CLEAR_NODE(&rq->rb_node);
+ rq->tag = BLK_MQ_NO_TAG;
+ rq->internal_tag = BLK_MQ_NO_TAG;
+ rq->start_time_ns = ktime_get_ns();
+ rq->part = NULL;
+ blk_crypto_rq_set_defaults(rq);
+}
+EXPORT_SYMBOL(blk_rq_init);
+
+/* Set start and alloc time when the allocated request is actually used */
+static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns)
{
- return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator;
+ if (blk_mq_need_time_stamp(rq))
+ rq->start_time_ns = ktime_get_ns();
+ else
+ rq->start_time_ns = 0;
+
+#ifdef CONFIG_BLK_RQ_ALLOC_TIME
+ if (blk_queue_rq_alloc_time(rq->q))
+ rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns;
+ else
+ rq->alloc_time_ns = 0;
+#endif
}
static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data,
- unsigned int tag, u64 alloc_time_ns)
+ struct blk_mq_tags *tags, unsigned int tag)
{
- struct blk_mq_tags *tags = blk_mq_tags_from_data(data);
+ struct blk_mq_ctx *ctx = data->ctx;
+ struct blk_mq_hw_ctx *hctx = data->hctx;
+ struct request_queue *q = data->q;
struct request *rq = tags->static_rqs[tag];
- req_flags_t rq_flags = 0;
- if (data->flags & BLK_MQ_REQ_INTERNAL) {
+ rq->q = q;
+ rq->mq_ctx = ctx;
+ rq->mq_hctx = hctx;
+ rq->cmd_flags = data->cmd_flags;
+
+ if (data->flags & BLK_MQ_REQ_PM)
+ data->rq_flags |= RQF_PM;
+ if (blk_queue_io_stat(q))
+ data->rq_flags |= RQF_IO_STAT;
+ rq->rq_flags = data->rq_flags;
+
+ if (data->rq_flags & RQF_SCHED_TAGS) {
rq->tag = BLK_MQ_NO_TAG;
rq->internal_tag = tag;
} else {
- if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) {
- rq_flags = RQF_MQ_INFLIGHT;
- atomic_inc(&data->hctx->nr_active);
- }
rq->tag = tag;
rq->internal_tag = BLK_MQ_NO_TAG;
- data->hctx->tags->rqs[rq->tag] = rq;
}
+ rq->timeout = 0;
- /* csd/requeue_work/fifo_time is initialized before use */
- rq->q = data->q;
- rq->mq_ctx = data->ctx;
- rq->mq_hctx = data->hctx;
- rq->rq_flags = rq_flags;
- rq->cmd_flags = data->cmd_flags;
- if (data->flags & BLK_MQ_REQ_PREEMPT)
- rq->rq_flags |= RQF_PREEMPT;
- if (blk_queue_io_stat(data->q))
- rq->rq_flags |= RQF_IO_STAT;
- INIT_LIST_HEAD(&rq->queuelist);
- INIT_HLIST_NODE(&rq->hash);
- RB_CLEAR_NODE(&rq->rb_node);
- rq->rq_disk = NULL;
rq->part = NULL;
-#ifdef CONFIG_BLK_RQ_ALLOC_TIME
- rq->alloc_time_ns = alloc_time_ns;
-#endif
- if (blk_mq_need_time_stamp(rq))
- rq->start_time_ns = ktime_get_ns();
- else
- rq->start_time_ns = 0;
rq->io_start_time_ns = 0;
rq->stats_sectors = 0;
rq->nr_phys_segments = 0;
#if defined(CONFIG_BLK_DEV_INTEGRITY)
rq->nr_integrity_segments = 0;
#endif
- blk_crypto_rq_set_defaults(rq);
- /* tag was already set */
- WRITE_ONCE(rq->deadline, 0);
-
- rq->timeout = 0;
-
rq->end_io = NULL;
rq->end_io_data = NULL;
- data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++;
- refcount_set(&rq->ref, 1);
+ blk_crypto_rq_set_defaults(rq);
+ INIT_LIST_HEAD(&rq->queuelist);
+ /* tag was already set */
+ WRITE_ONCE(rq->deadline, 0);
+ req_ref_set(rq, 1);
- if (!op_is_flush(data->cmd_flags)) {
+ if (rq->rq_flags & RQF_USE_SCHED) {
struct elevator_queue *e = data->q->elevator;
- rq->elv.icq = NULL;
- if (e && e->type->ops.prepare_request) {
- if (e->type->icq_cache)
- blk_mq_sched_assign_ioc(rq);
+ INIT_HLIST_NODE(&rq->hash);
+ RB_CLEAR_NODE(&rq->rb_node);
+ if (e->type->ops.prepare_request)
e->type->ops.prepare_request(rq);
- rq->rq_flags |= RQF_ELVPRIV;
- }
}
- data->hctx->queued++;
return rq;
}
-static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
+static inline struct request *
+__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data)
+{
+ unsigned int tag, tag_offset;
+ struct blk_mq_tags *tags;
+ struct request *rq;
+ unsigned long tag_mask;
+ int i, nr = 0;
+
+ tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset);
+ if (unlikely(!tag_mask))
+ return NULL;
+
+ tags = blk_mq_tags_from_data(data);
+ for (i = 0; tag_mask; i++) {
+ if (!(tag_mask & (1UL << i)))
+ continue;
+ tag = tag_offset + i;
+ prefetch(tags->static_rqs[tag]);
+ tag_mask &= ~(1UL << i);
+ rq = blk_mq_rq_ctx_init(data, tags, tag);
+ rq_list_add(data->cached_rq, rq);
+ nr++;
+ }
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_add_active_requests(data->hctx, nr);
+ /* caller already holds a reference, add for remainder */
+ percpu_ref_get_many(&data->q->q_usage_counter, nr - 1);
+ data->nr_tags -= nr;
+
+ return rq_list_pop(data->cached_rq);
+}
+
+static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data)
{
struct request_queue *q = data->q;
- struct elevator_queue *e = q->elevator;
u64 alloc_time_ns = 0;
+ struct request *rq;
unsigned int tag;
/* alloc_time includes depth and tag waits */
@@ -361,26 +448,50 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data)
if (data->cmd_flags & REQ_NOWAIT)
data->flags |= BLK_MQ_REQ_NOWAIT;
- if (e) {
- data->flags |= BLK_MQ_REQ_INTERNAL;
+ if (q->elevator) {
+ /*
+ * All requests use scheduler tags when an I/O scheduler is
+ * enabled for the queue.
+ */
+ data->rq_flags |= RQF_SCHED_TAGS;
/*
- * Flush requests are special and go directly to the
- * dispatch list. Don't include reserved tags in the
- * limiting, as it isn't useful.
+ * Flush/passthrough requests are special and go directly to the
+ * dispatch list.
*/
- if (!op_is_flush(data->cmd_flags) &&
- e->type->ops.limit_depth &&
- !(data->flags & BLK_MQ_REQ_RESERVED))
- e->type->ops.limit_depth(data->cmd_flags, data);
+ if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH &&
+ !blk_op_is_passthrough(data->cmd_flags)) {
+ struct elevator_mq_ops *ops = &q->elevator->type->ops;
+
+ WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED);
+
+ data->rq_flags |= RQF_USE_SCHED;
+ if (ops->limit_depth)
+ ops->limit_depth(data->cmd_flags, data);
+ }
}
retry:
data->ctx = blk_mq_get_ctx(q);
data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx);
- if (!(data->flags & BLK_MQ_REQ_INTERNAL))
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
blk_mq_tag_busy(data->hctx);
+ if (data->flags & BLK_MQ_REQ_RESERVED)
+ data->rq_flags |= RQF_RESV;
+
+ /*
+ * Try batched alloc if we want more than 1 tag.
+ */
+ if (data->nr_tags > 1) {
+ rq = __blk_mq_alloc_requests_batch(data);
+ if (rq) {
+ blk_mq_rq_time_init(rq, alloc_time_ns);
+ return rq;
+ }
+ data->nr_tags = 1;
+ }
+
/*
* Waiting allocations only fail because of an inactive hctx. In that
* case just retry the hctx assignment and tag allocation as CPU hotplug
@@ -390,36 +501,106 @@ retry:
if (tag == BLK_MQ_NO_TAG) {
if (data->flags & BLK_MQ_REQ_NOWAIT)
return NULL;
-
/*
- * Give up the CPU and sleep for a random short time to ensure
- * that thread using a realtime scheduling class are migrated
- * off the the CPU, and thus off the hctx that is going away.
+ * Give up the CPU and sleep for a random short time to
+ * ensure that thread using a realtime scheduling class
+ * are migrated off the CPU, and thus off the hctx that
+ * is going away.
*/
msleep(3);
goto retry;
}
- return blk_mq_rq_ctx_init(data, tag, alloc_time_ns);
+
+ if (!(data->rq_flags & RQF_SCHED_TAGS))
+ blk_mq_inc_active_requests(data->hctx);
+ rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag);
+ blk_mq_rq_time_init(rq, alloc_time_ns);
+ return rq;
}
-struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op,
- blk_mq_req_flags_t flags)
+static struct request *blk_mq_rq_cache_fill(struct request_queue *q,
+ struct blk_plug *plug,
+ blk_opf_t opf,
+ blk_mq_req_flags_t flags)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
- .cmd_flags = op,
+ .cmd_flags = opf,
+ .nr_tags = plug->nr_ios,
+ .cached_rq = &plug->cached_rq,
};
struct request *rq;
- int ret;
- ret = blk_queue_enter(q, flags);
- if (ret)
- return ERR_PTR(ret);
+ if (blk_queue_enter(q, flags))
+ return NULL;
- rq = __blk_mq_alloc_request(&data);
- if (!rq)
- goto out_queue_exit;
+ plug->nr_ios = 1;
+
+ rq = __blk_mq_alloc_requests(&data);
+ if (unlikely(!rq))
+ blk_queue_exit(q);
+ return rq;
+}
+
+static struct request *blk_mq_alloc_cached_request(struct request_queue *q,
+ blk_opf_t opf,
+ blk_mq_req_flags_t flags)
+{
+ struct blk_plug *plug = current->plug;
+ struct request *rq;
+
+ if (!plug)
+ return NULL;
+
+ if (rq_list_empty(plug->cached_rq)) {
+ if (plug->nr_ios == 1)
+ return NULL;
+ rq = blk_mq_rq_cache_fill(q, plug, opf, flags);
+ if (!rq)
+ return NULL;
+ } else {
+ rq = rq_list_peek(&plug->cached_rq);
+ if (!rq || rq->q != q)
+ return NULL;
+
+ if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type)
+ return NULL;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(opf))
+ return NULL;
+
+ plug->cached_rq = rq_list_next(rq);
+ blk_mq_rq_time_init(rq, 0);
+ }
+
+ rq->cmd_flags = opf;
+ INIT_LIST_HEAD(&rq->queuelist);
+ return rq;
+}
+
+struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf,
+ blk_mq_req_flags_t flags)
+{
+ struct request *rq;
+
+ rq = blk_mq_alloc_cached_request(q, opf, flags);
+ if (!rq) {
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .flags = flags,
+ .cmd_flags = opf,
+ .nr_tags = 1,
+ };
+ int ret;
+
+ ret = blk_queue_enter(q, flags);
+ if (ret)
+ return ERR_PTR(ret);
+
+ rq = __blk_mq_alloc_requests(&data);
+ if (!rq)
+ goto out_queue_exit;
+ }
rq->__data_len = 0;
rq->__sector = (sector_t) -1;
rq->bio = rq->biotail = NULL;
@@ -431,14 +612,16 @@ out_queue_exit:
EXPORT_SYMBOL(blk_mq_alloc_request);
struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
- unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx)
+ blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx)
{
struct blk_mq_alloc_data data = {
.q = q,
.flags = flags,
- .cmd_flags = op,
+ .cmd_flags = opf,
+ .nr_tags = 1,
};
u64 alloc_time_ns = 0;
+ struct request *rq;
unsigned int cpu;
unsigned int tag;
int ret;
@@ -453,7 +636,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* allocator for this for the rare use case of a command tied to
* a specific queue.
*/
- if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED))))
+ if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) ||
+ WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED)))
return ERR_PTR(-EINVAL);
if (hctx_idx >= q->nr_hw_queues)
@@ -468,22 +652,34 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q,
* If not tell the caller that it should skip this queue.
*/
ret = -EXDEV;
- data.hctx = q->queue_hw_ctx[hctx_idx];
+ data.hctx = xa_load(&q->hctx_table, hctx_idx);
if (!blk_mq_hw_queue_mapped(data.hctx))
goto out_queue_exit;
cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask);
+ if (cpu >= nr_cpu_ids)
+ goto out_queue_exit;
data.ctx = __blk_mq_get_ctx(q, cpu);
if (q->elevator)
- data.flags |= BLK_MQ_REQ_INTERNAL;
+ data.rq_flags |= RQF_SCHED_TAGS;
else
blk_mq_tag_busy(data.hctx);
+ if (flags & BLK_MQ_REQ_RESERVED)
+ data.rq_flags |= RQF_RESV;
+
ret = -EWOULDBLOCK;
tag = blk_mq_get_tag(&data);
if (tag == BLK_MQ_NO_TAG)
goto out_queue_exit;
- return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns);
+ if (!(data.rq_flags & RQF_SCHED_TAGS))
+ blk_mq_inc_active_requests(data.hctx);
+ rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag);
+ blk_mq_rq_time_init(rq, alloc_time_ns);
+ rq->__data_len = 0;
+ rq->__sector = (sector_t) -1;
+ rq->bio = rq->biotail = NULL;
+ return rq;
out_queue_exit:
blk_queue_exit(q);
@@ -491,6 +687,21 @@ out_queue_exit:
}
EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx);
+static void blk_mq_finish_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+
+ if (rq->rq_flags & RQF_USE_SCHED) {
+ q->elevator->type->ops.finish_request(rq);
+ /*
+ * For postflush request that may need to be
+ * completed twice, we should clear this flag
+ * to avoid double finish_request() on the rq.
+ */
+ rq->rq_flags &= ~RQF_USE_SCHED;
+ }
+}
+
static void __blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
@@ -501,8 +712,11 @@ static void __blk_mq_free_request(struct request *rq)
blk_crypto_free_request(rq);
blk_pm_mark_last_busy(rq);
rq->mq_hctx = NULL;
- if (rq->tag != BLK_MQ_NO_TAG)
+
+ if (rq->tag != BLK_MQ_NO_TAG) {
+ blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, ctx, rq->tag);
+ }
if (sched_tag != BLK_MQ_NO_TAG)
blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag);
blk_mq_sched_restart(hctx);
@@ -512,54 +726,329 @@ static void __blk_mq_free_request(struct request *rq)
void blk_mq_free_request(struct request *rq)
{
struct request_queue *q = rq->q;
- struct elevator_queue *e = q->elevator;
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
-
- if (rq->rq_flags & RQF_ELVPRIV) {
- if (e && e->type->ops.finish_request)
- e->type->ops.finish_request(rq);
- if (rq->elv.icq) {
- put_io_context(rq->elv.icq->ioc);
- rq->elv.icq = NULL;
- }
- }
- ctx->rq_completed[rq_is_sync(rq)]++;
- if (rq->rq_flags & RQF_MQ_INFLIGHT)
- atomic_dec(&hctx->nr_active);
+ blk_mq_finish_request(rq);
if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq)))
- laptop_io_completion(q->backing_dev_info);
+ laptop_io_completion(q->disk->bdi);
rq_qos_done(q, rq);
WRITE_ONCE(rq->state, MQ_RQ_IDLE);
- if (refcount_dec_and_test(&rq->ref))
+ if (req_ref_put_and_test(rq))
__blk_mq_free_request(rq);
}
EXPORT_SYMBOL_GPL(blk_mq_free_request);
-inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+void blk_mq_free_plug_rqs(struct blk_plug *plug)
{
- u64 now = 0;
+ struct request *rq;
- if (blk_mq_need_time_stamp(rq))
- now = ktime_get_ns();
+ while ((rq = rq_list_pop(&plug->cached_rq)) != NULL)
+ blk_mq_free_request(rq);
+}
- if (rq->rq_flags & RQF_STATS) {
- blk_mq_poll_stats_start(rq->q);
- blk_stat_add(rq, now);
+void blk_dump_rq_flags(struct request *rq, char *msg)
+{
+ printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg,
+ rq->q->disk ? rq->q->disk->disk_name : "?",
+ (__force unsigned long long) rq->cmd_flags);
+
+ printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n",
+ (unsigned long long)blk_rq_pos(rq),
+ blk_rq_sectors(rq), blk_rq_cur_sectors(rq));
+ printk(KERN_INFO " bio %p, biotail %p, len %u\n",
+ rq->bio, rq->biotail, blk_rq_bytes(rq));
+}
+EXPORT_SYMBOL(blk_dump_rq_flags);
+
+static void req_bio_endio(struct request *rq, struct bio *bio,
+ unsigned int nbytes, blk_status_t error)
+{
+ if (unlikely(error)) {
+ bio->bi_status = error;
+ } else if (req_op(rq) == REQ_OP_ZONE_APPEND) {
+ /*
+ * Partial zone append completions cannot be supported as the
+ * BIO fragments may end up not being written sequentially.
+ * For such case, force the completed nbytes to be equal to
+ * the BIO size so that bio_advance() sets the BIO remaining
+ * size to 0 and we end up calling bio_endio() before returning.
+ */
+ if (bio->bi_iter.bi_size != nbytes) {
+ bio->bi_status = BLK_STS_IOERR;
+ nbytes = bio->bi_iter.bi_size;
+ } else {
+ bio->bi_iter.bi_sector = rq->__sector;
+ }
+ }
+
+ bio_advance(bio, nbytes);
+
+ if (unlikely(rq->rq_flags & RQF_QUIET))
+ bio_set_flag(bio, BIO_QUIET);
+ /* don't actually finish bio if it's part of flush sequence */
+ if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ))
+ bio_endio(bio);
+}
+
+static void blk_account_io_completion(struct request *req, unsigned int bytes)
+{
+ if (req->part && blk_do_io_stat(req)) {
+ const int sgrp = op_stat_group(req_op(req));
+
+ part_stat_lock();
+ part_stat_add(req->part, sectors[sgrp], bytes >> 9);
+ part_stat_unlock();
+ }
+}
+
+static void blk_print_req_error(struct request *req, blk_status_t status)
+{
+ printk_ratelimited(KERN_ERR
+ "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x "
+ "phys_seg %u prio class %u\n",
+ blk_status_to_str(status),
+ req->q->disk ? req->q->disk->disk_name : "?",
+ blk_rq_pos(req), (__force u32)req_op(req),
+ blk_op_str(req_op(req)),
+ (__force u32)(req->cmd_flags & ~REQ_OP_MASK),
+ req->nr_phys_segments,
+ IOPRIO_PRIO_CLASS(req->ioprio));
+}
+
+/*
+ * Fully end IO on a request. Does not support partial completions, or
+ * errors.
+ */
+static void blk_complete_request(struct request *req)
+{
+ const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0;
+ int total_bytes = blk_rq_bytes(req);
+ struct bio *bio = req->bio;
+
+ trace_block_rq_complete(req, BLK_STS_OK, total_bytes);
+
+ if (!bio)
+ return;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ)
+ req->q->integrity.profile->complete_fn(req, total_bytes);
+#endif
+
+ /*
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
+ * bio_endio(). Therefore, the keyslot must be released before that.
+ */
+ blk_crypto_rq_put_keyslot(req);
+
+ blk_account_io_completion(req, total_bytes);
+
+ do {
+ struct bio *next = bio->bi_next;
+
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+
+ if (req_op(req) == REQ_OP_ZONE_APPEND)
+ bio->bi_iter.bi_sector = req->__sector;
+
+ if (!is_flush)
+ bio_endio(bio);
+ bio = next;
+ } while (bio);
+
+ /*
+ * Reset counters so that the request stacking driver
+ * can find how many bytes remain in the request
+ * later.
+ */
+ if (!req->end_io) {
+ req->bio = NULL;
+ req->__data_len = 0;
+ }
+}
+
+/**
+ * blk_update_request - Complete multiple bytes without completing the request
+ * @req: the request being processed
+ * @error: block status code
+ * @nr_bytes: number of bytes to complete for @req
+ *
+ * Description:
+ * Ends I/O on a number of bytes attached to @req, but doesn't complete
+ * the request structure even if @req doesn't have leftover.
+ * If @req has leftover, sets it up for the next range of segments.
+ *
+ * Passing the result of blk_rq_bytes() as @nr_bytes guarantees
+ * %false return from this function.
+ *
+ * Note:
+ * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function
+ * except in the consistency check at the end of this function.
+ *
+ * Return:
+ * %false - this request doesn't have any more data
+ * %true - this request has more data
+ **/
+bool blk_update_request(struct request *req, blk_status_t error,
+ unsigned int nr_bytes)
+{
+ int total_bytes;
+
+ trace_block_rq_complete(req, error, nr_bytes);
+
+ if (!req->bio)
+ return false;
+
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ &&
+ error == BLK_STS_OK)
+ req->q->integrity.profile->complete_fn(req, nr_bytes);
+#endif
+
+ /*
+ * Upper layers may call blk_crypto_evict_key() anytime after the last
+ * bio_endio(). Therefore, the keyslot must be released before that.
+ */
+ if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req))
+ __blk_crypto_rq_put_keyslot(req);
+
+ if (unlikely(error && !blk_rq_is_passthrough(req) &&
+ !(req->rq_flags & RQF_QUIET)) &&
+ !test_bit(GD_DEAD, &req->q->disk->state)) {
+ blk_print_req_error(req, error);
+ trace_block_rq_error(req, error, nr_bytes);
+ }
+
+ blk_account_io_completion(req, nr_bytes);
+
+ total_bytes = 0;
+ while (req->bio) {
+ struct bio *bio = req->bio;
+ unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes);
+
+ if (bio_bytes == bio->bi_iter.bi_size)
+ req->bio = bio->bi_next;
+
+ /* Completion has already been traced */
+ bio_clear_flag(bio, BIO_TRACE_COMPLETION);
+ req_bio_endio(req, bio, bio_bytes, error);
+
+ total_bytes += bio_bytes;
+ nr_bytes -= bio_bytes;
+
+ if (!nr_bytes)
+ break;
+ }
+
+ /*
+ * completely done
+ */
+ if (!req->bio) {
+ /*
+ * Reset counters so that the request stacking driver
+ * can find how many bytes remain in the request
+ * later.
+ */
+ req->__data_len = 0;
+ return false;
+ }
+
+ req->__data_len -= total_bytes;
+
+ /* update sector only for requests with clear definition of sector */
+ if (!blk_rq_is_passthrough(req))
+ req->__sector += total_bytes >> 9;
+
+ /* mixed attributes always follow the first bio */
+ if (req->rq_flags & RQF_MIXED_MERGE) {
+ req->cmd_flags &= ~REQ_FAILFAST_MASK;
+ req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK;
+ }
+
+ if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) {
+ /*
+ * If total number of sectors is less than the first segment
+ * size, something has gone terribly wrong.
+ */
+ if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) {
+ blk_dump_rq_flags(req, "request botched");
+ req->__data_len = blk_rq_cur_bytes(req);
+ }
+
+ /* recalculate the number of segments */
+ req->nr_phys_segments = blk_recalc_rq_segments(req);
+ }
+
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_update_request);
+
+static inline void blk_account_io_done(struct request *req, u64 now)
+{
+ trace_block_io_done(req);
+
+ /*
+ * Account IO completion. flush_rq isn't accounted as a
+ * normal IO on queueing nor completion. Accounting the
+ * containing request is enough.
+ */
+ if (blk_do_io_stat(req) && req->part &&
+ !(req->rq_flags & RQF_FLUSH_SEQ)) {
+ const int sgrp = op_stat_group(req_op(req));
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, true);
+ part_stat_inc(req->part, ios[sgrp]);
+ part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns);
+ part_stat_unlock();
+ }
+}
+
+static inline void blk_account_io_start(struct request *req)
+{
+ trace_block_io_start(req);
+
+ if (blk_do_io_stat(req)) {
+ /*
+ * All non-passthrough requests are created from a bio with one
+ * exception: when a flush command that is part of a flush sequence
+ * generated by the state machine in blk-flush.c is cloned onto the
+ * lower device by dm-multipath we can get here without a bio.
+ */
+ if (req->bio)
+ req->part = req->bio->bi_bdev;
+ else
+ req->part = req->q->disk->part0;
+
+ part_stat_lock();
+ update_io_ticks(req->part, jiffies, false);
+ part_stat_unlock();
}
+}
- if (rq->internal_tag != BLK_MQ_NO_TAG)
- blk_mq_sched_completed_request(rq, now);
+static inline void __blk_mq_end_request_acct(struct request *rq, u64 now)
+{
+ if (rq->rq_flags & RQF_STATS)
+ blk_stat_add(rq, now);
+ blk_mq_sched_completed_request(rq, now);
blk_account_io_done(rq, now);
+}
+
+inline void __blk_mq_end_request(struct request *rq, blk_status_t error)
+{
+ if (blk_mq_need_time_stamp(rq))
+ __blk_mq_end_request_acct(rq, ktime_get_ns());
+
+ blk_mq_finish_request(rq);
if (rq->end_io) {
rq_qos_done(rq->q, rq);
- rq->end_io(rq, error);
+ if (rq->end_io(rq, error) == RQ_END_IO_FREE)
+ blk_mq_free_request(rq);
} else {
blk_mq_free_request(rq);
}
@@ -574,106 +1063,178 @@ void blk_mq_end_request(struct request *rq, blk_status_t error)
}
EXPORT_SYMBOL(blk_mq_end_request);
-static void __blk_mq_complete_request_remote(void *data)
+#define TAG_COMP_BATCH 32
+
+static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx,
+ int *tag_array, int nr_tags)
{
- struct request *rq = data;
- struct request_queue *q = rq->q;
+ struct request_queue *q = hctx->queue;
+
+ blk_mq_sub_active_requests(hctx, nr_tags);
- q->mq_ops->complete(rq);
+ blk_mq_put_tags(hctx->tags, tag_array, nr_tags);
+ percpu_ref_put_many(&q->q_usage_counter, nr_tags);
}
-/**
- * blk_mq_force_complete_rq() - Force complete the request, bypassing any error
- * injection that could drop the completion.
- * @rq: Request to be force completed
- *
- * Drivers should use blk_mq_complete_request() to complete requests in their
- * normal IO path. For timeout error recovery, drivers may call this forced
- * completion routine after they've reclaimed timed out requests to bypass
- * potentially subsequent fake timeouts.
- */
-void blk_mq_force_complete_rq(struct request *rq)
+void blk_mq_end_request_batch(struct io_comp_batch *iob)
{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- struct request_queue *q = rq->q;
- bool shared = false;
- int cpu;
+ int tags[TAG_COMP_BATCH], nr_tags = 0;
+ struct blk_mq_hw_ctx *cur_hctx = NULL;
+ struct request *rq;
+ u64 now = 0;
- WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
- /*
- * Most of single queue controllers, there is only one irq vector
- * for handling IO completion, and the only irq's affinity is set
- * as all possible CPUs. On most of ARCHs, this affinity means the
- * irq is handled on one specific CPU.
- *
- * So complete IO reqeust in softirq context in case of single queue
- * for not degrading IO performance by irqsoff latency.
- */
- if (q->nr_hw_queues == 1) {
- __blk_complete_request(rq);
- return;
+ if (iob->need_ts)
+ now = ktime_get_ns();
+
+ while ((rq = rq_list_pop(&iob->req_list)) != NULL) {
+ prefetch(rq->bio);
+ prefetch(rq->rq_next);
+
+ blk_complete_request(rq);
+ if (iob->need_ts)
+ __blk_mq_end_request_acct(rq, now);
+
+ blk_mq_finish_request(rq);
+
+ rq_qos_done(rq->q, rq);
+
+ /*
+ * If end_io handler returns NONE, then it still has
+ * ownership of the request.
+ */
+ if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE)
+ continue;
+
+ WRITE_ONCE(rq->state, MQ_RQ_IDLE);
+ if (!req_ref_put_and_test(rq))
+ continue;
+
+ blk_crypto_free_request(rq);
+ blk_pm_mark_last_busy(rq);
+
+ if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) {
+ if (cur_hctx)
+ blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+ nr_tags = 0;
+ cur_hctx = rq->mq_hctx;
+ }
+ tags[nr_tags++] = rq->tag;
}
+ if (nr_tags)
+ blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags);
+}
+EXPORT_SYMBOL_GPL(blk_mq_end_request_batch);
+
+static void blk_complete_reqs(struct llist_head *list)
+{
+ struct llist_node *entry = llist_reverse_order(llist_del_all(list));
+ struct request *rq, *next;
+
+ llist_for_each_entry_safe(rq, next, entry, ipi_list)
+ rq->q->mq_ops->complete(rq);
+}
+
+static __latent_entropy void blk_done_softirq(struct softirq_action *h)
+{
+ blk_complete_reqs(this_cpu_ptr(&blk_cpu_done));
+}
+
+static int blk_softirq_cpu_dead(unsigned int cpu)
+{
+ blk_complete_reqs(&per_cpu(blk_cpu_done, cpu));
+ return 0;
+}
+
+static void __blk_mq_complete_request_remote(void *data)
+{
+ __raise_softirq_irqoff(BLOCK_SOFTIRQ);
+}
+
+static inline bool blk_mq_complete_need_ipi(struct request *rq)
+{
+ int cpu = raw_smp_processor_id();
+
+ if (!IS_ENABLED(CONFIG_SMP) ||
+ !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags))
+ return false;
/*
- * For a polled request, always complete locallly, it's pointless
- * to redirect the completion.
+ * With force threaded interrupts enabled, raising softirq from an SMP
+ * function call will always result in waking the ksoftirqd thread.
+ * This is probably worse than completing the request on a different
+ * cache domain.
*/
- if ((rq->cmd_flags & REQ_HIPRI) ||
- !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) {
- q->mq_ops->complete(rq);
- return;
- }
+ if (force_irqthreads())
+ return false;
- cpu = get_cpu();
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ctx->cpu);
+ /* same CPU or cache domain? Complete locally */
+ if (cpu == rq->mq_ctx->cpu ||
+ (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) &&
+ cpus_share_cache(cpu, rq->mq_ctx->cpu)))
+ return false;
- if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) {
- rq->csd.func = __blk_mq_complete_request_remote;
- rq->csd.info = rq;
- rq->csd.flags = 0;
- smp_call_function_single_async(ctx->cpu, &rq->csd);
- } else {
- q->mq_ops->complete(rq);
- }
- put_cpu();
+ /* don't try to IPI to an offline CPU */
+ return cpu_online(rq->mq_ctx->cpu);
}
-EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq);
-static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx)
- __releases(hctx->srcu)
+static void blk_mq_complete_send_ipi(struct request *rq)
{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING))
- rcu_read_unlock();
- else
- srcu_read_unlock(hctx->srcu, srcu_idx);
+ unsigned int cpu;
+
+ cpu = rq->mq_ctx->cpu;
+ if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu)))
+ smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu));
+}
+
+static void blk_mq_raise_softirq(struct request *rq)
+{
+ struct llist_head *list;
+
+ preempt_disable();
+ list = this_cpu_ptr(&blk_cpu_done);
+ if (llist_add(&rq->ipi_list, list))
+ raise_softirq(BLOCK_SOFTIRQ);
+ preempt_enable();
}
-static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx)
- __acquires(hctx->srcu)
+bool blk_mq_complete_request_remote(struct request *rq)
{
- if (!(hctx->flags & BLK_MQ_F_BLOCKING)) {
- /* shut up gcc false positive */
- *srcu_idx = 0;
- rcu_read_lock();
- } else
- *srcu_idx = srcu_read_lock(hctx->srcu);
+ WRITE_ONCE(rq->state, MQ_RQ_COMPLETE);
+
+ /*
+ * For request which hctx has only one ctx mapping,
+ * or a polled request, always complete locally,
+ * it's pointless to redirect the completion.
+ */
+ if ((rq->mq_hctx->nr_ctx == 1 &&
+ rq->mq_ctx->cpu == raw_smp_processor_id()) ||
+ rq->cmd_flags & REQ_POLLED)
+ return false;
+
+ if (blk_mq_complete_need_ipi(rq)) {
+ blk_mq_complete_send_ipi(rq);
+ return true;
+ }
+
+ if (rq->q->nr_hw_queues == 1) {
+ blk_mq_raise_softirq(rq);
+ return true;
+ }
+ return false;
}
+EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote);
/**
* blk_mq_complete_request - end I/O on a request
* @rq: the request being processed
*
* Description:
- * Ends all I/O on a request. It does not handle partial completions.
- * The actual completion happens out-of-order, through a IPI handler.
+ * Complete a request by scheduling the ->complete_rq operation.
**/
-bool blk_mq_complete_request(struct request *rq)
+void blk_mq_complete_request(struct request *rq)
{
- if (unlikely(blk_should_fake_timeout(rq->q)))
- return false;
- blk_mq_force_complete_rq(rq);
- return true;
+ if (!blk_mq_complete_request_remote(rq))
+ rq->q->mq_ops->complete(rq);
}
EXPORT_SYMBOL(blk_mq_complete_request);
@@ -689,9 +1250,10 @@ void blk_mq_start_request(struct request *rq)
{
struct request_queue *q = rq->q;
- trace_block_rq_issue(q, rq);
+ trace_block_rq_issue(rq);
- if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) {
+ if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) &&
+ !blk_rq_is_passthrough(rq)) {
rq->io_start_time_ns = ktime_get_ns();
rq->stats_sectors = blk_rq_sectors(rq);
rq->rq_flags |= RQF_STATS;
@@ -702,21 +1264,179 @@ void blk_mq_start_request(struct request *rq)
blk_add_timer(rq);
WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT);
+ rq->mq_hctx->tags->rqs[rq->tag] = rq;
#ifdef CONFIG_BLK_DEV_INTEGRITY
if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE)
q->integrity.profile->prepare_fn(rq);
#endif
+ if (rq->bio && rq->bio->bi_opf & REQ_POLLED)
+ WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num);
}
EXPORT_SYMBOL(blk_mq_start_request);
+/*
+ * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple
+ * queues. This is important for md arrays to benefit from merging
+ * requests.
+ */
+static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug)
+{
+ if (plug->multiple_queues)
+ return BLK_MAX_REQUEST_COUNT * 2;
+ return BLK_MAX_REQUEST_COUNT;
+}
+
+static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+{
+ struct request *last = rq_list_peek(&plug->mq_list);
+
+ if (!plug->rq_count) {
+ trace_block_plug(rq->q);
+ } else if (plug->rq_count >= blk_plug_max_rq_count(plug) ||
+ (!blk_queue_nomerges(rq->q) &&
+ blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
+ blk_mq_flush_plug_list(plug, false);
+ last = NULL;
+ trace_block_plug(rq->q);
+ }
+
+ if (!plug->multiple_queues && last && last->q != rq->q)
+ plug->multiple_queues = true;
+ /*
+ * Any request allocated from sched tags can't be issued to
+ * ->queue_rqs() directly
+ */
+ if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS))
+ plug->has_elevator = true;
+ rq->rq_next = NULL;
+ rq_list_add(&plug->mq_list, rq);
+ plug->rq_count++;
+}
+
+/**
+ * blk_execute_rq_nowait - insert a request to I/O scheduler for execution
+ * @rq: request to insert
+ * @at_head: insert request at head or tail of queue
+ *
+ * Description:
+ * Insert a fully prepared request at the back of the I/O scheduler queue
+ * for execution. Don't wait for completion.
+ *
+ * Note:
+ * This function will invoke @done directly if the queue is dead.
+ */
+void blk_execute_rq_nowait(struct request *rq, bool at_head)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+
+ WARN_ON(irqs_disabled());
+ WARN_ON(!blk_rq_is_passthrough(rq));
+
+ blk_account_io_start(rq);
+
+ /*
+ * As plugging can be enabled for passthrough requests on a zoned
+ * device, directly accessing the plug instead of using blk_mq_plug()
+ * should not have any consequences.
+ */
+ if (current->plug && !at_head) {
+ blk_add_rq_to_plug(current->plug, rq);
+ return;
+ }
+
+ blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
+ blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
+}
+EXPORT_SYMBOL_GPL(blk_execute_rq_nowait);
+
+struct blk_rq_wait {
+ struct completion done;
+ blk_status_t ret;
+};
+
+static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret)
+{
+ struct blk_rq_wait *wait = rq->end_io_data;
+
+ wait->ret = ret;
+ complete(&wait->done);
+ return RQ_END_IO_NONE;
+}
+
+bool blk_rq_is_poll(struct request *rq)
+{
+ if (!rq->mq_hctx)
+ return false;
+ if (rq->mq_hctx->type != HCTX_TYPE_POLL)
+ return false;
+ return true;
+}
+EXPORT_SYMBOL_GPL(blk_rq_is_poll);
+
+static void blk_rq_poll_completion(struct request *rq, struct completion *wait)
+{
+ do {
+ blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0);
+ cond_resched();
+ } while (!completion_done(wait));
+}
+
+/**
+ * blk_execute_rq - insert a request into queue for execution
+ * @rq: request to insert
+ * @at_head: insert request at head or tail of queue
+ *
+ * Description:
+ * Insert a fully prepared request at the back of the I/O scheduler queue
+ * for execution and wait for completion.
+ * Return: The blk_status_t result provided to blk_mq_end_request().
+ */
+blk_status_t blk_execute_rq(struct request *rq, bool at_head)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+ struct blk_rq_wait wait = {
+ .done = COMPLETION_INITIALIZER_ONSTACK(wait.done),
+ };
+
+ WARN_ON(irqs_disabled());
+ WARN_ON(!blk_rq_is_passthrough(rq));
+
+ rq->end_io_data = &wait;
+ rq->end_io = blk_end_sync_rq;
+
+ blk_account_io_start(rq);
+ blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0);
+ blk_mq_run_hw_queue(hctx, false);
+
+ if (blk_rq_is_poll(rq)) {
+ blk_rq_poll_completion(rq, &wait.done);
+ } else {
+ /*
+ * Prevent hang_check timer from firing at us during very long
+ * I/O
+ */
+ unsigned long hang_check = sysctl_hung_task_timeout_secs;
+
+ if (hang_check)
+ while (!wait_for_completion_io_timeout(&wait.done,
+ hang_check * (HZ/2)))
+ ;
+ else
+ wait_for_completion_io(&wait.done);
+ }
+
+ return wait.ret;
+}
+EXPORT_SYMBOL(blk_execute_rq);
+
static void __blk_mq_requeue_request(struct request *rq)
{
struct request_queue *q = rq->q;
blk_mq_put_driver_tag(rq);
- trace_block_rq_requeue(q, rq);
+ trace_block_rq_requeue(rq);
rq_qos_requeue(q, rq);
if (blk_mq_request_started(rq)) {
@@ -727,13 +1447,20 @@ static void __blk_mq_requeue_request(struct request *rq)
void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list)
{
+ struct request_queue *q = rq->q;
+ unsigned long flags;
+
__blk_mq_requeue_request(rq);
/* this request will be re-inserted to io scheduler queue */
blk_mq_sched_requeue_request(rq);
- BUG_ON(!list_empty(&rq->queuelist));
- blk_mq_add_to_requeue_list(rq, true, kick_requeue_list);
+ spin_lock_irqsave(&q->requeue_lock, flags);
+ list_add_tail(&rq->queuelist, &q->requeue_list);
+ spin_unlock_irqrestore(&q->requeue_lock, flags);
+
+ if (kick_requeue_list)
+ blk_mq_kick_requeue_list(q);
}
EXPORT_SYMBOL(blk_mq_requeue_request);
@@ -742,63 +1469,40 @@ static void blk_mq_requeue_work(struct work_struct *work)
struct request_queue *q =
container_of(work, struct request_queue, requeue_work.work);
LIST_HEAD(rq_list);
- struct request *rq, *next;
+ LIST_HEAD(flush_list);
+ struct request *rq;
spin_lock_irq(&q->requeue_lock);
list_splice_init(&q->requeue_list, &rq_list);
+ list_splice_init(&q->flush_list, &flush_list);
spin_unlock_irq(&q->requeue_lock);
- list_for_each_entry_safe(rq, next, &rq_list, queuelist) {
- if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP)))
- continue;
-
- rq->rq_flags &= ~RQF_SOFTBARRIER;
- list_del_init(&rq->queuelist);
+ while (!list_empty(&rq_list)) {
+ rq = list_entry(rq_list.next, struct request, queuelist);
/*
- * If RQF_DONTPREP, rq has contained some driver specific
- * data, so insert it to hctx dispatch list to avoid any
- * merge.
+ * If RQF_DONTPREP ist set, the request has been started by the
+ * driver already and might have driver-specific data allocated
+ * already. Insert it into the hctx dispatch list to avoid
+ * block layer merges for the request.
*/
- if (rq->rq_flags & RQF_DONTPREP)
- blk_mq_request_bypass_insert(rq, false, false);
- else
- blk_mq_sched_insert_request(rq, true, false, false);
+ if (rq->rq_flags & RQF_DONTPREP) {
+ list_del_init(&rq->queuelist);
+ blk_mq_request_bypass_insert(rq, 0);
+ } else {
+ list_del_init(&rq->queuelist);
+ blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD);
+ }
}
- while (!list_empty(&rq_list)) {
- rq = list_entry(rq_list.next, struct request, queuelist);
+ while (!list_empty(&flush_list)) {
+ rq = list_entry(flush_list.next, struct request, queuelist);
list_del_init(&rq->queuelist);
- blk_mq_sched_insert_request(rq, false, false, false);
+ blk_mq_insert_request(rq, 0);
}
blk_mq_run_hw_queues(q, false);
}
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
- bool kick_requeue_list)
-{
- struct request_queue *q = rq->q;
- unsigned long flags;
-
- /*
- * We abuse this flag that is otherwise used by the I/O scheduler to
- * request head insertion from the workqueue.
- */
- BUG_ON(rq->rq_flags & RQF_SOFTBARRIER);
-
- spin_lock_irqsave(&q->requeue_lock, flags);
- if (at_head) {
- rq->rq_flags |= RQF_SOFTBARRIER;
- list_add(&rq->queuelist, &q->requeue_list);
- } else {
- list_add_tail(&rq->queuelist, &q->requeue_list);
- }
- spin_unlock_irqrestore(&q->requeue_lock, flags);
-
- if (kick_requeue_list)
- blk_mq_kick_requeue_list(q);
-}
-
void blk_mq_kick_requeue_list(struct request_queue *q)
{
kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0);
@@ -813,25 +1517,26 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q,
}
EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list);
-struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag)
+static bool blk_is_flush_data_rq(struct request *rq)
{
- if (tag < tags->nr_tags) {
- prefetch(tags->rqs[tag]);
- return tags->rqs[tag];
- }
-
- return NULL;
+ return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq);
}
-EXPORT_SYMBOL(blk_mq_tag_to_rq);
-static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq,
- void *priv, bool reserved)
+static bool blk_mq_rq_inflight(struct request *rq, void *priv)
{
/*
- * If we find a request that isn't idle and the queue matches,
- * we know the queue is busy. Return false to stop the iteration.
+ * If we find a request that isn't idle we know the queue is busy
+ * as it's checked in the iter.
+ * Return false to stop the iteration.
+ *
+ * In case of queue quiesce, if one flush data request is completed,
+ * don't count it as inflight given the flush sequence is suspended,
+ * and the original flush data request is invisible to driver, just
+ * like other pending requests because of quiesce
*/
- if (blk_mq_request_started(rq) && rq->q == hctx->queue) {
+ if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) &&
+ blk_is_flush_data_rq(rq) &&
+ blk_mq_request_completed(rq))) {
bool *busy = priv;
*busy = true;
@@ -850,13 +1555,13 @@ bool blk_mq_queue_inflight(struct request_queue *q)
}
EXPORT_SYMBOL_GPL(blk_mq_queue_inflight);
-static void blk_mq_rq_timed_out(struct request *req, bool reserved)
+static void blk_mq_rq_timed_out(struct request *req)
{
req->rq_flags |= RQF_TIMED_OUT;
if (req->q->mq_ops->timeout) {
enum blk_eh_timer_return ret;
- ret = req->q->mq_ops->timeout(req, reserved);
+ ret = req->q->mq_ops->timeout(req);
if (ret == BLK_EH_DONE)
return;
WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER);
@@ -865,7 +1570,13 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved)
blk_add_timer(req);
}
-static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
+struct blk_expired_data {
+ bool has_timedout_rq;
+ unsigned long next;
+ unsigned long timeout_start;
+};
+
+static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired)
{
unsigned long deadline;
@@ -875,54 +1586,50 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next)
return false;
deadline = READ_ONCE(rq->deadline);
- if (time_after_eq(jiffies, deadline))
+ if (time_after_eq(expired->timeout_start, deadline))
return true;
- if (*next == 0)
- *next = deadline;
- else if (time_after(*next, deadline))
- *next = deadline;
+ if (expired->next == 0)
+ expired->next = deadline;
+ else if (time_after(expired->next, deadline))
+ expired->next = deadline;
return false;
}
-static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx,
- struct request *rq, void *priv, bool reserved)
+void blk_mq_put_rq_ref(struct request *rq)
{
- unsigned long *next = priv;
-
- /*
- * Just do a quick check if it is expired before locking the request in
- * so we're not unnecessarilly synchronizing across CPUs.
- */
- if (!blk_mq_req_expired(rq, next))
- return true;
+ if (is_flush_rq(rq)) {
+ if (rq->end_io(rq, 0) == RQ_END_IO_FREE)
+ blk_mq_free_request(rq);
+ } else if (req_ref_put_and_test(rq)) {
+ __blk_mq_free_request(rq);
+ }
+}
- /*
- * We have reason to believe the request may be expired. Take a
- * reference on the request to lock this request lifetime into its
- * currently allocated context to prevent it from being reallocated in
- * the event the completion by-passes this timeout handler.
- *
- * If the reference was already released, then the driver beat the
- * timeout handler to posting a natural completion.
- */
- if (!refcount_inc_not_zero(&rq->ref))
- return true;
+static bool blk_mq_check_expired(struct request *rq, void *priv)
+{
+ struct blk_expired_data *expired = priv;
/*
- * The request is now locked and cannot be reallocated underneath the
- * timeout handler's processing. Re-verify this exact request is truly
- * expired; if it is not expired, then the request was completed and
- * reallocated as a new request.
+ * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot
+ * be reallocated underneath the timeout handler's processing, then
+ * the expire check is reliable. If the request is not expired, then
+ * it was completed and reallocated as a new request after returning
+ * from blk_mq_check_expired().
*/
- if (blk_mq_req_expired(rq, next))
- blk_mq_rq_timed_out(rq, reserved);
+ if (blk_mq_req_expired(rq, expired)) {
+ expired->has_timedout_rq = true;
+ return false;
+ }
+ return true;
+}
- if (is_flush_rq(rq, hctx))
- rq->end_io(rq, 0);
- else if (refcount_dec_and_test(&rq->ref))
- __blk_mq_free_request(rq);
+static bool blk_mq_handle_expired(struct request *rq, void *priv)
+{
+ struct blk_expired_data *expired = priv;
+ if (blk_mq_req_expired(rq, expired))
+ blk_mq_rq_timed_out(rq);
return true;
}
@@ -930,9 +1637,11 @@ static void blk_mq_timeout_work(struct work_struct *work)
{
struct request_queue *q =
container_of(work, struct request_queue, timeout_work);
- unsigned long next = 0;
+ struct blk_expired_data expired = {
+ .timeout_start = jiffies,
+ };
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
/* A deadlock might occur if a request is stuck requiring a
* timeout at the same time a queue freeze is waiting
@@ -950,10 +1659,23 @@ static void blk_mq_timeout_work(struct work_struct *work)
if (!percpu_ref_tryget(&q->q_usage_counter))
return;
- blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next);
+ /* check if there is any timed-out request */
+ blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired);
+ if (expired.has_timedout_rq) {
+ /*
+ * Before walking tags, we must ensure any submit started
+ * before the current time has finished. Since the submit
+ * uses srcu or rcu, wait for a synchronization point to
+ * ensure all running submits have finished
+ */
+ blk_mq_wait_quiesce_done(q->tag_set);
+
+ expired.next = 0;
+ blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired);
+ }
- if (next != 0) {
- mod_timer(&q->timeout, next);
+ if (expired.next != 0) {
+ mod_timer(&q->timeout, expired.next);
} else {
/*
* Request timeouts are handled as a forward rolling timer. If
@@ -1044,12 +1766,29 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
return data.rq;
}
-static inline unsigned int queued_to_index(unsigned int queued)
+bool __blk_mq_alloc_driver_tag(struct request *rq)
{
- if (!queued)
- return 0;
+ struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags;
+ unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags;
+ int tag;
+
+ blk_mq_tag_busy(rq->mq_hctx);
+
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) {
+ bt = &rq->mq_hctx->tags->breserved_tags;
+ tag_offset = 0;
+ } else {
+ if (!hctx_may_queue(rq->mq_hctx, bt))
+ return false;
+ }
+
+ tag = __sbitmap_queue_get(bt);
+ if (tag == BLK_MQ_NO_TAG)
+ return false;
- return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1);
+ rq->tag = tag + tag_offset;
+ blk_mq_inc_active_requests(rq->mq_hctx);
+ return true;
}
static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
@@ -1082,12 +1821,13 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode,
static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
- struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags;
+ struct sbitmap_queue *sbq;
struct wait_queue_head *wq;
wait_queue_entry_t *wait;
bool ret;
- if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) {
+ if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) &&
+ !(blk_mq_is_shared_tags(hctx->flags))) {
blk_mq_sched_mark_restart_hctx(hctx);
/*
@@ -1105,6 +1845,10 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
if (!list_empty_careful(&wait->entry))
return false;
+ if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag))
+ sbq = &hctx->tags->breserved_tags;
+ else
+ sbq = &hctx->tags->bitmap_tags;
wq = &bt_wait_ptr(sbq, hctx)->wait;
spin_lock_irq(&wq->lock);
@@ -1120,6 +1864,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx,
__add_wait_queue(wq, wait);
/*
+ * Add one explicit barrier since blk_mq_get_driver_tag() may
+ * not imply barrier in case of failure.
+ *
+ * Order adding us to wait queue and allocating driver tag.
+ *
+ * The pair is the one implied in sbitmap_queue_wake_up() which
+ * orders clearing sbitmap tag bits and waitqueue_active() in
+ * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless
+ *
+ * Otherwise, re-order of adding wait queue and getting driver tag
+ * may cause __sbitmap_queue_wake_up() to wake up nothing because
+ * the waitqueue_active() may not observe us in wait queue.
+ */
+ smp_mb();
+
+ /*
* It's possible that a tag was freed in the window between the
* allocation failure and adding the hardware queue to the wait
* queue.
@@ -1156,9 +1916,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
{
unsigned int ewma;
- if (hctx->queue->elevator)
- return;
-
ewma = hctx->dispatch_busy;
if (!ewma && !busy)
@@ -1177,16 +1934,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy)
static void blk_mq_handle_dev_resource(struct request *rq,
struct list_head *list)
{
- struct request *next =
- list_first_entry_or_null(list, struct request, queuelist);
-
- /*
- * If an I/O scheduler has been configured and we got a driver tag for
- * the next request already, free it.
- */
- if (next)
- blk_mq_put_driver_tag(next);
-
list_add(&rq->queuelist, list);
__blk_mq_requeue_request(rq);
}
@@ -1204,105 +1951,155 @@ static void blk_mq_handle_zone_resource(struct request *rq,
__blk_mq_requeue_request(rq);
}
+enum prep_dispatch {
+ PREP_DISPATCH_OK,
+ PREP_DISPATCH_NO_TAG,
+ PREP_DISPATCH_NO_BUDGET,
+};
+
+static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq,
+ bool need_budget)
+{
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
+ int budget_token = -1;
+
+ if (need_budget) {
+ budget_token = blk_mq_get_dispatch_budget(rq->q);
+ if (budget_token < 0) {
+ blk_mq_put_driver_tag(rq);
+ return PREP_DISPATCH_NO_BUDGET;
+ }
+ blk_mq_set_rq_budget_token(rq, budget_token);
+ }
+
+ if (!blk_mq_get_driver_tag(rq)) {
+ /*
+ * The initial allocation attempt failed, so we need to
+ * rerun the hardware queue when a tag is freed. The
+ * waitqueue takes care of that. If the queue is run
+ * before we add this entry back on the dispatch list,
+ * we'll re-run it below.
+ */
+ if (!blk_mq_mark_tag_wait(hctx, rq)) {
+ /*
+ * All budgets not got from this function will be put
+ * together during handling partial dispatch
+ */
+ if (need_budget)
+ blk_mq_put_dispatch_budget(rq->q, budget_token);
+ return PREP_DISPATCH_NO_TAG;
+ }
+ }
+
+ return PREP_DISPATCH_OK;
+}
+
+/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */
+static void blk_mq_release_budgets(struct request_queue *q,
+ struct list_head *list)
+{
+ struct request *rq;
+
+ list_for_each_entry(rq, list, queuelist) {
+ int budget_token = blk_mq_get_rq_budget_token(rq);
+
+ if (budget_token >= 0)
+ blk_mq_put_dispatch_budget(q, budget_token);
+ }
+}
+
+/*
+ * blk_mq_commit_rqs will notify driver using bd->last that there is no
+ * more requests. (See comment in struct blk_mq_ops for commit_rqs for
+ * details)
+ * Attention, we should explicitly call this in unusual cases:
+ * 1) did not queue everything initially scheduled to queue
+ * 2) the last attempt to queue a request failed
+ */
+static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued,
+ bool from_schedule)
+{
+ if (hctx->queue->mq_ops->commit_rqs && queued) {
+ trace_block_unplug(hctx->queue, queued, !from_schedule);
+ hctx->queue->mq_ops->commit_rqs(hctx);
+ }
+}
+
/*
* Returns true if we did some work AND can potentially do more.
*/
-bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
- bool got_budget)
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list,
+ unsigned int nr_budgets)
{
- struct blk_mq_hw_ctx *hctx;
- struct request *rq, *nxt;
- bool no_tag = false;
- int errors, queued;
+ enum prep_dispatch prep;
+ struct request_queue *q = hctx->queue;
+ struct request *rq;
+ int queued;
blk_status_t ret = BLK_STS_OK;
- bool no_budget_avail = false;
LIST_HEAD(zone_list);
+ bool needs_resource = false;
if (list_empty(list))
return false;
- WARN_ON(!list_is_singular(list) && got_budget);
-
/*
* Now process all the entries, sending them to the driver.
*/
- errors = queued = 0;
+ queued = 0;
do {
struct blk_mq_queue_data bd;
rq = list_first_entry(list, struct request, queuelist);
- hctx = rq->mq_hctx;
- if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) {
- blk_mq_put_driver_tag(rq);
- no_budget_avail = true;
+ WARN_ON_ONCE(hctx != rq->mq_hctx);
+ prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets);
+ if (prep != PREP_DISPATCH_OK)
break;
- }
-
- if (!blk_mq_get_driver_tag(rq)) {
- /*
- * The initial allocation attempt failed, so we need to
- * rerun the hardware queue when a tag is freed. The
- * waitqueue takes care of that. If the queue is run
- * before we add this entry back on the dispatch list,
- * we'll re-run it below.
- */
- if (!blk_mq_mark_tag_wait(hctx, rq)) {
- blk_mq_put_dispatch_budget(hctx);
- /*
- * For non-shared tags, the RESTART check
- * will suffice.
- */
- if (hctx->flags & BLK_MQ_F_TAG_SHARED)
- no_tag = true;
- break;
- }
- }
list_del_init(&rq->queuelist);
bd.rq = rq;
+ bd.last = list_empty(list);
/*
- * Flag last if we have no more requests, or if we have more
- * but can't assign a driver tag to it.
+ * once the request is queued to lld, no need to cover the
+ * budget any more
*/
- if (list_empty(list))
- bd.last = true;
- else {
- nxt = list_first_entry(list, struct request, queuelist);
- bd.last = !blk_mq_get_driver_tag(nxt);
- }
-
+ if (nr_budgets)
+ nr_budgets--;
ret = q->mq_ops->queue_rq(hctx, &bd);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_handle_dev_resource(rq, list);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
break;
- } else if (ret == BLK_STS_ZONE_RESOURCE) {
+ case BLK_STS_RESOURCE:
+ needs_resource = true;
+ fallthrough;
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_handle_dev_resource(rq, list);
+ goto out;
+ case BLK_STS_ZONE_RESOURCE:
/*
* Move the request to zone_list and keep going through
* the dispatch list to find more requests the drive can
* accept.
*/
blk_mq_handle_zone_resource(rq, &zone_list);
- if (list_empty(list))
- break;
- continue;
- }
-
- if (unlikely(ret != BLK_STS_OK)) {
- errors++;
- blk_mq_end_request(rq, BLK_STS_IOERR);
- continue;
+ needs_resource = true;
+ break;
+ default:
+ blk_mq_end_request(rq, ret);
}
-
- queued++;
} while (!list_empty(list));
-
+out:
if (!list_empty(&zone_list))
list_splice_tail_init(&zone_list, list);
- hctx->dispatched[queued_to_index(queued)]++;
+ /* If we didn't flush the entire list, we could have told the driver
+ * there was more coming, but that turned out to be a lie.
+ */
+ if (!list_empty(list) || ret != BLK_STS_OK)
+ blk_mq_commit_rqs(hctx, queued, false);
/*
* Any items that need requeuing? Stuff them into hctx->dispatch,
@@ -1310,20 +2107,28 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
*/
if (!list_empty(list)) {
bool needs_restart;
+ /* For non-shared tags, the RESTART check will suffice */
+ bool no_tag = prep == PREP_DISPATCH_NO_TAG &&
+ ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) ||
+ blk_mq_is_shared_tags(hctx->flags));
- /*
- * If we didn't flush the entire list, we could have told
- * the driver there was more coming, but that turned out to
- * be a lie.
- */
- if (q->mq_ops->commit_rqs && queued)
- q->mq_ops->commit_rqs(hctx);
+ if (nr_budgets)
+ blk_mq_release_budgets(q, list);
spin_lock(&hctx->lock);
list_splice_tail_init(list, &hctx->dispatch);
spin_unlock(&hctx->lock);
/*
+ * Order adding requests to hctx->dispatch and checking
+ * SCHED_RESTART flag. The pair of this smp_mb() is the one
+ * in blk_mq_sched_restart(). Avoid restart code path to
+ * miss the new added requests to hctx->dispatch, meantime
+ * SCHED_RESTART is observed here.
+ */
+ smp_mb();
+
+ /*
* If SCHED_RESTART was set by the caller of this function and
* it is no longer set that means that it was cleared by another
* thread and hence that a queue rerun is needed.
@@ -1346,77 +2151,24 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list,
* If driver returns BLK_STS_RESOURCE and SCHED_RESTART
* bit is set, run queue after a delay to avoid IO stalls
* that could otherwise occur if the queue is idle. We'll do
- * similar if we couldn't get budget and SCHED_RESTART is set.
+ * similar if we couldn't get budget or couldn't lock a zone
+ * and SCHED_RESTART is set.
*/
needs_restart = blk_mq_sched_needs_restart(hctx);
+ if (prep == PREP_DISPATCH_NO_BUDGET)
+ needs_resource = true;
if (!needs_restart ||
(no_tag && list_empty_careful(&hctx->dispatch_wait.entry)))
blk_mq_run_hw_queue(hctx, true);
- else if (needs_restart && (ret == BLK_STS_RESOURCE ||
- no_budget_avail))
+ else if (needs_resource)
blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY);
blk_mq_update_dispatch_busy(hctx, true);
return false;
- } else
- blk_mq_update_dispatch_busy(hctx, false);
-
- /*
- * If the host/device is unable to accept more work, inform the
- * caller of that.
- */
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- return false;
-
- return (queued + errors) != 0;
-}
-
-/**
- * __blk_mq_run_hw_queue - Run a hardware queue.
- * @hctx: Pointer to the hardware queue to run.
- *
- * Send pending requests to the hardware.
- */
-static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx)
-{
- int srcu_idx;
-
- /*
- * We should be running this queue from one of the CPUs that
- * are mapped to it.
- *
- * There are at least two related races now between setting
- * hctx->next_cpu from blk_mq_hctx_next_cpu() and running
- * __blk_mq_run_hw_queue():
- *
- * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(),
- * but later it becomes online, then this warning is harmless
- * at all
- *
- * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(),
- * but later it becomes offline, then the warning can't be
- * triggered, and we depend on blk-mq timeout handler to
- * handle dispatched requests to this hctx
- */
- if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) &&
- cpu_online(hctx->next_cpu)) {
- printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n",
- raw_smp_processor_id(),
- cpumask_empty(hctx->cpumask) ? "inactive": "active");
- dump_stack();
}
- /*
- * We can't run the queue inline with ints disabled. Ensure that
- * we catch bad users of this early.
- */
- WARN_ON_ONCE(in_interrupt());
-
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
-
- hctx_lock(hctx, &srcu_idx);
- blk_mq_sched_dispatch_requests(hctx);
- hctx_unlock(hctx, srcu_idx);
+ blk_mq_update_dispatch_busy(hctx, false);
+ return true;
}
static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx)
@@ -1475,46 +2227,19 @@ select_cpu:
}
/**
- * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue.
+ * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
* @hctx: Pointer to the hardware queue to run.
- * @async: If we want to run the queue asynchronously.
- * @msecs: Microseconds of delay to wait before running the queue.
+ * @msecs: Milliseconds of delay to wait before running the queue.
*
- * If !@async, try to run the queue now. Else, run the queue asynchronously and
- * with a delay of @msecs.
+ * Run a hardware queue asynchronously with a delay of @msecs.
*/
-static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async,
- unsigned long msecs)
+void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
{
if (unlikely(blk_mq_hctx_stopped(hctx)))
return;
-
- if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) {
- int cpu = get_cpu();
- if (cpumask_test_cpu(cpu, hctx->cpumask)) {
- __blk_mq_run_hw_queue(hctx);
- put_cpu();
- return;
- }
-
- put_cpu();
- }
-
kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work,
msecs_to_jiffies(msecs));
}
-
-/**
- * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously.
- * @hctx: Pointer to the hardware queue to run.
- * @msecs: Microseconds of delay to wait before running the queue.
- *
- * Run a hardware queue asynchronously with a delay of @msecs.
- */
-void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs)
-{
- __blk_mq_delay_run_hw_queue(hctx, true, msecs);
-}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
/**
@@ -1528,10 +2253,16 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue);
*/
void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
{
- int srcu_idx;
bool need_run;
/*
+ * We can't run the queue inline with interrupts disabled.
+ */
+ WARN_ON_ONCE(!async && in_interrupt());
+
+ might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING);
+
+ /*
* When queue is quiesced, we may be switching io scheduler, or
* updating nr_hw_queues, or other things, and we can't run queue
* any more, even __blk_mq_hctx_has_pending() can't be called safely.
@@ -1539,31 +2270,68 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async)
* And queue will be rerun in blk_mq_unquiesce_queue() if it is
* quiesced.
*/
- hctx_lock(hctx, &srcu_idx);
- need_run = !blk_queue_quiesced(hctx->queue) &&
- blk_mq_hctx_has_pending(hctx);
- hctx_unlock(hctx, srcu_idx);
+ __blk_mq_run_dispatch_ops(hctx->queue, false,
+ need_run = !blk_queue_quiesced(hctx->queue) &&
+ blk_mq_hctx_has_pending(hctx));
+
+ if (!need_run)
+ return;
+
+ if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) {
+ blk_mq_delay_run_hw_queue(hctx, 0);
+ return;
+ }
- if (need_run)
- __blk_mq_delay_run_hw_queue(hctx, async, 0);
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_sched_dispatch_requests(hctx));
}
EXPORT_SYMBOL(blk_mq_run_hw_queue);
+/*
+ * Return prefered queue to dispatch from (if any) for non-mq aware IO
+ * scheduler.
+ */
+static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q)
+{
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ /*
+ * If the IO scheduler does not respect hardware queues when
+ * dispatching, we just don't bother with multiple HW queues and
+ * dispatch from hctx for the current CPU since running multiple queues
+ * just causes lock contention inside the scheduler and pointless cache
+ * bouncing.
+ */
+ struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT];
+
+ if (!blk_mq_hctx_stopped(hctx))
+ return hctx;
+ return NULL;
+}
+
/**
- * blk_mq_run_hw_queue - Run all hardware queues in a request queue.
+ * blk_mq_run_hw_queues - Run all hardware queues in a request queue.
* @q: Pointer to the request queue to run.
* @async: If we want to run the queue asynchronously.
*/
void blk_mq_run_hw_queues(struct request_queue *q, bool async)
{
- struct blk_mq_hw_ctx *hctx;
- int i;
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
+ unsigned long i;
+ sq_hctx = NULL;
+ if (blk_queue_sq_sched(q))
+ sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
-
- blk_mq_run_hw_queue(hctx, async);
+ /*
+ * Dispatch from this hctx either if there's no hctx preferred
+ * by IO scheduler or if it has requests that bypass the
+ * scheduler.
+ */
+ if (!sq_hctx || sq_hctx == hctx ||
+ !list_empty_careful(&hctx->dispatch))
+ blk_mq_run_hw_queue(hctx, async);
}
}
EXPORT_SYMBOL(blk_mq_run_hw_queues);
@@ -1571,42 +2339,39 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues);
/**
* blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously.
* @q: Pointer to the request queue to run.
- * @msecs: Microseconds of delay to wait before running the queues.
+ * @msecs: Milliseconds of delay to wait before running the queues.
*/
void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs)
{
- struct blk_mq_hw_ctx *hctx;
- int i;
+ struct blk_mq_hw_ctx *hctx, *sq_hctx;
+ unsigned long i;
+ sq_hctx = NULL;
+ if (blk_queue_sq_sched(q))
+ sq_hctx = blk_mq_get_sq_hctx(q);
queue_for_each_hw_ctx(q, hctx, i) {
if (blk_mq_hctx_stopped(hctx))
continue;
-
- blk_mq_delay_run_hw_queue(hctx, msecs);
+ /*
+ * If there is already a run_work pending, leave the
+ * pending delay untouched. Otherwise, a hctx can stall
+ * if another hctx is re-delaying the other's work
+ * before the work executes.
+ */
+ if (delayed_work_pending(&hctx->run_work))
+ continue;
+ /*
+ * Dispatch from this hctx either if there's no hctx preferred
+ * by IO scheduler or if it has requests that bypass the
+ * scheduler.
+ */
+ if (!sq_hctx || sq_hctx == hctx ||
+ !list_empty_careful(&hctx->dispatch))
+ blk_mq_delay_run_hw_queue(hctx, msecs);
}
}
EXPORT_SYMBOL(blk_mq_delay_run_hw_queues);
-/**
- * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped
- * @q: request queue.
- *
- * The caller is responsible for serializing this function against
- * blk_mq_{start,stop}_hw_queue().
- */
-bool blk_mq_queue_stopped(struct request_queue *q)
-{
- struct blk_mq_hw_ctx *hctx;
- int i;
-
- queue_for_each_hw_ctx(q, hctx, i)
- if (blk_mq_hctx_stopped(hctx))
- return true;
-
- return false;
-}
-EXPORT_SYMBOL(blk_mq_queue_stopped);
-
/*
* This function is often used for pausing .queue_rq() by driver when
* there isn't enough resource or some conditions aren't satisfied, and
@@ -1636,7 +2401,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue);
void blk_mq_stop_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_stop_hw_queue(hctx);
@@ -1647,14 +2412,14 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx)
{
clear_bit(BLK_MQ_S_STOPPED, &hctx->state);
- blk_mq_run_hw_queue(hctx, false);
+ blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING);
}
EXPORT_SYMBOL(blk_mq_start_hw_queue);
void blk_mq_start_hw_queues(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
blk_mq_start_hw_queue(hctx);
@@ -1674,179 +2439,170 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue);
void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
- blk_mq_start_stopped_hw_queue(hctx, async);
+ blk_mq_start_stopped_hw_queue(hctx, async ||
+ (hctx->flags & BLK_MQ_F_BLOCKING));
}
EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues);
static void blk_mq_run_work_fn(struct work_struct *work)
{
- struct blk_mq_hw_ctx *hctx;
-
- hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work);
-
- /*
- * If we are stopped, don't run the queue.
- */
- if (test_bit(BLK_MQ_S_STOPPED, &hctx->state))
- return;
-
- __blk_mq_run_hw_queue(hctx);
-}
-
-static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- bool at_head)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
- enum hctx_type type = hctx->type;
-
- lockdep_assert_held(&ctx->lock);
-
- trace_block_rq_insert(hctx->queue, rq);
-
- if (at_head)
- list_add(&rq->queuelist, &ctx->rq_lists[type]);
- else
- list_add_tail(&rq->queuelist, &ctx->rq_lists[type]);
-}
-
-void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- bool at_head)
-{
- struct blk_mq_ctx *ctx = rq->mq_ctx;
-
- lockdep_assert_held(&ctx->lock);
+ struct blk_mq_hw_ctx *hctx =
+ container_of(work, struct blk_mq_hw_ctx, run_work.work);
- __blk_mq_insert_req_list(hctx, rq, at_head);
- blk_mq_hctx_mark_pending(hctx, ctx);
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_sched_dispatch_requests(hctx));
}
/**
* blk_mq_request_bypass_insert - Insert a request at dispatch list.
* @rq: Pointer to request to be inserted.
- * @run_queue: If we should run the hardware queue after inserting the request.
+ * @flags: BLK_MQ_INSERT_*
*
* Should only be used carefully, when the caller knows we want to
* bypass a potential IO scheduler on the target device.
*/
-void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
- bool run_queue)
+static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags)
{
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
spin_lock(&hctx->lock);
- if (at_head)
+ if (flags & BLK_MQ_INSERT_AT_HEAD)
list_add(&rq->queuelist, &hctx->dispatch);
else
list_add_tail(&rq->queuelist, &hctx->dispatch);
spin_unlock(&hctx->lock);
-
- if (run_queue)
- blk_mq_run_hw_queue(hctx, false);
}
-void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- struct list_head *list)
-
+static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_ctx *ctx, struct list_head *list,
+ bool run_queue_async)
{
struct request *rq;
enum hctx_type type = hctx->type;
/*
+ * Try to issue requests directly if the hw queue isn't busy to save an
+ * extra enqueue & dequeue to the sw queue.
+ */
+ if (!hctx->dispatch_busy && !run_queue_async) {
+ blk_mq_run_dispatch_ops(hctx->queue,
+ blk_mq_try_issue_list_directly(hctx, list));
+ if (list_empty(list))
+ goto out;
+ }
+
+ /*
* preemption doesn't flush plug list, so it's possible ctx->cpu is
* offline now
*/
list_for_each_entry(rq, list, queuelist) {
BUG_ON(rq->mq_ctx != ctx);
- trace_block_rq_insert(hctx->queue, rq);
+ trace_block_rq_insert(rq);
+ if (rq->cmd_flags & REQ_NOWAIT)
+ run_queue_async = true;
}
spin_lock(&ctx->lock);
list_splice_tail_init(list, &ctx->rq_lists[type]);
blk_mq_hctx_mark_pending(hctx, ctx);
spin_unlock(&ctx->lock);
+out:
+ blk_mq_run_hw_queue(hctx, run_queue_async);
}
-static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b)
-{
- struct request *rqa = container_of(a, struct request, queuelist);
- struct request *rqb = container_of(b, struct request, queuelist);
-
- if (rqa->mq_ctx != rqb->mq_ctx)
- return rqa->mq_ctx > rqb->mq_ctx;
- if (rqa->mq_hctx != rqb->mq_hctx)
- return rqa->mq_hctx > rqb->mq_hctx;
-
- return blk_rq_pos(rqa) > blk_rq_pos(rqb);
-}
-
-void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+static void blk_mq_insert_request(struct request *rq, blk_insert_t flags)
{
- LIST_HEAD(list);
-
- if (list_empty(&plug->mq_list))
- return;
- list_splice_init(&plug->mq_list, &list);
+ struct request_queue *q = rq->q;
+ struct blk_mq_ctx *ctx = rq->mq_ctx;
+ struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- if (plug->rq_count > 2 && plug->multiple_queues)
- list_sort(NULL, &list, plug_rq_cmp);
+ if (blk_rq_is_passthrough(rq)) {
+ /*
+ * Passthrough request have to be added to hctx->dispatch
+ * directly. The device may be in a situation where it can't
+ * handle FS request, and always returns BLK_STS_RESOURCE for
+ * them, which gets them added to hctx->dispatch.
+ *
+ * If a passthrough request is required to unblock the queues,
+ * and it is added to the scheduler queue, there is no chance to
+ * dispatch it given we prioritize requests in hctx->dispatch.
+ */
+ blk_mq_request_bypass_insert(rq, flags);
+ } else if (req_op(rq) == REQ_OP_FLUSH) {
+ /*
+ * Firstly normal IO request is inserted to scheduler queue or
+ * sw queue, meantime we add flush request to dispatch queue(
+ * hctx->dispatch) directly and there is at most one in-flight
+ * flush request for each hw queue, so it doesn't matter to add
+ * flush request to tail or front of the dispatch queue.
+ *
+ * Secondly in case of NCQ, flush request belongs to non-NCQ
+ * command, and queueing it will fail when there is any
+ * in-flight normal IO request(NCQ command). When adding flush
+ * rq to the front of hctx->dispatch, it is easier to introduce
+ * extra time to flush rq's latency because of S_SCHED_RESTART
+ * compared with adding to the tail of dispatch queue, then
+ * chance of flush merge is increased, and less flush requests
+ * will be issued to controller. It is observed that ~10% time
+ * is saved in blktests block/004 on disk attached to AHCI/NCQ
+ * drive when adding flush rq to the front of hctx->dispatch.
+ *
+ * Simply queue flush rq to the front of hctx->dispatch so that
+ * intensive flush workloads can benefit in case of NCQ HW.
+ */
+ blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD);
+ } else if (q->elevator) {
+ LIST_HEAD(list);
- plug->rq_count = 0;
+ WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG);
- do {
- struct list_head rq_list;
- struct request *rq, *head_rq = list_entry_rq(list.next);
- struct list_head *pos = &head_rq->queuelist; /* skip first */
- struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx;
- struct blk_mq_ctx *this_ctx = head_rq->mq_ctx;
- unsigned int depth = 1;
-
- list_for_each_continue(pos, &list) {
- rq = list_entry_rq(pos);
- BUG_ON(!rq->q);
- if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx)
- break;
- depth++;
- }
+ list_add(&rq->queuelist, &list);
+ q->elevator->type->ops.insert_requests(hctx, &list, flags);
+ } else {
+ trace_block_rq_insert(rq);
- list_cut_before(&rq_list, &list, pos);
- trace_block_unplug(head_rq->q, depth, !from_schedule);
- blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list,
- from_schedule);
- } while(!list_empty(&list));
+ spin_lock(&ctx->lock);
+ if (flags & BLK_MQ_INSERT_AT_HEAD)
+ list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]);
+ else
+ list_add_tail(&rq->queuelist,
+ &ctx->rq_lists[hctx->type]);
+ blk_mq_hctx_mark_pending(hctx, ctx);
+ spin_unlock(&ctx->lock);
+ }
}
static void blk_mq_bio_to_request(struct request *rq, struct bio *bio,
unsigned int nr_segs)
{
+ int err;
+
if (bio->bi_opf & REQ_RAHEAD)
rq->cmd_flags |= REQ_FAILFAST_MASK;
rq->__sector = bio->bi_iter.bi_sector;
- rq->write_hint = bio->bi_write_hint;
blk_rq_bio_prep(rq, bio, nr_segs);
- blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
+
+ /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */
+ err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO);
+ WARN_ON_ONCE(err);
blk_account_io_start(rq);
}
static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie, bool last)
+ struct request *rq, bool last)
{
struct request_queue *q = rq->q;
struct blk_mq_queue_data bd = {
.rq = rq,
.last = last,
};
- blk_qc_t new_cookie;
blk_status_t ret;
- new_cookie = request_to_qc_t(hctx, rq);
-
/*
* For OK queue, we are done. For error, caller may kill it.
* Any other error (busy), just add it to our list as we
@@ -1856,7 +2612,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
switch (ret) {
case BLK_STS_OK:
blk_mq_update_dispatch_busy(hctx, false);
- *cookie = new_cookie;
break;
case BLK_STS_RESOURCE:
case BLK_STS_DEV_RESOURCE:
@@ -1865,59 +2620,31 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx,
break;
default:
blk_mq_update_dispatch_busy(hctx, false);
- *cookie = BLK_QC_T_NONE;
break;
}
return ret;
}
-static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq,
- blk_qc_t *cookie,
- bool bypass_insert, bool last)
+static bool blk_mq_get_budget_and_tag(struct request *rq)
{
- struct request_queue *q = rq->q;
- bool run_queue = true;
-
- /*
- * RCU or SRCU read lock is needed before checking quiesced flag.
- *
- * When queue is stopped or quiesced, ignore 'bypass_insert' from
- * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller,
- * and avoid driver to try to dispatch again.
- */
- if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) {
- run_queue = false;
- bypass_insert = false;
- goto insert;
- }
-
- if (q->elevator && !bypass_insert)
- goto insert;
-
- if (!blk_mq_get_dispatch_budget(hctx))
- goto insert;
+ int budget_token;
+ budget_token = blk_mq_get_dispatch_budget(rq->q);
+ if (budget_token < 0)
+ return false;
+ blk_mq_set_rq_budget_token(rq, budget_token);
if (!blk_mq_get_driver_tag(rq)) {
- blk_mq_put_dispatch_budget(hctx);
- goto insert;
+ blk_mq_put_dispatch_budget(rq->q, budget_token);
+ return false;
}
-
- return __blk_mq_issue_directly(hctx, rq, cookie, last);
-insert:
- if (bypass_insert)
- return BLK_STS_RESOURCE;
-
- blk_mq_request_bypass_insert(rq, false, run_queue);
- return BLK_STS_OK;
+ return true;
}
/**
* blk_mq_try_issue_directly - Try to send a request directly to device driver.
* @hctx: Pointer of the associated hardware queue.
* @rq: Pointer to request to be sent.
- * @cookie: Request queue cookie.
*
* If the device has enough resources to accept a new request now, send the
* request directly to device driver. Else, insert at hctx->dispatch queue, so
@@ -1925,88 +2652,299 @@ insert:
* queue have higher priority.
*/
static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx,
- struct request *rq, blk_qc_t *cookie)
+ struct request *rq)
{
blk_status_t ret;
- int srcu_idx;
- might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING);
+ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
+ blk_mq_insert_request(rq, 0);
+ return;
+ }
- hctx_lock(hctx, &srcu_idx);
+ if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) {
+ blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT);
+ return;
+ }
- ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true);
- if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE)
- blk_mq_request_bypass_insert(rq, false, true);
- else if (ret != BLK_STS_OK)
+ ret = __blk_mq_issue_directly(hctx, rq, true);
+ switch (ret) {
+ case BLK_STS_OK:
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_request_bypass_insert(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
+ break;
+ default:
blk_mq_end_request(rq, ret);
-
- hctx_unlock(hctx, srcu_idx);
+ break;
+ }
}
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
+static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last)
{
- blk_status_t ret;
- int srcu_idx;
- blk_qc_t unused_cookie;
struct blk_mq_hw_ctx *hctx = rq->mq_hctx;
- hctx_lock(hctx, &srcu_idx);
- ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last);
- hctx_unlock(hctx, srcu_idx);
+ if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) {
+ blk_mq_insert_request(rq, 0);
+ return BLK_STS_OK;
+ }
- return ret;
+ if (!blk_mq_get_budget_and_tag(rq))
+ return BLK_STS_RESOURCE;
+ return __blk_mq_issue_directly(hctx, rq, last);
}
-void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
+static void blk_mq_plug_issue_direct(struct blk_plug *plug)
+{
+ struct blk_mq_hw_ctx *hctx = NULL;
+ struct request *rq;
+ int queued = 0;
+ blk_status_t ret = BLK_STS_OK;
+
+ while ((rq = rq_list_pop(&plug->mq_list))) {
+ bool last = rq_list_empty(plug->mq_list);
+
+ if (hctx != rq->mq_hctx) {
+ if (hctx) {
+ blk_mq_commit_rqs(hctx, queued, false);
+ queued = 0;
+ }
+ hctx = rq->mq_hctx;
+ }
+
+ ret = blk_mq_request_issue_directly(rq, last);
+ switch (ret) {
+ case BLK_STS_OK:
+ queued++;
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_request_bypass_insert(rq, 0);
+ blk_mq_run_hw_queue(hctx, false);
+ goto out;
+ default:
+ blk_mq_end_request(rq, ret);
+ break;
+ }
+ }
+
+out:
+ if (ret != BLK_STS_OK)
+ blk_mq_commit_rqs(hctx, queued, false);
+}
+
+static void __blk_mq_flush_plug_list(struct request_queue *q,
+ struct blk_plug *plug)
+{
+ if (blk_queue_quiesced(q))
+ return;
+ q->mq_ops->queue_rqs(&plug->mq_list);
+}
+
+static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched)
+{
+ struct blk_mq_hw_ctx *this_hctx = NULL;
+ struct blk_mq_ctx *this_ctx = NULL;
+ struct request *requeue_list = NULL;
+ struct request **requeue_lastp = &requeue_list;
+ unsigned int depth = 0;
+ bool is_passthrough = false;
+ LIST_HEAD(list);
+
+ do {
+ struct request *rq = rq_list_pop(&plug->mq_list);
+
+ if (!this_hctx) {
+ this_hctx = rq->mq_hctx;
+ this_ctx = rq->mq_ctx;
+ is_passthrough = blk_rq_is_passthrough(rq);
+ } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx ||
+ is_passthrough != blk_rq_is_passthrough(rq)) {
+ rq_list_add_tail(&requeue_lastp, rq);
+ continue;
+ }
+ list_add(&rq->queuelist, &list);
+ depth++;
+ } while (!rq_list_empty(plug->mq_list));
+
+ plug->mq_list = requeue_list;
+ trace_block_unplug(this_hctx->queue, depth, !from_sched);
+
+ percpu_ref_get(&this_hctx->queue->q_usage_counter);
+ /* passthrough requests should never be issued to the I/O scheduler */
+ if (is_passthrough) {
+ spin_lock(&this_hctx->lock);
+ list_splice_tail_init(&list, &this_hctx->dispatch);
+ spin_unlock(&this_hctx->lock);
+ blk_mq_run_hw_queue(this_hctx, from_sched);
+ } else if (this_hctx->queue->elevator) {
+ this_hctx->queue->elevator->type->ops.insert_requests(this_hctx,
+ &list, 0);
+ blk_mq_run_hw_queue(this_hctx, from_sched);
+ } else {
+ blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched);
+ }
+ percpu_ref_put(&this_hctx->queue->q_usage_counter);
+}
+
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule)
+{
+ struct request *rq;
+
+ /*
+ * We may have been called recursively midway through handling
+ * plug->mq_list via a schedule() in the driver's queue_rq() callback.
+ * To avoid mq_list changing under our feet, clear rq_count early and
+ * bail out specifically if rq_count is 0 rather than checking
+ * whether the mq_list is empty.
+ */
+ if (plug->rq_count == 0)
+ return;
+ plug->rq_count = 0;
+
+ if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) {
+ struct request_queue *q;
+
+ rq = rq_list_peek(&plug->mq_list);
+ q = rq->q;
+
+ /*
+ * Peek first request and see if we have a ->queue_rqs() hook.
+ * If we do, we can dispatch the whole plug list in one go. We
+ * already know at this point that all requests belong to the
+ * same queue, caller must ensure that's the case.
+ */
+ if (q->mq_ops->queue_rqs) {
+ blk_mq_run_dispatch_ops(q,
+ __blk_mq_flush_plug_list(q, plug));
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
+
+ blk_mq_run_dispatch_ops(q,
+ blk_mq_plug_issue_direct(plug));
+ if (rq_list_empty(plug->mq_list))
+ return;
+ }
+
+ do {
+ blk_mq_dispatch_plug_list(plug, from_schedule);
+ } while (!rq_list_empty(plug->mq_list));
+}
+
+static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
struct list_head *list)
{
int queued = 0;
+ blk_status_t ret = BLK_STS_OK;
while (!list_empty(list)) {
- blk_status_t ret;
struct request *rq = list_first_entry(list, struct request,
queuelist);
list_del_init(&rq->queuelist);
ret = blk_mq_request_issue_directly(rq, list_empty(list));
- if (ret != BLK_STS_OK) {
- if (ret == BLK_STS_RESOURCE ||
- ret == BLK_STS_DEV_RESOURCE) {
- blk_mq_request_bypass_insert(rq, false,
- list_empty(list));
- break;
- }
- blk_mq_end_request(rq, ret);
- } else
+ switch (ret) {
+ case BLK_STS_OK:
queued++;
+ break;
+ case BLK_STS_RESOURCE:
+ case BLK_STS_DEV_RESOURCE:
+ blk_mq_request_bypass_insert(rq, 0);
+ if (list_empty(list))
+ blk_mq_run_hw_queue(hctx, false);
+ goto out;
+ default:
+ blk_mq_end_request(rq, ret);
+ break;
+ }
}
- /*
- * If we didn't flush the entire list, we could have told
- * the driver there was more coming, but that turned out to
- * be a lie.
- */
- if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued)
- hctx->queue->mq_ops->commit_rqs(hctx);
+out:
+ if (ret != BLK_STS_OK)
+ blk_mq_commit_rqs(hctx, queued, false);
}
-static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
+static bool blk_mq_attempt_bio_merge(struct request_queue *q,
+ struct bio *bio, unsigned int nr_segs)
{
- list_add_tail(&rq->queuelist, &plug->mq_list);
- plug->rq_count++;
- if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) {
- struct request *tmp;
+ if (!blk_queue_nomerges(q) && bio_mergeable(bio)) {
+ if (blk_attempt_plug_merge(q, bio, nr_segs))
+ return true;
+ if (blk_mq_sched_bio_merge(q, bio, nr_segs))
+ return true;
+ }
+ return false;
+}
+
+static struct request *blk_mq_get_new_requests(struct request_queue *q,
+ struct blk_plug *plug,
+ struct bio *bio,
+ unsigned int nsegs)
+{
+ struct blk_mq_alloc_data data = {
+ .q = q,
+ .nr_tags = 1,
+ .cmd_flags = bio->bi_opf,
+ };
+ struct request *rq;
+
+ if (blk_mq_attempt_bio_merge(q, bio, nsegs))
+ return NULL;
+
+ rq_qos_throttle(q, bio);
- tmp = list_first_entry(&plug->mq_list, struct request,
- queuelist);
- if (tmp->q != rq->q)
- plug->multiple_queues = true;
+ if (plug) {
+ data.nr_tags = plug->nr_ios;
+ plug->nr_ios = 1;
+ data.cached_rq = &plug->cached_rq;
}
+
+ rq = __blk_mq_alloc_requests(&data);
+ if (rq)
+ return rq;
+ rq_qos_cleanup(q, bio);
+ if (bio->bi_opf & REQ_NOWAIT)
+ bio_wouldblock_error(bio);
+ return NULL;
+}
+
+/*
+ * Check if we can use the passed on request for submitting the passed in bio,
+ * and remove it from the request list if it can be used.
+ */
+static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug,
+ struct bio *bio)
+{
+ enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf);
+ enum hctx_type hctx_type = rq->mq_hctx->type;
+
+ WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq);
+
+ if (type != hctx_type &&
+ !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT))
+ return false;
+ if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf))
+ return false;
+
+ /*
+ * If any qos ->throttle() end up blocking, we will have flushed the
+ * plug and hence killed the cached_rq list as well. Pop this entry
+ * before we throttle.
+ */
+ plug->cached_rq = rq_list_next(rq);
+ rq_qos_throttle(rq->q, bio);
+
+ blk_mq_rq_time_init(rq, 0);
+ rq->cmd_flags = bio->bi_opf;
+ INIT_LIST_HEAD(&rq->queuelist);
+ return true;
}
/**
- * blk_mq_make_request - Create and send a request to block device.
- * @q: Request queue pointer.
+ * blk_mq_submit_bio - Create and send a request to block device.
* @bio: Bio pointer.
*
* Builds up a request structure from @q and @bio and send to the device. The
@@ -2017,143 +2955,323 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq)
*
* It will not queue the request if there is an error with the bio, or at the
* request creation.
- *
- * Returns: Request queue cookie.
*/
-blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio)
+void blk_mq_submit_bio(struct bio *bio)
{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct blk_plug *plug = blk_mq_plug(bio);
const int is_sync = op_is_sync(bio->bi_opf);
- const int is_flush_fua = op_is_flush(bio->bi_opf);
- struct blk_mq_alloc_data data = {
- .q = q,
- };
- struct request *rq;
- struct blk_plug *plug;
- struct request *same_queue_rq = NULL;
- unsigned int nr_segs;
- blk_qc_t cookie;
+ struct blk_mq_hw_ctx *hctx;
+ struct request *rq = NULL;
+ unsigned int nr_segs = 1;
blk_status_t ret;
- blk_queue_bounce(q, &bio);
- __blk_queue_split(q, &bio, &nr_segs);
-
- if (!bio_integrity_prep(bio))
- goto queue_exit;
+ bio = blk_queue_bounce(bio, q);
- if (!is_flush_fua && !blk_queue_nomerges(q) &&
- blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq))
- goto queue_exit;
-
- if (blk_mq_sched_bio_merge(q, bio, nr_segs))
- goto queue_exit;
-
- rq_qos_throttle(q, bio);
+ if (plug) {
+ rq = rq_list_peek(&plug->cached_rq);
+ if (rq && rq->q != q)
+ rq = NULL;
+ }
+ if (rq) {
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ return;
+ }
+ if (!bio_integrity_prep(bio))
+ return;
+ if (blk_mq_attempt_bio_merge(q, bio, nr_segs))
+ return;
+ if (blk_mq_use_cached_rq(rq, plug, bio))
+ goto done;
+ percpu_ref_get(&q->q_usage_counter);
+ } else {
+ if (unlikely(bio_queue_enter(bio)))
+ return;
+ if (unlikely(bio_may_exceed_limits(bio, &q->limits))) {
+ bio = __bio_split_to_limits(bio, &q->limits, &nr_segs);
+ if (!bio)
+ goto fail;
+ }
+ if (!bio_integrity_prep(bio))
+ goto fail;
+ }
- data.cmd_flags = bio->bi_opf;
- rq = __blk_mq_alloc_request(&data);
+ rq = blk_mq_get_new_requests(q, plug, bio, nr_segs);
if (unlikely(!rq)) {
- rq_qos_cleanup(q, bio);
- if (bio->bi_opf & REQ_NOWAIT)
- bio_wouldblock_error(bio);
- goto queue_exit;
+fail:
+ blk_queue_exit(q);
+ return;
}
- trace_block_getrq(q, bio, bio->bi_opf);
+done:
+ trace_block_getrq(bio);
rq_qos_track(q, rq, bio);
- cookie = request_to_qc_t(data.hctx, rq);
-
blk_mq_bio_to_request(rq, bio, nr_segs);
- ret = blk_crypto_init_request(rq);
+ ret = blk_crypto_rq_get_keyslot(rq);
if (ret != BLK_STS_OK) {
bio->bi_status = ret;
bio_endio(bio);
blk_mq_free_request(rq);
- return BLK_QC_T_NONE;
+ return;
}
- plug = blk_mq_plug(q, bio);
- if (unlikely(is_flush_fua)) {
- /* Bypass scheduler for flush requests */
- blk_insert_flush(rq);
- blk_mq_run_hw_queue(data.hctx, true);
- } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs ||
- !blk_queue_nonrot(q))) {
- /*
- * Use plugging if we have a ->commit_rqs() hook as well, as
- * we know the driver uses bd->last in a smart fashion.
- *
- * Use normal plugging if this disk is slow HDD, as sequential
- * IO may benefit a lot from plug merging.
- */
- unsigned int request_count = plug->rq_count;
- struct request *last = NULL;
+ if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq))
+ return;
- if (!request_count)
- trace_block_plug(q);
- else
- last = list_entry_rq(plug->mq_list.prev);
+ if (plug) {
+ blk_add_rq_to_plug(plug, rq);
+ return;
+ }
- if (request_count >= BLK_MAX_REQUEST_COUNT || (last &&
- blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) {
- blk_flush_plug_list(plug, false);
- trace_block_plug(q);
- }
+ hctx = rq->mq_hctx;
+ if ((rq->rq_flags & RQF_USE_SCHED) ||
+ (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) {
+ blk_mq_insert_request(rq, 0);
+ blk_mq_run_hw_queue(hctx, true);
+ } else {
+ blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq));
+ }
+}
- blk_add_rq_to_plug(plug, rq);
- } else if (q->elevator) {
- /* Insert the request at the IO scheduler queue */
- blk_mq_sched_insert_request(rq, false, true, true);
- } else if (plug && !blk_queue_nomerges(q)) {
+#ifdef CONFIG_BLK_MQ_STACKING
+/**
+ * blk_insert_cloned_request - Helper for stacking drivers to submit a request
+ * @rq: the request being queued
+ */
+blk_status_t blk_insert_cloned_request(struct request *rq)
+{
+ struct request_queue *q = rq->q;
+ unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq));
+ unsigned int max_segments = blk_rq_get_max_segments(rq);
+ blk_status_t ret;
+
+ if (blk_rq_sectors(rq) > max_sectors) {
/*
- * We do limited plugging. If the bio can be merged, do that.
- * Otherwise the existing request in the plug list will be
- * issued. So the plug list will have one request at most
- * The plug list might get flushed before this. If that happens,
- * the plug list is empty, and same_queue_rq is invalid.
+ * SCSI device does not have a good way to return if
+ * Write Same/Zero is actually supported. If a device rejects
+ * a non-read/write command (discard, write same,etc.) the
+ * low-level device driver will set the relevant queue limit to
+ * 0 to prevent blk-lib from issuing more of the offending
+ * operations. Commands queued prior to the queue limit being
+ * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O
+ * errors being propagated to upper layers.
*/
- if (list_empty(&plug->mq_list))
- same_queue_rq = NULL;
- if (same_queue_rq) {
- list_del_init(&same_queue_rq->queuelist);
- plug->rq_count--;
+ if (max_sectors == 0)
+ return BLK_STS_NOTSUPP;
+
+ printk(KERN_ERR "%s: over max size limit. (%u > %u)\n",
+ __func__, blk_rq_sectors(rq), max_sectors);
+ return BLK_STS_IOERR;
+ }
+
+ /*
+ * The queue settings related to segment counting may differ from the
+ * original queue.
+ */
+ rq->nr_phys_segments = blk_recalc_rq_segments(rq);
+ if (rq->nr_phys_segments > max_segments) {
+ printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n",
+ __func__, rq->nr_phys_segments, max_segments);
+ return BLK_STS_IOERR;
+ }
+
+ if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq)))
+ return BLK_STS_IOERR;
+
+ ret = blk_crypto_rq_get_keyslot(rq);
+ if (ret != BLK_STS_OK)
+ return ret;
+
+ blk_account_io_start(rq);
+
+ /*
+ * Since we have a scheduler attached on the top device,
+ * bypass a potential scheduler on the bottom device for
+ * insert.
+ */
+ blk_mq_run_dispatch_ops(q,
+ ret = blk_mq_request_issue_directly(rq, true));
+ if (ret)
+ blk_account_io_done(rq, ktime_get_ns());
+ return ret;
+}
+EXPORT_SYMBOL_GPL(blk_insert_cloned_request);
+
+/**
+ * blk_rq_unprep_clone - Helper function to free all bios in a cloned request
+ * @rq: the clone request to be cleaned up
+ *
+ * Description:
+ * Free all bios in @rq for a cloned request.
+ */
+void blk_rq_unprep_clone(struct request *rq)
+{
+ struct bio *bio;
+
+ while ((bio = rq->bio) != NULL) {
+ rq->bio = bio->bi_next;
+
+ bio_put(bio);
+ }
+}
+EXPORT_SYMBOL_GPL(blk_rq_unprep_clone);
+
+/**
+ * blk_rq_prep_clone - Helper function to setup clone request
+ * @rq: the request to be setup
+ * @rq_src: original request to be cloned
+ * @bs: bio_set that bios for clone are allocated from
+ * @gfp_mask: memory allocation mask for bio
+ * @bio_ctr: setup function to be called for each clone bio.
+ * Returns %0 for success, non %0 for failure.
+ * @data: private data to be passed to @bio_ctr
+ *
+ * Description:
+ * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq.
+ * Also, pages which the original bios are pointing to are not copied
+ * and the cloned bios just point same pages.
+ * So cloned bios must be completed before original bios, which means
+ * the caller must complete @rq before @rq_src.
+ */
+int blk_rq_prep_clone(struct request *rq, struct request *rq_src,
+ struct bio_set *bs, gfp_t gfp_mask,
+ int (*bio_ctr)(struct bio *, struct bio *, void *),
+ void *data)
+{
+ struct bio *bio, *bio_src;
+
+ if (!bs)
+ bs = &fs_bio_set;
+
+ __rq_for_each_bio(bio_src, rq_src) {
+ bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask,
+ bs);
+ if (!bio)
+ goto free_and_out;
+
+ if (bio_ctr && bio_ctr(bio, bio_src, data))
+ goto free_and_out;
+
+ if (rq->bio) {
+ rq->biotail->bi_next = bio;
+ rq->biotail = bio;
+ } else {
+ rq->bio = rq->biotail = bio;
}
- blk_add_rq_to_plug(plug, rq);
- trace_block_plug(q);
+ bio = NULL;
+ }
+
+ /* Copy attributes of the original request to the clone request. */
+ rq->__sector = blk_rq_pos(rq_src);
+ rq->__data_len = blk_rq_bytes(rq_src);
+ if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) {
+ rq->rq_flags |= RQF_SPECIAL_PAYLOAD;
+ rq->special_vec = rq_src->special_vec;
+ }
+ rq->nr_phys_segments = rq_src->nr_phys_segments;
+ rq->ioprio = rq_src->ioprio;
+
+ if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0)
+ goto free_and_out;
+
+ return 0;
+
+free_and_out:
+ if (bio)
+ bio_put(bio);
+ blk_rq_unprep_clone(rq);
- if (same_queue_rq) {
- data.hctx = same_queue_rq->mq_hctx;
- trace_block_unplug(q, 1, true);
- blk_mq_try_issue_directly(data.hctx, same_queue_rq,
- &cookie);
+ return -ENOMEM;
+}
+EXPORT_SYMBOL_GPL(blk_rq_prep_clone);
+#endif /* CONFIG_BLK_MQ_STACKING */
+
+/*
+ * Steal bios from a request and add them to a bio list.
+ * The request must not have been partially completed before.
+ */
+void blk_steal_bios(struct bio_list *list, struct request *rq)
+{
+ if (rq->bio) {
+ if (list->tail)
+ list->tail->bi_next = rq->bio;
+ else
+ list->head = rq->bio;
+ list->tail = rq->biotail;
+
+ rq->bio = NULL;
+ rq->biotail = NULL;
+ }
+
+ rq->__data_len = 0;
+}
+EXPORT_SYMBOL_GPL(blk_steal_bios);
+
+static size_t order_to_size(unsigned int order)
+{
+ return (size_t)PAGE_SIZE << order;
+}
+
+/* called before freeing request pool in @tags */
+static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags,
+ struct blk_mq_tags *tags)
+{
+ struct page *page;
+ unsigned long flags;
+
+ /*
+ * There is no need to clear mapping if driver tags is not initialized
+ * or the mapping belongs to the driver tags.
+ */
+ if (!drv_tags || drv_tags == tags)
+ return;
+
+ list_for_each_entry(page, &tags->page_list, lru) {
+ unsigned long start = (unsigned long)page_address(page);
+ unsigned long end = start + order_to_size(page->private);
+ int i;
+
+ for (i = 0; i < drv_tags->nr_tags; i++) {
+ struct request *rq = drv_tags->rqs[i];
+ unsigned long rq_addr = (unsigned long)rq;
+
+ if (rq_addr >= start && rq_addr < end) {
+ WARN_ON_ONCE(req_ref_read(rq) != 0);
+ cmpxchg(&drv_tags->rqs[i], rq, NULL);
+ }
}
- } else if ((q->nr_hw_queues > 1 && is_sync) ||
- !data.hctx->dispatch_busy) {
- /*
- * There is no scheduler and we can try to send directly
- * to the hardware.
- */
- blk_mq_try_issue_directly(data.hctx, rq, &cookie);
- } else {
- /* Default case. */
- blk_mq_sched_insert_request(rq, false, true, true);
}
- return cookie;
-queue_exit:
- blk_queue_exit(q);
- return BLK_QC_T_NONE;
+ /*
+ * Wait until all pending iteration is done.
+ *
+ * Request reference is cleared and it is guaranteed to be observed
+ * after the ->lock is released.
+ */
+ spin_lock_irqsave(&drv_tags->lock, flags);
+ spin_unlock_irqrestore(&drv_tags->lock, flags);
}
-EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx)
{
+ struct blk_mq_tags *drv_tags;
struct page *page;
- if (tags->rqs && set->ops->exit_request) {
+ if (list_empty(&tags->page_list))
+ return;
+
+ if (blk_mq_is_shared_tags(set->flags))
+ drv_tags = set->shared_tags;
+ else
+ drv_tags = set->tags[hctx_idx];
+
+ if (tags->static_rqs && set->ops->exit_request) {
int i;
for (i = 0; i < tags->nr_tags; i++) {
@@ -2166,6 +3284,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
}
}
+ blk_mq_clear_rq_mapping(drv_tags, tags);
+
while (!list_empty(&tags->page_list)) {
page = list_first_entry(&tags->page_list, struct page, lru);
list_del_init(&page->lru);
@@ -2188,15 +3308,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags)
blk_mq_free_tags(tags);
}
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx,
- unsigned int nr_tags,
- unsigned int reserved_tags)
+static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
{
+ int i;
+
+ for (i = 0; i < set->nr_maps; i++) {
+ unsigned int start = set->map[i].queue_offset;
+ unsigned int end = start + set->map[i].nr_queues;
+
+ if (hctx_idx >= start && hctx_idx < end)
+ break;
+ }
+
+ if (i >= set->nr_maps)
+ i = HCTX_TYPE_DEFAULT;
+
+ return i;
+}
+
+static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ enum hctx_type type = hctx_idx_to_type(set, hctx_idx);
+
+ return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx);
+}
+
+static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx,
+ unsigned int nr_tags,
+ unsigned int reserved_tags)
+{
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
struct blk_mq_tags *tags;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -2208,26 +3354,22 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->rqs) {
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->rqs)
+ goto err_free_tags;
tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *),
GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY,
node);
- if (!tags->static_rqs) {
- kfree(tags->rqs);
- blk_mq_free_tags(tags);
- return NULL;
- }
+ if (!tags->static_rqs)
+ goto err_free_rqs;
return tags;
-}
-static size_t order_to_size(unsigned int order)
-{
- return (size_t)PAGE_SIZE << order;
+err_free_rqs:
+ kfree(tags->rqs);
+err_free_tags:
+ blk_mq_free_tags(tags);
+ return NULL;
}
static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
@@ -2245,14 +3387,14 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq,
return 0;
}
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
- unsigned int hctx_idx, unsigned int depth)
+static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags,
+ unsigned int hctx_idx, unsigned int depth)
{
unsigned int i, j, entries_per_page, max_order = 4;
+ int node = blk_mq_get_hctx_node(set, hctx_idx);
size_t rq_size, left;
- int node;
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx);
if (node == NUMA_NO_NODE)
node = set->numa_node;
@@ -2327,7 +3469,7 @@ struct rq_iter_data {
bool has_rq;
};
-static bool blk_mq_has_request(struct request *rq, void *data, bool reserved)
+static bool blk_mq_has_request(struct request *rq, void *data)
{
struct rq_iter_data *iter_data = data;
@@ -2352,7 +3494,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx)
static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu,
struct blk_mq_hw_ctx *hctx)
{
- if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu)
+ if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu)
return false;
if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids)
return false;
@@ -2448,22 +3590,58 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx)
&hctx->cpuhp_dead);
}
+/*
+ * Before freeing hw queue, clearing the flush request reference in
+ * tags->rqs[] for avoiding potential UAF.
+ */
+static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags,
+ unsigned int queue_depth, struct request *flush_rq)
+{
+ int i;
+ unsigned long flags;
+
+ /* The hw queue may not be mapped yet */
+ if (!tags)
+ return;
+
+ WARN_ON_ONCE(req_ref_read(flush_rq) != 0);
+
+ for (i = 0; i < queue_depth; i++)
+ cmpxchg(&tags->rqs[i], flush_rq, NULL);
+
+ /*
+ * Wait until all pending iteration is done.
+ *
+ * Request reference is cleared and it is guaranteed to be observed
+ * after the ->lock is released.
+ */
+ spin_lock_irqsave(&tags->lock, flags);
+ spin_unlock_irqrestore(&tags->lock, flags);
+}
+
/* hctx->ctxs will be freed in queue's release handler */
static void blk_mq_exit_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
{
+ struct request *flush_rq = hctx->fq->flush_rq;
+
if (blk_mq_hw_queue_mapped(hctx))
blk_mq_tag_idle(hctx);
+ if (blk_queue_init_done(q))
+ blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx],
+ set->queue_depth, flush_rq);
if (set->ops->exit_request)
- set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
+ set->ops->exit_request(set, flush_rq, hctx_idx);
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
blk_mq_remove_cpuhp(hctx);
+ xa_erase(&q->hctx_table, hctx_idx);
+
spin_lock(&q->unused_hctx_lock);
list_add(&hctx->hctx_list, &q->unused_hctx_list);
spin_unlock(&q->unused_hctx_lock);
@@ -2473,30 +3651,15 @@ static void blk_mq_exit_hw_queues(struct request_queue *q,
struct blk_mq_tag_set *set, int nr_queue)
{
struct blk_mq_hw_ctx *hctx;
- unsigned int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
if (i == nr_queue)
break;
- blk_mq_debugfs_unregister_hctx(hctx);
blk_mq_exit_hctx(q, set, hctx, i);
}
}
-static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set)
-{
- int hw_ctx_size = sizeof(struct blk_mq_hw_ctx);
-
- BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu),
- __alignof__(struct blk_mq_hw_ctx)) !=
- sizeof(struct blk_mq_hw_ctx));
-
- if (tag_set->flags & BLK_MQ_F_BLOCKING)
- hw_ctx_size += sizeof(struct srcu_struct);
-
- return hw_ctx_size;
-}
-
static int blk_mq_init_hctx(struct request_queue *q,
struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx, unsigned hctx_idx)
@@ -2517,8 +3680,15 @@ static int blk_mq_init_hctx(struct request_queue *q,
if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx,
hctx->numa_node))
goto exit_hctx;
+
+ if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL))
+ goto exit_flush_rq;
+
return 0;
+ exit_flush_rq:
+ if (set->ops->exit_request)
+ set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx);
exit_hctx:
if (set->ops->exit_hctx)
set->ops->exit_hctx(hctx, hctx_idx);
@@ -2534,7 +3704,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
struct blk_mq_hw_ctx *hctx;
gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY;
- hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node);
+ hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node);
if (!hctx)
goto fail_alloc_hctx;
@@ -2550,7 +3720,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
spin_lock_init(&hctx->lock);
INIT_LIST_HEAD(&hctx->dispatch);
hctx->queue = q;
- hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED;
+ hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED;
INIT_LIST_HEAD(&hctx->hctx_list);
@@ -2564,7 +3734,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
goto free_cpumask;
if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8),
- gfp, node))
+ gfp, node, false, false))
goto free_ctxs;
hctx->nr_ctx = 0;
@@ -2576,8 +3746,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set,
if (!hctx->fq)
goto free_bitmap;
- if (hctx->flags & BLK_MQ_F_BLOCKING)
- init_srcu_struct(hctx->srcu);
blk_mq_hctx_kobj_init(hctx);
return hctx;
@@ -2619,44 +3787,69 @@ static void blk_mq_init_cpu_queues(struct request_queue *q,
for (j = 0; j < set->nr_maps; j++) {
hctx = blk_mq_map_queue_type(q, j, i);
if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE)
- hctx->numa_node = local_memory_node(cpu_to_node(i));
+ hctx->numa_node = cpu_to_node(i);
}
}
}
-static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set,
- int hctx_idx)
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx,
+ unsigned int depth)
{
- int ret = 0;
+ struct blk_mq_tags *tags;
+ int ret;
- set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx,
- set->queue_depth, set->reserved_tags);
- if (!set->tags[hctx_idx])
- return false;
+ tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags);
+ if (!tags)
+ return NULL;
+
+ ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth);
+ if (ret) {
+ blk_mq_free_rq_map(tags);
+ return NULL;
+ }
+
+ return tags;
+}
+
+static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+ int hctx_idx)
+{
+ if (blk_mq_is_shared_tags(set->flags)) {
+ set->tags[hctx_idx] = set->shared_tags;
- ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx,
- set->queue_depth);
- if (!ret)
return true;
+ }
- blk_mq_free_rq_map(set->tags[hctx_idx]);
- set->tags[hctx_idx] = NULL;
- return false;
+ set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx,
+ set->queue_depth);
+
+ return set->tags[hctx_idx];
}
-static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set,
- unsigned int hctx_idx)
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags,
+ unsigned int hctx_idx)
{
- if (set->tags && set->tags[hctx_idx]) {
- blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx);
- blk_mq_free_rq_map(set->tags[hctx_idx]);
- set->tags[hctx_idx] = NULL;
+ if (tags) {
+ blk_mq_free_rqs(set, tags, hctx_idx);
+ blk_mq_free_rq_map(tags);
}
}
+static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx)
+{
+ if (!blk_mq_is_shared_tags(set->flags))
+ blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx);
+
+ set->tags[hctx_idx] = NULL;
+}
+
static void blk_mq_map_swqueue(struct request_queue *q)
{
- unsigned int i, j, hctx_idx;
+ unsigned int j, hctx_idx;
+ unsigned long i;
struct blk_mq_hw_ctx *hctx;
struct blk_mq_ctx *ctx;
struct blk_mq_tag_set *set = q->tag_set;
@@ -2684,7 +3877,7 @@ static void blk_mq_map_swqueue(struct request_queue *q)
hctx_idx = set->map[j].mq_map[i];
/* unmapped hw queue can be remapped after CPU topo changed */
if (!set->tags[hctx_idx] &&
- !__blk_mq_alloc_map_and_request(set, hctx_idx)) {
+ !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) {
/*
* If tags initialization fail for some hctx,
* that hctx won't be brought online. In this
@@ -2731,8 +3924,8 @@ static void blk_mq_map_swqueue(struct request_queue *q)
* fallback in case of a new remap fails
* allocation
*/
- if (i && set->tags[i])
- blk_mq_free_map_and_requests(set, i);
+ if (i)
+ __blk_mq_free_map_and_rqs(set, i);
hctx->tags = NULL;
continue;
@@ -2763,18 +3956,20 @@ static void blk_mq_map_swqueue(struct request_queue *q)
static void queue_set_hctx_shared(struct request_queue *q, bool shared)
{
struct blk_mq_hw_ctx *hctx;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i) {
- if (shared)
- hctx->flags |= BLK_MQ_F_TAG_SHARED;
- else
- hctx->flags &= ~BLK_MQ_F_TAG_SHARED;
+ if (shared) {
+ hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
+ } else {
+ blk_mq_tag_idle(hctx);
+ hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
+ }
}
}
-static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set,
- bool shared)
+static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set,
+ bool shared)
{
struct request_queue *q;
@@ -2792,12 +3987,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q)
struct blk_mq_tag_set *set = q->tag_set;
mutex_lock(&set->tag_list_lock);
- list_del_rcu(&q->tag_set_list);
+ list_del(&q->tag_set_list);
if (list_is_singular(&set->tag_list)) {
/* just transitioned to unshared */
- set->flags &= ~BLK_MQ_F_TAG_SHARED;
+ set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
- blk_mq_update_tag_set_depth(set, false);
+ blk_mq_update_tag_set_shared(set, false);
}
mutex_unlock(&set->tag_list_lock);
INIT_LIST_HEAD(&q->tag_set_list);
@@ -2812,14 +4007,14 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set,
* Check to see if we're transitioning to shared (from 1 to 2 queues).
*/
if (!list_empty(&set->tag_list) &&
- !(set->flags & BLK_MQ_F_TAG_SHARED)) {
- set->flags |= BLK_MQ_F_TAG_SHARED;
+ !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) {
+ set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED;
/* update existing queue */
- blk_mq_update_tag_set_depth(set, true);
+ blk_mq_update_tag_set_shared(set, true);
}
- if (set->flags & BLK_MQ_F_TAG_SHARED)
+ if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
queue_set_hctx_shared(q, true);
- list_add_tail_rcu(&q->tag_set_list, &set->tag_list);
+ list_add_tail(&q->tag_set_list, &set->tag_list);
mutex_unlock(&set->tag_list_lock);
}
@@ -2861,7 +4056,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q)
void blk_mq_release(struct request_queue *q)
{
struct blk_mq_hw_ctx *hctx, *next;
- int i;
+ unsigned long i;
queue_for_each_hw_ctx(q, hctx, i)
WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list));
@@ -2872,7 +4067,7 @@ void blk_mq_release(struct request_queue *q)
kobject_put(&hctx->kobj);
}
- kfree(q->queue_hw_ctx);
+ xa_destroy(&q->hctx_table);
/*
* release .mq_kobj and sw queue's kobject now because
@@ -2881,27 +4076,23 @@ void blk_mq_release(struct request_queue *q)
blk_mq_sysfs_deinit(q);
}
-struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
+static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set,
void *queuedata)
{
- struct request_queue *uninit_q, *q;
+ struct request_queue *q;
+ int ret;
- uninit_q = __blk_alloc_queue(set->numa_node);
- if (!uninit_q)
+ q = blk_alloc_queue(set->numa_node);
+ if (!q)
return ERR_PTR(-ENOMEM);
- uninit_q->queuedata = queuedata;
-
- /*
- * Initialize the queue without an elevator. device_add_disk() will do
- * the initialization.
- */
- q = blk_mq_init_allocated_queue(set, uninit_q, false);
- if (IS_ERR(q))
- blk_cleanup_queue(uninit_q);
-
+ q->queuedata = queuedata;
+ ret = blk_mq_init_allocated_queue(set, q);
+ if (ret) {
+ blk_put_queue(q);
+ return ERR_PTR(ret);
+ }
return q;
}
-EXPORT_SYMBOL_GPL(blk_mq_init_queue_data);
struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
{
@@ -2909,39 +4100,67 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set)
}
EXPORT_SYMBOL(blk_mq_init_queue);
-/*
- * Helper for setting up a queue with mq ops, given queue depth, and
- * the passed in mq ops flags.
+/**
+ * blk_mq_destroy_queue - shutdown a request queue
+ * @q: request queue to shutdown
+ *
+ * This shuts down a request queue allocated by blk_mq_init_queue(). All future
+ * requests will be failed with -ENODEV. The caller is responsible for dropping
+ * the reference from blk_mq_init_queue() by calling blk_put_queue().
+ *
+ * Context: can sleep
*/
-struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set,
- const struct blk_mq_ops *ops,
- unsigned int queue_depth,
- unsigned int set_flags)
+void blk_mq_destroy_queue(struct request_queue *q)
{
- struct request_queue *q;
- int ret;
+ WARN_ON_ONCE(!queue_is_mq(q));
+ WARN_ON_ONCE(blk_queue_registered(q));
- memset(set, 0, sizeof(*set));
- set->ops = ops;
- set->nr_hw_queues = 1;
- set->nr_maps = 1;
- set->queue_depth = queue_depth;
- set->numa_node = NUMA_NO_NODE;
- set->flags = set_flags;
+ might_sleep();
- ret = blk_mq_alloc_tag_set(set);
- if (ret)
- return ERR_PTR(ret);
+ blk_queue_flag_set(QUEUE_FLAG_DYING, q);
+ blk_queue_start_drain(q);
+ blk_mq_freeze_queue_wait(q);
+
+ blk_sync_queue(q);
+ blk_mq_cancel_work_sync(q);
+ blk_mq_exit_queue(q);
+}
+EXPORT_SYMBOL(blk_mq_destroy_queue);
+
+struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata,
+ struct lock_class_key *lkclass)
+{
+ struct request_queue *q;
+ struct gendisk *disk;
+
+ q = blk_mq_init_queue_data(set, queuedata);
+ if (IS_ERR(q))
+ return ERR_CAST(q);
- q = blk_mq_init_queue(set);
- if (IS_ERR(q)) {
- blk_mq_free_tag_set(set);
- return q;
+ disk = __alloc_disk_node(q, set->numa_node, lkclass);
+ if (!disk) {
+ blk_mq_destroy_queue(q);
+ blk_put_queue(q);
+ return ERR_PTR(-ENOMEM);
}
+ set_bit(GD_OWNS_QUEUE, &disk->state);
+ return disk;
+}
+EXPORT_SYMBOL(__blk_mq_alloc_disk);
- return q;
+struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q,
+ struct lock_class_key *lkclass)
+{
+ struct gendisk *disk;
+
+ if (!blk_get_queue(q))
+ return NULL;
+ disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass);
+ if (!disk)
+ blk_put_queue(q);
+ return disk;
}
-EXPORT_SYMBOL(blk_mq_init_sq_queue);
+EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue);
static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
struct blk_mq_tag_set *set, struct request_queue *q,
@@ -2980,52 +4199,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx(
static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
struct request_queue *q)
{
- int i, j, end;
- struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx;
-
- if (q->nr_hw_queues < set->nr_hw_queues) {
- struct blk_mq_hw_ctx **new_hctxs;
-
- new_hctxs = kcalloc_node(set->nr_hw_queues,
- sizeof(*new_hctxs), GFP_KERNEL,
- set->numa_node);
- if (!new_hctxs)
- return;
- if (hctxs)
- memcpy(new_hctxs, hctxs, q->nr_hw_queues *
- sizeof(*hctxs));
- q->queue_hw_ctx = new_hctxs;
- kfree(hctxs);
- hctxs = new_hctxs;
- }
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i, j;
/* protect against switching io scheduler */
mutex_lock(&q->sysfs_lock);
for (i = 0; i < set->nr_hw_queues; i++) {
- int node;
- struct blk_mq_hw_ctx *hctx;
+ int old_node;
+ int node = blk_mq_get_hctx_node(set, i);
+ struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i);
- node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i);
- /*
- * If the hw queue has been mapped to another numa node,
- * we need to realloc the hctx. If allocation fails, fallback
- * to use the previous one.
- */
- if (hctxs[i] && (hctxs[i]->numa_node == node))
- continue;
+ if (old_hctx) {
+ old_node = old_hctx->numa_node;
+ blk_mq_exit_hctx(q, set, old_hctx, i);
+ }
- hctx = blk_mq_alloc_and_init_hctx(set, q, i, node);
- if (hctx) {
- if (hctxs[i])
- blk_mq_exit_hctx(q, set, hctxs[i], i);
- hctxs[i] = hctx;
- } else {
- if (hctxs[i])
- pr_warn("Allocate new hctx on node %d fails,\
- fallback to previous one on node %d\n",
- node, hctxs[i]->numa_node);
- else
+ if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) {
+ if (!old_hctx)
break;
+ pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n",
+ node, old_node);
+ hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node);
+ WARN_ON_ONCE(!hctx);
}
}
/*
@@ -3034,41 +4229,35 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set,
*/
if (i != set->nr_hw_queues) {
j = q->nr_hw_queues;
- end = i;
} else {
j = i;
- end = q->nr_hw_queues;
q->nr_hw_queues = set->nr_hw_queues;
}
- for (; j < end; j++) {
- struct blk_mq_hw_ctx *hctx = hctxs[j];
-
- if (hctx) {
- if (hctx->tags)
- blk_mq_free_map_and_requests(set, j);
- blk_mq_exit_hctx(q, set, hctx, j);
- hctxs[j] = NULL;
- }
- }
+ xa_for_each_start(&q->hctx_table, j, hctx, j)
+ blk_mq_exit_hctx(q, set, hctx, j);
mutex_unlock(&q->sysfs_lock);
}
-struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
- struct request_queue *q,
- bool elevator_init)
+static void blk_mq_update_poll_flag(struct request_queue *q)
+{
+ struct blk_mq_tag_set *set = q->tag_set;
+
+ if (set->nr_maps > HCTX_TYPE_POLL &&
+ set->map[HCTX_TYPE_POLL].nr_queues)
+ blk_queue_flag_set(QUEUE_FLAG_POLL, q);
+ else
+ blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
+}
+
+int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
+ struct request_queue *q)
{
/* mark the queue as mq asap */
q->mq_ops = set->ops;
- q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn,
- blk_mq_poll_stats_bkt,
- BLK_MQ_POLL_STATS_BKTS, q);
- if (!q->poll_cb)
- goto err_exit;
-
if (blk_mq_alloc_ctxs(q))
- goto err_poll;
+ goto err_exit;
/* init q->mq_kobj and sw queues' kobjects */
blk_mq_sysfs_init(q);
@@ -3076,6 +4265,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
INIT_LIST_HEAD(&q->unused_hctx_list);
spin_lock_init(&q->unused_hctx_lock);
+ xa_init(&q->hctx_table);
+
blk_mq_realloc_hw_ctxs(set, q);
if (!q->nr_hw_queues)
goto err_hctxs;
@@ -3086,67 +4277,67 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set,
q->tag_set = set;
q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT;
- if (set->nr_maps > HCTX_TYPE_POLL &&
- set->map[HCTX_TYPE_POLL].nr_queues)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
-
- q->sg_reserved_size = INT_MAX;
+ blk_mq_update_poll_flag(q);
INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work);
+ INIT_LIST_HEAD(&q->flush_list);
INIT_LIST_HEAD(&q->requeue_list);
spin_lock_init(&q->requeue_lock);
q->nr_requests = set->queue_depth;
- /*
- * Default to classic polling
- */
- q->poll_nsec = BLK_MQ_POLL_CLASSIC;
-
blk_mq_init_cpu_queues(q, set->nr_hw_queues);
blk_mq_add_queue_tag_set(set, q);
blk_mq_map_swqueue(q);
-
- if (elevator_init)
- elevator_init_mq(q);
-
- return q;
+ return 0;
err_hctxs:
- kfree(q->queue_hw_ctx);
- q->nr_hw_queues = 0;
- blk_mq_sysfs_deinit(q);
-err_poll:
- blk_stat_free_callback(q->poll_cb);
- q->poll_cb = NULL;
+ blk_mq_release(q);
err_exit:
q->mq_ops = NULL;
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
EXPORT_SYMBOL(blk_mq_init_allocated_queue);
/* tags can _not_ be used after returning from blk_mq_exit_queue */
void blk_mq_exit_queue(struct request_queue *q)
{
- struct blk_mq_tag_set *set = q->tag_set;
+ struct blk_mq_tag_set *set = q->tag_set;
- blk_mq_del_queue_tag_set(q);
+ /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */
blk_mq_exit_hw_queues(q, set, set->nr_hw_queues);
+ /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */
+ blk_mq_del_queue_tag_set(q);
}
static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set)
{
int i;
- for (i = 0; i < set->nr_hw_queues; i++)
- if (!__blk_mq_alloc_map_and_request(set, i))
+ if (blk_mq_is_shared_tags(set->flags)) {
+ set->shared_tags = blk_mq_alloc_map_and_rqs(set,
+ BLK_MQ_NO_HCTX_IDX,
+ set->queue_depth);
+ if (!set->shared_tags)
+ return -ENOMEM;
+ }
+
+ for (i = 0; i < set->nr_hw_queues; i++) {
+ if (!__blk_mq_alloc_map_and_rqs(set, i))
goto out_unwind;
+ cond_resched();
+ }
return 0;
out_unwind:
while (--i >= 0)
- blk_mq_free_map_and_requests(set, i);
+ __blk_mq_free_map_and_rqs(set, i);
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ blk_mq_free_map_and_rqs(set, set->shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ }
return -ENOMEM;
}
@@ -3156,7 +4347,7 @@ out_unwind:
* may reduce the depth asked for, if memory is tight. set->queue_depth
* will be updated to reflect the allocated depth.
*/
-static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
+static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set)
{
unsigned int depth;
int err;
@@ -3186,7 +4377,7 @@ static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set)
return 0;
}
-static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
+static void blk_mq_update_queue_map(struct blk_mq_tag_set *set)
{
/*
* blk_mq_map_queues() and multiple .map_queues() implementations
@@ -3216,20 +4407,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set)
for (i = 0; i < set->nr_maps; i++)
blk_mq_clear_mq_map(&set->map[i]);
- return set->ops->map_queues(set);
+ set->ops->map_queues(set);
} else {
BUG_ON(set->nr_maps > 1);
- return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
+ blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
}
}
static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
- int cur_nr_hw_queues, int new_nr_hw_queues)
+ int new_nr_hw_queues)
{
struct blk_mq_tags **new_tags;
+ int i;
- if (cur_nr_hw_queues >= new_nr_hw_queues)
- return 0;
+ if (set->nr_hw_queues >= new_nr_hw_queues)
+ goto done;
new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *),
GFP_KERNEL, set->numa_node);
@@ -3237,12 +4429,22 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set,
return -ENOMEM;
if (set->tags)
- memcpy(new_tags, set->tags, cur_nr_hw_queues *
+ memcpy(new_tags, set->tags, set->nr_hw_queues *
sizeof(*set->tags));
kfree(set->tags);
set->tags = new_tags;
- set->nr_hw_queues = new_nr_hw_queues;
+ for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) {
+ if (!__blk_mq_alloc_map_and_rqs(set, i)) {
+ while (--i >= set->nr_hw_queues)
+ __blk_mq_free_map_and_rqs(set, i);
+ return -ENOMEM;
+ }
+ cond_resched();
+ }
+
+done:
+ set->nr_hw_queues = new_nr_hw_queues;
return 0;
}
@@ -3299,10 +4501,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids)
set->nr_hw_queues = nr_cpu_ids;
- if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0)
- return -ENOMEM;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL);
+ if (!set->srcu)
+ return -ENOMEM;
+ ret = init_srcu_struct(set->srcu);
+ if (ret)
+ goto out_free_srcu;
+ }
ret = -ENOMEM;
+ set->tags = kcalloc_node(set->nr_hw_queues,
+ sizeof(struct blk_mq_tags *), GFP_KERNEL,
+ set->numa_node);
+ if (!set->tags)
+ goto out_cleanup_srcu;
+
for (i = 0; i < set->nr_maps; i++) {
set->map[i].mq_map = kcalloc_node(nr_cpu_ids,
sizeof(set->map[i].mq_map[0]),
@@ -3312,11 +4526,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set)
set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues;
}
- ret = blk_mq_update_queue_map(set);
- if (ret)
- goto out_free_mq_map;
+ blk_mq_update_queue_map(set);
- ret = blk_mq_alloc_map_and_requests(set);
+ ret = blk_mq_alloc_set_map_and_rqs(set);
if (ret)
goto out_free_mq_map;
@@ -3332,16 +4544,43 @@ out_free_mq_map:
}
kfree(set->tags);
set->tags = NULL;
+out_cleanup_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ cleanup_srcu_struct(set->srcu);
+out_free_srcu:
+ if (set->flags & BLK_MQ_F_BLOCKING)
+ kfree(set->srcu);
return ret;
}
EXPORT_SYMBOL(blk_mq_alloc_tag_set);
+/* allocate and initialize a tagset for a simple single-queue device */
+int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set,
+ const struct blk_mq_ops *ops, unsigned int queue_depth,
+ unsigned int set_flags)
+{
+ memset(set, 0, sizeof(*set));
+ set->ops = ops;
+ set->nr_hw_queues = 1;
+ set->nr_maps = 1;
+ set->queue_depth = queue_depth;
+ set->numa_node = NUMA_NO_NODE;
+ set->flags = set_flags;
+ return blk_mq_alloc_tag_set(set);
+}
+EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set);
+
void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
{
int i, j;
for (i = 0; i < set->nr_hw_queues; i++)
- blk_mq_free_map_and_requests(set, i);
+ __blk_mq_free_map_and_rqs(set, i);
+
+ if (blk_mq_is_shared_tags(set->flags)) {
+ blk_mq_free_map_and_rqs(set, set->shared_tags,
+ BLK_MQ_NO_HCTX_IDX);
+ }
for (j = 0; j < set->nr_maps; j++) {
kfree(set->map[j].mq_map);
@@ -3350,6 +4589,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set)
kfree(set->tags);
set->tags = NULL;
+ if (set->flags & BLK_MQ_F_BLOCKING) {
+ cleanup_srcu_struct(set->srcu);
+ kfree(set->srcu);
+ }
}
EXPORT_SYMBOL(blk_mq_free_tag_set);
@@ -3357,7 +4600,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
{
struct blk_mq_tag_set *set = q->tag_set;
struct blk_mq_hw_ctx *hctx;
- int i, ret;
+ int ret;
+ unsigned long i;
if (!set)
return -EINVAL;
@@ -3376,21 +4620,27 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr)
* If we're using an MQ scheduler, just update the scheduler
* queue depth. This is similar to what the old code would do.
*/
- if (!hctx->sched_tags) {
- ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
- false);
- } else {
+ if (hctx->sched_tags) {
ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags,
- nr, true);
+ nr, true);
+ } else {
+ ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr,
+ false);
}
if (ret)
break;
if (q->elevator && q->elevator->type->ops.depth_updated)
q->elevator->type->ops.depth_updated(hctx);
}
-
- if (!ret)
+ if (!ret) {
q->nr_requests = nr;
+ if (blk_mq_is_shared_tags(set->flags)) {
+ if (q->elevator)
+ blk_mq_tag_update_sched_shared_tags(q);
+ else
+ blk_mq_tag_resize_shared_tags(set, nr);
+ }
+ }
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
@@ -3418,53 +4668,61 @@ static bool blk_mq_elv_switch_none(struct list_head *head,
{
struct blk_mq_qe_pair *qe;
- if (!q->elevator)
- return true;
-
qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY);
if (!qe)
return false;
+ /* q->elevator needs protection from ->sysfs_lock */
+ mutex_lock(&q->sysfs_lock);
+
+ /* the check has to be done with holding sysfs_lock */
+ if (!q->elevator) {
+ kfree(qe);
+ goto unlock;
+ }
+
INIT_LIST_HEAD(&qe->node);
qe->q = q;
qe->type = q->elevator->type;
+ /* keep a reference to the elevator module as we'll switch back */
+ __elevator_get(qe->type);
list_add(&qe->node, head);
-
- mutex_lock(&q->sysfs_lock);
- /*
- * After elevator_switch_mq, the previous elevator_queue will be
- * released by elevator_release. The reference of the io scheduler
- * module get by elevator_get will also be put. So we need to get
- * a reference of the io scheduler module here to prevent it to be
- * removed.
- */
- __module_get(qe->type->elevator_owner);
- elevator_switch_mq(q, NULL);
+ elevator_disable(q);
+unlock:
mutex_unlock(&q->sysfs_lock);
return true;
}
-static void blk_mq_elv_switch_back(struct list_head *head,
- struct request_queue *q)
+static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head,
+ struct request_queue *q)
{
struct blk_mq_qe_pair *qe;
- struct elevator_type *t = NULL;
list_for_each_entry(qe, head, node)
- if (qe->q == q) {
- t = qe->type;
- break;
- }
+ if (qe->q == q)
+ return qe;
- if (!t)
- return;
+ return NULL;
+}
+
+static void blk_mq_elv_switch_back(struct list_head *head,
+ struct request_queue *q)
+{
+ struct blk_mq_qe_pair *qe;
+ struct elevator_type *t;
+ qe = blk_lookup_qe_pair(head, q);
+ if (!qe)
+ return;
+ t = qe->type;
list_del(&qe->node);
kfree(qe);
mutex_lock(&q->sysfs_lock);
- elevator_switch_mq(q, t);
+ elevator_switch(q, t);
+ /* drop the reference acquired in blk_mq_elv_switch_none */
+ elevator_put(t);
mutex_unlock(&q->sysfs_lock);
}
@@ -3473,7 +4731,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
{
struct request_queue *q;
LIST_HEAD(head);
- int prev_nr_hw_queues;
+ int prev_nr_hw_queues = set->nr_hw_queues;
+ int i;
lockdep_assert_held(&set->tag_list_lock);
@@ -3497,24 +4756,26 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set,
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_debugfs_unregister_hctxs(q);
- blk_mq_sysfs_unregister(q);
+ blk_mq_sysfs_unregister_hctxs(q);
}
- prev_nr_hw_queues = set->nr_hw_queues;
- if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) <
- 0)
+ if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0)
goto reregister;
- set->nr_hw_queues = nr_hw_queues;
fallback:
blk_mq_update_queue_map(set);
list_for_each_entry(q, &set->tag_list, tag_set_list) {
blk_mq_realloc_hw_ctxs(set, q);
+ blk_mq_update_poll_flag(q);
if (q->nr_hw_queues != set->nr_hw_queues) {
+ int i = prev_nr_hw_queues;
+
pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n",
nr_hw_queues, prev_nr_hw_queues);
+ for (; i < set->nr_hw_queues; i++)
+ __blk_mq_free_map_and_rqs(set, i);
+
set->nr_hw_queues = prev_nr_hw_queues;
- blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]);
goto fallback;
}
blk_mq_map_swqueue(q);
@@ -3522,7 +4783,7 @@ fallback:
reregister:
list_for_each_entry(q, &set->tag_list, tag_set_list) {
- blk_mq_sysfs_register(q);
+ blk_mq_sysfs_register_hctxs(q);
blk_mq_debugfs_register_hctxs(q);
}
@@ -3532,6 +4793,10 @@ switch_back:
list_for_each_entry(q, &set->tag_list, tag_set_list)
blk_mq_unfreeze_queue(q);
+
+ /* Free the excess tags when nr_hw_queues shrink. */
+ for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++)
+ __blk_mq_free_map_and_rqs(set, i);
}
void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
@@ -3542,215 +4807,58 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues)
}
EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues);
-/* Enable polling stats and return whether they were already enabled. */
-static bool blk_poll_stats_enable(struct request_queue *q)
-{
- if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
- blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q))
- return true;
- blk_stat_add_callback(q, q->poll_cb);
- return false;
-}
-
-static void blk_mq_poll_stats_start(struct request_queue *q)
-{
- /*
- * We don't arm the callback if polling stats are not enabled or the
- * callback is already active.
- */
- if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) ||
- blk_stat_is_active(q->poll_cb))
- return;
-
- blk_stat_activate_msecs(q->poll_cb, 100);
-}
-
-static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb)
-{
- struct request_queue *q = cb->data;
- int bucket;
-
- for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) {
- if (cb->stat[bucket].nr_samples)
- q->poll_stat[bucket] = cb->stat[bucket];
- }
-}
-
-static unsigned long blk_mq_poll_nsecs(struct request_queue *q,
- struct request *rq)
-{
- unsigned long ret = 0;
- int bucket;
-
- /*
- * If stats collection isn't on, don't sleep but turn it on for
- * future users
- */
- if (!blk_poll_stats_enable(q))
- return 0;
-
- /*
- * As an optimistic guess, use half of the mean service time
- * for this type of request. We can (and should) make this smarter.
- * For instance, if the completion latencies are tight, we can
- * get closer than just half the mean. This is especially
- * important on devices where the completion latencies are longer
- * than ~10 usec. We do use the stats for the relevant IO size
- * if available which does lead to better estimates.
- */
- bucket = blk_mq_poll_stats_bkt(rq);
- if (bucket < 0)
- return ret;
-
- if (q->poll_stat[bucket].nr_samples)
- ret = (q->poll_stat[bucket].mean + 1) / 2;
-
- return ret;
-}
-
-static bool blk_mq_poll_hybrid_sleep(struct request_queue *q,
- struct request *rq)
+static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
+ struct io_comp_batch *iob, unsigned int flags)
{
- struct hrtimer_sleeper hs;
- enum hrtimer_mode mode;
- unsigned int nsecs;
- ktime_t kt;
-
- if (rq->rq_flags & RQF_MQ_POLL_SLEPT)
- return false;
-
- /*
- * If we get here, hybrid polling is enabled. Hence poll_nsec can be:
- *
- * 0: use half of prev avg
- * >0: use this specific value
- */
- if (q->poll_nsec > 0)
- nsecs = q->poll_nsec;
- else
- nsecs = blk_mq_poll_nsecs(q, rq);
-
- if (!nsecs)
- return false;
-
- rq->rq_flags |= RQF_MQ_POLL_SLEPT;
+ long state = get_current_state();
+ int ret;
- /*
- * This will be replaced with the stats tracking code, using
- * 'avg_completion_time / 2' as the pre-sleep target.
- */
- kt = nsecs;
+ do {
+ ret = q->mq_ops->poll(hctx, iob);
+ if (ret > 0) {
+ __set_current_state(TASK_RUNNING);
+ return ret;
+ }
- mode = HRTIMER_MODE_REL;
- hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode);
- hrtimer_set_expires(&hs.timer, kt);
+ if (signal_pending_state(state, current))
+ __set_current_state(TASK_RUNNING);
+ if (task_is_running(current))
+ return 1;
- do {
- if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE)
+ if (ret < 0 || (flags & BLK_POLL_ONESHOT))
break;
- set_current_state(TASK_UNINTERRUPTIBLE);
- hrtimer_sleeper_start_expires(&hs, mode);
- if (hs.task)
- io_schedule();
- hrtimer_cancel(&hs.timer);
- mode = HRTIMER_MODE_ABS;
- } while (hs.task && !signal_pending(current));
+ cpu_relax();
+ } while (!need_resched());
__set_current_state(TASK_RUNNING);
- destroy_hrtimer_on_stack(&hs.timer);
- return true;
+ return 0;
}
-static bool blk_mq_poll_hybrid(struct request_queue *q,
- struct blk_mq_hw_ctx *hctx, blk_qc_t cookie)
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie,
+ struct io_comp_batch *iob, unsigned int flags)
{
- struct request *rq;
-
- if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
- return false;
+ struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie);
- if (!blk_qc_t_is_internal(cookie))
- rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie));
- else {
- rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie));
- /*
- * With scheduling, if the request has completed, we'll
- * get a NULL return here, as we clear the sched tag when
- * that happens. The request still remains valid, like always,
- * so we should be safe with just the NULL check.
- */
- if (!rq)
- return false;
- }
-
- return blk_mq_poll_hybrid_sleep(q, rq);
+ return blk_hctx_poll(q, hctx, iob, flags);
}
-/**
- * blk_poll - poll for IO completions
- * @q: the queue
- * @cookie: cookie passed back at IO submission time
- * @spin: whether to spin for completions
- *
- * Description:
- * Poll for completions on the passed in queue. Returns number of
- * completed entries found. If @spin is true, then blk_poll will continue
- * looping until at least one completion is found, unless the task is
- * otherwise marked running (or we need to reschedule).
- */
-int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin)
+int blk_rq_poll(struct request *rq, struct io_comp_batch *iob,
+ unsigned int poll_flags)
{
- struct blk_mq_hw_ctx *hctx;
- long state;
+ struct request_queue *q = rq->q;
+ int ret;
- if (!blk_qc_t_valid(cookie) ||
- !test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
+ if (!blk_rq_is_poll(rq))
+ return 0;
+ if (!percpu_ref_tryget(&q->q_usage_counter))
return 0;
- if (current->plug)
- blk_flush_plug_list(current->plug, false);
-
- hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)];
-
- /*
- * If we sleep, have the caller restart the poll loop to reset
- * the state. Like for the other success return cases, the
- * caller is responsible for checking if the IO completed. If
- * the IO isn't complete, we'll get called again and will go
- * straight to the busy poll loop.
- */
- if (blk_mq_poll_hybrid(q, hctx, cookie))
- return 1;
-
- hctx->poll_considered++;
-
- state = current->state;
- do {
- int ret;
-
- hctx->poll_invoked++;
-
- ret = q->mq_ops->poll(hctx);
- if (ret > 0) {
- hctx->poll_success++;
- __set_current_state(TASK_RUNNING);
- return ret;
- }
-
- if (signal_pending_state(state, current))
- __set_current_state(TASK_RUNNING);
-
- if (current->state == TASK_RUNNING)
- return 1;
- if (ret < 0 || !spin)
- break;
- cpu_relax();
- } while (!need_resched());
+ ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags);
+ blk_queue_exit(q);
- __set_current_state(TASK_RUNNING);
- return 0;
+ return ret;
}
-EXPORT_SYMBOL_GPL(blk_poll);
+EXPORT_SYMBOL_GPL(blk_rq_poll);
unsigned int blk_mq_rq_cpu(struct request *rq)
{
@@ -3758,8 +4866,31 @@ unsigned int blk_mq_rq_cpu(struct request *rq)
}
EXPORT_SYMBOL(blk_mq_rq_cpu);
+void blk_mq_cancel_work_sync(struct request_queue *q)
+{
+ struct blk_mq_hw_ctx *hctx;
+ unsigned long i;
+
+ cancel_delayed_work_sync(&q->requeue_work);
+
+ queue_for_each_hw_ctx(q, hctx, i)
+ cancel_delayed_work_sync(&hctx->run_work);
+}
+
static int __init blk_mq_init(void)
{
+ int i;
+
+ for_each_possible_cpu(i)
+ init_llist_head(&per_cpu(blk_cpu_done, i));
+ for_each_possible_cpu(i)
+ INIT_CSD(&per_cpu(blk_cpu_csd, i),
+ __blk_mq_complete_request_remote, NULL);
+ open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
+
+ cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
+ "block/softirq:dead", NULL,
+ blk_softirq_cpu_dead);
cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL,
blk_mq_hctx_notify_dead);
cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online",
diff --git a/block/blk-mq.h b/block/blk-mq.h
index b3ce0f3a2ad2..f75a9ecfebde 100644
--- a/block/blk-mq.h
+++ b/block/blk-mq.h
@@ -2,8 +2,8 @@
#ifndef INT_BLK_MQ_H
#define INT_BLK_MQ_H
+#include <linux/blk-mq.h>
#include "blk-stat.h"
-#include "blk-mq-tag.h"
struct blk_mq_tag_set;
@@ -25,27 +25,32 @@ struct blk_mq_ctx {
unsigned short index_hw[HCTX_MAX_TYPES];
struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES];
- /* incremented at dispatch time */
- unsigned long rq_dispatched[2];
- unsigned long rq_merged;
-
- /* incremented at completion time */
- unsigned long ____cacheline_aligned_in_smp rq_completed[2];
-
struct request_queue *queue;
struct blk_mq_ctxs *ctxs;
struct kobject kobj;
} ____cacheline_aligned_in_smp;
+enum {
+ BLK_MQ_NO_TAG = -1U,
+ BLK_MQ_TAG_MIN = 1,
+ BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1,
+};
+
+typedef unsigned int __bitwise blk_insert_t;
+#define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01)
+
+void blk_mq_submit_bio(struct bio *bio);
+int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob,
+ unsigned int flags);
void blk_mq_exit_queue(struct request_queue *q);
int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr);
void blk_mq_wake_waiters(struct request_queue *q);
-bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool);
-void blk_mq_add_to_requeue_list(struct request *rq, bool at_head,
- bool kick_requeue_list);
+bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *,
+ unsigned int);
void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list);
struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
struct blk_mq_ctx *start);
+void blk_mq_put_rq_ref(struct request *rq);
/*
* Internal helpers for allocating/freeing the request map
@@ -53,27 +58,11 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx,
void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
unsigned int hctx_idx);
void blk_mq_free_rq_map(struct blk_mq_tags *tags);
-struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set,
- unsigned int hctx_idx,
- unsigned int nr_tags,
- unsigned int reserved_tags);
-int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags,
- unsigned int hctx_idx, unsigned int depth);
-
-/*
- * Internal helpers for request insertion into sw queues
- */
-void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- bool at_head);
-void blk_mq_request_bypass_insert(struct request *rq, bool at_head,
- bool run_queue);
-void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx,
- struct list_head *list);
-
-/* Used by blk_insert_cloned_request() to issue request directly */
-blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last);
-void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx,
- struct list_head *list);
+struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set,
+ unsigned int hctx_idx, unsigned int depth);
+void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set,
+ struct blk_mq_tags *tags,
+ unsigned int hctx_idx);
/*
* CPU -> queue mappings
@@ -90,30 +79,34 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue *
enum hctx_type type,
unsigned int cpu)
{
- return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]];
+ return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]);
}
-/*
- * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
- * @q: request queue
- * @flags: request command flags
- * @cpu: cpu ctx
- */
-static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
- unsigned int flags,
- struct blk_mq_ctx *ctx)
+static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf)
{
enum hctx_type type = HCTX_TYPE_DEFAULT;
/*
- * The caller ensure that if REQ_HIPRI, poll must be enabled.
+ * The caller ensure that if REQ_POLLED, poll must be enabled.
*/
- if (flags & REQ_HIPRI)
+ if (opf & REQ_POLLED)
type = HCTX_TYPE_POLL;
- else if ((flags & REQ_OP_MASK) == REQ_OP_READ)
+ else if ((opf & REQ_OP_MASK) == REQ_OP_READ)
type = HCTX_TYPE_READ;
-
- return ctx->hctxs[type];
+ return type;
+}
+
+/*
+ * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue
+ * @q: request queue
+ * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED).
+ * @ctx: software queue cpu ctx
+ */
+static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
+ blk_opf_t opf,
+ struct blk_mq_ctx *ctx)
+{
+ return ctx->hctxs[blk_mq_get_hctx_type(opf)];
}
/*
@@ -121,10 +114,15 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q,
*/
extern void blk_mq_sysfs_init(struct request_queue *q);
extern void blk_mq_sysfs_deinit(struct request_queue *q);
-extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q);
-extern int blk_mq_sysfs_register(struct request_queue *q);
-extern void blk_mq_sysfs_unregister(struct request_queue *q);
+int blk_mq_sysfs_register(struct gendisk *disk);
+void blk_mq_sysfs_unregister(struct gendisk *disk);
+int blk_mq_sysfs_register_hctxs(struct request_queue *q);
+void blk_mq_sysfs_unregister_hctxs(struct request_queue *q);
extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx);
+void blk_mq_free_plug_rqs(struct blk_plug *plug);
+void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule);
+
+void blk_mq_cancel_work_sync(struct request_queue *q);
void blk_mq_release(struct request_queue *q);
@@ -150,18 +148,81 @@ struct blk_mq_alloc_data {
struct request_queue *q;
blk_mq_req_flags_t flags;
unsigned int shallow_depth;
- unsigned int cmd_flags;
+ blk_opf_t cmd_flags;
+ req_flags_t rq_flags;
+
+ /* allocate multiple requests/tags in one go */
+ unsigned int nr_tags;
+ struct request **cached_rq;
/* input & output parameter */
struct blk_mq_ctx *ctx;
struct blk_mq_hw_ctx *hctx;
};
+struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags,
+ unsigned int reserved_tags, int node, int alloc_policy);
+void blk_mq_free_tags(struct blk_mq_tags *tags);
+int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags,
+ struct sbitmap_queue *breserved_tags, unsigned int queue_depth,
+ unsigned int reserved, int node, int alloc_policy);
+
+unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data);
+unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags,
+ unsigned int *offset);
+void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx,
+ unsigned int tag);
+void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags);
+int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx,
+ struct blk_mq_tags **tags, unsigned int depth, bool can_grow);
+void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set,
+ unsigned int size);
+void blk_mq_tag_update_sched_shared_tags(struct request_queue *q);
+
+void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool);
+void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn,
+ void *priv);
+void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn,
+ void *priv);
+
+static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt,
+ struct blk_mq_hw_ctx *hctx)
+{
+ if (!hctx)
+ return &bt->ws[0];
+ return sbq_wait_ptr(bt, &hctx->wait_index);
+}
+
+void __blk_mq_tag_busy(struct blk_mq_hw_ctx *);
+void __blk_mq_tag_idle(struct blk_mq_hw_ctx *);
+
+static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_tag_busy(hctx);
+}
+
+static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_tag_idle(hctx);
+}
+
+static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags,
+ unsigned int tag)
+{
+ return tag < tags->nr_reserved_tags;
+}
+
+static inline bool blk_mq_is_shared_tags(unsigned int flags)
+{
+ return flags & BLK_MQ_F_TAG_HCTX_SHARED;
+}
+
static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data)
{
- if (data->flags & BLK_MQ_REQ_INTERNAL)
+ if (data->rq_flags & RQF_SCHED_TAGS)
return data->hctx->sched_tags;
-
return data->hctx->tags;
}
@@ -175,37 +236,107 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx)
return hctx->nr_ctx && hctx->tags;
}
-unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part);
-void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part,
- unsigned int inflight[2]);
+unsigned int blk_mq_in_flight(struct request_queue *q,
+ struct block_device *part);
+void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part,
+ unsigned int inflight[2]);
-static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline void blk_mq_put_dispatch_budget(struct request_queue *q,
+ int budget_token)
{
- struct request_queue *q = hctx->queue;
-
if (q->mq_ops->put_budget)
- q->mq_ops->put_budget(hctx);
+ q->mq_ops->put_budget(q, budget_token);
}
-static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx)
+static inline int blk_mq_get_dispatch_budget(struct request_queue *q)
{
- struct request_queue *q = hctx->queue;
-
if (q->mq_ops->get_budget)
- return q->mq_ops->get_budget(hctx);
- return true;
+ return q->mq_ops->get_budget(q);
+ return 0;
+}
+
+static inline void blk_mq_set_rq_budget_token(struct request *rq, int token)
+{
+ if (token < 0)
+ return;
+
+ if (rq->q->mq_ops->set_rq_budget_token)
+ rq->q->mq_ops->set_rq_budget_token(rq, token);
+}
+
+static inline int blk_mq_get_rq_budget_token(struct request *rq)
+{
+ if (rq->q->mq_ops->get_rq_budget_token)
+ return rq->q->mq_ops->get_rq_budget_token(rq);
+ return -1;
+}
+
+static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (blk_mq_is_shared_tags(hctx->flags))
+ atomic_add(val, &hctx->queue->nr_active_requests_shared_tags);
+ else
+ atomic_add(val, &hctx->nr_active);
}
+static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ __blk_mq_add_active_requests(hctx, 1);
+}
+
+static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (blk_mq_is_shared_tags(hctx->flags))
+ atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags);
+ else
+ atomic_sub(val, &hctx->nr_active);
+}
+
+static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ __blk_mq_sub_active_requests(hctx, 1);
+}
+
+static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_add_active_requests(hctx, val);
+}
+
+static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_inc_active_requests(hctx);
+}
+
+static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx,
+ int val)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_sub_active_requests(hctx, val);
+}
+
+static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)
+ __blk_mq_dec_active_requests(hctx);
+}
+
+static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx)
+{
+ if (blk_mq_is_shared_tags(hctx->flags))
+ return atomic_read(&hctx->queue->nr_active_requests_shared_tags);
+ return atomic_read(&hctx->nr_active);
+}
static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx,
struct request *rq)
{
+ blk_mq_dec_active_requests(hctx);
blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag);
rq->tag = BLK_MQ_NO_TAG;
-
- if (rq->rq_flags & RQF_MQ_INFLIGHT) {
- rq->rq_flags &= ~RQF_MQ_INFLIGHT;
- atomic_dec(&hctx->nr_active);
- }
}
static inline void blk_mq_put_driver_tag(struct request *rq)
@@ -216,6 +347,16 @@ static inline void blk_mq_put_driver_tag(struct request *rq)
__blk_mq_put_driver_tag(rq->mq_hctx, rq);
}
+bool __blk_mq_alloc_driver_tag(struct request *rq);
+
+static inline bool blk_mq_get_driver_tag(struct request *rq)
+{
+ if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq))
+ return false;
+
+ return true;
+}
+
static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
{
int cpu;
@@ -226,7 +367,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
/*
* blk_mq_plug() - Get caller context plug
- * @q: request queue
* @bio : the bio being submitted by the caller context
*
* Plugging, by design, may delay the insertion of BIOs into the elevator in
@@ -237,23 +377,94 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap)
* order. While this is not a problem with regular block devices, this ordering
* change can cause write BIO failures with zoned block devices as these
* require sequential write patterns to zones. Prevent this from happening by
- * ignoring the plug state of a BIO issuing context if the target request queue
- * is for a zoned block device and the BIO to plug is a write operation.
+ * ignoring the plug state of a BIO issuing context if it is for a zoned block
+ * device and the BIO to plug is a write operation.
*
* Return current->plug if the bio can be plugged and NULL otherwise
*/
-static inline struct blk_plug *blk_mq_plug(struct request_queue *q,
- struct bio *bio)
+static inline struct blk_plug *blk_mq_plug( struct bio *bio)
{
+ /* Zoned block device write operation case: do not plug the BIO */
+ if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) &&
+ bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio)))
+ return NULL;
+
/*
* For regular block devices or read operations, use the context plug
* which may be NULL if blk_start_plug() was not executed.
*/
- if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio)))
- return current->plug;
+ return current->plug;
+}
- /* Zoned block device write operation case: do not plug the BIO */
- return NULL;
+/* Free all requests on the list */
+static inline void blk_mq_free_requests(struct list_head *list)
+{
+ while (!list_empty(list)) {
+ struct request *rq = list_entry_rq(list->next);
+
+ list_del_init(&rq->queuelist);
+ blk_mq_free_request(rq);
+ }
+}
+
+/*
+ * For shared tag users, we track the number of currently active users
+ * and attempt to provide a fair share of the tag depth for each of them.
+ */
+static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx,
+ struct sbitmap_queue *bt)
+{
+ unsigned int depth, users;
+
+ if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED))
+ return true;
+
+ /*
+ * Don't try dividing an ant
+ */
+ if (bt->sb.depth == 1)
+ return true;
+
+ if (blk_mq_is_shared_tags(hctx->flags)) {
+ struct request_queue *q = hctx->queue;
+
+ if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags))
+ return true;
+ } else {
+ if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state))
+ return true;
+ }
+
+ users = READ_ONCE(hctx->tags->active_queues);
+ if (!users)
+ return true;
+
+ /*
+ * Allow at least some tags
+ */
+ depth = max((bt->sb.depth + users - 1) / users, 4U);
+ return __blk_mq_active_requests(hctx) < depth;
}
+/* run the code block in @dispatch_ops with rcu/srcu read lock held */
+#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \
+do { \
+ if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \
+ struct blk_mq_tag_set *__tag_set = (q)->tag_set; \
+ int srcu_idx; \
+ \
+ might_sleep_if(check_sleep); \
+ srcu_idx = srcu_read_lock(__tag_set->srcu); \
+ (dispatch_ops); \
+ srcu_read_unlock(__tag_set->srcu, srcu_idx); \
+ } else { \
+ rcu_read_lock(); \
+ (dispatch_ops); \
+ rcu_read_unlock(); \
+ } \
+} while (0)
+
+#define blk_mq_run_dispatch_ops(q, dispatch_ops) \
+ __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \
+
#endif
diff --git a/block/blk-pm.c b/block/blk-pm.c
index 1adc1cd748b4..42e842074715 100644
--- a/block/blk-pm.c
+++ b/block/blk-pm.c
@@ -1,11 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
-#include <linux/blk-mq.h>
#include <linux/blk-pm.h>
#include <linux/blkdev.h>
#include <linux/pm_runtime.h>
#include "blk-mq.h"
-#include "blk-mq-tag.h"
/**
* blk_pm_runtime_init - Block layer runtime PM initialization routine
@@ -67,6 +65,10 @@ int blk_pre_runtime_suspend(struct request_queue *q)
WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE);
+ spin_lock_irq(&q->queue_lock);
+ q->rpm_status = RPM_SUSPENDING;
+ spin_unlock_irq(&q->queue_lock);
+
/*
* Increase the pm_only counter before checking whether any
* non-PM blk_queue_enter() calls are in progress to avoid that any
@@ -89,15 +91,14 @@ int blk_pre_runtime_suspend(struct request_queue *q)
/* Switch q_usage_counter back to per-cpu mode. */
blk_mq_unfreeze_queue(q);
- spin_lock_irq(&q->queue_lock);
- if (ret < 0)
+ if (ret < 0) {
+ spin_lock_irq(&q->queue_lock);
+ q->rpm_status = RPM_ACTIVE;
pm_runtime_mark_last_busy(q->dev);
- else
- q->rpm_status = RPM_SUSPENDING;
- spin_unlock_irq(&q->queue_lock);
+ spin_unlock_irq(&q->queue_lock);
- if (ret)
blk_clear_pm_only(q);
+ }
return ret;
}
@@ -160,59 +161,31 @@ EXPORT_SYMBOL(blk_pre_runtime_resume);
/**
* blk_post_runtime_resume - Post runtime resume processing
* @q: the queue of the device
- * @err: return value of the device's runtime_resume function
*
* Description:
- * Update the queue's runtime status according to the return value of the
- * device's runtime_resume function. If it is successfully resumed, process
- * the requests that are queued into the device's queue when it is resuming
- * and then mark last busy and initiate autosuspend for it.
+ * Restart the queue of a runtime suspended device. It does this regardless
+ * of whether the device's runtime-resume succeeded; even if it failed the
+ * driver or error handler will need to communicate with the device.
*
* This function should be called near the end of the device's
- * runtime_resume callback.
+ * runtime_resume callback to correct queue runtime PM status and re-enable
+ * peeking requests from the queue.
*/
-void blk_post_runtime_resume(struct request_queue *q, int err)
+void blk_post_runtime_resume(struct request_queue *q)
{
+ int old_status;
+
if (!q->dev)
return;
spin_lock_irq(&q->queue_lock);
- if (!err) {
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_mark_last_busy(q->dev);
- pm_request_autosuspend(q->dev);
- } else {
- q->rpm_status = RPM_SUSPENDED;
- }
+ old_status = q->rpm_status;
+ q->rpm_status = RPM_ACTIVE;
+ pm_runtime_mark_last_busy(q->dev);
+ pm_request_autosuspend(q->dev);
spin_unlock_irq(&q->queue_lock);
- if (!err)
+ if (old_status != RPM_ACTIVE)
blk_clear_pm_only(q);
}
EXPORT_SYMBOL(blk_post_runtime_resume);
-
-/**
- * blk_set_runtime_active - Force runtime status of the queue to be active
- * @q: the queue of the device
- *
- * If the device is left runtime suspended during system suspend the resume
- * hook typically resumes the device and corrects runtime status
- * accordingly. However, that does not affect the queue runtime PM status
- * which is still "suspended". This prevents processing requests from the
- * queue.
- *
- * This function can be used in driver's resume hook to correct queue
- * runtime PM status and re-enable peeking requests from the queue. It
- * should be called before first request is added to the queue.
- */
-void blk_set_runtime_active(struct request_queue *q)
-{
- if (q->dev) {
- spin_lock_irq(&q->queue_lock);
- q->rpm_status = RPM_ACTIVE;
- pm_runtime_mark_last_busy(q->dev);
- pm_request_autosuspend(q->dev);
- spin_unlock_irq(&q->queue_lock);
- }
-}
-EXPORT_SYMBOL(blk_set_runtime_active);
diff --git a/block/blk-pm.h b/block/blk-pm.h
index ea5507d23e75..8a5a0d4b357f 100644
--- a/block/blk-pm.h
+++ b/block/blk-pm.h
@@ -6,11 +6,14 @@
#include <linux/pm_runtime.h>
#ifdef CONFIG_PM
-static inline void blk_pm_request_resume(struct request_queue *q)
+static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
- if (q->dev && (q->rpm_status == RPM_SUSPENDED ||
- q->rpm_status == RPM_SUSPENDING))
- pm_request_resume(q->dev);
+ if (!q->dev || !blk_queue_pm_only(q))
+ return 1; /* Nothing to do */
+ if (pm && q->rpm_status != RPM_SUSPENDED)
+ return 1; /* Request allowed */
+ pm_request_resume(q->dev);
+ return 0;
}
static inline void blk_pm_mark_last_busy(struct request *rq)
@@ -18,52 +21,15 @@ static inline void blk_pm_mark_last_busy(struct request *rq)
if (rq->q->dev && !(rq->rq_flags & RQF_PM))
pm_runtime_mark_last_busy(rq->q->dev);
}
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
- lockdep_assert_held(&rq->q->queue_lock);
-
- if (rq->q->dev && !(rq->rq_flags & RQF_PM))
- rq->q->nr_pending--;
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
- struct request *rq)
-{
- lockdep_assert_held(&q->queue_lock);
-
- if (q->dev && !(rq->rq_flags & RQF_PM))
- q->nr_pending++;
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
- lockdep_assert_held(&rq->q->queue_lock);
-
- if (rq->q->dev && !(rq->rq_flags & RQF_PM))
- --rq->q->nr_pending;
-}
#else
-static inline void blk_pm_request_resume(struct request_queue *q)
+static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q)
{
+ return 1;
}
static inline void blk_pm_mark_last_busy(struct request *rq)
{
}
-
-static inline void blk_pm_requeue_request(struct request *rq)
-{
-}
-
-static inline void blk_pm_add_request(struct request_queue *q,
- struct request *rq)
-{
-}
-
-static inline void blk_pm_put_request(struct request *rq)
-{
-}
#endif
#endif /* _BLOCK_BLK_PM_H_ */
diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c
index 656460636ad3..dd7310c94713 100644
--- a/block/blk-rq-qos.c
+++ b/block/blk-rq-qos.c
@@ -10,16 +10,10 @@ static bool atomic_inc_below(atomic_t *v, unsigned int below)
{
unsigned int cur = atomic_read(v);
- for (;;) {
- unsigned int old;
-
+ do {
if (cur >= below)
return false;
- old = atomic_cmpxchg(v, cur, cur + 1);
- if (old == cur)
- break;
- cur = old;
- }
+ } while (!atomic_try_cmpxchg(v, &cur, cur + 1));
return true;
}
@@ -266,8 +260,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
if (!has_sleeper && acquire_inflight_cb(rqw, private_data))
return;
- prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE);
- has_sleeper = !wq_has_single_sleeper(&rqw->wait);
+ has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq,
+ TASK_UNINTERRUPTIBLE);
do {
/* The memory barrier in set_task_state saves us here. */
if (data.got_token)
@@ -276,7 +270,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
finish_wait(&rqw->wait, &data.wq);
/*
- * We raced with wbt_wake_function() getting a token,
+ * We raced with rq_qos_wake_function() getting a token,
* which means we now have two. Put our local token
* and wake anyone else potentially waiting for one.
*/
@@ -294,11 +288,68 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data,
void rq_qos_exit(struct request_queue *q)
{
- blk_mq_debugfs_unregister_queue_rqos(q);
-
+ mutex_lock(&q->rq_qos_mutex);
while (q->rq_qos) {
struct rq_qos *rqos = q->rq_qos;
q->rq_qos = rqos->next;
rqos->ops->exit(rqos);
}
+ mutex_unlock(&q->rq_qos_mutex);
+}
+
+int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
+ const struct rq_qos_ops *ops)
+{
+ struct request_queue *q = disk->queue;
+
+ lockdep_assert_held(&q->rq_qos_mutex);
+
+ rqos->disk = disk;
+ rqos->id = id;
+ rqos->ops = ops;
+
+ /*
+ * No IO can be in-flight when adding rqos, so freeze queue, which
+ * is fine since we only support rq_qos for blk-mq queue.
+ */
+ blk_mq_freeze_queue(q);
+
+ if (rq_qos_id(q, rqos->id))
+ goto ebusy;
+ rqos->next = q->rq_qos;
+ q->rq_qos = rqos;
+
+ blk_mq_unfreeze_queue(q);
+
+ if (rqos->ops->debugfs_attrs) {
+ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_register_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
+ }
+
+ return 0;
+ebusy:
+ blk_mq_unfreeze_queue(q);
+ return -EBUSY;
+}
+
+void rq_qos_del(struct rq_qos *rqos)
+{
+ struct request_queue *q = rqos->disk->queue;
+ struct rq_qos **cur;
+
+ lockdep_assert_held(&q->rq_qos_mutex);
+
+ blk_mq_freeze_queue(q);
+ for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
+ if (*cur == rqos) {
+ *cur = rqos->next;
+ break;
+ }
+ }
+ blk_mq_unfreeze_queue(q);
+
+ mutex_lock(&q->debugfs_mutex);
+ blk_mq_debugfs_unregister_rqos(rqos);
+ mutex_unlock(&q->debugfs_mutex);
}
diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h
index 2bc43e94f4c4..37245c97ee61 100644
--- a/block/blk-rq-qos.h
+++ b/block/blk-rq-qos.h
@@ -7,6 +7,7 @@
#include <linux/blk_types.h>
#include <linux/atomic.h>
#include <linux/wait.h>
+#include <linux/blk-mq.h>
#include "blk-mq-debugfs.h"
@@ -24,8 +25,8 @@ struct rq_wait {
};
struct rq_qos {
- struct rq_qos_ops *ops;
- struct request_queue *q;
+ const struct rq_qos_ops *ops;
+ struct gendisk *disk;
enum rq_qos_id id;
struct rq_qos *next;
#ifdef CONFIG_BLK_DEBUG_FS
@@ -73,52 +74,20 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q)
return rq_qos_id(q, RQ_QOS_WBT);
}
-static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q)
+static inline struct rq_qos *iolat_rq_qos(struct request_queue *q)
{
return rq_qos_id(q, RQ_QOS_LATENCY);
}
-static inline const char *rq_qos_id_to_name(enum rq_qos_id id)
-{
- switch (id) {
- case RQ_QOS_WBT:
- return "wbt";
- case RQ_QOS_LATENCY:
- return "latency";
- case RQ_QOS_COST:
- return "cost";
- }
- return "unknown";
-}
-
static inline void rq_wait_init(struct rq_wait *rq_wait)
{
atomic_set(&rq_wait->inflight, 0);
init_waitqueue_head(&rq_wait->wait);
}
-static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos)
-{
- rqos->next = q->rq_qos;
- q->rq_qos = rqos;
-
- if (rqos->ops->debugfs_attrs)
- blk_mq_debugfs_register_rqos(rqos);
-}
-
-static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos)
-{
- struct rq_qos **cur;
-
- for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) {
- if (*cur == rqos) {
- *cur = rqos->next;
- break;
- }
- }
-
- blk_mq_debugfs_unregister_rqos(rqos);
-}
+int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id,
+ const struct rq_qos_ops *ops);
+void rq_qos_del(struct rq_qos *rqos);
typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data);
typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data);
@@ -149,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio)
static inline void rq_qos_done(struct request_queue *q, struct request *rq)
{
- if (q->rq_qos)
+ if (q->rq_qos && !blk_rq_is_passthrough(rq))
__rq_qos_done(q->rq_qos, rq);
}
@@ -165,21 +134,22 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq)
__rq_qos_requeue(q->rq_qos, rq);
}
-static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio)
+static inline void rq_qos_done_bio(struct bio *bio)
{
- if (q->rq_qos)
- __rq_qos_done_bio(q->rq_qos, bio);
+ if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) ||
+ bio_flagged(bio, BIO_QOS_MERGED))) {
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ if (q->rq_qos)
+ __rq_qos_done_bio(q->rq_qos, bio);
+ }
}
static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio)
{
- /*
- * BIO_TRACKED lets controllers know that a bio went through the
- * normal rq_qos path.
- */
- bio_set_flag(bio, BIO_TRACKED);
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_QOS_THROTTLED);
__rq_qos_throttle(q->rq_qos, bio);
+ }
}
static inline void rq_qos_track(struct request_queue *q, struct request *rq,
@@ -192,8 +162,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq,
static inline void rq_qos_merge(struct request_queue *q, struct request *rq,
struct bio *bio)
{
- if (q->rq_qos)
+ if (q->rq_qos) {
+ bio_set_flag(bio, BIO_QOS_MERGED);
__rq_qos_merge(q->rq_qos, rq, bio);
+ }
}
static inline void rq_qos_queue_depth_changed(struct request_queue *q)
diff --git a/block/blk-settings.c b/block/blk-settings.c
index 9a2c23cd9700..06ea91e51b8b 100644
--- a/block/blk-settings.c
+++ b/block/blk-settings.c
@@ -7,7 +7,8 @@
#include <linux/init.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
-#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
+#include <linux/pagemap.h>
+#include <linux/backing-dev-defs.h>
#include <linux/gcd.h>
#include <linux/lcm.h>
#include <linux/jiffies.h>
@@ -15,13 +16,9 @@
#include <linux/dma-mapping.h>
#include "blk.h"
+#include "blk-rq-qos.h"
#include "blk-wbt.h"
-unsigned long blk_max_low_pfn;
-EXPORT_SYMBOL(blk_max_low_pfn);
-
-unsigned long blk_max_pfn;
-
void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout)
{
q->rq_timeout = timeout;
@@ -44,24 +41,25 @@ void blk_set_default_limits(struct queue_limits *lim)
lim->virt_boundary_mask = 0;
lim->max_segment_size = BLK_MAX_SEGMENT_SIZE;
lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS;
- lim->max_dev_sectors = 0;
+ lim->max_user_sectors = lim->max_dev_sectors = 0;
lim->chunk_sectors = 0;
- lim->max_write_same_sectors = 0;
lim->max_write_zeroes_sectors = 0;
lim->max_zone_append_sectors = 0;
lim->max_discard_sectors = 0;
lim->max_hw_discard_sectors = 0;
- lim->discard_granularity = 0;
+ lim->max_secure_erase_sectors = 0;
+ lim->discard_granularity = 512;
lim->discard_alignment = 0;
lim->discard_misaligned = 0;
lim->logical_block_size = lim->physical_block_size = lim->io_min = 512;
- lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT);
+ lim->bounce = BLK_BOUNCE_NONE;
lim->alignment_offset = 0;
lim->io_opt = 0;
lim->misaligned = 0;
- lim->zoned = BLK_ZONED_NONE;
+ lim->zoned = false;
+ lim->zone_write_granularity = 0;
+ lim->dma_alignment = 511;
}
-EXPORT_SYMBOL(blk_set_default_limits);
/**
* blk_set_stacking_limits - set default limits for stacking devices
@@ -82,7 +80,6 @@ void blk_set_stacking_limits(struct queue_limits *lim)
lim->max_segment_size = UINT_MAX;
lim->max_sectors = UINT_MAX;
lim->max_dev_sectors = UINT_MAX;
- lim->max_write_same_sectors = UINT_MAX;
lim->max_write_zeroes_sectors = UINT_MAX;
lim->max_zone_append_sectors = UINT_MAX;
}
@@ -91,39 +88,16 @@ EXPORT_SYMBOL(blk_set_stacking_limits);
/**
* blk_queue_bounce_limit - set bounce buffer limit for queue
* @q: the request queue for the device
- * @max_addr: the maximum address the device can handle
+ * @bounce: bounce limit to enforce
*
* Description:
- * Different hardware can have different requirements as to what pages
- * it can do I/O directly to. A low level driver can call
- * blk_queue_bounce_limit to have lower memory pages allocated as bounce
- * buffers for doing I/O to pages residing above @max_addr.
+ * Force bouncing for ISA DMA ranges or highmem.
+ *
+ * DEPRECATED, don't use in new code.
**/
-void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr)
+void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce)
{
- unsigned long b_pfn = max_addr >> PAGE_SHIFT;
- int dma = 0;
-
- q->bounce_gfp = GFP_NOIO;
-#if BITS_PER_LONG == 64
- /*
- * Assume anything <= 4GB can be handled by IOMMU. Actually
- * some IOMMUs can handle everything, but I don't know of a
- * way to test this here.
- */
- if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT))
- dma = 1;
- q->limits.bounce_pfn = max(max_low_pfn, b_pfn);
-#else
- if (b_pfn < blk_max_low_pfn)
- dma = 1;
- q->limits.bounce_pfn = b_pfn;
-#endif
- if (dma) {
- init_emergency_isa_pool();
- q->bounce_gfp = GFP_NOIO | GFP_DMA;
- q->limits.bounce_pfn = b_pfn;
- }
+ q->limits.bounce = bounce;
}
EXPORT_SYMBOL(blk_queue_bounce_limit);
@@ -153,15 +127,27 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto
if ((max_hw_sectors << 9) < PAGE_SIZE) {
max_hw_sectors = 1 << (PAGE_SHIFT - 9);
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_hw_sectors);
+ pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors);
}
+ max_hw_sectors = round_down(max_hw_sectors,
+ limits->logical_block_size >> SECTOR_SHIFT);
limits->max_hw_sectors = max_hw_sectors;
+
max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors);
- max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS);
+
+ if (limits->max_user_sectors)
+ max_sectors = min(max_sectors, limits->max_user_sectors);
+ else
+ max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP);
+
+ max_sectors = round_down(max_sectors,
+ limits->logical_block_size >> SECTOR_SHIFT);
limits->max_sectors = max_sectors;
- q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9);
+
+ if (!q->disk)
+ return;
+ q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9);
}
EXPORT_SYMBOL(blk_queue_max_hw_sectors);
@@ -172,15 +158,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors);
*
* Description:
* If a driver doesn't want IOs to cross a given chunk size, it can set
- * this limit and prevent merging across chunks. Note that the chunk size
- * must currently be a power-of-2 in sectors. Also note that the block
- * layer must accept a page worth of data at any offset. So if the
- * crossing of chunks is a hard limitation in the driver, it must still be
- * prepared to split single page bios.
+ * this limit and prevent merging across chunks. Note that the block layer
+ * must accept a page worth of data at any offset. So if the crossing of
+ * chunks is a hard limitation in the driver, it must still be prepared
+ * to split single page bios.
**/
void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors)
{
- BUG_ON(!is_power_of_2(chunk_sectors));
q->limits.chunk_sectors = chunk_sectors;
}
EXPORT_SYMBOL(blk_queue_chunk_sectors);
@@ -199,16 +183,16 @@ void blk_queue_max_discard_sectors(struct request_queue *q,
EXPORT_SYMBOL(blk_queue_max_discard_sectors);
/**
- * blk_queue_max_write_same_sectors - set max sectors for a single write same
+ * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase
* @q: the request queue for the device
- * @max_write_same_sectors: maximum number of sectors to write per command
+ * @max_sectors: maximum number of sectors to secure_erase
**/
-void blk_queue_max_write_same_sectors(struct request_queue *q,
- unsigned int max_write_same_sectors)
+void blk_queue_max_secure_erase_sectors(struct request_queue *q,
+ unsigned int max_sectors)
{
- q->limits.max_write_same_sectors = max_write_same_sectors;
+ q->limits.max_secure_erase_sectors = max_sectors;
}
-EXPORT_SYMBOL(blk_queue_max_write_same_sectors);
+EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors);
/**
* blk_queue_max_write_zeroes_sectors - set max sectors for a single
@@ -263,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments
{
if (!max_segments) {
max_segments = 1;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_segments);
+ pr_info("%s: set to minimum %u\n", __func__, max_segments);
}
q->limits.max_segments = max_segments;
@@ -300,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size)
{
if (max_size < PAGE_SIZE) {
max_size = PAGE_SIZE;
- printk(KERN_INFO "%s: set to minimum %d\n",
- __func__, max_size);
+ pr_info("%s: set to minimum %u\n", __func__, max_size);
}
/* see blk_queue_virt_boundary() for the explanation */
@@ -323,13 +305,23 @@ EXPORT_SYMBOL(blk_queue_max_segment_size);
**/
void blk_queue_logical_block_size(struct request_queue *q, unsigned int size)
{
- q->limits.logical_block_size = size;
+ struct queue_limits *limits = &q->limits;
- if (q->limits.physical_block_size < size)
- q->limits.physical_block_size = size;
+ limits->logical_block_size = size;
- if (q->limits.io_min < q->limits.physical_block_size)
- q->limits.io_min = q->limits.physical_block_size;
+ if (limits->discard_granularity < limits->logical_block_size)
+ limits->discard_granularity = limits->logical_block_size;
+
+ if (limits->physical_block_size < size)
+ limits->physical_block_size = size;
+
+ if (limits->io_min < limits->physical_block_size)
+ limits->io_min = limits->physical_block_size;
+
+ limits->max_hw_sectors =
+ round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT);
+ limits->max_sectors =
+ round_down(limits->max_sectors, size >> SECTOR_SHIFT);
}
EXPORT_SYMBOL(blk_queue_logical_block_size);
@@ -350,12 +342,37 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size)
if (q->limits.physical_block_size < q->limits.logical_block_size)
q->limits.physical_block_size = q->limits.logical_block_size;
+ if (q->limits.discard_granularity < q->limits.physical_block_size)
+ q->limits.discard_granularity = q->limits.physical_block_size;
+
if (q->limits.io_min < q->limits.physical_block_size)
q->limits.io_min = q->limits.physical_block_size;
}
EXPORT_SYMBOL(blk_queue_physical_block_size);
/**
+ * blk_queue_zone_write_granularity - set zone write granularity for the queue
+ * @q: the request queue for the zoned device
+ * @size: the zone write granularity size, in bytes
+ *
+ * Description:
+ * This should be set to the lowest possible size allowing to write in
+ * sequential zones of a zoned block device.
+ */
+void blk_queue_zone_write_granularity(struct request_queue *q,
+ unsigned int size)
+{
+ if (WARN_ON_ONCE(!blk_queue_is_zoned(q)))
+ return;
+
+ q->limits.zone_write_granularity = size;
+
+ if (q->limits.zone_write_granularity < q->limits.logical_block_size)
+ q->limits.zone_write_granularity = q->limits.logical_block_size;
+}
+EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity);
+
+/**
* blk_queue_alignment_offset - set physical block alignment offset
* @q: the request queue for the device
* @offset: alignment offset in bytes
@@ -374,6 +391,20 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset)
}
EXPORT_SYMBOL(blk_queue_alignment_offset);
+void disk_update_readahead(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+
+ /*
+ * For read-ahead of large files to be effective, we need to read ahead
+ * at least twice the optimal I/O size.
+ */
+ disk->bdi->ra_pages =
+ max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
+ disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9);
+}
+EXPORT_SYMBOL_GPL(disk_update_readahead);
+
/**
* blk_limits_io_min - set minimum request size for a device
* @limits: the queue limits
@@ -452,19 +483,54 @@ EXPORT_SYMBOL(blk_limits_io_opt);
void blk_queue_io_opt(struct request_queue *q, unsigned int opt)
{
blk_limits_io_opt(&q->limits, opt);
+ if (!q->disk)
+ return;
+ q->disk->bdi->ra_pages =
+ max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES);
}
EXPORT_SYMBOL(blk_queue_io_opt);
-/**
- * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers
- * @t: the stacking driver (top)
- * @b: the underlying device (bottom)
- **/
-void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b)
+static int queue_limit_alignment_offset(const struct queue_limits *lim,
+ sector_t sector)
+{
+ unsigned int granularity = max(lim->physical_block_size, lim->io_min);
+ unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT)
+ << SECTOR_SHIFT;
+
+ return (granularity + lim->alignment_offset - alignment) % granularity;
+}
+
+static unsigned int queue_limit_discard_alignment(
+ const struct queue_limits *lim, sector_t sector)
+{
+ unsigned int alignment, granularity, offset;
+
+ if (!lim->max_discard_sectors)
+ return 0;
+
+ /* Why are these in bytes, not sectors? */
+ alignment = lim->discard_alignment >> SECTOR_SHIFT;
+ granularity = lim->discard_granularity >> SECTOR_SHIFT;
+ if (!granularity)
+ return 0;
+
+ /* Offset of the partition start in 'granularity' sectors */
+ offset = sector_div(sector, granularity);
+
+ /* And why do we do this modulus *again* in blkdev_issue_discard()? */
+ offset = (granularity + alignment - offset) % granularity;
+
+ /* Turn it back into bytes, gaah */
+ return offset << SECTOR_SHIFT;
+}
+
+static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs)
{
- blk_stack_limits(&t->limits, &b->limits, 0);
+ sectors = round_down(sectors, lbs >> SECTOR_SHIFT);
+ if (sectors < PAGE_SIZE >> SECTOR_SHIFT)
+ sectors = PAGE_SIZE >> SECTOR_SHIFT;
+ return sectors;
}
-EXPORT_SYMBOL(blk_queue_stack_limits);
/**
* blk_stack_limits - adjust queue_limits for stacked devices
@@ -495,13 +561,11 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors);
t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors);
t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors);
- t->max_write_same_sectors = min(t->max_write_same_sectors,
- b->max_write_same_sectors);
t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors,
b->max_write_zeroes_sectors);
t->max_zone_append_sectors = min(t->max_zone_append_sectors,
b->max_zone_append_sectors);
- t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn);
+ t->bounce = max(t->bounce, b->bounce);
t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask,
b->seg_boundary_mask);
@@ -545,6 +609,11 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->io_min = max(t->io_min, b->io_min);
t->io_opt = lcm_not_zero(t->io_opt, b->io_opt);
+ t->dma_alignment = max(t->dma_alignment, b->dma_alignment);
+
+ /* Set non-power-of-2 compatible chunk_sectors boundary */
+ if (b->chunk_sectors)
+ t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors);
/* Physical block size a multiple of the logical block size? */
if (t->physical_block_size & (t->logical_block_size - 1)) {
@@ -567,6 +636,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
ret = -1;
}
+ /* chunk_sectors a multiple of the physical block size? */
+ if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) {
+ t->chunk_sectors = 0;
+ t->misaligned = 1;
+ ret = -1;
+ }
+
t->raid_partial_stripes_expensive =
max(t->raid_partial_stripes_expensive,
b->raid_partial_stripes_expensive);
@@ -581,6 +657,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
ret = -1;
}
+ t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size);
+ t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size);
+ t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size);
+
/* Discard alignment and granularity */
if (b->discard_granularity) {
alignment = queue_limit_discard_alignment(b, start);
@@ -604,38 +684,16 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b,
t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) %
t->discard_granularity;
}
-
- if (b->chunk_sectors)
- t->chunk_sectors = min_not_zero(t->chunk_sectors,
- b->chunk_sectors);
-
+ t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors,
+ b->max_secure_erase_sectors);
+ t->zone_write_granularity = max(t->zone_write_granularity,
+ b->zone_write_granularity);
+ t->zoned = max(t->zoned, b->zoned);
return ret;
}
EXPORT_SYMBOL(blk_stack_limits);
/**
- * bdev_stack_limits - adjust queue limits for stacked drivers
- * @t: the stacking driver limits (top device)
- * @bdev: the component block_device (bottom)
- * @start: first data sector within component device
- *
- * Description:
- * Merges queue limits for a top device and a block_device. Returns
- * 0 if alignment didn't change. Returns -1 if adding the bottom
- * device caused misalignment.
- */
-int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev,
- sector_t start)
-{
- struct request_queue *bq = bdev_get_queue(bdev);
-
- start += get_start_sect(bdev);
-
- return blk_stack_limits(t, &bq->limits, start);
-}
-EXPORT_SYMBOL(bdev_stack_limits);
-
-/**
* disk_stack_limits - adjust queue limits for stacked drivers
* @disk: MD/DM gendisk (top)
* @bdev: the underlying block device (bottom)
@@ -650,18 +708,12 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev,
{
struct request_queue *t = disk->queue;
- if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) {
- char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE];
-
- disk_name(disk, 0, top);
- bdevname(bdev, bottom);
-
- printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n",
- top, bottom);
- }
+ if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits,
+ get_start_sect(bdev) + (offset >> 9)) < 0)
+ pr_notice("%s: Warning: Device %pg is misaligned\n",
+ disk->disk_name, bdev);
- t->backing_dev_info->io_pages =
- t->limits.max_sectors >> (PAGE_SHIFT - 9);
+ disk_update_readahead(disk);
}
EXPORT_SYMBOL(disk_stack_limits);
@@ -691,8 +743,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask)
{
if (mask < PAGE_SIZE - 1) {
mask = PAGE_SIZE - 1;
- printk(KERN_INFO "%s: set to minimum %lx\n",
- __func__, mask);
+ pr_info("%s: set to minimum %lx\n", __func__, mask);
}
q->limits.seg_boundary_mask = mask;
@@ -731,7 +782,7 @@ EXPORT_SYMBOL(blk_queue_virt_boundary);
**/
void blk_queue_dma_alignment(struct request_queue *q, int mask)
{
- q->dma_alignment = mask;
+ q->limits.dma_alignment = mask;
}
EXPORT_SYMBOL(blk_queue_dma_alignment);
@@ -753,8 +804,8 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask)
{
BUG_ON(mask > PAGE_SIZE);
- if (mask > q->dma_alignment)
- q->dma_alignment = mask;
+ if (mask > q->limits.dma_alignment)
+ q->limits.dma_alignment = mask;
}
EXPORT_SYMBOL(blk_queue_update_dma_alignment);
@@ -781,16 +832,17 @@ EXPORT_SYMBOL(blk_set_queue_depth);
*/
void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua)
{
- if (wc)
+ if (wc) {
+ blk_queue_flag_set(QUEUE_FLAG_HW_WC, q);
blk_queue_flag_set(QUEUE_FLAG_WC, q);
- else
+ } else {
+ blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q);
blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+ }
if (fua)
blk_queue_flag_set(QUEUE_FLAG_FUA, q);
else
blk_queue_flag_clear(QUEUE_FLAG_FUA, q);
-
- wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
}
EXPORT_SYMBOL_GPL(blk_queue_write_cache);
@@ -832,10 +884,45 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q,
}
EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging);
-static int __init blk_settings_init(void)
+/**
+ * disk_set_zoned - inidicate a zoned device
+ * @disk: gendisk to configure
+ */
+void disk_set_zoned(struct gendisk *disk)
{
- blk_max_low_pfn = max_low_pfn - 1;
- blk_max_pfn = max_pfn - 1;
- return 0;
+ struct request_queue *q = disk->queue;
+
+ WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED));
+
+ /*
+ * Set the zone write granularity to the device logical block
+ * size by default. The driver can change this value if needed.
+ */
+ q->limits.zoned = true;
+ blk_queue_zone_write_granularity(q, queue_logical_block_size(q));
+}
+EXPORT_SYMBOL_GPL(disk_set_zoned);
+
+int bdev_alignment_offset(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (q->limits.misaligned)
+ return -1;
+ if (bdev_is_partition(bdev))
+ return queue_limit_alignment_offset(&q->limits,
+ bdev->bd_start_sect);
+ return q->limits.alignment_offset;
+}
+EXPORT_SYMBOL_GPL(bdev_alignment_offset);
+
+unsigned int bdev_discard_alignment(struct block_device *bdev)
+{
+ struct request_queue *q = bdev_get_queue(bdev);
+
+ if (bdev_is_partition(bdev))
+ return queue_limit_discard_alignment(&q->limits,
+ bdev->bd_start_sect);
+ return q->limits.discard_alignment;
}
-subsys_initcall(blk_settings_init);
+EXPORT_SYMBOL_GPL(bdev_discard_alignment);
diff --git a/block/blk-softirq.c b/block/blk-softirq.c
deleted file mode 100644
index 6e7ec87d49fa..000000000000
--- a/block/blk-softirq.c
+++ /dev/null
@@ -1,156 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Functions related to softirq rq completions
- */
-#include <linux/kernel.h>
-#include <linux/module.h>
-#include <linux/init.h>
-#include <linux/bio.h>
-#include <linux/blkdev.h>
-#include <linux/interrupt.h>
-#include <linux/cpu.h>
-#include <linux/sched.h>
-#include <linux/sched/topology.h>
-
-#include "blk.h"
-
-static DEFINE_PER_CPU(struct list_head, blk_cpu_done);
-
-/*
- * Softirq action handler - move entries to local list and loop over them
- * while passing them to the queue registered handler.
- */
-static __latent_entropy void blk_done_softirq(struct softirq_action *h)
-{
- struct list_head *cpu_list, local_list;
-
- local_irq_disable();
- cpu_list = this_cpu_ptr(&blk_cpu_done);
- list_replace_init(cpu_list, &local_list);
- local_irq_enable();
-
- while (!list_empty(&local_list)) {
- struct request *rq;
-
- rq = list_entry(local_list.next, struct request, ipi_list);
- list_del_init(&rq->ipi_list);
- rq->q->mq_ops->complete(rq);
- }
-}
-
-#ifdef CONFIG_SMP
-static void trigger_softirq(void *data)
-{
- struct request *rq = data;
- struct list_head *list;
-
- list = this_cpu_ptr(&blk_cpu_done);
- list_add_tail(&rq->ipi_list, list);
-
- if (list->next == &rq->ipi_list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
-}
-
-/*
- * Setup and invoke a run of 'trigger_softirq' on the given cpu.
- */
-static int raise_blk_irq(int cpu, struct request *rq)
-{
- if (cpu_online(cpu)) {
- call_single_data_t *data = &rq->csd;
-
- data->func = trigger_softirq;
- data->info = rq;
- data->flags = 0;
-
- smp_call_function_single_async(cpu, data);
- return 0;
- }
-
- return 1;
-}
-#else /* CONFIG_SMP */
-static int raise_blk_irq(int cpu, struct request *rq)
-{
- return 1;
-}
-#endif
-
-static int blk_softirq_cpu_dead(unsigned int cpu)
-{
- /*
- * If a CPU goes away, splice its entries to the current CPU
- * and trigger a run of the softirq
- */
- local_irq_disable();
- list_splice_init(&per_cpu(blk_cpu_done, cpu),
- this_cpu_ptr(&blk_cpu_done));
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- local_irq_enable();
-
- return 0;
-}
-
-void __blk_complete_request(struct request *req)
-{
- struct request_queue *q = req->q;
- int cpu, ccpu = req->mq_ctx->cpu;
- unsigned long flags;
- bool shared = false;
-
- BUG_ON(!q->mq_ops->complete);
-
- local_irq_save(flags);
- cpu = smp_processor_id();
-
- /*
- * Select completion CPU
- */
- if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) {
- if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags))
- shared = cpus_share_cache(cpu, ccpu);
- } else
- ccpu = cpu;
-
- /*
- * If current CPU and requested CPU share a cache, run the softirq on
- * the current CPU. One might concern this is just like
- * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is
- * running in interrupt handler, and currently I/O controller doesn't
- * support multiple interrupts, so current CPU is unique actually. This
- * avoids IPI sending from current CPU to the first CPU of a group.
- */
- if (ccpu == cpu || shared) {
- struct list_head *list;
-do_local:
- list = this_cpu_ptr(&blk_cpu_done);
- list_add_tail(&req->ipi_list, list);
-
- /*
- * if the list only contains our just added request,
- * signal a raise of the softirq. If there are already
- * entries there, someone already raised the irq but it
- * hasn't run yet.
- */
- if (list->next == &req->ipi_list)
- raise_softirq_irqoff(BLOCK_SOFTIRQ);
- } else if (raise_blk_irq(ccpu, req))
- goto do_local;
-
- local_irq_restore(flags);
-}
-
-static __init int blk_softirq_init(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i));
-
- open_softirq(BLOCK_SOFTIRQ, blk_done_softirq);
- cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD,
- "block/softirq:dead", NULL,
- blk_softirq_cpu_dead);
- return 0;
-}
-subsys_initcall(blk_softirq_init);
diff --git a/block/blk-stat.c b/block/blk-stat.c
index 7da302ff88d0..7ff76ae6c76a 100644
--- a/block/blk-stat.c
+++ b/block/blk-stat.c
@@ -6,7 +6,6 @@
*/
#include <linux/kernel.h>
#include <linux/rculist.h>
-#include <linux/blk-mq.h>
#include "blk-stat.h"
#include "blk-mq.h"
@@ -15,7 +14,7 @@
struct blk_queue_stats {
struct list_head callbacks;
spinlock_t lock;
- bool enable_accounting;
+ int accounting;
};
void blk_rq_stat_init(struct blk_rq_stat *stat)
@@ -58,7 +57,8 @@ void blk_stat_add(struct request *rq, u64 now)
value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0;
- blk_throtl_stat_add(rq, value);
+ if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE)
+ blk_throtl_stat_add(rq, value);
rcu_read_lock();
cpu = get_cpu();
@@ -137,6 +137,7 @@ void blk_stat_add_callback(struct request_queue *q,
struct blk_stat_callback *cb)
{
unsigned int bucket;
+ unsigned long flags;
int cpu;
for_each_possible_cpu(cpu) {
@@ -147,20 +148,22 @@ void blk_stat_add_callback(struct request_queue *q,
blk_rq_stat_init(&cpu_stat[bucket]);
}
- spin_lock(&q->stats->lock);
+ spin_lock_irqsave(&q->stats->lock, flags);
list_add_tail_rcu(&cb->list, &q->stats->callbacks);
blk_queue_flag_set(QUEUE_FLAG_STATS, q);
- spin_unlock(&q->stats->lock);
+ spin_unlock_irqrestore(&q->stats->lock, flags);
}
void blk_stat_remove_callback(struct request_queue *q,
struct blk_stat_callback *cb)
{
- spin_lock(&q->stats->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->stats->lock, flags);
list_del_rcu(&cb->list);
- if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting)
+ if (list_empty(&q->stats->callbacks) && !q->stats->accounting)
blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
- spin_unlock(&q->stats->lock);
+ spin_unlock_irqrestore(&q->stats->lock, flags);
del_timer_sync(&cb->timer);
}
@@ -181,12 +184,25 @@ void blk_stat_free_callback(struct blk_stat_callback *cb)
call_rcu(&cb->rcu, blk_stat_free_callback_rcu);
}
+void blk_stat_disable_accounting(struct request_queue *q)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->stats->lock, flags);
+ if (!--q->stats->accounting && list_empty(&q->stats->callbacks))
+ blk_queue_flag_clear(QUEUE_FLAG_STATS, q);
+ spin_unlock_irqrestore(&q->stats->lock, flags);
+}
+EXPORT_SYMBOL_GPL(blk_stat_disable_accounting);
+
void blk_stat_enable_accounting(struct request_queue *q)
{
- spin_lock(&q->stats->lock);
- q->stats->enable_accounting = true;
- blk_queue_flag_set(QUEUE_FLAG_STATS, q);
- spin_unlock(&q->stats->lock);
+ unsigned long flags;
+
+ spin_lock_irqsave(&q->stats->lock, flags);
+ if (!q->stats->accounting++ && list_empty(&q->stats->callbacks))
+ blk_queue_flag_set(QUEUE_FLAG_STATS, q);
+ spin_unlock_irqrestore(&q->stats->lock, flags);
}
EXPORT_SYMBOL_GPL(blk_stat_enable_accounting);
@@ -200,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void)
INIT_LIST_HEAD(&stats->callbacks);
spin_lock_init(&stats->lock);
- stats->enable_accounting = false;
+ stats->accounting = 0;
return stats;
}
diff --git a/block/blk-stat.h b/block/blk-stat.h
index 17b47a86eefb..17e1eb4ec7e2 100644
--- a/block/blk-stat.h
+++ b/block/blk-stat.h
@@ -64,11 +64,13 @@ struct blk_stat_callback {
struct blk_queue_stats *blk_alloc_queue_stats(void);
void blk_free_queue_stats(struct blk_queue_stats *);
+bool blk_stats_alloc_enable(struct request_queue *q);
void blk_stat_add(struct request *rq, u64 now);
/* record time/size info in request but not add a callback */
void blk_stat_enable_accounting(struct request_queue *q);
+void blk_stat_disable_accounting(struct request_queue *q);
/**
* blk_stat_alloc_callback() - Allocate a block statistics callback.
diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c
index 02643e149d5e..6b2429cad81a 100644
--- a/block/blk-sysfs.c
+++ b/block/blk-sysfs.c
@@ -9,13 +9,16 @@
#include <linux/blkdev.h>
#include <linux/backing-dev.h>
#include <linux/blktrace_api.h>
-#include <linux/blk-mq.h>
-#include <linux/blk-cgroup.h>
+#include <linux/debugfs.h>
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
+#include "blk-mq-sched.h"
+#include "blk-rq-qos.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
+#include "blk-throttle.h"
struct queue_sysfs_entry {
struct attribute attr;
@@ -44,22 +47,9 @@ queue_var_store(unsigned long *var, const char *page, size_t count)
return count;
}
-static ssize_t queue_var_store64(s64 *var, const char *page)
-{
- int err;
- s64 v;
-
- err = kstrtos64(page, 10, &v);
- if (err < 0)
- return err;
-
- *var = v;
- return 0;
-}
-
static ssize_t queue_requests_show(struct request_queue *q, char *page)
{
- return queue_var_show(q->nr_requests, (page));
+ return queue_var_show(q->nr_requests, page);
}
static ssize_t
@@ -87,23 +77,26 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_ra_show(struct request_queue *q, char *page)
{
- unsigned long ra_kb = q->backing_dev_info->ra_pages <<
- (PAGE_SHIFT - 10);
+ unsigned long ra_kb;
- return queue_var_show(ra_kb, (page));
+ if (!q->disk)
+ return -EINVAL;
+ ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10);
+ return queue_var_show(ra_kb, page);
}
static ssize_t
queue_ra_store(struct request_queue *q, const char *page, size_t count)
{
unsigned long ra_kb;
- ssize_t ret = queue_var_store(&ra_kb, page, count);
+ ssize_t ret;
+ if (!q->disk)
+ return -EINVAL;
+ ret = queue_var_store(&ra_kb, page, count);
if (ret < 0)
return ret;
-
- q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
-
+ q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10);
return ret;
}
@@ -111,28 +104,28 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page)
{
int max_sectors_kb = queue_max_sectors(q) >> 1;
- return queue_var_show(max_sectors_kb, (page));
+ return queue_var_show(max_sectors_kb, page);
}
static ssize_t queue_max_segments_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_segments(q), (page));
+ return queue_var_show(queue_max_segments(q), page);
}
static ssize_t queue_max_discard_segments_show(struct request_queue *q,
char *page)
{
- return queue_var_show(queue_max_discard_segments(q), (page));
+ return queue_var_show(queue_max_discard_segments(q), page);
}
static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page)
{
- return queue_var_show(q->limits.max_integrity_segments, (page));
+ return queue_var_show(q->limits.max_integrity_segments, page);
}
static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page)
{
- return queue_var_show(queue_max_segment_size(q), (page));
+ return queue_var_show(queue_max_segment_size(q), page);
}
static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page)
@@ -208,8 +201,7 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag
static ssize_t queue_write_same_max_show(struct request_queue *q, char *page)
{
- return sprintf(page, "%llu\n",
- (unsigned long long)q->limits.max_write_same_sectors << 9);
+ return queue_var_show(0, page);
}
static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
@@ -218,6 +210,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page)
(unsigned long long)q->limits.max_write_zeroes_sectors << 9);
}
+static ssize_t queue_zone_write_granularity_show(struct request_queue *q,
+ char *page)
+{
+ return queue_var_show(queue_zone_write_granularity(q), page);
+}
+
static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
{
unsigned long long max_sectors = q->limits.max_zone_append_sectors;
@@ -228,23 +226,33 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page)
static ssize_t
queue_max_sectors_store(struct request_queue *q, const char *page, size_t count)
{
- unsigned long max_sectors_kb,
+ unsigned long var;
+ unsigned int max_sectors_kb,
max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1,
page_kb = 1 << (PAGE_SHIFT - 10);
- ssize_t ret = queue_var_store(&max_sectors_kb, page, count);
+ ssize_t ret = queue_var_store(&var, page, count);
if (ret < 0)
return ret;
- max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long)
+ max_sectors_kb = (unsigned int)var;
+ max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb,
q->limits.max_dev_sectors >> 1);
-
- if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb)
- return -EINVAL;
+ if (max_sectors_kb == 0) {
+ q->limits.max_user_sectors = 0;
+ max_sectors_kb = min(max_hw_sectors_kb,
+ BLK_DEF_MAX_SECTORS_CAP >> 1);
+ } else {
+ if (max_sectors_kb > max_hw_sectors_kb ||
+ max_sectors_kb < page_kb)
+ return -EINVAL;
+ q->limits.max_user_sectors = max_sectors_kb << 1;
+ }
spin_lock_irq(&q->queue_lock);
q->limits.max_sectors = max_sectors_kb << 1;
- q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
+ if (q->disk)
+ q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10);
spin_unlock_irq(&q->queue_lock);
return ret;
@@ -254,19 +262,29 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page)
{
int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1;
- return queue_var_show(max_hw_sectors_kb, (page));
+ return queue_var_show(max_hw_sectors_kb, page);
+}
+
+static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(q->limits.virt_boundary_mask, page);
+}
+
+static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(queue_dma_alignment(q), page);
}
#define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \
static ssize_t \
-queue_show_##name(struct request_queue *q, char *page) \
+queue_##name##_show(struct request_queue *q, char *page) \
{ \
int bit; \
bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \
return queue_var_show(neg ? !bit : bit, page); \
} \
static ssize_t \
-queue_store_##name(struct request_queue *q, const char *page, size_t count) \
+queue_##name##_store(struct request_queue *q, const char *page, size_t count) \
{ \
unsigned long val; \
ssize_t ret; \
@@ -286,23 +304,29 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \
QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1);
QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0);
QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0);
+QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0);
#undef QUEUE_SYSFS_BIT_FNS
static ssize_t queue_zoned_show(struct request_queue *q, char *page)
{
- switch (blk_queue_zoned_model(q)) {
- case BLK_ZONED_HA:
- return sprintf(page, "host-aware\n");
- case BLK_ZONED_HM:
+ if (blk_queue_is_zoned(q))
return sprintf(page, "host-managed\n");
- default:
- return sprintf(page, "none\n");
- }
+ return sprintf(page, "none\n");
}
static ssize_t queue_nr_zones_show(struct request_queue *q, char *page)
{
- return queue_var_show(blk_queue_nr_zones(q), page);
+ return queue_var_show(disk_nr_zones(q->disk), page);
+}
+
+static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(bdev_max_open_zones(q->disk->part0), page);
+}
+
+static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(bdev_max_active_zones(q->disk->part0), page);
}
static ssize_t queue_nomerges_show(struct request_queue *q, char *page)
@@ -365,35 +389,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count)
static ssize_t queue_poll_delay_show(struct request_queue *q, char *page)
{
- int val;
-
- if (q->poll_nsec == BLK_MQ_POLL_CLASSIC)
- val = BLK_MQ_POLL_CLASSIC;
- else
- val = q->poll_nsec / 1000;
-
- return sprintf(page, "%d\n", val);
+ return sprintf(page, "%d\n", -1);
}
static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page,
size_t count)
{
- int err, val;
-
- if (!q->mq_ops || !q->mq_ops->poll)
- return -EINVAL;
-
- err = kstrtoint(page, 10, &val);
- if (err < 0)
- return err;
-
- if (val == BLK_MQ_POLL_CLASSIC)
- q->poll_nsec = BLK_MQ_POLL_CLASSIC;
- else if (val >= 0)
- q->poll_nsec = val * 1000;
- else
- return -EINVAL;
-
return count;
}
@@ -405,23 +406,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page)
static ssize_t queue_poll_store(struct request_queue *q, const char *page,
size_t count)
{
- unsigned long poll_on;
- ssize_t ret;
-
- if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL ||
- !q->tag_set->map[HCTX_TYPE_POLL].nr_queues)
+ if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags))
return -EINVAL;
-
- ret = queue_var_store(&poll_on, page, count);
- if (ret < 0)
- return ret;
-
- if (poll_on)
- blk_queue_flag_set(QUEUE_FLAG_POLL, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_POLL, q);
-
- return ret;
+ pr_info_ratelimited("writes to the poll attribute are ignored.\n");
+ pr_info_ratelimited("please use driver specific parameters instead.\n");
+ return count;
}
static ssize_t queue_io_timeout_show(struct request_queue *q, char *page)
@@ -444,11 +433,133 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page,
return count;
}
+static ssize_t queue_wc_show(struct request_queue *q, char *page)
+{
+ if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
+ return sprintf(page, "write back\n");
+
+ return sprintf(page, "write through\n");
+}
+
+static ssize_t queue_wc_store(struct request_queue *q, const char *page,
+ size_t count)
+{
+ if (!strncmp(page, "write back", 10)) {
+ if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags))
+ return -EINVAL;
+ blk_queue_flag_set(QUEUE_FLAG_WC, q);
+ } else if (!strncmp(page, "write through", 13) ||
+ !strncmp(page, "none", 4)) {
+ blk_queue_flag_clear(QUEUE_FLAG_WC, q);
+ } else {
+ return -EINVAL;
+ }
+
+ return count;
+}
+
+static ssize_t queue_fua_show(struct request_queue *q, char *page)
+{
+ return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
+}
+
+static ssize_t queue_dax_show(struct request_queue *q, char *page)
+{
+ return queue_var_show(blk_queue_dax(q), page);
+}
+
+#define QUEUE_RO_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0444 }, \
+ .show = _prefix##_show, \
+};
+
+#define QUEUE_RW_ENTRY(_prefix, _name) \
+static struct queue_sysfs_entry _prefix##_entry = { \
+ .attr = { .name = _name, .mode = 0644 }, \
+ .show = _prefix##_show, \
+ .store = _prefix##_store, \
+};
+
+QUEUE_RW_ENTRY(queue_requests, "nr_requests");
+QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb");
+QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb");
+QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb");
+QUEUE_RO_ENTRY(queue_max_segments, "max_segments");
+QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments");
+QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size");
+QUEUE_RW_ENTRY(elv_iosched, "scheduler");
+
+QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size");
+QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size");
+QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors");
+QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size");
+QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size");
+
+QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments");
+QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity");
+QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes");
+QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes");
+QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data");
+
+QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes");
+QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes");
+QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity");
+
+QUEUE_RO_ENTRY(queue_zoned, "zoned");
+QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones");
+QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones");
+QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones");
+
+QUEUE_RW_ENTRY(queue_nomerges, "nomerges");
+QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity");
+QUEUE_RW_ENTRY(queue_poll, "io_poll");
+QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay");
+QUEUE_RW_ENTRY(queue_wc, "write_cache");
+QUEUE_RO_ENTRY(queue_fua, "fua");
+QUEUE_RO_ENTRY(queue_dax, "dax");
+QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout");
+QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask");
+QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment");
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
+QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time");
+#endif
+
+/* legacy alias for logical_block_size: */
+static struct queue_sysfs_entry queue_hw_sector_size_entry = {
+ .attr = {.name = "hw_sector_size", .mode = 0444 },
+ .show = queue_logical_block_size_show,
+};
+
+QUEUE_RW_ENTRY(queue_nonrot, "rotational");
+QUEUE_RW_ENTRY(queue_iostats, "iostats");
+QUEUE_RW_ENTRY(queue_random, "add_random");
+QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes");
+
+#ifdef CONFIG_BLK_WBT
+static ssize_t queue_var_store64(s64 *var, const char *page)
+{
+ int err;
+ s64 v;
+
+ err = kstrtos64(page, 10, &v);
+ if (err < 0)
+ return err;
+
+ *var = v;
+ return 0;
+}
+
static ssize_t queue_wb_lat_show(struct request_queue *q, char *page)
{
if (!wbt_rq_qos(q))
return -EINVAL;
+ if (wbt_disabled(q))
+ return sprintf(page, "0\n");
+
return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000));
}
@@ -467,7 +578,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
rqos = wbt_rq_qos(q);
if (!rqos) {
- ret = wbt_init(q);
+ ret = wbt_init(q->disk);
if (ret)
return ret;
}
@@ -496,251 +607,11 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page,
return count;
}
-static ssize_t queue_wc_show(struct request_queue *q, char *page)
-{
- if (test_bit(QUEUE_FLAG_WC, &q->queue_flags))
- return sprintf(page, "write back\n");
-
- return sprintf(page, "write through\n");
-}
-
-static ssize_t queue_wc_store(struct request_queue *q, const char *page,
- size_t count)
-{
- int set = -1;
-
- if (!strncmp(page, "write back", 10))
- set = 1;
- else if (!strncmp(page, "write through", 13) ||
- !strncmp(page, "none", 4))
- set = 0;
-
- if (set == -1)
- return -EINVAL;
-
- if (set)
- blk_queue_flag_set(QUEUE_FLAG_WC, q);
- else
- blk_queue_flag_clear(QUEUE_FLAG_WC, q);
-
- return count;
-}
-
-static ssize_t queue_fua_show(struct request_queue *q, char *page)
-{
- return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags));
-}
-
-static ssize_t queue_dax_show(struct request_queue *q, char *page)
-{
- return queue_var_show(blk_queue_dax(q), page);
-}
-
-static struct queue_sysfs_entry queue_requests_entry = {
- .attr = {.name = "nr_requests", .mode = 0644 },
- .show = queue_requests_show,
- .store = queue_requests_store,
-};
-
-static struct queue_sysfs_entry queue_ra_entry = {
- .attr = {.name = "read_ahead_kb", .mode = 0644 },
- .show = queue_ra_show,
- .store = queue_ra_store,
-};
-
-static struct queue_sysfs_entry queue_max_sectors_entry = {
- .attr = {.name = "max_sectors_kb", .mode = 0644 },
- .show = queue_max_sectors_show,
- .store = queue_max_sectors_store,
-};
-
-static struct queue_sysfs_entry queue_max_hw_sectors_entry = {
- .attr = {.name = "max_hw_sectors_kb", .mode = 0444 },
- .show = queue_max_hw_sectors_show,
-};
-
-static struct queue_sysfs_entry queue_max_segments_entry = {
- .attr = {.name = "max_segments", .mode = 0444 },
- .show = queue_max_segments_show,
-};
-
-static struct queue_sysfs_entry queue_max_discard_segments_entry = {
- .attr = {.name = "max_discard_segments", .mode = 0444 },
- .show = queue_max_discard_segments_show,
-};
-
-static struct queue_sysfs_entry queue_max_integrity_segments_entry = {
- .attr = {.name = "max_integrity_segments", .mode = 0444 },
- .show = queue_max_integrity_segments_show,
-};
-
-static struct queue_sysfs_entry queue_max_segment_size_entry = {
- .attr = {.name = "max_segment_size", .mode = 0444 },
- .show = queue_max_segment_size_show,
-};
-
-static struct queue_sysfs_entry queue_iosched_entry = {
- .attr = {.name = "scheduler", .mode = 0644 },
- .show = elv_iosched_show,
- .store = elv_iosched_store,
-};
-
-static struct queue_sysfs_entry queue_hw_sector_size_entry = {
- .attr = {.name = "hw_sector_size", .mode = 0444 },
- .show = queue_logical_block_size_show,
-};
-
-static struct queue_sysfs_entry queue_logical_block_size_entry = {
- .attr = {.name = "logical_block_size", .mode = 0444 },
- .show = queue_logical_block_size_show,
-};
-
-static struct queue_sysfs_entry queue_physical_block_size_entry = {
- .attr = {.name = "physical_block_size", .mode = 0444 },
- .show = queue_physical_block_size_show,
-};
-
-static struct queue_sysfs_entry queue_chunk_sectors_entry = {
- .attr = {.name = "chunk_sectors", .mode = 0444 },
- .show = queue_chunk_sectors_show,
-};
-
-static struct queue_sysfs_entry queue_io_min_entry = {
- .attr = {.name = "minimum_io_size", .mode = 0444 },
- .show = queue_io_min_show,
-};
-
-static struct queue_sysfs_entry queue_io_opt_entry = {
- .attr = {.name = "optimal_io_size", .mode = 0444 },
- .show = queue_io_opt_show,
-};
-
-static struct queue_sysfs_entry queue_discard_granularity_entry = {
- .attr = {.name = "discard_granularity", .mode = 0444 },
- .show = queue_discard_granularity_show,
-};
-
-static struct queue_sysfs_entry queue_discard_max_hw_entry = {
- .attr = {.name = "discard_max_hw_bytes", .mode = 0444 },
- .show = queue_discard_max_hw_show,
-};
-
-static struct queue_sysfs_entry queue_discard_max_entry = {
- .attr = {.name = "discard_max_bytes", .mode = 0644 },
- .show = queue_discard_max_show,
- .store = queue_discard_max_store,
-};
-
-static struct queue_sysfs_entry queue_discard_zeroes_data_entry = {
- .attr = {.name = "discard_zeroes_data", .mode = 0444 },
- .show = queue_discard_zeroes_data_show,
-};
-
-static struct queue_sysfs_entry queue_write_same_max_entry = {
- .attr = {.name = "write_same_max_bytes", .mode = 0444 },
- .show = queue_write_same_max_show,
-};
-
-static struct queue_sysfs_entry queue_write_zeroes_max_entry = {
- .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 },
- .show = queue_write_zeroes_max_show,
-};
-
-static struct queue_sysfs_entry queue_zone_append_max_entry = {
- .attr = {.name = "zone_append_max_bytes", .mode = 0444 },
- .show = queue_zone_append_max_show,
-};
-
-static struct queue_sysfs_entry queue_nonrot_entry = {
- .attr = {.name = "rotational", .mode = 0644 },
- .show = queue_show_nonrot,
- .store = queue_store_nonrot,
-};
-
-static struct queue_sysfs_entry queue_zoned_entry = {
- .attr = {.name = "zoned", .mode = 0444 },
- .show = queue_zoned_show,
-};
-
-static struct queue_sysfs_entry queue_nr_zones_entry = {
- .attr = {.name = "nr_zones", .mode = 0444 },
- .show = queue_nr_zones_show,
-};
-
-static struct queue_sysfs_entry queue_nomerges_entry = {
- .attr = {.name = "nomerges", .mode = 0644 },
- .show = queue_nomerges_show,
- .store = queue_nomerges_store,
-};
-
-static struct queue_sysfs_entry queue_rq_affinity_entry = {
- .attr = {.name = "rq_affinity", .mode = 0644 },
- .show = queue_rq_affinity_show,
- .store = queue_rq_affinity_store,
-};
-
-static struct queue_sysfs_entry queue_iostats_entry = {
- .attr = {.name = "iostats", .mode = 0644 },
- .show = queue_show_iostats,
- .store = queue_store_iostats,
-};
-
-static struct queue_sysfs_entry queue_random_entry = {
- .attr = {.name = "add_random", .mode = 0644 },
- .show = queue_show_random,
- .store = queue_store_random,
-};
-
-static struct queue_sysfs_entry queue_poll_entry = {
- .attr = {.name = "io_poll", .mode = 0644 },
- .show = queue_poll_show,
- .store = queue_poll_store,
-};
-
-static struct queue_sysfs_entry queue_poll_delay_entry = {
- .attr = {.name = "io_poll_delay", .mode = 0644 },
- .show = queue_poll_delay_show,
- .store = queue_poll_delay_store,
-};
-
-static struct queue_sysfs_entry queue_wc_entry = {
- .attr = {.name = "write_cache", .mode = 0644 },
- .show = queue_wc_show,
- .store = queue_wc_store,
-};
-
-static struct queue_sysfs_entry queue_fua_entry = {
- .attr = {.name = "fua", .mode = 0444 },
- .show = queue_fua_show,
-};
-
-static struct queue_sysfs_entry queue_dax_entry = {
- .attr = {.name = "dax", .mode = 0444 },
- .show = queue_dax_show,
-};
-
-static struct queue_sysfs_entry queue_io_timeout_entry = {
- .attr = {.name = "io_timeout", .mode = 0644 },
- .show = queue_io_timeout_show,
- .store = queue_io_timeout_store,
-};
-
-static struct queue_sysfs_entry queue_wb_lat_entry = {
- .attr = {.name = "wbt_lat_usec", .mode = 0644 },
- .show = queue_wb_lat_show,
- .store = queue_wb_lat_store,
-};
-
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
-static struct queue_sysfs_entry throtl_sample_time_entry = {
- .attr = {.name = "throttle_sample_time", .mode = 0644 },
- .show = blk_throtl_sample_time_show,
- .store = blk_throtl_sample_time_store,
-};
+QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec");
#endif
+/* Common attributes for bio-based and request-based queues. */
static struct attribute *queue_attrs[] = {
- &queue_requests_entry.attr,
&queue_ra_entry.attr,
&queue_max_hw_sectors_entry.attr,
&queue_max_sectors_entry.attr,
@@ -748,7 +619,6 @@ static struct attribute *queue_attrs[] = {
&queue_max_discard_segments_entry.attr,
&queue_max_integrity_segments_entry.attr,
&queue_max_segment_size_entry.attr,
- &queue_iosched_entry.attr,
&queue_hw_sector_size_entry.attr,
&queue_logical_block_size_entry.attr,
&queue_physical_block_size_entry.attr,
@@ -762,22 +632,37 @@ static struct attribute *queue_attrs[] = {
&queue_write_same_max_entry.attr,
&queue_write_zeroes_max_entry.attr,
&queue_zone_append_max_entry.attr,
+ &queue_zone_write_granularity_entry.attr,
&queue_nonrot_entry.attr,
&queue_zoned_entry.attr,
&queue_nr_zones_entry.attr,
+ &queue_max_open_zones_entry.attr,
+ &queue_max_active_zones_entry.attr,
&queue_nomerges_entry.attr,
- &queue_rq_affinity_entry.attr,
&queue_iostats_entry.attr,
+ &queue_stable_writes_entry.attr,
&queue_random_entry.attr,
&queue_poll_entry.attr,
&queue_wc_entry.attr,
&queue_fua_entry.attr,
&queue_dax_entry.attr,
- &queue_wb_lat_entry.attr,
&queue_poll_delay_entry.attr,
- &queue_io_timeout_entry.attr,
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
- &throtl_sample_time_entry.attr,
+ &blk_throtl_sample_time_entry.attr,
+#endif
+ &queue_virt_boundary_mask_entry.attr,
+ &queue_dma_alignment_entry.attr,
+ NULL,
+};
+
+/* Request-based queue attributes that are not relevant for bio-based queues. */
+static struct attribute *blk_mq_queue_attrs[] = {
+ &queue_requests_entry.attr,
+ &elv_iosched_entry.attr,
+ &queue_rq_affinity_entry.attr,
+ &queue_io_timeout_entry.attr,
+#ifdef CONFIG_BLK_WBT
+ &queue_wb_lat_entry.attr,
#endif
NULL,
};
@@ -785,12 +670,28 @@ static struct attribute *queue_attrs[] = {
static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr,
int n)
{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
+
+ if ((attr == &queue_max_open_zones_entry.attr ||
+ attr == &queue_max_active_zones_entry.attr) &&
+ !blk_queue_is_zoned(q))
+ return 0;
+
+ return attr->mode;
+}
+
+static umode_t blk_mq_queue_attr_visible(struct kobject *kobj,
+ struct attribute *attr, int n)
+{
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
- if (attr == &queue_io_timeout_entry.attr &&
- (!q->mq_ops || !q->mq_ops->timeout))
- return 0;
+ if (!queue_is_mq(q))
+ return 0;
+
+ if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout)
+ return 0;
return attr->mode;
}
@@ -800,6 +701,10 @@ static struct attribute_group queue_attr_group = {
.is_visible = queue_attr_visible,
};
+static struct attribute_group blk_mq_queue_attr_group = {
+ .attrs = blk_mq_queue_attrs,
+ .is_visible = blk_mq_queue_attr_visible,
+};
#define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr)
@@ -807,8 +712,8 @@ static ssize_t
queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page)
{
struct queue_sysfs_entry *entry = to_queue(attr);
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->show)
@@ -824,135 +729,106 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *page, size_t length)
{
struct queue_sysfs_entry *entry = to_queue(attr);
- struct request_queue *q;
+ struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj);
+ struct request_queue *q = disk->queue;
ssize_t res;
if (!entry->store)
return -EIO;
- q = container_of(kobj, struct request_queue, kobj);
mutex_lock(&q->sysfs_lock);
res = entry->store(q, page, length);
mutex_unlock(&q->sysfs_lock);
return res;
}
-static void blk_free_queue_rcu(struct rcu_head *rcu_head)
-{
- struct request_queue *q = container_of(rcu_head, struct request_queue,
- rcu_head);
- kmem_cache_free(blk_requestq_cachep, q);
-}
-
-/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */
-static void blk_exit_queue(struct request_queue *q)
-{
- /*
- * Since the I/O scheduler exit code may access cgroup information,
- * perform I/O scheduler exit before disassociating from the block
- * cgroup controller.
- */
- if (q->elevator) {
- ioc_clear_queue(q);
- __elevator_exit(q, q->elevator);
- q->elevator = NULL;
- }
+static const struct sysfs_ops queue_sysfs_ops = {
+ .show = queue_attr_show,
+ .store = queue_attr_store,
+};
- /*
- * Remove all references to @q from the block cgroup controller before
- * restoring @q->queue_lock to avoid that restoring this pointer causes
- * e.g. blkcg_print_blkgs() to crash.
- */
- blkcg_exit_queue(q);
+static const struct attribute_group *blk_queue_attr_groups[] = {
+ &queue_attr_group,
+ &blk_mq_queue_attr_group,
+ NULL
+};
- /*
- * Since the cgroup code may dereference the @q->backing_dev_info
- * pointer, only decrease its reference count after having removed the
- * association with the block cgroup controller.
- */
- bdi_put(q->backing_dev_info);
+static void blk_queue_release(struct kobject *kobj)
+{
+ /* nothing to do here, all data is associated with the parent gendisk */
}
+static const struct kobj_type blk_queue_ktype = {
+ .default_groups = blk_queue_attr_groups,
+ .sysfs_ops = &queue_sysfs_ops,
+ .release = blk_queue_release,
+};
-/**
- * __blk_release_queue - release a request queue
- * @work: pointer to the release_work member of the request queue to be released
- *
- * Description:
- * This function is called when a block device is being unregistered. The
- * process of releasing a request queue starts with blk_cleanup_queue, which
- * set the appropriate flags and then calls blk_put_queue, that decrements
- * the reference counter of the request queue. Once the reference counter
- * of the request queue reaches zero, blk_release_queue is called to release
- * all allocated resources of the request queue.
- */
-static void __blk_release_queue(struct work_struct *work)
+static void blk_debugfs_remove(struct gendisk *disk)
{
- struct request_queue *q = container_of(work, typeof(*q), release_work);
-
- if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags))
- blk_stat_remove_callback(q, q->poll_cb);
- blk_stat_free_callback(q->poll_cb);
-
- blk_free_queue_stats(q->stats);
-
- if (queue_is_mq(q))
- cancel_delayed_work_sync(&q->requeue_work);
-
- blk_exit_queue(q);
-
- blk_queue_free_zone_bitmaps(q);
-
- if (queue_is_mq(q))
- blk_mq_release(q);
+ struct request_queue *q = disk->queue;
+ mutex_lock(&q->debugfs_mutex);
blk_trace_shutdown(q);
-
- if (queue_is_mq(q))
- blk_mq_debugfs_unregister(q);
-
- bioset_exit(&q->bio_split);
-
- ida_simple_remove(&blk_queue_ida, q->id);
- call_rcu(&q->rcu_head, blk_free_queue_rcu);
-}
-
-static void blk_release_queue(struct kobject *kobj)
-{
- struct request_queue *q =
- container_of(kobj, struct request_queue, kobj);
-
- INIT_WORK(&q->release_work, __blk_release_queue);
- schedule_work(&q->release_work);
+ debugfs_remove_recursive(q->debugfs_dir);
+ q->debugfs_dir = NULL;
+ q->sched_debugfs_dir = NULL;
+ q->rqos_debugfs_dir = NULL;
+ mutex_unlock(&q->debugfs_mutex);
}
-static const struct sysfs_ops queue_sysfs_ops = {
- .show = queue_attr_show,
- .store = queue_attr_store,
-};
-
-struct kobj_type blk_queue_ktype = {
- .sysfs_ops = &queue_sysfs_ops,
- .release = blk_release_queue,
-};
-
/**
* blk_register_queue - register a block layer queue with sysfs
* @disk: Disk of which the request queue should be registered with sysfs.
*/
int blk_register_queue(struct gendisk *disk)
{
- int ret;
- struct device *dev = disk_to_dev(disk);
struct request_queue *q = disk->queue;
- bool has_elevator = false;
+ int ret;
- if (WARN_ON(!q))
- return -ENXIO;
+ mutex_lock(&q->sysfs_dir_lock);
+ kobject_init(&disk->queue_kobj, &blk_queue_ktype);
+ ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
+ if (ret < 0)
+ goto out_put_queue_kobj;
- WARN_ONCE(blk_queue_registered(q),
- "%s is registering an already registered queue\n",
- kobject_name(&dev->kobj));
+ if (queue_is_mq(q)) {
+ ret = blk_mq_sysfs_register(disk);
+ if (ret)
+ goto out_put_queue_kobj;
+ }
+ mutex_lock(&q->sysfs_lock);
+
+ mutex_lock(&q->debugfs_mutex);
+ q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root);
+ if (queue_is_mq(q))
+ blk_mq_debugfs_register(q);
+ mutex_unlock(&q->debugfs_mutex);
+
+ ret = disk_register_independent_access_ranges(disk);
+ if (ret)
+ goto out_debugfs_remove;
+
+ if (q->elevator) {
+ ret = elv_register_queue(q, false);
+ if (ret)
+ goto out_unregister_ia_ranges;
+ }
+
+ ret = blk_crypto_sysfs_register(disk);
+ if (ret)
+ goto out_elv_unregister;
+
+ blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
+ wbt_enable_default(disk);
+ blk_throtl_register(disk);
+
+ /* Now everything is ready and send out KOBJ_ADD uevent */
+ kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
+ if (q->elevator)
+ kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
+ mutex_unlock(&q->sysfs_lock);
+ mutex_unlock(&q->sysfs_dir_lock);
/*
* SCSI probing may synchronously create and destroy a lot of
@@ -968,61 +844,20 @@ int blk_register_queue(struct gendisk *disk)
percpu_ref_switch_to_percpu(&q->q_usage_counter);
}
- ret = blk_trace_init_sysfs(dev);
- if (ret)
- return ret;
-
- mutex_lock(&q->sysfs_dir_lock);
-
- ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue");
- if (ret < 0) {
- blk_trace_remove_sysfs(dev);
- goto unlock;
- }
-
- ret = sysfs_create_group(&q->kobj, &queue_attr_group);
- if (ret) {
- blk_trace_remove_sysfs(dev);
- kobject_del(&q->kobj);
- kobject_put(&dev->kobj);
- goto unlock;
- }
-
- if (queue_is_mq(q)) {
- __blk_mq_register_dev(dev, q);
- blk_mq_debugfs_register(q);
- }
-
- mutex_lock(&q->sysfs_lock);
- if (q->elevator) {
- ret = elv_register_queue(q, false);
- if (ret) {
- mutex_unlock(&q->sysfs_lock);
- mutex_unlock(&q->sysfs_dir_lock);
- kobject_del(&q->kobj);
- blk_trace_remove_sysfs(dev);
- kobject_put(&dev->kobj);
- return ret;
- }
- has_elevator = true;
- }
-
- blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
- wbt_enable_default(q);
- blk_throtl_register_queue(q);
+ return ret;
- /* Now everything is ready and send out KOBJ_ADD uevent */
- kobject_uevent(&q->kobj, KOBJ_ADD);
- if (has_elevator)
- kobject_uevent(&q->elevator->kobj, KOBJ_ADD);
+out_elv_unregister:
+ elv_unregister_queue(q);
+out_unregister_ia_ranges:
+ disk_unregister_independent_access_ranges(disk);
+out_debugfs_remove:
+ blk_debugfs_remove(disk);
mutex_unlock(&q->sysfs_lock);
-
- ret = 0;
-unlock:
+out_put_queue_kobj:
+ kobject_put(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
return ret;
}
-EXPORT_SYMBOL_GPL(blk_register_queue);
/**
* blk_unregister_queue - counterpart of blk_register_queue()
@@ -1057,17 +892,18 @@ void blk_unregister_queue(struct gendisk *disk)
* structures that can be modified through sysfs.
*/
if (queue_is_mq(q))
- blk_mq_unregister_dev(disk_to_dev(disk), q);
-
- kobject_uevent(&q->kobj, KOBJ_REMOVE);
- kobject_del(&q->kobj);
- blk_trace_remove_sysfs(disk_to_dev(disk));
+ blk_mq_sysfs_unregister(disk);
+ blk_crypto_sysfs_unregister(disk);
mutex_lock(&q->sysfs_lock);
- if (q->elevator)
- elv_unregister_queue(q);
+ elv_unregister_queue(q);
+ disk_unregister_independent_access_ranges(disk);
mutex_unlock(&q->sysfs_lock);
+
+ /* Now that we've deleted all child objects, we can delete the queue. */
+ kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE);
+ kobject_del(&disk->queue_kobj);
mutex_unlock(&q->sysfs_dir_lock);
- kobject_put(&disk_to_dev(disk)->kobj);
+ blk_debugfs_remove(disk);
}
diff --git a/block/blk-throttle.c b/block/blk-throttle.c
index 209fdd8939fb..16f5766620a4 100644
--- a/block/blk-throttle.c
+++ b/block/blk-throttle.c
@@ -10,15 +10,16 @@
#include <linux/blkdev.h>
#include <linux/bio.h>
#include <linux/blktrace_api.h>
-#include <linux/blk-cgroup.h>
#include "blk.h"
#include "blk-cgroup-rwstat.h"
+#include "blk-stat.h"
+#include "blk-throttle.h"
/* Max dispatch from a group in 1 round */
-static int throtl_grp_quantum = 8;
+#define THROTL_GRP_QUANTUM 8
/* Total max dispatch from all groups in one round */
-static int throtl_quantum = 32;
+#define THROTL_QUANTUM 32
/* Throttling is performed over a slice and after that slice is renewed */
#define DFL_THROTL_SLICE_HD (HZ / 10)
@@ -37,151 +38,11 @@ static int throtl_quantum = 32;
*/
#define LATENCY_FILTERED_HD (1000L) /* 1ms */
-static struct blkcg_policy blkcg_policy_throtl;
-
/* A workqueue to queue throttle related work */
static struct workqueue_struct *kthrotld_workqueue;
-/*
- * To implement hierarchical throttling, throtl_grps form a tree and bios
- * are dispatched upwards level by level until they reach the top and get
- * issued. When dispatching bios from the children and local group at each
- * level, if the bios are dispatched into a single bio_list, there's a risk
- * of a local or child group which can queue many bios at once filling up
- * the list starving others.
- *
- * To avoid such starvation, dispatched bios are queued separately
- * according to where they came from. When they are again dispatched to
- * the parent, they're popped in round-robin order so that no single source
- * hogs the dispatch window.
- *
- * throtl_qnode is used to keep the queued bios separated by their sources.
- * Bios are queued to throtl_qnode which in turn is queued to
- * throtl_service_queue and then dispatched in round-robin order.
- *
- * It's also used to track the reference counts on blkg's. A qnode always
- * belongs to a throtl_grp and gets queued on itself or the parent, so
- * incrementing the reference of the associated throtl_grp when a qnode is
- * queued and decrementing when dequeued is enough to keep the whole blkg
- * tree pinned while bios are in flight.
- */
-struct throtl_qnode {
- struct list_head node; /* service_queue->queued[] */
- struct bio_list bios; /* queued bios */
- struct throtl_grp *tg; /* tg this qnode belongs to */
-};
-
-struct throtl_service_queue {
- struct throtl_service_queue *parent_sq; /* the parent service_queue */
-
- /*
- * Bios queued directly to this service_queue or dispatched from
- * children throtl_grp's.
- */
- struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
- unsigned int nr_queued[2]; /* number of queued bios */
-
- /*
- * RB tree of active children throtl_grp's, which are sorted by
- * their ->disptime.
- */
- struct rb_root_cached pending_tree; /* RB tree of active tgs */
- unsigned int nr_pending; /* # queued in the tree */
- unsigned long first_pending_disptime; /* disptime of the first tg */
- struct timer_list pending_timer; /* fires on first_pending_disptime */
-};
-
-enum tg_state_flags {
- THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
- THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
-};
-
#define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node)
-enum {
- LIMIT_LOW,
- LIMIT_MAX,
- LIMIT_CNT,
-};
-
-struct throtl_grp {
- /* must be the first member */
- struct blkg_policy_data pd;
-
- /* active throtl group service_queue member */
- struct rb_node rb_node;
-
- /* throtl_data this group belongs to */
- struct throtl_data *td;
-
- /* this group's service queue */
- struct throtl_service_queue service_queue;
-
- /*
- * qnode_on_self is used when bios are directly queued to this
- * throtl_grp so that local bios compete fairly with bios
- * dispatched from children. qnode_on_parent is used when bios are
- * dispatched from this throtl_grp into its parent and will compete
- * with the sibling qnode_on_parents and the parent's
- * qnode_on_self.
- */
- struct throtl_qnode qnode_on_self[2];
- struct throtl_qnode qnode_on_parent[2];
-
- /*
- * Dispatch time in jiffies. This is the estimated time when group
- * will unthrottle and is ready to dispatch more bio. It is used as
- * key to sort active groups in service tree.
- */
- unsigned long disptime;
-
- unsigned int flags;
-
- /* are there any throtl rules between this group and td? */
- bool has_rules[2];
-
- /* internally used bytes per second rate limits */
- uint64_t bps[2][LIMIT_CNT];
- /* user configured bps limits */
- uint64_t bps_conf[2][LIMIT_CNT];
-
- /* internally used IOPS limits */
- unsigned int iops[2][LIMIT_CNT];
- /* user configured IOPS limits */
- unsigned int iops_conf[2][LIMIT_CNT];
-
- /* Number of bytes disptached in current slice */
- uint64_t bytes_disp[2];
- /* Number of bio's dispatched in current slice */
- unsigned int io_disp[2];
-
- unsigned long last_low_overflow_time[2];
-
- uint64_t last_bytes_disp[2];
- unsigned int last_io_disp[2];
-
- unsigned long last_check_time;
-
- unsigned long latency_target; /* us */
- unsigned long latency_target_conf; /* us */
- /* When did we start a new slice */
- unsigned long slice_start[2];
- unsigned long slice_end[2];
-
- unsigned long last_finish_time; /* ns / 1024 */
- unsigned long checked_last_finish_time; /* ns / 1024 */
- unsigned long avg_idletime; /* ns / 1024 */
- unsigned long idletime_threshold; /* us */
- unsigned long idletime_threshold_conf; /* us */
-
- unsigned int bio_cnt; /* total bios */
- unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
- unsigned long bio_cnt_reset_time;
-
- struct blkg_rwstat stat_bytes;
- struct blkg_rwstat stat_ios;
-};
-
/* We measure latency for request size from <= 4k to >= 1M */
#define LATENCY_BUCKET_SIZE 9
@@ -228,16 +89,6 @@ struct throtl_data
static void throtl_pending_timer_fn(struct timer_list *t);
-static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
-{
- return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
-}
-
-static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
-{
- return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
-}
-
static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg)
{
return pd_to_blkg(&tg->pd);
@@ -278,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq)
/*
* cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to
* make the IO dispatch more smooth.
- * Scale up: linearly scale up according to lapsed time since upgrade. For
+ * Scale up: linearly scale up according to elapsed time since upgrade. For
* every throtl_slice, the limit scales up 1/2 .low limit till the
* limit hits .max limit
* Scale down: exponentially scale down if a cgroup doesn't hit its .low limit
@@ -376,7 +227,7 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw)
break; \
if ((__tg)) { \
blk_add_cgroup_trace_msg(__td->queue, \
- tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\
+ &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\
} else { \
blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \
} \
@@ -423,12 +274,13 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn,
*/
static struct bio *throtl_peek_queued(struct list_head *queued)
{
- struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct throtl_qnode *qn;
struct bio *bio;
if (list_empty(queued))
return NULL;
+ qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_peek(&qn->bios);
WARN_ON_ONCE(!bio);
return bio;
@@ -451,12 +303,13 @@ static struct bio *throtl_peek_queued(struct list_head *queued)
static struct bio *throtl_pop_queued(struct list_head *queued,
struct throtl_grp **tg_to_put)
{
- struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node);
+ struct throtl_qnode *qn;
struct bio *bio;
if (list_empty(queued))
return NULL;
+ qn = list_first_entry(queued, struct throtl_qnode, node);
bio = bio_list_pop(&qn->bios);
WARN_ON_ONCE(!bio);
@@ -476,20 +329,19 @@ static struct bio *throtl_pop_queued(struct list_head *queued,
/* init a service_queue, assumes the caller zeroed it */
static void throtl_service_queue_init(struct throtl_service_queue *sq)
{
- INIT_LIST_HEAD(&sq->queued[0]);
- INIT_LIST_HEAD(&sq->queued[1]);
+ INIT_LIST_HEAD(&sq->queued[READ]);
+ INIT_LIST_HEAD(&sq->queued[WRITE]);
sq->pending_tree = RB_ROOT_CACHED;
timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0);
}
-static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp,
- struct request_queue *q,
- struct blkcg *blkcg)
+static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk,
+ struct blkcg *blkcg, gfp_t gfp)
{
struct throtl_grp *tg;
int rw;
- tg = kzalloc_node(sizeof(*tg), gfp, q->node);
+ tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id);
if (!tg)
return NULL;
@@ -542,8 +394,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd)
* If on the default hierarchy, we switch to properly hierarchical
* behavior where limits on a given throtl_grp are applied to the
* whole subtree rather than just the group itself. e.g. If 16M
- * read_bps limit is set on the root group, the whole system can't
- * exceed 16M for the device.
+ * read_bps limit is set on a parent group, summary bps of
+ * parent group and its subtree groups can't exceed 16M for the
+ * device.
*
* If not on the default hierarchy, the broken flat hierarchy
* behavior is retained where all throtl_grps are treated as if
@@ -568,11 +421,16 @@ static void tg_update_has_rules(struct throtl_grp *tg)
struct throtl_data *td = tg->td;
int rw;
- for (rw = READ; rw <= WRITE; rw++)
- tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) ||
+ for (rw = READ; rw <= WRITE; rw++) {
+ tg->has_rules_iops[rw] =
+ (parent_tg && parent_tg->has_rules_iops[rw]) ||
+ (td->limit_valid[td->limit_index] &&
+ tg_iops_limit(tg, rw) != UINT_MAX);
+ tg->has_rules_bps[rw] =
+ (parent_tg && parent_tg->has_rules_bps[rw]) ||
(td->limit_valid[td->limit_index] &&
- (tg_bps_limit(tg, rw) != U64_MAX ||
- tg_iops_limit(tg, rw) != UINT_MAX));
+ (tg_bps_limit(tg, rw) != U64_MAX));
+ }
}
static void throtl_pd_online(struct blkg_policy_data *pd)
@@ -585,6 +443,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd)
tg_update_has_rules(tg);
}
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void blk_throtl_update_limit_valid(struct throtl_data *td)
{
struct cgroup_subsys_state *pos_css;
@@ -605,6 +464,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td)
td->limit_valid[LIMIT_LOW] = low_valid;
}
+#else
+static inline void blk_throtl_update_limit_valid(struct throtl_data *td)
+{
+}
+#endif
static void throtl_upgrade_state(struct throtl_data *td);
static void throtl_pd_offline(struct blkg_policy_data *pd)
@@ -636,9 +500,6 @@ static struct throtl_grp *
throtl_rb_first(struct throtl_service_queue *parent_sq)
{
struct rb_node *n;
- /* Service tree is empty */
- if (!parent_sq->nr_pending)
- return NULL;
n = rb_first_cached(&parent_sq->pending_tree);
WARN_ON_ONCE(!n);
@@ -652,7 +513,6 @@ static void throtl_rb_erase(struct rb_node *n,
{
rb_erase_cached(n, &parent_sq->pending_tree);
RB_CLEAR_NODE(n);
- --parent_sq->nr_pending;
}
static void update_min_dispatch_time(struct throtl_service_queue *parent_sq)
@@ -692,29 +552,25 @@ static void tg_service_queue_add(struct throtl_grp *tg)
leftmost);
}
-static void __throtl_enqueue_tg(struct throtl_grp *tg)
-{
- tg_service_queue_add(tg);
- tg->flags |= THROTL_TG_PENDING;
- tg->service_queue.parent_sq->nr_pending++;
-}
-
static void throtl_enqueue_tg(struct throtl_grp *tg)
{
- if (!(tg->flags & THROTL_TG_PENDING))
- __throtl_enqueue_tg(tg);
-}
-
-static void __throtl_dequeue_tg(struct throtl_grp *tg)
-{
- throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
- tg->flags &= ~THROTL_TG_PENDING;
+ if (!(tg->flags & THROTL_TG_PENDING)) {
+ tg_service_queue_add(tg);
+ tg->flags |= THROTL_TG_PENDING;
+ tg->service_queue.parent_sq->nr_pending++;
+ }
}
static void throtl_dequeue_tg(struct throtl_grp *tg)
{
- if (tg->flags & THROTL_TG_PENDING)
- __throtl_dequeue_tg(tg);
+ if (tg->flags & THROTL_TG_PENDING) {
+ struct throtl_service_queue *parent_sq =
+ tg->service_queue.parent_sq;
+
+ throtl_rb_erase(&tg->rb_node, parent_sq);
+ --parent_sq->nr_pending;
+ tg->flags &= ~THROTL_TG_PENDING;
+ }
}
/* Call with queue lock held */
@@ -779,6 +635,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
+ tg->carryover_bytes[rw] = 0;
+ tg->carryover_ios[rw] = 0;
/*
* Previous slice has expired. We must have trimmed it after last
@@ -786,7 +644,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
* that bandwidth. Do try to make use of that bandwidth while giving
* credit.
*/
- if (time_after_eq(start, tg->slice_start[rw]))
+ if (time_after(start, tg->slice_start[rw]))
tg->slice_start[rw] = start;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
@@ -796,12 +654,18 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg,
tg->slice_end[rw], jiffies);
}
-static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw)
+static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw,
+ bool clear_carryover)
{
tg->bytes_disp[rw] = 0;
tg->io_disp[rw] = 0;
tg->slice_start[rw] = jiffies;
tg->slice_end[rw] = jiffies + tg->td->throtl_slice;
+ if (clear_carryover) {
+ tg->carryover_bytes[rw] = 0;
+ tg->carryover_ios[rw] = 0;
+ }
+
throtl_log(&tg->service_queue,
"[%c] new slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -817,7 +681,7 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw,
static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw,
unsigned long jiffy_end)
{
- tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice);
+ throtl_set_slice_end(tg, rw, jiffy_end);
throtl_log(&tg->service_queue,
"[%c] extend slice start=%lu end=%lu jiffies=%lu",
rw == READ ? 'R' : 'W', tg->slice_start[rw],
@@ -833,11 +697,47 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw)
return true;
}
+static unsigned int calculate_io_allowed(u32 iops_limit,
+ unsigned long jiffy_elapsed)
+{
+ unsigned int io_allowed;
+ u64 tmp;
+
+ /*
+ * jiffy_elapsed should not be a big value as minimum iops can be
+ * 1 then at max jiffy elapsed should be equivalent of 1 second as we
+ * will allow dispatch after 1 second and after that slice should
+ * have been trimmed.
+ */
+
+ tmp = (u64)iops_limit * jiffy_elapsed;
+ do_div(tmp, HZ);
+
+ if (tmp > UINT_MAX)
+ io_allowed = UINT_MAX;
+ else
+ io_allowed = tmp;
+
+ return io_allowed;
+}
+
+static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed)
+{
+ /*
+ * Can result be wider than 64 bits?
+ * We check against 62, not 64, due to ilog2 truncation.
+ */
+ if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62)
+ return U64_MAX;
+ return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ);
+}
+
/* Trim the used slices and adjust slice start accordingly */
static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
{
- unsigned long nr_slices, time_elapsed, io_trim;
- u64 bytes_trim, tmp;
+ unsigned long time_elapsed;
+ long long bytes_trim;
+ int io_trim;
BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw]));
@@ -852,97 +752,121 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw)
/*
* A bio has been dispatched. Also adjust slice_end. It might happen
* that initially cgroup limit was very low resulting in high
- * slice_end, but later limit was bumped up and bio was dispached
+ * slice_end, but later limit was bumped up and bio was dispatched
* sooner, then we need to reduce slice_end. A high bogus slice_end
* is bad because it does not allow new slice to start.
*/
throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice);
- time_elapsed = jiffies - tg->slice_start[rw];
-
- nr_slices = time_elapsed / tg->td->throtl_slice;
-
- if (!nr_slices)
+ time_elapsed = rounddown(jiffies - tg->slice_start[rw],
+ tg->td->throtl_slice);
+ if (!time_elapsed)
return;
- tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices;
- do_div(tmp, HZ);
- bytes_trim = tmp;
- io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) /
- HZ;
-
- if (!bytes_trim && !io_trim)
+ bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw),
+ time_elapsed) +
+ tg->carryover_bytes[rw];
+ io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) +
+ tg->carryover_ios[rw];
+ if (bytes_trim <= 0 && io_trim <= 0)
return;
- if (tg->bytes_disp[rw] >= bytes_trim)
+ tg->carryover_bytes[rw] = 0;
+ if ((long long)tg->bytes_disp[rw] >= bytes_trim)
tg->bytes_disp[rw] -= bytes_trim;
else
tg->bytes_disp[rw] = 0;
- if (tg->io_disp[rw] >= io_trim)
+ tg->carryover_ios[rw] = 0;
+ if ((int)tg->io_disp[rw] >= io_trim)
tg->io_disp[rw] -= io_trim;
else
tg->io_disp[rw] = 0;
- tg->slice_start[rw] += nr_slices * tg->td->throtl_slice;
+ tg->slice_start[rw] += time_elapsed;
throtl_log(&tg->service_queue,
- "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu",
- rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim,
- tg->slice_start[rw], tg->slice_end[rw], jiffies);
+ "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu",
+ rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice,
+ bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw],
+ jiffies);
}
-static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+static void __tg_update_carryover(struct throtl_grp *tg, bool rw)
{
- bool rw = bio_data_dir(bio);
- unsigned int io_allowed;
- unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
- u64 tmp;
-
- jiffy_elapsed = jiffies - tg->slice_start[rw];
-
- /* Round up to the next throttle slice, wait time must be nonzero */
- jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
+ unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw];
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ u32 iops_limit = tg_iops_limit(tg, rw);
/*
- * jiffy_elapsed_rnd should not be a big value as minimum iops can be
- * 1 then at max jiffy elapsed should be equivalent of 1 second as we
- * will allow dispatch after 1 second and after that slice should
- * have been trimmed.
+ * If config is updated while bios are still throttled, calculate and
+ * accumulate how many bytes/ios are waited across changes. And
+ * carryover_bytes/ios will be used to calculate new wait time under new
+ * configuration.
*/
+ if (bps_limit != U64_MAX)
+ tg->carryover_bytes[rw] +=
+ calculate_bytes_allowed(bps_limit, jiffy_elapsed) -
+ tg->bytes_disp[rw];
+ if (iops_limit != UINT_MAX)
+ tg->carryover_ios[rw] +=
+ calculate_io_allowed(iops_limit, jiffy_elapsed) -
+ tg->io_disp[rw];
+}
- tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd;
- do_div(tmp, HZ);
+static void tg_update_carryover(struct throtl_grp *tg)
+{
+ if (tg->service_queue.nr_queued[READ])
+ __tg_update_carryover(tg, READ);
+ if (tg->service_queue.nr_queued[WRITE])
+ __tg_update_carryover(tg, WRITE);
- if (tmp > UINT_MAX)
- io_allowed = UINT_MAX;
- else
- io_allowed = tmp;
+ /* see comments in struct throtl_grp for meaning of these fields. */
+ throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__,
+ tg->carryover_bytes[READ], tg->carryover_bytes[WRITE],
+ tg->carryover_ios[READ], tg->carryover_ios[WRITE]);
+}
- if (tg->io_disp[rw] + 1 <= io_allowed) {
- if (wait)
- *wait = 0;
- return true;
+static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio,
+ u32 iops_limit)
+{
+ bool rw = bio_data_dir(bio);
+ int io_allowed;
+ unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
+
+ if (iops_limit == UINT_MAX) {
+ return 0;
}
+ jiffy_elapsed = jiffies - tg->slice_start[rw];
+
+ /* Round up to the next throttle slice, wait time must be nonzero */
+ jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice);
+ io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) +
+ tg->carryover_ios[rw];
+ if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed)
+ return 0;
+
/* Calc approx time to dispatch */
jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed;
-
- if (wait)
- *wait = jiffy_wait;
- return false;
+ return jiffy_wait;
}
-static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
- unsigned long *wait)
+static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio,
+ u64 bps_limit)
{
bool rw = bio_data_dir(bio);
- u64 bytes_allowed, extra_bytes, tmp;
+ long long bytes_allowed;
+ u64 extra_bytes;
unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd;
unsigned int bio_size = throtl_bio_data_size(bio);
+ /* no need to throttle if this bio's bytes have been accounted */
+ if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ return 0;
+ }
+
jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw];
/* Slice has just started. Consider one slice interval */
@@ -950,20 +874,14 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
jiffy_elapsed_rnd = tg->td->throtl_slice;
jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice);
-
- tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd;
- do_div(tmp, HZ);
- bytes_allowed = tmp;
-
- if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) {
- if (wait)
- *wait = 0;
- return true;
- }
+ bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) +
+ tg->carryover_bytes[rw];
+ if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed)
+ return 0;
/* Calc approx time to dispatch */
extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed;
- jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw));
+ jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit);
if (!jiffy_wait)
jiffy_wait = 1;
@@ -973,9 +891,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio,
* up we did. Add that time also.
*/
jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed);
- if (wait)
- *wait = jiffy_wait;
- return false;
+ return jiffy_wait;
}
/*
@@ -987,6 +903,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
{
bool rw = bio_data_dir(bio);
unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0;
+ u64 bps_limit = tg_bps_limit(tg, rw);
+ u32 iops_limit = tg_iops_limit(tg, rw);
/*
* Currently whole state machine of group depends on first bio
@@ -998,8 +916,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
bio != throtl_peek_queued(&tg->service_queue.queued[rw]));
/* If tg->bps = -1, then BW is unlimited */
- if (tg_bps_limit(tg, rw) == U64_MAX &&
- tg_iops_limit(tg, rw) == UINT_MAX) {
+ if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) ||
+ tg->flags & THROTL_TG_CANCELING) {
if (wait)
*wait = 0;
return true;
@@ -1013,7 +931,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
* slice and it should be extended instead.
*/
if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw]))
- throtl_start_new_slice(tg, rw);
+ throtl_start_new_slice(tg, rw, true);
else {
if (time_before(tg->slice_end[rw],
jiffies + tg->td->throtl_slice))
@@ -1021,8 +939,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio,
jiffies + tg->td->throtl_slice);
}
- if (tg_with_in_bps_limit(tg, bio, &bps_wait) &&
- tg_with_in_iops_limit(tg, bio, &iops_wait)) {
+ bps_wait = tg_within_bps_limit(tg, bio, bps_limit);
+ iops_wait = tg_within_iops_limit(tg, bio, iops_limit);
+ if (bps_wait + iops_wait == 0) {
if (wait)
*wait = 0;
return true;
@@ -1045,19 +964,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio)
unsigned int bio_size = throtl_bio_data_size(bio);
/* Charge the bio to the group */
- tg->bytes_disp[rw] += bio_size;
+ if (!bio_flagged(bio, BIO_BPS_THROTTLED)) {
+ tg->bytes_disp[rw] += bio_size;
+ tg->last_bytes_disp[rw] += bio_size;
+ }
+
tg->io_disp[rw]++;
- tg->last_bytes_disp[rw] += bio_size;
tg->last_io_disp[rw]++;
-
- /*
- * BIO_THROTTLED is used to prevent the same bio to be throttled
- * more than once as a throttled bio will go through blk-throtl the
- * second time when it eventually gets issued. Set it when a bio
- * is being charged to a tg.
- */
- if (!bio_flagged(bio, BIO_THROTTLED))
- bio_set_flag(bio, BIO_THROTTLED);
}
/**
@@ -1082,7 +995,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn,
* If @tg doesn't currently have any bios queued in the same
* direction, queueing @bio can change when @tg should be
* dispatched. Mark that @tg was empty. This is automatically
- * cleaered on the next tg_update_disptime().
+ * cleared on the next tg_update_disptime().
*/
if (!sq->nr_queued[rw])
tg->flags |= THROTL_TG_WAS_EMPTY;
@@ -1111,9 +1024,9 @@ static void tg_update_disptime(struct throtl_grp *tg)
disptime = jiffies + min_wait;
/* Update dispatch time */
- throtl_dequeue_tg(tg);
+ throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq);
tg->disptime = disptime;
- throtl_enqueue_tg(tg);
+ tg_service_queue_add(tg);
/* see throtl_add_bio_tg() */
tg->flags &= ~THROTL_TG_WAS_EMPTY;
@@ -1159,6 +1072,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw)
throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg);
start_parent_slice_with_credit(tg, parent_tg, rw);
} else {
+ bio_set_flag(bio, BIO_BPS_THROTTLED);
throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw],
&parent_sq->queued[rw]);
BUG_ON(tg->td->nr_queued[rw] <= 0);
@@ -1175,8 +1089,8 @@ static int throtl_dispatch_tg(struct throtl_grp *tg)
{
struct throtl_service_queue *sq = &tg->service_queue;
unsigned int nr_reads = 0, nr_writes = 0;
- unsigned int max_nr_reads = throtl_grp_quantum*3/4;
- unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads;
+ unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4;
+ unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads;
struct bio *bio;
/* Try to dispatch 75% READS and 25% WRITES */
@@ -1209,24 +1123,28 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq)
unsigned int nr_disp = 0;
while (1) {
- struct throtl_grp *tg = throtl_rb_first(parent_sq);
+ struct throtl_grp *tg;
struct throtl_service_queue *sq;
+ if (!parent_sq->nr_pending)
+ break;
+
+ tg = throtl_rb_first(parent_sq);
if (!tg)
break;
if (time_before(jiffies, tg->disptime))
break;
- throtl_dequeue_tg(tg);
-
nr_disp += throtl_dispatch_tg(tg);
sq = &tg->service_queue;
- if (sq->nr_queued[0] || sq->nr_queued[1])
+ if (sq->nr_queued[READ] || sq->nr_queued[WRITE])
tg_update_disptime(tg);
+ else
+ throtl_dequeue_tg(tg);
- if (nr_disp >= throtl_quantum)
+ if (nr_disp >= THROTL_QUANTUM)
break;
}
@@ -1255,12 +1173,22 @@ static void throtl_pending_timer_fn(struct timer_list *t)
struct throtl_service_queue *sq = from_timer(sq, t, pending_timer);
struct throtl_grp *tg = sq_to_tg(sq);
struct throtl_data *td = sq_to_td(sq);
- struct request_queue *q = td->queue;
struct throtl_service_queue *parent_sq;
+ struct request_queue *q;
bool dispatched;
int ret;
+ /* throtl_data may be gone, so figure out request queue by blkg */
+ if (tg)
+ q = tg->pd.blkg->q;
+ else
+ q = td->queue;
+
spin_lock_irq(&q->queue_lock);
+
+ if (!q->root_blkg)
+ goto out_unlock;
+
if (throtl_can_upgrade(td, NULL))
throtl_upgrade_state(td);
@@ -1303,7 +1231,7 @@ again:
}
}
} else {
- /* reached the top-level, queue issueing */
+ /* reached the top-level, queue issuing */
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
out_unlock:
@@ -1314,8 +1242,8 @@ out_unlock:
* blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work
* @work: work item being executed
*
- * This function is queued for execution when bio's reach the bio_lists[]
- * of throtl_data->service_queue. Those bio's are ready and issued by this
+ * This function is queued for execution when bios reach the bio_lists[]
+ * of throtl_data->service_queue. Those bios are ready and issued by this
* function.
*/
static void blk_throtl_dispatch_work_fn(struct work_struct *work)
@@ -1339,8 +1267,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work)
if (!bio_list_empty(&bio_list_on_stack)) {
blk_start_plug(&plug);
- while((bio = bio_list_pop(&bio_list_on_stack)))
- generic_make_request(bio);
+ while ((bio = bio_list_pop(&bio_list_on_stack)))
+ submit_bio_noacct_nocheck(bio);
blk_finish_plug(&plug);
}
}
@@ -1392,6 +1320,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE),
tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE));
+ rcu_read_lock();
/*
* Update has_rules[] flags for the updated tg's subtree. A tg is
* considered to have rules if either the tg itself or any of its
@@ -1419,6 +1348,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
this_tg->latency_target = max(this_tg->latency_target,
parent_tg->latency_target);
}
+ rcu_read_unlock();
/*
* We're already holding queue_lock and know @tg is valid. Let's
@@ -1428,8 +1358,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global)
* that a group's limit are dropped suddenly and we don't want to
* account recently dispatched IO with new low rate.
*/
- throtl_start_new_slice(tg, 0);
- throtl_start_new_slice(tg, 1);
+ throtl_start_new_slice(tg, READ, false);
+ throtl_start_new_slice(tg, WRITE, false);
if (tg->flags & THROTL_TG_PENDING) {
tg_update_disptime(tg);
@@ -1446,9 +1376,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
int ret;
u64 v;
- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ blkg_conf_init(&ctx, buf);
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
- return ret;
+ goto out_finish;
ret = -EINVAL;
if (sscanf(ctx.body, "%llu", &v) != 1)
@@ -1457,6 +1389,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
v = U64_MAX;
tg = blkg_to_tg(ctx.blkg);
+ tg_update_carryover(tg);
if (is_u64)
*(u64 *)((void *)tg + of_cft(of)->private) = v;
@@ -1466,7 +1399,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of,
tg_conf_updated(tg, false);
ret = 0;
out_finish:
- blkg_conf_finish(&ctx);
+ blkg_conf_exit(&ctx);
return ret ?: nbytes;
}
@@ -1638,11 +1571,14 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
int ret;
int index = of_cft(of)->private;
- ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx);
+ blkg_conf_init(&ctx, buf);
+
+ ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx);
if (ret)
- return ret;
+ goto out_finish;
tg = blkg_to_tg(ctx.blkg);
+ tg_update_carryover(tg);
v[0] = tg->bps_conf[READ][index];
v[1] = tg->bps_conf[WRITE][index];
@@ -1674,13 +1610,13 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
goto out_finish;
ret = -EINVAL;
- if (!strcmp(tok, "rbps"))
+ if (!strcmp(tok, "rbps") && val > 1)
v[0] = val;
- else if (!strcmp(tok, "wbps"))
+ else if (!strcmp(tok, "wbps") && val > 1)
v[1] = val;
- else if (!strcmp(tok, "riops"))
+ else if (!strcmp(tok, "riops") && val > 1)
v[2] = min_t(u64, val, UINT_MAX);
- else if (!strcmp(tok, "wiops"))
+ else if (!strcmp(tok, "wiops") && val > 1)
v[3] = min_t(u64, val, UINT_MAX);
else if (off == LIMIT_LOW && !strcmp(tok, "idle"))
idle_time = val;
@@ -1738,7 +1674,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of,
tg->td->limit_valid[LIMIT_LOW]);
ret = 0;
out_finish:
- blkg_conf_finish(&ctx);
+ blkg_conf_exit(&ctx);
return ret ?: nbytes;
}
@@ -1769,7 +1705,7 @@ static void throtl_shutdown_wq(struct request_queue *q)
cancel_work_sync(&td->dispatch_work);
}
-static struct blkcg_policy blkcg_policy_throtl = {
+struct blkcg_policy blkcg_policy_throtl = {
.dfl_cftypes = throtl_files,
.legacy_cftypes = throtl_legacy_files,
@@ -1780,6 +1716,52 @@ static struct blkcg_policy blkcg_policy_throtl = {
.pd_free_fn = throtl_pd_free,
};
+void blk_throtl_cancel_bios(struct gendisk *disk)
+{
+ struct request_queue *q = disk->queue;
+ struct cgroup_subsys_state *pos_css;
+ struct blkcg_gq *blkg;
+
+ spin_lock_irq(&q->queue_lock);
+ /*
+ * queue_lock is held, rcu lock is not needed here technically.
+ * However, rcu lock is still held to emphasize that following
+ * path need RCU protection and to prevent warning from lockdep.
+ */
+ rcu_read_lock();
+ blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) {
+ struct throtl_grp *tg = blkg_to_tg(blkg);
+ struct throtl_service_queue *sq = &tg->service_queue;
+
+ /*
+ * Set the flag to make sure throtl_pending_timer_fn() won't
+ * stop until all throttled bios are dispatched.
+ */
+ tg->flags |= THROTL_TG_CANCELING;
+
+ /*
+ * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup
+ * will be inserted to service queue without THROTL_TG_PENDING
+ * set in tg_update_disptime below. Then IO dispatched from
+ * child in tg_dispatch_one_bio will trigger double insertion
+ * and corrupt the tree.
+ */
+ if (!(tg->flags & THROTL_TG_PENDING))
+ continue;
+
+ /*
+ * Update disptime after setting the above flag to make sure
+ * throtl_select_dispatch() won't exit without dispatching.
+ */
+ tg_update_disptime(tg);
+
+ throtl_schedule_pending_timer(sq, jiffies + 1);
+ }
+ rcu_read_unlock();
+ spin_unlock_irq(&q->queue_lock);
+}
+
+#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
{
unsigned long rtime = jiffies, wtime = jiffies;
@@ -1791,7 +1773,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg)
return min(rtime, wtime);
}
-/* tg should not be an intermediate node */
static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg)
{
struct throtl_service_queue *parent_sq;
@@ -1845,24 +1826,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg)
return ret;
}
-static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw)
{
struct throtl_service_queue *sq = &tg->service_queue;
- bool read_limit, write_limit;
+ bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW];
/*
- * if cgroup reaches low limit (if low limit is 0, the cgroup always
- * reaches), it's ok to upgrade to next limit
+ * if low limit is zero, low limit is always reached.
+ * if low limit is non-zero, we can check if there is any request
+ * is queued to determine if low limit is reached as we throttle
+ * request according to limit.
*/
- read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW];
- write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW];
- if (!read_limit && !write_limit)
- return true;
- if (read_limit && sq->nr_queued[READ] &&
- (!write_limit || sq->nr_queued[WRITE]))
- return true;
- if (write_limit && sq->nr_queued[WRITE] &&
- (!read_limit || sq->nr_queued[READ]))
+ return !limit || sq->nr_queued[rw];
+}
+
+static bool throtl_tg_can_upgrade(struct throtl_grp *tg)
+{
+ /*
+ * cgroup reaches low limit when low limit of READ and WRITE are
+ * both reached, it's ok to upgrade to next limit if cgroup reaches
+ * low limit
+ */
+ if (throtl_low_limit_reached(tg, READ) &&
+ throtl_low_limit_reached(tg, WRITE))
return true;
if (time_after_eq(jiffies,
@@ -1957,7 +1943,7 @@ static void throtl_upgrade_state(struct throtl_data *td)
queue_work(kthrotld_workqueue, &td->dispatch_work);
}
-static void throtl_downgrade_state(struct throtl_data *td, int new)
+static void throtl_downgrade_state(struct throtl_data *td)
{
td->scale /= 2;
@@ -1967,7 +1953,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new)
return;
}
- td->limit_index = new;
+ td->limit_index = LIMIT_LOW;
td->low_downgrade_time = jiffies;
}
@@ -1980,8 +1966,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
* If cgroup is below low limit, consider downgrade and throttle other
* cgroups
*/
- if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) &&
- time_after_eq(now, tg_last_low_overflow_time(tg) +
+ if (time_after_eq(now, tg_last_low_overflow_time(tg) +
td->throtl_slice) &&
(!throtl_tg_is_idle(tg) ||
!list_empty(&tg_to_blkg(tg)->blkcg->css.children)))
@@ -1991,6 +1976,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg)
static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg)
{
+ struct throtl_data *td = tg->td;
+
+ if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice))
+ return false;
+
while (true) {
if (!throtl_tg_can_downgrade(tg))
return false;
@@ -2054,7 +2044,7 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
* cgroups
*/
if (throtl_hierarchy_can_downgrade(tg))
- throtl_downgrade_state(tg->td, LIMIT_LOW);
+ throtl_downgrade_state(tg->td);
tg->last_bytes_disp[READ] = 0;
tg->last_bytes_disp[WRITE] = 0;
@@ -2064,10 +2054,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg)
static void blk_throtl_update_idletime(struct throtl_grp *tg)
{
- unsigned long now = ktime_get_ns() >> 10;
+ unsigned long now;
unsigned long last_finish_time = tg->last_finish_time;
- if (now <= last_finish_time || last_finish_time == 0 ||
+ if (last_finish_time == 0)
+ return;
+
+ now = ktime_get_ns() >> 10;
+ if (now <= last_finish_time ||
last_finish_time == tg->checked_last_finish_time)
return;
@@ -2075,7 +2069,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg)
tg->checked_last_finish_time = last_finish_time;
}
-#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_update_latency_buckets(struct throtl_data *td)
{
struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE];
@@ -2083,7 +2076,7 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
unsigned long last_latency[2] = { 0 };
unsigned long latency[2];
- if (!blk_queue_nonrot(td->queue))
+ if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW])
return;
if (time_before(jiffies, td->last_calculate_time + HZ))
return;
@@ -2156,32 +2149,42 @@ static void throtl_update_latency_buckets(struct throtl_data *td)
static inline void throtl_update_latency_buckets(struct throtl_data *td)
{
}
+
+static void blk_throtl_update_idletime(struct throtl_grp *tg)
+{
+}
+
+static void throtl_downgrade_check(struct throtl_grp *tg)
+{
+}
+
+static void throtl_upgrade_check(struct throtl_grp *tg)
+{
+}
+
+static bool throtl_can_upgrade(struct throtl_data *td,
+ struct throtl_grp *this_tg)
+{
+ return false;
+}
+
+static void throtl_upgrade_state(struct throtl_data *td)
+{
+}
#endif
-bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
- struct bio *bio)
+bool __blk_throtl_bio(struct bio *bio)
{
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
+ struct blkcg_gq *blkg = bio->bi_blkg;
struct throtl_qnode *qn = NULL;
- struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg);
+ struct throtl_grp *tg = blkg_to_tg(blkg);
struct throtl_service_queue *sq;
bool rw = bio_data_dir(bio);
bool throttled = false;
struct throtl_data *td = tg->td;
- WARN_ON_ONCE(!rcu_read_lock_held());
-
- /* see throtl_charge_bio() */
- if (bio_flagged(bio, BIO_THROTTLED))
- goto out;
-
- if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
- blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
- bio->bi_iter.bi_size);
- blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
- }
-
- if (!tg->has_rules[rw])
- goto out;
+ rcu_read_lock();
spin_lock_irq(&q->queue_lock);
@@ -2229,14 +2232,16 @@ again:
/*
* @bio passed through this layer without being throttled.
- * Climb up the ladder. If we''re already at the top, it
+ * Climb up the ladder. If we're already at the top, it
* can be executed directly.
*/
qn = &tg->qnode_on_parent[rw];
sq = sq->parent_sq;
tg = sq_to_tg(sq);
- if (!tg)
+ if (!tg) {
+ bio_set_flag(bio, BIO_BPS_THROTTLED);
goto out_unlock;
+ }
}
/* out-of-limit, queue to @tg */
@@ -2265,21 +2270,21 @@ again:
}
out_unlock:
- spin_unlock_irq(&q->queue_lock);
-out:
- bio_set_flag(bio, BIO_THROTTLED);
-
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
if (throttled || !td->track_bio_latency)
bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY;
#endif
+ spin_unlock_irq(&q->queue_lock);
+
+ rcu_read_unlock();
return throttled;
}
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
static void throtl_track_latency(struct throtl_data *td, sector_t size,
- int op, unsigned long time)
+ enum req_op op, unsigned long time)
{
+ const bool rw = op_is_write(op);
struct latency_bucket *latency;
int index;
@@ -2290,10 +2295,10 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size,
index = request_bucket_index(size);
- latency = get_cpu_ptr(td->latency_buckets[op]);
+ latency = get_cpu_ptr(td->latency_buckets[rw]);
latency[index].total_latency += time;
latency[index].samples++;
- put_cpu_ptr(td->latency_buckets[op]);
+ put_cpu_ptr(td->latency_buckets[rw]);
}
void blk_throtl_stat_add(struct request *rq, u64 time_ns)
@@ -2319,6 +2324,8 @@ void blk_throtl_bio_endio(struct bio *bio)
if (!blkg)
return;
tg = blkg_to_tg(blkg);
+ if (!tg->td->limit_valid[LIMIT_LOW])
+ return;
finish_time_ns = ktime_get_ns();
tg->last_finish_time = finish_time_ns >> 10;
@@ -2358,8 +2365,9 @@ void blk_throtl_bio_endio(struct bio *bio)
}
#endif
-int blk_throtl_init(struct request_queue *q)
+int blk_throtl_init(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct throtl_data *td;
int ret;
@@ -2392,7 +2400,7 @@ int blk_throtl_init(struct request_queue *q)
td->low_downgrade_time = jiffies;
/* activate policy */
- ret = blkcg_activate_policy(q, &blkcg_policy_throtl);
+ ret = blkcg_activate_policy(disk, &blkcg_policy_throtl);
if (ret) {
free_percpu(td->latency_buckets[READ]);
free_percpu(td->latency_buckets[WRITE]);
@@ -2401,18 +2409,22 @@ int blk_throtl_init(struct request_queue *q)
return ret;
}
-void blk_throtl_exit(struct request_queue *q)
+void blk_throtl_exit(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
+
BUG_ON(!q->td);
+ del_timer_sync(&q->td->service_queue.pending_timer);
throtl_shutdown_wq(q);
- blkcg_deactivate_policy(q, &blkcg_policy_throtl);
+ blkcg_deactivate_policy(disk, &blkcg_policy_throtl);
free_percpu(q->td->latency_buckets[READ]);
free_percpu(q->td->latency_buckets[WRITE]);
kfree(q->td);
}
-void blk_throtl_register_queue(struct request_queue *q)
+void blk_throtl_register(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct throtl_data *td;
int i;
@@ -2433,11 +2445,12 @@ void blk_throtl_register_queue(struct request_queue *q)
#ifndef CONFIG_BLK_DEV_THROTTLING_LOW
/* if no low limit, use previous default */
td->throtl_slice = DFL_THROTL_SLICE_HD;
-#endif
+#else
td->track_bio_latency = !queue_is_mq(q);
if (!td->track_bio_latency)
blk_stat_enable_accounting(q);
+#endif
}
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
diff --git a/block/blk-throttle.h b/block/blk-throttle.h
new file mode 100644
index 000000000000..bffbc9cfc8ab
--- /dev/null
+++ b/block/blk-throttle.h
@@ -0,0 +1,217 @@
+#ifndef BLK_THROTTLE_H
+#define BLK_THROTTLE_H
+
+#include "blk-cgroup-rwstat.h"
+
+/*
+ * To implement hierarchical throttling, throtl_grps form a tree and bios
+ * are dispatched upwards level by level until they reach the top and get
+ * issued. When dispatching bios from the children and local group at each
+ * level, if the bios are dispatched into a single bio_list, there's a risk
+ * of a local or child group which can queue many bios at once filling up
+ * the list starving others.
+ *
+ * To avoid such starvation, dispatched bios are queued separately
+ * according to where they came from. When they are again dispatched to
+ * the parent, they're popped in round-robin order so that no single source
+ * hogs the dispatch window.
+ *
+ * throtl_qnode is used to keep the queued bios separated by their sources.
+ * Bios are queued to throtl_qnode which in turn is queued to
+ * throtl_service_queue and then dispatched in round-robin order.
+ *
+ * It's also used to track the reference counts on blkg's. A qnode always
+ * belongs to a throtl_grp and gets queued on itself or the parent, so
+ * incrementing the reference of the associated throtl_grp when a qnode is
+ * queued and decrementing when dequeued is enough to keep the whole blkg
+ * tree pinned while bios are in flight.
+ */
+struct throtl_qnode {
+ struct list_head node; /* service_queue->queued[] */
+ struct bio_list bios; /* queued bios */
+ struct throtl_grp *tg; /* tg this qnode belongs to */
+};
+
+struct throtl_service_queue {
+ struct throtl_service_queue *parent_sq; /* the parent service_queue */
+
+ /*
+ * Bios queued directly to this service_queue or dispatched from
+ * children throtl_grp's.
+ */
+ struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */
+ unsigned int nr_queued[2]; /* number of queued bios */
+
+ /*
+ * RB tree of active children throtl_grp's, which are sorted by
+ * their ->disptime.
+ */
+ struct rb_root_cached pending_tree; /* RB tree of active tgs */
+ unsigned int nr_pending; /* # queued in the tree */
+ unsigned long first_pending_disptime; /* disptime of the first tg */
+ struct timer_list pending_timer; /* fires on first_pending_disptime */
+};
+
+enum tg_state_flags {
+ THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */
+ THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */
+ THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */
+};
+
+enum {
+ LIMIT_LOW,
+ LIMIT_MAX,
+ LIMIT_CNT,
+};
+
+struct throtl_grp {
+ /* must be the first member */
+ struct blkg_policy_data pd;
+
+ /* active throtl group service_queue member */
+ struct rb_node rb_node;
+
+ /* throtl_data this group belongs to */
+ struct throtl_data *td;
+
+ /* this group's service queue */
+ struct throtl_service_queue service_queue;
+
+ /*
+ * qnode_on_self is used when bios are directly queued to this
+ * throtl_grp so that local bios compete fairly with bios
+ * dispatched from children. qnode_on_parent is used when bios are
+ * dispatched from this throtl_grp into its parent and will compete
+ * with the sibling qnode_on_parents and the parent's
+ * qnode_on_self.
+ */
+ struct throtl_qnode qnode_on_self[2];
+ struct throtl_qnode qnode_on_parent[2];
+
+ /*
+ * Dispatch time in jiffies. This is the estimated time when group
+ * will unthrottle and is ready to dispatch more bio. It is used as
+ * key to sort active groups in service tree.
+ */
+ unsigned long disptime;
+
+ unsigned int flags;
+
+ /* are there any throtl rules between this group and td? */
+ bool has_rules_bps[2];
+ bool has_rules_iops[2];
+
+ /* internally used bytes per second rate limits */
+ uint64_t bps[2][LIMIT_CNT];
+ /* user configured bps limits */
+ uint64_t bps_conf[2][LIMIT_CNT];
+
+ /* internally used IOPS limits */
+ unsigned int iops[2][LIMIT_CNT];
+ /* user configured IOPS limits */
+ unsigned int iops_conf[2][LIMIT_CNT];
+
+ /* Number of bytes dispatched in current slice */
+ uint64_t bytes_disp[2];
+ /* Number of bio's dispatched in current slice */
+ unsigned int io_disp[2];
+
+ unsigned long last_low_overflow_time[2];
+
+ uint64_t last_bytes_disp[2];
+ unsigned int last_io_disp[2];
+
+ /*
+ * The following two fields are updated when new configuration is
+ * submitted while some bios are still throttled, they record how many
+ * bytes/ios are waited already in previous configuration, and they will
+ * be used to calculate wait time under new configuration.
+ */
+ long long carryover_bytes[2];
+ int carryover_ios[2];
+
+ unsigned long last_check_time;
+
+ unsigned long latency_target; /* us */
+ unsigned long latency_target_conf; /* us */
+ /* When did we start a new slice */
+ unsigned long slice_start[2];
+ unsigned long slice_end[2];
+
+ unsigned long last_finish_time; /* ns / 1024 */
+ unsigned long checked_last_finish_time; /* ns / 1024 */
+ unsigned long avg_idletime; /* ns / 1024 */
+ unsigned long idletime_threshold; /* us */
+ unsigned long idletime_threshold_conf; /* us */
+
+ unsigned int bio_cnt; /* total bios */
+ unsigned int bad_bio_cnt; /* bios exceeding latency threshold */
+ unsigned long bio_cnt_reset_time;
+
+ struct blkg_rwstat stat_bytes;
+ struct blkg_rwstat stat_ios;
+};
+
+extern struct blkcg_policy blkcg_policy_throtl;
+
+static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd)
+{
+ return pd ? container_of(pd, struct throtl_grp, pd) : NULL;
+}
+
+static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg)
+{
+ return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl));
+}
+
+/*
+ * Internal throttling interface
+ */
+#ifndef CONFIG_BLK_DEV_THROTTLING
+static inline int blk_throtl_init(struct gendisk *disk) { return 0; }
+static inline void blk_throtl_exit(struct gendisk *disk) { }
+static inline void blk_throtl_register(struct gendisk *disk) { }
+static inline bool blk_throtl_bio(struct bio *bio) { return false; }
+static inline void blk_throtl_cancel_bios(struct gendisk *disk) { }
+#else /* CONFIG_BLK_DEV_THROTTLING */
+int blk_throtl_init(struct gendisk *disk);
+void blk_throtl_exit(struct gendisk *disk);
+void blk_throtl_register(struct gendisk *disk);
+bool __blk_throtl_bio(struct bio *bio);
+void blk_throtl_cancel_bios(struct gendisk *disk);
+
+static inline bool blk_should_throtl(struct bio *bio)
+{
+ struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg);
+ int rw = bio_data_dir(bio);
+
+ if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) {
+ if (!bio_flagged(bio, BIO_CGROUP_ACCT)) {
+ bio_set_flag(bio, BIO_CGROUP_ACCT);
+ blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf,
+ bio->bi_iter.bi_size);
+ }
+ blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1);
+ }
+
+ /* iops limit is always counted */
+ if (tg->has_rules_iops[rw])
+ return true;
+
+ if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED))
+ return true;
+
+ return false;
+}
+
+static inline bool blk_throtl_bio(struct bio *bio)
+{
+
+ if (!blk_should_throtl(bio))
+ return false;
+
+ return __blk_throtl_bio(bio);
+}
+#endif /* CONFIG_BLK_DEV_THROTTLING */
+
+#endif
diff --git a/block/blk-timeout.c b/block/blk-timeout.c
index 8aa68fae96ad..1b8de0417fc1 100644
--- a/block/blk-timeout.c
+++ b/block/blk-timeout.c
@@ -20,13 +20,11 @@ static int __init setup_fail_io_timeout(char *str)
}
__setup("fail_io_timeout=", setup_fail_io_timeout);
-int blk_should_fake_timeout(struct request_queue *q)
+bool __blk_should_fake_timeout(struct request_queue *q)
{
- if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags))
- return 0;
-
return should_fail(&fail_io_timeout, 1);
}
+EXPORT_SYMBOL_GPL(__blk_should_fake_timeout);
static int __init fail_io_timeout_debugfs(void)
{
@@ -70,7 +68,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr,
#endif /* CONFIG_FAIL_IO_TIMEOUT */
/**
- * blk_abort_request -- Request request recovery for the specified command
+ * blk_abort_request - Request recovery for the specified command
* @req: pointer to the request of interest
*
* This function requests that the block layer start recovery for the
@@ -90,11 +88,29 @@ void blk_abort_request(struct request *req)
}
EXPORT_SYMBOL_GPL(blk_abort_request);
+static unsigned long blk_timeout_mask __read_mostly;
+
+static int __init blk_timeout_init(void)
+{
+ blk_timeout_mask = roundup_pow_of_two(HZ) - 1;
+ return 0;
+}
+
+late_initcall(blk_timeout_init);
+
+/*
+ * Just a rough estimate, we don't care about specific values for timeouts.
+ */
+static inline unsigned long blk_round_jiffies(unsigned long j)
+{
+ return (j + blk_timeout_mask) + 1;
+}
+
unsigned long blk_rq_timeout(unsigned long timeout)
{
unsigned long maxt;
- maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT);
+ maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT);
if (time_after(timeout, maxt))
timeout = maxt;
@@ -131,7 +147,7 @@ void blk_add_timer(struct request *req)
* than an existing one, modify the timer. Round up to next nearest
* second.
*/
- expiry = blk_rq_timeout(round_jiffies_up(expiry));
+ expiry = blk_rq_timeout(blk_round_jiffies(expiry));
if (!timer_pending(&q->timeout) ||
time_before(expiry, q->timeout.expires)) {
diff --git a/block/blk-wbt.c b/block/blk-wbt.c
index 0fa615eefd52..0c0e270a8265 100644
--- a/block/blk-wbt.c
+++ b/block/blk-wbt.c
@@ -25,12 +25,78 @@
#include <linux/backing-dev.h>
#include <linux/swap.h>
+#include "blk-stat.h"
#include "blk-wbt.h"
#include "blk-rq-qos.h"
+#include "elevator.h"
#define CREATE_TRACE_POINTS
#include <trace/events/wbt.h>
+enum wbt_flags {
+ WBT_TRACKED = 1, /* write, tracked for throttling */
+ WBT_READ = 2, /* read */
+ WBT_KSWAPD = 4, /* write, from kswapd */
+ WBT_DISCARD = 8, /* discard */
+
+ WBT_NR_BITS = 4, /* number of bits */
+};
+
+enum {
+ WBT_RWQ_BG = 0,
+ WBT_RWQ_KSWAPD,
+ WBT_RWQ_DISCARD,
+ WBT_NUM_RWQ,
+};
+
+/*
+ * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
+ * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
+ * to WBT_STATE_OFF/ON_MANUAL.
+ */
+enum {
+ WBT_STATE_ON_DEFAULT = 1, /* on by default */
+ WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */
+ WBT_STATE_OFF_DEFAULT = 3, /* off by default */
+ WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */
+};
+
+struct rq_wb {
+ /*
+ * Settings that govern how we throttle
+ */
+ unsigned int wb_background; /* background writeback */
+ unsigned int wb_normal; /* normal writeback */
+
+ short enable_state; /* WBT_STATE_* */
+
+ /*
+ * Number of consecutive periods where we don't have enough
+ * information to make a firm scale up/down decision.
+ */
+ unsigned int unknown_cnt;
+
+ u64 win_nsec; /* default window size */
+ u64 cur_win_nsec; /* current window size */
+
+ struct blk_stat_callback *cb;
+
+ u64 sync_issue;
+ void *sync_cookie;
+
+ unsigned long last_issue; /* last non-throttled issue */
+ unsigned long last_comp; /* last non-throttled comp */
+ unsigned long min_lat_nsec;
+ struct rq_qos rqos;
+ struct rq_wait rq_wait[WBT_NUM_RWQ];
+ struct rq_depth rq_depth;
+};
+
+static inline struct rq_wb *RQWB(struct rq_qos *rqos)
+{
+ return container_of(rqos, struct rq_wb, rqos);
+}
+
static inline void wbt_clear_state(struct request *rq)
{
rq->wbt_flags = 0;
@@ -77,7 +143,8 @@ enum {
static inline bool rwb_enabled(struct rq_wb *rwb)
{
- return rwb && rwb->wb_normal != 0;
+ return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
+ rwb->enable_state != WBT_STATE_OFF_MANUAL;
}
static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
@@ -96,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
*/
static bool wb_recent_wait(struct rq_wb *rwb)
{
- struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
- return time_before(jiffies, wb->dirty_sleep + HZ);
+ return time_before(jiffies, bdi->last_bdp_sleep + HZ);
}
static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
@@ -132,22 +199,14 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
inflight = atomic_dec_return(&rqw->inflight);
/*
- * wbt got disabled with IO in flight. Wake up any potential
- * waiters, we don't have to do more than that.
- */
- if (unlikely(!rwb_enabled(rwb))) {
- rwb_wake_all(rwb);
- return;
- }
-
- /*
* For discards, our limit is always the background. For writes, if
* the device does write back caching, drop further down before we
* wake people up.
*/
if (wb_acct & WBT_DISCARD)
limit = rwb->wb_background;
- else if (rwb->wc && !wb_recent_wait(rwb))
+ else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) &&
+ !wb_recent_wait(rwb))
limit = 0;
else
limit = rwb->wb_normal;
@@ -224,6 +283,16 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
return now - issue;
}
+static inline unsigned int wbt_inflight(struct rq_wb *rwb)
+{
+ unsigned int i, ret = 0;
+
+ for (i = 0; i < WBT_NUM_RWQ; i++)
+ ret += atomic_read(&rwb->rq_wait[i].inflight);
+
+ return ret;
+}
+
enum {
LAT_OK = 1,
LAT_UNKNOWN,
@@ -233,7 +302,7 @@ enum {
static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
{
- struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
struct rq_depth *rqd = &rwb->rq_depth;
u64 thislat;
@@ -286,7 +355,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
{
- struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info;
+ struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
struct rq_depth *rqd = &rwb->rq_depth;
trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
@@ -356,10 +425,12 @@ static void wb_timer_fn(struct blk_stat_callback *cb)
unsigned int inflight = wbt_inflight(rwb);
int status;
+ if (!rwb->rqos.disk)
+ return;
+
status = latency_exceeded(rwb, cb->stat);
- trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step,
- inflight);
+ trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight);
/*
* If we exceeded the latency target, step down. If we did not,
@@ -418,6 +489,13 @@ static void wbt_update_limits(struct rq_wb *rwb)
rwb_wake_all(rwb);
}
+bool wbt_disabled(struct request_queue *q)
+{
+ struct rq_qos *rqos = wbt_rq_qos(q);
+
+ return !rqos || !rwb_enabled(RQWB(rqos));
+}
+
u64 wbt_get_min_lat(struct request_queue *q)
{
struct rq_qos *rqos = wbt_rq_qos(q);
@@ -431,8 +509,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val)
struct rq_qos *rqos = wbt_rq_qos(q);
if (!rqos)
return;
+
RQWB(rqos)->min_lat_nsec = val;
- RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ if (val)
+ RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
+ else
+ RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
+
wbt_update_limits(RQWB(rqos));
}
@@ -447,18 +530,11 @@ static bool close_io(struct rq_wb *rwb)
#define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO)
-static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
+static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
{
unsigned int limit;
- /*
- * If we got disabled, just return UINT_MAX. This ensures that
- * we'll properly inc a new IO, and dec+wakeup at the end.
- */
- if (!rwb_enabled(rwb))
- return UINT_MAX;
-
- if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD)
+ if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
return rwb->wb_background;
/*
@@ -469,9 +545,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
* the idle limit, or go to normal if we haven't had competing
* IO for a bit.
*/
- if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
+ if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
limit = rwb->rq_depth.max_depth;
- else if ((rw & REQ_BACKGROUND) || close_io(rwb)) {
+ else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
/*
* If less than 100ms since we completed unrelated IO,
* limit us to half the depth for background writeback.
@@ -486,13 +562,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw)
struct wbt_wait_data {
struct rq_wb *rwb;
enum wbt_flags wb_acct;
- unsigned long rw;
+ blk_opf_t opf;
};
static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
{
struct wbt_wait_data *data = private_data;
- return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw));
+ return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
}
static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
@@ -506,19 +582,19 @@ static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
* the timer to kick off queuing again.
*/
static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
- unsigned long rw)
+ blk_opf_t opf)
{
struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
struct wbt_wait_data data = {
.rwb = rwb,
.wb_acct = wb_acct,
- .rw = rw,
+ .opf = opf,
};
rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
}
-static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
+static inline bool wbt_should_throttle(struct bio *bio)
{
switch (bio_op(bio)) {
case REQ_OP_WRITE:
@@ -528,7 +604,7 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio)
if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
(REQ_SYNC | REQ_IDLE))
return false;
- /* fallthrough */
+ fallthrough;
case REQ_OP_DISCARD:
return true;
default:
@@ -545,7 +621,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
if (bio_op(bio) == REQ_OP_READ) {
flags = WBT_READ;
- } else if (wbt_should_throttle(rwb, bio)) {
+ } else if (wbt_should_throttle(bio)) {
if (current_is_kswapd())
flags |= WBT_KSWAPD;
if (bio_op(bio) == REQ_OP_DISCARD)
@@ -563,7 +639,6 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
}
/*
- * Returns true if the IO request should be accounted, false if not.
* May sleep, if we have exceeded the writeback limits. Caller can pass
* in an irq held spinlock, if it holds one when calling this function.
* If we do sleep, we'll release and re-grab it.
@@ -623,29 +698,33 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
}
}
-void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
-{
- struct rq_qos *rqos = wbt_rq_qos(q);
- if (rqos)
- RQWB(rqos)->wc = write_cache_on;
-}
-
/*
* Enable wbt if defaults are configured that way
*/
-void wbt_enable_default(struct request_queue *q)
+void wbt_enable_default(struct gendisk *disk)
{
- struct rq_qos *rqos = wbt_rq_qos(q);
+ struct request_queue *q = disk->queue;
+ struct rq_qos *rqos;
+ bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
+
+ if (q->elevator &&
+ test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
+ enable = false;
+
/* Throttling already enabled? */
- if (rqos)
+ rqos = wbt_rq_qos(q);
+ if (rqos) {
+ if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
+ RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
return;
+ }
/* Queue not registered? Maybe shutting down... */
if (!blk_queue_registered(q))
return;
- if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ))
- wbt_init(q);
+ if (queue_is_mq(q) && enable)
+ wbt_init(disk);
}
EXPORT_SYMBOL_GPL(wbt_enable_default);
@@ -663,7 +742,7 @@ u64 wbt_default_latency_nsec(struct request_queue *q)
static int wbt_data_dir(const struct request *rq)
{
- const int op = req_op(rq);
+ const enum req_op op = req_op(rq);
if (op == REQ_OP_READ)
return READ;
@@ -676,16 +755,15 @@ static int wbt_data_dir(const struct request *rq)
static void wbt_queue_depth_changed(struct rq_qos *rqos)
{
- RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q);
+ RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
wbt_update_limits(RQWB(rqos));
}
static void wbt_exit(struct rq_qos *rqos)
{
struct rq_wb *rwb = RQWB(rqos);
- struct request_queue *q = rqos->q;
- blk_stat_remove_callback(q, rwb->cb);
+ blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
blk_stat_free_callback(rwb->cb);
kfree(rwb);
}
@@ -693,16 +771,16 @@ static void wbt_exit(struct rq_qos *rqos)
/*
* Disable wbt, if enabled by default.
*/
-void wbt_disable_default(struct request_queue *q)
+void wbt_disable_default(struct gendisk *disk)
{
- struct rq_qos *rqos = wbt_rq_qos(q);
+ struct rq_qos *rqos = wbt_rq_qos(disk->queue);
struct rq_wb *rwb;
if (!rqos)
return;
rwb = RQWB(rqos);
if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
blk_stat_deactivate(rwb->cb);
- rwb->wb_normal = 0;
+ rwb->enable_state = WBT_STATE_OFF_DEFAULT;
}
}
EXPORT_SYMBOL_GPL(wbt_disable_default);
@@ -795,7 +873,7 @@ static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
};
#endif
-static struct rq_qos_ops wbt_rqos_ops = {
+static const struct rq_qos_ops wbt_rqos_ops = {
.throttle = wbt_wait,
.issue = wbt_issue,
.track = wbt_track,
@@ -809,10 +887,12 @@ static struct rq_qos_ops wbt_rqos_ops = {
#endif
};
-int wbt_init(struct request_queue *q)
+int wbt_init(struct gendisk *disk)
{
+ struct request_queue *q = disk->queue;
struct rq_wb *rwb;
int i;
+ int ret;
rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
if (!rwb)
@@ -827,26 +907,30 @@ int wbt_init(struct request_queue *q)
for (i = 0; i < WBT_NUM_RWQ; i++)
rq_wait_init(&rwb->rq_wait[i]);
- rwb->rqos.id = RQ_QOS_WBT;
- rwb->rqos.ops = &wbt_rqos_ops;
- rwb->rqos.q = q;
rwb->last_comp = rwb->last_issue = jiffies;
rwb->win_nsec = RWB_WINDOW_NSEC;
rwb->enable_state = WBT_STATE_ON_DEFAULT;
- rwb->wc = 1;
rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
+ rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+ rwb->rq_depth.queue_depth = blk_queue_depth(q);
wbt_update_limits(rwb);
/*
* Assign rwb and add the stats callback.
*/
- rq_qos_add(q, &rwb->rqos);
+ mutex_lock(&q->rq_qos_mutex);
+ ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
+ mutex_unlock(&q->rq_qos_mutex);
+ if (ret)
+ goto err_free;
+
blk_stat_add_callback(q, rwb->cb);
- rwb->min_lat_nsec = wbt_default_latency_nsec(q);
+ return 0;
- wbt_queue_depth_changed(&rwb->rqos);
- wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags));
+err_free:
+ blk_stat_free_callback(rwb->cb);
+ kfree(rwb);
+ return ret;
- return 0;
}
diff --git a/block/blk-wbt.h b/block/blk-wbt.h
index 16bdc85b8df9..e5fc653b9b76 100644
--- a/block/blk-wbt.h
+++ b/block/blk-wbt.h
@@ -2,130 +2,25 @@
#ifndef WB_THROTTLE_H
#define WB_THROTTLE_H
-#include <linux/kernel.h>
-#include <linux/atomic.h>
-#include <linux/wait.h>
-#include <linux/timer.h>
-#include <linux/ktime.h>
-
-#include "blk-stat.h"
-#include "blk-rq-qos.h"
-
-enum wbt_flags {
- WBT_TRACKED = 1, /* write, tracked for throttling */
- WBT_READ = 2, /* read */
- WBT_KSWAPD = 4, /* write, from kswapd */
- WBT_DISCARD = 8, /* discard */
-
- WBT_NR_BITS = 4, /* number of bits */
-};
-
-enum {
- WBT_RWQ_BG = 0,
- WBT_RWQ_KSWAPD,
- WBT_RWQ_DISCARD,
- WBT_NUM_RWQ,
-};
-
-/*
- * Enable states. Either off, or on by default (done at init time),
- * or on through manual setup in sysfs.
- */
-enum {
- WBT_STATE_ON_DEFAULT = 1,
- WBT_STATE_ON_MANUAL = 2,
-};
-
-struct rq_wb {
- /*
- * Settings that govern how we throttle
- */
- unsigned int wb_background; /* background writeback */
- unsigned int wb_normal; /* normal writeback */
-
- short enable_state; /* WBT_STATE_* */
-
- /*
- * Number of consecutive periods where we don't have enough
- * information to make a firm scale up/down decision.
- */
- unsigned int unknown_cnt;
-
- u64 win_nsec; /* default window size */
- u64 cur_win_nsec; /* current window size */
-
- struct blk_stat_callback *cb;
-
- u64 sync_issue;
- void *sync_cookie;
-
- unsigned int wc;
-
- unsigned long last_issue; /* last non-throttled issue */
- unsigned long last_comp; /* last non-throttled comp */
- unsigned long min_lat_nsec;
- struct rq_qos rqos;
- struct rq_wait rq_wait[WBT_NUM_RWQ];
- struct rq_depth rq_depth;
-};
-
-static inline struct rq_wb *RQWB(struct rq_qos *rqos)
-{
- return container_of(rqos, struct rq_wb, rqos);
-}
-
-static inline unsigned int wbt_inflight(struct rq_wb *rwb)
-{
- unsigned int i, ret = 0;
-
- for (i = 0; i < WBT_NUM_RWQ; i++)
- ret += atomic_read(&rwb->rq_wait[i].inflight);
-
- return ret;
-}
-
-
#ifdef CONFIG_BLK_WBT
-int wbt_init(struct request_queue *);
-void wbt_disable_default(struct request_queue *);
-void wbt_enable_default(struct request_queue *);
+int wbt_init(struct gendisk *disk);
+void wbt_disable_default(struct gendisk *disk);
+void wbt_enable_default(struct gendisk *disk);
u64 wbt_get_min_lat(struct request_queue *q);
void wbt_set_min_lat(struct request_queue *q, u64 val);
-
-void wbt_set_write_cache(struct request_queue *, bool);
+bool wbt_disabled(struct request_queue *);
u64 wbt_default_latency_nsec(struct request_queue *);
#else
-static inline void wbt_track(struct request *rq, enum wbt_flags flags)
-{
-}
-static inline int wbt_init(struct request_queue *q)
-{
- return -EINVAL;
-}
-static inline void wbt_disable_default(struct request_queue *q)
-{
-}
-static inline void wbt_enable_default(struct request_queue *q)
-{
-}
-static inline void wbt_set_write_cache(struct request_queue *q, bool wc)
-{
-}
-static inline u64 wbt_get_min_lat(struct request_queue *q)
-{
- return 0;
-}
-static inline void wbt_set_min_lat(struct request_queue *q, u64 val)
+static inline void wbt_disable_default(struct gendisk *disk)
{
}
-static inline u64 wbt_default_latency_nsec(struct request_queue *q)
+static inline void wbt_enable_default(struct gendisk *disk)
{
- return 0;
}
#endif /* CONFIG_BLK_WBT */
diff --git a/block/blk-zoned.c b/block/blk-zoned.c
index 23831fa8701d..d343e5756a9c 100644
--- a/block/blk-zoned.c
+++ b/block/blk-zoned.c
@@ -52,33 +52,15 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond)
}
EXPORT_SYMBOL_GPL(blk_zone_cond_str);
-static inline sector_t blk_zone_start(struct request_queue *q,
- sector_t sector)
-{
- sector_t zone_mask = blk_queue_zone_sectors(q) - 1;
-
- return sector & ~zone_mask;
-}
-
/*
* Return true if a request is a write requests that needs zone write locking.
*/
bool blk_req_needs_zone_write_lock(struct request *rq)
{
- if (!rq->q->seq_zones_wlock)
+ if (!rq->q->disk->seq_zones_wlock)
return false;
- if (blk_rq_is_passthrough(rq))
- return false;
-
- switch (req_op(rq)) {
- case REQ_OP_WRITE_ZEROES:
- case REQ_OP_WRITE_SAME:
- case REQ_OP_WRITE:
- return blk_rq_zone_is_seq(rq);
- default:
- return false;
- }
+ return blk_rq_is_seq_zoned_write(rq);
}
EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock);
@@ -86,7 +68,7 @@ bool blk_req_zone_write_trylock(struct request *rq)
{
unsigned int zno = blk_rq_zone_no(rq);
- if (test_and_set_bit(zno, rq->q->seq_zones_wlock))
+ if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock))
return false;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -99,7 +81,7 @@ EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock);
void __blk_req_zone_write_lock(struct request *rq)
{
if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq),
- rq->q->seq_zones_wlock)))
+ rq->q->disk->seq_zones_wlock)))
return;
WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED);
@@ -110,28 +92,29 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock);
void __blk_req_zone_write_unlock(struct request *rq)
{
rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED;
- if (rq->q->seq_zones_wlock)
+ if (rq->q->disk->seq_zones_wlock)
WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq),
- rq->q->seq_zones_wlock));
+ rq->q->disk->seq_zones_wlock));
}
EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock);
/**
- * blkdev_nr_zones - Get number of zones
- * @disk: Target gendisk
+ * bdev_nr_zones - Get number of zones
+ * @bdev: Target device
*
* Return the total number of zones of a zoned block device. For a block
* device without zone capabilities, the number of zones is always 0.
*/
-unsigned int blkdev_nr_zones(struct gendisk *disk)
+unsigned int bdev_nr_zones(struct block_device *bdev)
{
- sector_t zone_sectors = blk_queue_zone_sectors(disk->queue);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
- if (!blk_queue_is_zoned(disk->queue))
+ if (!bdev_is_zoned(bdev))
return 0;
- return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors);
+ return (bdev_nr_sectors(bdev) + zone_sectors - 1) >>
+ ilog2(zone_sectors);
}
-EXPORT_SYMBOL_GPL(blkdev_nr_zones);
+EXPORT_SYMBOL_GPL(bdev_nr_zones);
/**
* blkdev_report_zones - Get zones information
@@ -158,8 +141,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
struct gendisk *disk = bdev->bd_disk;
sector_t capacity = get_capacity(disk);
- if (!blk_queue_is_zoned(bdev_get_queue(bdev)) ||
- WARN_ON_ONCE(!disk->fops->report_zones))
+ if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones))
return -EOPNOTSUPP;
if (!nr_zones || sector >= capacity)
@@ -169,18 +151,84 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector,
}
EXPORT_SYMBOL_GPL(blkdev_report_zones);
-static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
- sector_t sector,
- sector_t nr_sectors)
+static inline unsigned long *blk_alloc_zone_bitmap(int node,
+ unsigned int nr_zones)
{
- if (!blk_queue_zone_resetall(bdev_get_queue(bdev)))
- return false;
+ return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
+ GFP_NOIO, node);
+}
+static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx,
+ void *data)
+{
/*
- * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors
- * of the applicable zone range is the entire disk.
+ * For an all-zones reset, ignore conventional, empty, read-only
+ * and offline zones.
*/
- return !sector && nr_sectors == get_capacity(bdev->bd_disk);
+ switch (zone->cond) {
+ case BLK_ZONE_COND_NOT_WP:
+ case BLK_ZONE_COND_EMPTY:
+ case BLK_ZONE_COND_READONLY:
+ case BLK_ZONE_COND_OFFLINE:
+ return 0;
+ default:
+ set_bit(idx, (unsigned long *)data);
+ return 0;
+ }
+}
+
+static int blkdev_zone_reset_all_emulated(struct block_device *bdev,
+ gfp_t gfp_mask)
+{
+ struct gendisk *disk = bdev->bd_disk;
+ sector_t capacity = bdev_nr_sectors(bdev);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
+ unsigned long *need_reset;
+ struct bio *bio = NULL;
+ sector_t sector = 0;
+ int ret;
+
+ need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones);
+ if (!need_reset)
+ return -ENOMEM;
+
+ ret = disk->fops->report_zones(disk, 0, disk->nr_zones,
+ blk_zone_need_reset_cb, need_reset);
+ if (ret < 0)
+ goto out_free_need_reset;
+
+ ret = 0;
+ while (sector < capacity) {
+ if (!test_bit(disk_zone_no(disk, sector), need_reset)) {
+ sector += zone_sectors;
+ continue;
+ }
+
+ bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC,
+ gfp_mask);
+ bio->bi_iter.bi_sector = sector;
+ sector += zone_sectors;
+
+ /* This may take a while, so be nice to others */
+ cond_resched();
+ }
+
+ if (bio) {
+ ret = submit_bio_wait(bio);
+ bio_put(bio);
+ }
+
+out_free_need_reset:
+ kfree(need_reset);
+ return ret;
+}
+
+static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask)
+{
+ struct bio bio;
+
+ bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC);
+ return submit_bio_wait(&bio);
}
/**
@@ -199,18 +247,17 @@ static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev,
* The operation to execute on each zone can be a zone reset, open, close
* or finish request.
*/
-int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
- sector_t sector, sector_t nr_sectors,
- gfp_t gfp_mask)
+int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op,
+ sector_t sector, sector_t nr_sectors, gfp_t gfp_mask)
{
struct request_queue *q = bdev_get_queue(bdev);
- sector_t zone_sectors = blk_queue_zone_sectors(q);
- sector_t capacity = get_capacity(bdev->bd_disk);
+ sector_t zone_sectors = bdev_zone_sectors(bdev);
+ sector_t capacity = bdev_nr_sectors(bdev);
sector_t end_sector = sector + nr_sectors;
struct bio *bio = NULL;
- int ret;
+ int ret = 0;
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -EOPNOTSUPP;
if (bdev_read_only(bdev))
@@ -224,27 +271,26 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op,
return -EINVAL;
/* Check alignment (handle eventual smaller last zone) */
- if (sector & (zone_sectors - 1))
+ if (!bdev_is_zone_start(bdev, sector))
return -EINVAL;
- if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity)
+ if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity)
return -EINVAL;
- while (sector < end_sector) {
- bio = blk_next_bio(bio, 0, gfp_mask);
- bio_set_dev(bio, bdev);
-
- /*
- * Special case for the zone reset operation that reset all
- * zones, this is useful for applications like mkfs.
- */
- if (op == REQ_OP_ZONE_RESET &&
- blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) {
- bio->bi_opf = REQ_OP_ZONE_RESET_ALL;
- break;
- }
+ /*
+ * In the case of a zone reset operation over all zones,
+ * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this
+ * command. For other devices, we emulate this command behavior by
+ * identifying the zones needing a reset.
+ */
+ if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) {
+ if (!blk_queue_zone_resetall(q))
+ return blkdev_zone_reset_all_emulated(bdev, gfp_mask);
+ return blkdev_zone_reset_all(bdev, gfp_mask);
+ }
- bio->bi_opf = op | REQ_SYNC;
+ while (sector < end_sector) {
+ bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask);
bio->bi_iter.bi_sector = sector;
sector += zone_sectors;
@@ -277,28 +323,20 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx,
* BLKREPORTZONE ioctl processing.
* Called from blkdev_ioctl.
*/
-int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned int cmd, unsigned long arg)
+int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
+ unsigned long arg)
{
void __user *argp = (void __user *)arg;
struct zone_report_args args;
- struct request_queue *q;
struct blk_zone_report rep;
int ret;
if (!argp)
return -EINVAL;
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -ENOTTY;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report)))
return -EFAULT;
@@ -312,37 +350,47 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode,
return ret;
rep.nr_zones = ret;
+ rep.flags = BLK_ZONE_REP_CAPACITY;
if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report)))
return -EFAULT;
return 0;
}
+static int blkdev_truncate_zone_range(struct block_device *bdev,
+ blk_mode_t mode, const struct blk_zone_range *zrange)
+{
+ loff_t start, end;
+
+ if (zrange->sector + zrange->nr_sectors <= zrange->sector ||
+ zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk))
+ /* Out of range */
+ return -EINVAL;
+
+ start = zrange->sector << SECTOR_SHIFT;
+ end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1;
+
+ return truncate_bdev_range(bdev, mode, start, end);
+}
+
/*
* BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing.
* Called from blkdev_ioctl.
*/
-int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
+int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned int cmd, unsigned long arg)
{
void __user *argp = (void __user *)arg;
- struct request_queue *q;
struct blk_zone_range zrange;
- enum req_opf op;
+ enum req_op op;
+ int ret;
if (!argp)
return -EINVAL;
- q = bdev_get_queue(bdev);
- if (!q)
- return -ENXIO;
-
- if (!blk_queue_is_zoned(q))
+ if (!bdev_is_zoned(bdev))
return -ENOTTY;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- if (!(mode & FMODE_WRITE))
+ if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range)))
@@ -351,6 +399,12 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
switch (cmd) {
case BLKRESETZONE:
op = REQ_OP_ZONE_RESET;
+
+ /* Invalidate the page cache, including dirty pages. */
+ filemap_invalidate_lock(bdev->bd_inode->i_mapping);
+ ret = blkdev_truncate_zone_range(bdev, mode, &zrange);
+ if (ret)
+ goto fail;
break;
case BLKOPENZONE:
op = REQ_OP_ZONE_OPEN;
@@ -365,23 +419,22 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode,
return -ENOTTY;
}
- return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
- GFP_KERNEL);
-}
+ ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors,
+ GFP_KERNEL);
-static inline unsigned long *blk_alloc_zone_bitmap(int node,
- unsigned int nr_zones)
-{
- return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long),
- GFP_NOIO, node);
+fail:
+ if (cmd == BLKRESETZONE)
+ filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
+
+ return ret;
}
-void blk_queue_free_zone_bitmaps(struct request_queue *q)
+void disk_free_zone_bitmaps(struct gendisk *disk)
{
- kfree(q->conv_zones_bitmap);
- q->conv_zones_bitmap = NULL;
- kfree(q->seq_zones_wlock);
- q->seq_zones_wlock = NULL;
+ kfree(disk->conv_zones_bitmap);
+ disk->conv_zones_bitmap = NULL;
+ kfree(disk->seq_zones_wlock);
+ disk->seq_zones_wlock = NULL;
}
struct blk_revalidate_zone_args {
@@ -389,7 +442,6 @@ struct blk_revalidate_zone_args {
unsigned long *conv_zones_bitmap;
unsigned long *seq_zones_wlock;
unsigned int nr_zones;
- sector_t zone_sectors;
sector_t sector;
};
@@ -403,38 +455,34 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
struct gendisk *disk = args->disk;
struct request_queue *q = disk->queue;
sector_t capacity = get_capacity(disk);
+ sector_t zone_sectors = q->limits.chunk_sectors;
+
+ /* Check for bad zones and holes in the zone report */
+ if (zone->start != args->sector) {
+ pr_warn("%s: Zone gap at sectors %llu..%llu\n",
+ disk->disk_name, args->sector, zone->start);
+ return -ENODEV;
+ }
+
+ if (zone->start >= capacity || !zone->len) {
+ pr_warn("%s: Invalid zone start %llu, length %llu\n",
+ disk->disk_name, zone->start, zone->len);
+ return -ENODEV;
+ }
/*
* All zones must have the same size, with the exception on an eventual
* smaller last zone.
*/
- if (zone->start == 0) {
- if (zone->len == 0 || !is_power_of_2(zone->len)) {
- pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n",
- disk->disk_name, zone->len);
- return -ENODEV;
- }
-
- args->zone_sectors = zone->len;
- args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len);
- } else if (zone->start + args->zone_sectors < capacity) {
- if (zone->len != args->zone_sectors) {
+ if (zone->start + zone->len < capacity) {
+ if (zone->len != zone_sectors) {
pr_warn("%s: Invalid zoned device with non constant zone size\n",
disk->disk_name);
return -ENODEV;
}
- } else {
- if (zone->len > args->zone_sectors) {
- pr_warn("%s: Invalid zoned device with larger last zone size\n",
- disk->disk_name);
- return -ENODEV;
- }
- }
-
- /* Check for holes in the zone report */
- if (zone->start != args->sector) {
- pr_warn("%s: Zone gap at sectors %llu..%llu\n",
- disk->disk_name, args->sector, zone->start);
+ } else if (zone->len > zone_sectors) {
+ pr_warn("%s: Invalid zoned device with larger last zone size\n",
+ disk->disk_name);
return -ENODEV;
}
@@ -450,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
set_bit(idx, args->conv_zones_bitmap);
break;
case BLK_ZONE_TYPE_SEQWRITE_REQ:
- case BLK_ZONE_TYPE_SEQWRITE_PREF:
if (!args->seq_zones_wlock) {
args->seq_zones_wlock =
blk_alloc_zone_bitmap(q->node, args->nr_zones);
@@ -458,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
return -ENOMEM;
}
break;
+ case BLK_ZONE_TYPE_SEQWRITE_PREF:
default:
pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n",
disk->disk_name, (int)zone->type, zone->start);
@@ -473,11 +521,13 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx,
* @disk: Target disk
* @update_driver_data: Callback to update driver data on the frozen disk
*
- * Helper function for low-level device drivers to (re) allocate and initialize
- * a disk request queue zone bitmaps. This functions should normally be called
- * within the disk ->revalidate method for blk-mq based drivers. For BIO based
- * drivers only q->nr_zones needs to be updated so that the sysfs exposed value
- * is correct.
+ * Helper function for low-level device drivers to check and (re) allocate and
+ * initialize a disk request queue zone bitmaps. This functions should normally
+ * be called within the disk ->revalidate method for blk-mq based drivers.
+ * Before calling this function, the device driver must already have set the
+ * device zone size (chunk_sector limit) and the max zone append limit.
+ * For BIO based drivers, this function cannot be used. BIO based device drivers
+ * only need to set disk->nr_zones so that the sysfs exposed value is correct.
* If the @update_driver_data callback function is not NULL, the callback is
* executed with the device request queue frozen after all zones have been
* checked.
@@ -486,9 +536,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
void (*update_driver_data)(struct gendisk *disk))
{
struct request_queue *q = disk->queue;
- struct blk_revalidate_zone_args args = {
- .disk = disk,
- };
+ sector_t zone_sectors = q->limits.chunk_sectors;
+ sector_t capacity = get_capacity(disk);
+ struct blk_revalidate_zone_args args = { };
unsigned int noio_flag;
int ret;
@@ -497,32 +547,66 @@ int blk_revalidate_disk_zones(struct gendisk *disk,
if (WARN_ON_ONCE(!queue_is_mq(q)))
return -EIO;
+ if (!capacity)
+ return -ENODEV;
+
+ /*
+ * Checks that the device driver indicated a valid zone size and that
+ * the max zone append limit is set.
+ */
+ if (!zone_sectors || !is_power_of_2(zone_sectors)) {
+ pr_warn("%s: Invalid non power of two zone size (%llu)\n",
+ disk->disk_name, zone_sectors);
+ return -ENODEV;
+ }
+
+ if (!q->limits.max_zone_append_sectors) {
+ pr_warn("%s: Invalid 0 maximum zone append limit\n",
+ disk->disk_name);
+ return -ENODEV;
+ }
+
/*
* Ensure that all memory allocations in this context are done as if
* GFP_NOIO was specified.
*/
+ args.disk = disk;
+ args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors);
noio_flag = memalloc_noio_save();
ret = disk->fops->report_zones(disk, 0, UINT_MAX,
blk_revalidate_zone_cb, &args);
+ if (!ret) {
+ pr_warn("%s: No zones reported\n", disk->disk_name);
+ ret = -ENODEV;
+ }
memalloc_noio_restore(noio_flag);
/*
+ * If zones where reported, make sure that the entire disk capacity
+ * has been checked.
+ */
+ if (ret > 0 && args.sector != capacity) {
+ pr_warn("%s: Missing zones from sector %llu\n",
+ disk->disk_name, args.sector);
+ ret = -ENODEV;
+ }
+
+ /*
* Install the new bitmaps and update nr_zones only once the queue is
* stopped and all I/Os are completed (i.e. a scheduler is not
* referencing the bitmaps).
*/
blk_mq_freeze_queue(q);
- if (ret >= 0) {
- blk_queue_chunk_sectors(q, args.zone_sectors);
- q->nr_zones = args.nr_zones;
- swap(q->seq_zones_wlock, args.seq_zones_wlock);
- swap(q->conv_zones_bitmap, args.conv_zones_bitmap);
+ if (ret > 0) {
+ disk->nr_zones = args.nr_zones;
+ swap(disk->seq_zones_wlock, args.seq_zones_wlock);
+ swap(disk->conv_zones_bitmap, args.conv_zones_bitmap);
if (update_driver_data)
update_driver_data(disk);
ret = 0;
} else {
pr_warn("%s: failed to revalidate zones\n", disk->disk_name);
- blk_queue_free_zone_bitmaps(q);
+ disk_free_zone_bitmaps(disk);
}
blk_mq_unfreeze_queue(q);
diff --git a/block/blk.h b/block/blk.h
index b5d1f0fc6547..1ef920f72e0f 100644
--- a/block/blk.h
+++ b/block/blk.h
@@ -2,66 +2,82 @@
#ifndef BLK_INTERNAL_H
#define BLK_INTERNAL_H
-#include <linux/idr.h>
-#include <linux/blk-mq.h>
-#include <linux/part_stat.h>
#include <linux/blk-crypto.h>
+#include <linux/memblock.h> /* for max_pfn/max_low_pfn */
#include <xen/xen.h>
#include "blk-crypto-internal.h"
-#include "blk-mq.h"
-#include "blk-mq-sched.h"
+
+struct elevator_type;
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
-#ifdef CONFIG_DEBUG_FS
extern struct dentry *blk_debugfs_root;
-#endif
struct blk_flush_queue {
+ spinlock_t mq_flush_lock;
unsigned int flush_pending_idx:1;
unsigned int flush_running_idx:1;
blk_status_t rq_status;
unsigned long flush_pending_since;
struct list_head flush_queue[2];
- struct list_head flush_data_in_flight;
+ unsigned long flush_data_in_flight;
struct request *flush_rq;
+};
+
+bool is_flush_rq(struct request *req);
+
+struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
+ gfp_t flags);
+void blk_free_flush_queue(struct blk_flush_queue *q);
+
+void blk_freeze_queue(struct request_queue *q);
+void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic);
+void blk_queue_start_drain(struct request_queue *q);
+int __bio_queue_enter(struct request_queue *q, struct bio *bio);
+void submit_bio_noacct_nocheck(struct bio *bio);
+
+static inline bool blk_try_enter_queue(struct request_queue *q, bool pm)
+{
+ rcu_read_lock();
+ if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter))
+ goto fail;
/*
- * flush_rq shares tag with this rq, both can't be active
- * at the same time
+ * The code that increments the pm_only counter must ensure that the
+ * counter is globally visible before the queue is unfrozen.
*/
- struct request *orig_rq;
- struct lock_class_key key;
- spinlock_t mq_flush_lock;
-};
+ if (blk_queue_pm_only(q) &&
+ (!pm || queue_rpm_status(q) == RPM_SUSPENDED))
+ goto fail_put;
-extern struct kmem_cache *blk_requestq_cachep;
-extern struct kobj_type blk_queue_ktype;
-extern struct ida blk_queue_ida;
+ rcu_read_unlock();
+ return true;
-static inline struct blk_flush_queue *
-blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx)
-{
- return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq;
+fail_put:
+ blk_queue_exit(q);
+fail:
+ rcu_read_unlock();
+ return false;
}
-static inline void __blk_get_queue(struct request_queue *q)
+static inline int bio_queue_enter(struct bio *bio)
{
- kobject_get(&q->kobj);
-}
+ struct request_queue *q = bdev_get_queue(bio->bi_bdev);
-static inline bool
-is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx)
-{
- return hctx->fq->flush_rq == req;
+ if (blk_try_enter_queue(q, false))
+ return 0;
+ return __bio_queue_enter(q, bio);
}
-struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size,
- gfp_t flags);
-void blk_free_flush_queue(struct blk_flush_queue *q);
+#define BIO_INLINE_VECS 4
+struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs,
+ gfp_t gfp_mask);
+void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs);
-void blk_freeze_queue(struct request_queue *q);
+bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv,
+ struct page *page, unsigned len, unsigned offset,
+ bool *same_page);
static inline bool biovec_phys_mergeable(struct request_queue *q,
struct bio_vec *vec1, struct bio_vec *vec2)
@@ -70,6 +86,13 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset;
phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset;
+ /*
+ * Merging adjacent physical pages may not work correctly under KMSAN
+ * if their metadata pages aren't adjacent. Just disable merging.
+ */
+ if (IS_ENABLED(CONFIG_KMSAN))
+ return false;
+
if (addr1 + vec1->bv_len != addr2)
return false;
if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page))
@@ -79,35 +102,81 @@ static inline bool biovec_phys_mergeable(struct request_queue *q,
return true;
}
-static inline bool __bvec_gap_to_prev(struct request_queue *q,
+static inline bool __bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
- return (offset & queue_virt_boundary(q)) ||
- ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+ return (offset & lim->virt_boundary_mask) ||
+ ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask);
}
/*
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
*/
-static inline bool bvec_gap_to_prev(struct request_queue *q,
+static inline bool bvec_gap_to_prev(const struct queue_limits *lim,
struct bio_vec *bprv, unsigned int offset)
{
- if (!queue_virt_boundary(q))
+ if (!lim->virt_boundary_mask)
+ return false;
+ return __bvec_gap_to_prev(lim, bprv, offset);
+}
+
+static inline bool rq_mergeable(struct request *rq)
+{
+ if (blk_rq_is_passthrough(rq))
+ return false;
+
+ if (req_op(rq) == REQ_OP_FLUSH)
+ return false;
+
+ if (req_op(rq) == REQ_OP_WRITE_ZEROES)
+ return false;
+
+ if (req_op(rq) == REQ_OP_ZONE_APPEND)
+ return false;
+
+ if (rq->cmd_flags & REQ_NOMERGE_FLAGS)
return false;
- return __bvec_gap_to_prev(q, bprv, offset);
+ if (rq->rq_flags & RQF_NOMERGE_FLAGS)
+ return false;
+
+ return true;
+}
+
+/*
+ * There are two different ways to handle DISCARD merges:
+ * 1) If max_discard_segments > 1, the driver treats every bio as a range and
+ * send the bios to controller together. The ranges don't need to be
+ * contiguous.
+ * 2) Otherwise, the request will be normal read/write requests. The ranges
+ * need to be contiguous.
+ */
+static inline bool blk_discard_mergable(struct request *req)
+{
+ if (req_op(req) == REQ_OP_DISCARD &&
+ queue_max_discard_segments(req->q) > 1)
+ return true;
+ return false;
}
-static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio,
- unsigned int nr_segs)
+static inline unsigned int blk_rq_get_max_segments(struct request *rq)
{
- rq->nr_phys_segments = nr_segs;
- rq->__data_len = bio->bi_iter.bi_size;
- rq->bio = rq->biotail = bio;
- rq->ioprio = bio_prio(bio);
+ if (req_op(rq) == REQ_OP_DISCARD)
+ return queue_max_discard_segments(rq->q);
+ return queue_max_segments(rq->q);
+}
+
+static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q,
+ enum req_op op)
+{
+ if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE))
+ return min(q->limits.max_discard_sectors,
+ UINT_MAX >> SECTOR_SHIFT);
+
+ if (unlikely(op == REQ_OP_WRITE_ZEROES))
+ return q->limits.max_write_zeroes_sectors;
- if (bio->bi_disk)
- rq->rq_disk = bio->bi_disk;
+ return q->limits.max_sectors;
}
#ifdef CONFIG_BLK_DEV_INTEGRITY
@@ -121,13 +190,19 @@ static inline bool bio_integrity_endio(struct bio *bio)
return true;
}
+bool blk_integrity_merge_rq(struct request_queue *, struct request *,
+ struct request *);
+bool blk_integrity_merge_bio(struct request_queue *, struct request *,
+ struct bio *);
+
static inline bool integrity_req_gap_back_merge(struct request *req,
struct bio *next)
{
struct bio_integrity_payload *bip = bio_integrity(req->bio);
struct bio_integrity_payload *bip_next = bio_integrity(next);
- return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ return bvec_gap_to_prev(&req->q->limits,
+ &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
@@ -137,13 +212,23 @@ static inline bool integrity_req_gap_front_merge(struct request *req,
struct bio_integrity_payload *bip = bio_integrity(bio);
struct bio_integrity_payload *bip_next = bio_integrity(req->bio);
- return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1],
+ return bvec_gap_to_prev(&req->q->limits,
+ &bip->bip_vec[bip->bip_vcnt - 1],
bip_next->bip_vec[0].bv_offset);
}
-void blk_integrity_add(struct gendisk *);
-void blk_integrity_del(struct gendisk *);
+extern const struct attribute_group blk_integrity_attr_group;
#else /* CONFIG_BLK_DEV_INTEGRITY */
+static inline bool blk_integrity_merge_rq(struct request_queue *rq,
+ struct request *r1, struct request *r2)
+{
+ return true;
+}
+static inline bool blk_integrity_merge_bio(struct request_queue *rq,
+ struct request *r, struct bio *b)
+{
+ return true;
+}
static inline bool integrity_req_gap_back_merge(struct request *req,
struct bio *next)
{
@@ -165,54 +250,35 @@ static inline bool bio_integrity_endio(struct bio *bio)
static inline void bio_integrity_free(struct bio *bio)
{
}
-static inline void blk_integrity_add(struct gendisk *disk)
-{
-}
-static inline void blk_integrity_del(struct gendisk *disk)
-{
-}
#endif /* CONFIG_BLK_DEV_INTEGRITY */
unsigned long blk_rq_timeout(unsigned long timeout);
void blk_add_timer(struct request *req);
-bool bio_attempt_front_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs);
-bool bio_attempt_back_merge(struct request *req, struct bio *bio,
- unsigned int nr_segs);
-bool bio_attempt_discard_merge(struct request_queue *q, struct request *req,
- struct bio *bio);
bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio,
- unsigned int nr_segs, struct request **same_queue_rq);
+ unsigned int nr_segs);
+bool blk_bio_list_merge(struct request_queue *q, struct list_head *list,
+ struct bio *bio, unsigned int nr_segs);
-void blk_account_io_start(struct request *req);
-void blk_account_io_done(struct request *req, u64 now);
+/*
+ * Plug flush limits
+ */
+#define BLK_MAX_REQUEST_COUNT 32
+#define BLK_PLUG_FLUSH_SIZE (128 * 1024)
/*
* Internal elevator interface
*/
#define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED)
-void blk_insert_flush(struct request *rq);
+bool blk_insert_flush(struct request *rq);
-void elevator_init_mq(struct request_queue *q);
-int elevator_switch_mq(struct request_queue *q,
- struct elevator_type *new_e);
-void __elevator_exit(struct request_queue *, struct elevator_queue *);
+int elevator_switch(struct request_queue *q, struct elevator_type *new_e);
+void elevator_disable(struct request_queue *q);
+void elevator_exit(struct request_queue *q);
int elv_register_queue(struct request_queue *q, bool uevent);
void elv_unregister_queue(struct request_queue *q);
-static inline void elevator_exit(struct request_queue *q,
- struct elevator_queue *e)
-{
- lockdep_assert_held(&q->sysfs_lock);
-
- blk_mq_sched_free_requests(q);
- __elevator_exit(q, e);
-}
-
-struct hd_struct *__disk_get_part(struct gendisk *disk, int partno);
-
ssize_t part_size_show(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t part_stat_show(struct device *dev, struct device_attribute *attr,
@@ -223,34 +289,47 @@ ssize_t part_fail_show(struct device *dev, struct device_attribute *attr,
char *buf);
ssize_t part_fail_store(struct device *dev, struct device_attribute *attr,
const char *buf, size_t count);
-
-#ifdef CONFIG_FAIL_IO_TIMEOUT
-int blk_should_fake_timeout(struct request_queue *);
ssize_t part_timeout_show(struct device *, struct device_attribute *, char *);
ssize_t part_timeout_store(struct device *, struct device_attribute *,
const char *, size_t);
-#else
-static inline int blk_should_fake_timeout(struct request_queue *q)
+
+static inline bool bio_may_exceed_limits(struct bio *bio,
+ const struct queue_limits *lim)
{
- return 0;
+ switch (bio_op(bio)) {
+ case REQ_OP_DISCARD:
+ case REQ_OP_SECURE_ERASE:
+ case REQ_OP_WRITE_ZEROES:
+ return true; /* non-trivial splitting decisions */
+ default:
+ break;
+ }
+
+ /*
+ * All drivers must accept single-segments bios that are <= PAGE_SIZE.
+ * This is a quick and dirty check that relies on the fact that
+ * bi_io_vec[0] is always valid if a bio has data. The check might
+ * lead to occasional false negatives when bios are cloned, but compared
+ * to the performance impact of cloned bios themselves the loop below
+ * doesn't matter anyway.
+ */
+ return lim->chunk_sectors || bio->bi_vcnt != 1 ||
+ bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE;
}
-#endif
-void __blk_queue_split(struct request_queue *q, struct bio **bio,
- unsigned int *nr_segs);
+struct bio *__bio_split_to_limits(struct bio *bio,
+ const struct queue_limits *lim,
+ unsigned int *nr_segs);
int ll_back_merge_fn(struct request *req, struct bio *bio,
unsigned int nr_segs);
-int ll_front_merge_fn(struct request *req, struct bio *bio,
- unsigned int nr_segs);
-struct request *attempt_back_merge(struct request_queue *q, struct request *rq);
-struct request *attempt_front_merge(struct request_queue *q, struct request *rq);
-int blk_attempt_req_merge(struct request_queue *q, struct request *rq,
+bool blk_attempt_req_merge(struct request_queue *q, struct request *rq,
struct request *next);
unsigned int blk_recalc_rq_segments(struct request *rq);
void blk_rq_set_mixed_merge(struct request *rq);
bool blk_rq_merge_ok(struct request *rq, struct bio *bio);
enum elv_merge blk_try_merge(struct request *rq, struct bio *bio);
+void blk_set_default_limits(struct queue_limits *lim);
int blk_dev_init(void);
/*
@@ -261,9 +340,11 @@ int blk_dev_init(void);
*/
static inline bool blk_do_io_stat(struct request *rq)
{
- return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT);
+ return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq);
}
+void update_io_ticks(struct block_device *part, unsigned long now, bool end);
+
static inline void req_set_nomerge(struct request_queue *q, struct request *req)
{
req->cmd_flags |= REQ_NOMERGE;
@@ -272,38 +353,18 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req)
}
/*
- * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size
- * is defined as 'unsigned int', meantime it has to aligned to with logical
- * block size which is the minimum accepted unit by hardware.
- */
-static inline unsigned int bio_allowed_max_sectors(struct request_queue *q)
-{
- return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9;
-}
-
-/*
* Internal io_context interface
*/
-void get_io_context(struct io_context *ioc);
-struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q);
-struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q,
- gfp_t gfp_mask);
+struct io_cq *ioc_find_get_icq(struct request_queue *q);
+struct io_cq *ioc_lookup_icq(struct request_queue *q);
+#ifdef CONFIG_BLK_ICQ
void ioc_clear_queue(struct request_queue *q);
+#else
+static inline void ioc_clear_queue(struct request_queue *q)
+{
+}
+#endif /* CONFIG_BLK_ICQ */
-int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node);
-
-/*
- * Internal throttling interface
- */
-#ifdef CONFIG_BLK_DEV_THROTTLING
-extern int blk_throtl_init(struct request_queue *q);
-extern void blk_throtl_exit(struct request_queue *q);
-extern void blk_throtl_register_queue(struct request_queue *q);
-#else /* CONFIG_BLK_DEV_THROTTLING */
-static inline int blk_throtl_init(struct request_queue *q) { return 0; }
-static inline void blk_throtl_exit(struct request_queue *q) { }
-static inline void blk_throtl_register_queue(struct request_queue *q) { }
-#endif /* CONFIG_BLK_DEV_THROTTLING */
#ifdef CONFIG_BLK_DEV_THROTTLING_LOW
extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page);
extern ssize_t blk_throtl_sample_time_store(struct request_queue *q,
@@ -315,129 +376,144 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { }
static inline void blk_throtl_stat_add(struct request *rq, u64 time) { }
#endif
-#ifdef CONFIG_BOUNCE
-extern int init_emergency_isa_pool(void);
-extern void blk_queue_bounce(struct request_queue *q, struct bio **bio);
-#else
-static inline int init_emergency_isa_pool(void)
+struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q);
+
+static inline bool blk_queue_may_bounce(struct request_queue *q)
{
- return 0;
+ return IS_ENABLED(CONFIG_BOUNCE) &&
+ q->limits.bounce == BLK_BOUNCE_HIGH &&
+ max_low_pfn >= max_pfn;
}
-static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio)
+
+static inline struct bio *blk_queue_bounce(struct bio *bio,
+ struct request_queue *q)
{
+ if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio)))
+ return __blk_queue_bounce(bio, q);
+ return bio;
}
-#endif /* CONFIG_BOUNCE */
-
-#ifdef CONFIG_BLK_CGROUP_IOLATENCY
-extern int blk_iolatency_init(struct request_queue *q);
-#else
-static inline int blk_iolatency_init(struct request_queue *q) { return 0; }
-#endif
-
-struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp);
#ifdef CONFIG_BLK_DEV_ZONED
-void blk_queue_free_zone_bitmaps(struct request_queue *q);
-#else
-static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {}
-#endif
+void disk_free_zone_bitmaps(struct gendisk *disk);
+int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd,
+ unsigned long arg);
+int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode,
+ unsigned int cmd, unsigned long arg);
+#else /* CONFIG_BLK_DEV_ZONED */
+static inline void disk_free_zone_bitmaps(struct gendisk *disk) {}
+static inline int blkdev_report_zones_ioctl(struct block_device *bdev,
+ unsigned int cmd, unsigned long arg)
+{
+ return -ENOTTY;
+}
+static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev,
+ blk_mode_t mode, unsigned int cmd, unsigned long arg)
+{
+ return -ENOTTY;
+}
+#endif /* CONFIG_BLK_DEV_ZONED */
-struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector);
+struct block_device *bdev_alloc(struct gendisk *disk, u8 partno);
+void bdev_add(struct block_device *bdev, dev_t dev);
-int blk_alloc_devt(struct hd_struct *part, dev_t *devt);
-void blk_free_devt(dev_t devt);
-void blk_invalidate_devt(dev_t devt);
-char *disk_name(struct gendisk *hd, int partno, char *buf);
+int blk_alloc_ext_minor(void);
+void blk_free_ext_minor(unsigned int minor);
#define ADDPART_FLAG_NONE 0
#define ADDPART_FLAG_RAID 1
#define ADDPART_FLAG_WHOLEDISK 2
-void delete_partition(struct gendisk *disk, struct hd_struct *part);
-int bdev_add_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length);
-int bdev_del_partition(struct block_device *bdev, int partno);
-int bdev_resize_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length);
-int disk_expand_part_tbl(struct gendisk *disk, int target);
-int hd_ref_init(struct hd_struct *part);
+int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length);
+int bdev_del_partition(struct gendisk *disk, int partno);
+int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length);
+void drop_partition(struct block_device *part);
-/* no need to get/put refcount of part0 */
-static inline int hd_struct_try_get(struct hd_struct *part)
-{
- if (part->partno)
- return percpu_ref_tryget_live(&part->ref);
- return 1;
-}
+void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors);
+
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+ struct lock_class_key *lkclass);
+
+int bio_add_hw_page(struct request_queue *q, struct bio *bio,
+ struct page *page, unsigned int len, unsigned int offset,
+ unsigned int max_sectors, bool *same_page);
-static inline void hd_struct_put(struct hd_struct *part)
+/*
+ * Clean up a page appropriately, where the page may be pinned, may have a
+ * ref taken on it or neither.
+ */
+static inline void bio_release_page(struct bio *bio, struct page *page)
{
- if (part->partno)
- percpu_ref_put(&part->ref);
+ if (bio_flagged(bio, BIO_PAGE_PINNED))
+ unpin_user_page(page);
}
-static inline void hd_free_part(struct hd_struct *part)
+struct request_queue *blk_alloc_queue(int node_id);
+
+int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode);
+
+int disk_alloc_events(struct gendisk *disk);
+void disk_add_events(struct gendisk *disk);
+void disk_del_events(struct gendisk *disk);
+void disk_release_events(struct gendisk *disk);
+void disk_block_events(struct gendisk *disk);
+void disk_unblock_events(struct gendisk *disk);
+void disk_flush_events(struct gendisk *disk, unsigned int mask);
+extern struct device_attribute dev_attr_events;
+extern struct device_attribute dev_attr_events_async;
+extern struct device_attribute dev_attr_events_poll_msecs;
+
+extern struct attribute_group blk_trace_attr_group;
+
+blk_mode_t file_to_blk_mode(struct file *file);
+int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode,
+ loff_t lstart, loff_t lend);
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg);
+
+extern const struct address_space_operations def_blk_aops;
+
+int disk_register_independent_access_ranges(struct gendisk *disk);
+void disk_unregister_independent_access_ranges(struct gendisk *disk);
+
+#ifdef CONFIG_FAIL_MAKE_REQUEST
+bool should_fail_request(struct block_device *part, unsigned int bytes);
+#else /* CONFIG_FAIL_MAKE_REQUEST */
+static inline bool should_fail_request(struct block_device *part,
+ unsigned int bytes)
{
- free_percpu(part->dkstats);
- kfree(part->info);
- percpu_ref_exit(&part->ref);
+ return false;
}
+#endif /* CONFIG_FAIL_MAKE_REQUEST */
/*
- * Any access of part->nr_sects which is not protected by partition
- * bd_mutex or gendisk bdev bd_mutex, should be done using this
- * accessor function.
+ * Optimized request reference counting. Ideally we'd make timeouts be more
+ * clever, as that's the only reason we need references at all... But until
+ * this happens, this is faster than using refcount_t. Also see:
*
- * Code written along the lines of i_size_read() and i_size_write().
- * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption
- * on.
+ * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count")
*/
-static inline sector_t part_nr_sects_read(struct hd_struct *part)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
- sector_t nr_sects;
- unsigned seq;
- do {
- seq = read_seqcount_begin(&part->nr_sects_seq);
- nr_sects = part->nr_sects;
- } while (read_seqcount_retry(&part->nr_sects_seq, seq));
- return nr_sects;
-#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
- sector_t nr_sects;
-
- preempt_disable();
- nr_sects = part->nr_sects;
- preempt_enable();
- return nr_sects;
-#else
- return part->nr_sects;
-#endif
+#define req_ref_zero_or_close_to_overflow(req) \
+ ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u)
+
+static inline bool req_ref_inc_not_zero(struct request *req)
+{
+ return atomic_inc_not_zero(&req->ref);
}
-/*
- * Should be called with mutex lock held (typically bd_mutex) of partition
- * to provide mutual exlusion among writers otherwise seqcount might be
- * left in wrong state leaving the readers spinning infinitely.
- */
-static inline void part_nr_sects_write(struct hd_struct *part, sector_t size)
-{
-#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
- preempt_disable();
- write_seqcount_begin(&part->nr_sects_seq);
- part->nr_sects = size;
- write_seqcount_end(&part->nr_sects_seq);
- preempt_enable();
-#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION)
- preempt_disable();
- part->nr_sects = size;
- preempt_enable();
-#else
- part->nr_sects = size;
-#endif
+static inline bool req_ref_put_and_test(struct request *req)
+{
+ WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req));
+ return atomic_dec_and_test(&req->ref);
}
-struct request_queue *__blk_alloc_queue(int node_id);
+static inline void req_ref_set(struct request *req, int value)
+{
+ atomic_set(&req->ref, value);
+}
-int bio_add_hw_page(struct request_queue *q, struct bio *bio,
- struct page *page, unsigned int len, unsigned int offset,
- unsigned int max_sectors, bool *same_page);
+static inline int req_ref_read(struct request *req)
+{
+ return atomic_read(&req->ref);
+}
#endif /* BLK_INTERNAL_H */
diff --git a/block/bounce.c b/block/bounce.c
index c3aaed070124..7cfcb242f9a1 100644
--- a/block/bounce.c
+++ b/block/bounce.c
@@ -18,18 +18,18 @@
#include <linux/init.h>
#include <linux/hash.h>
#include <linux/highmem.h>
-#include <linux/memblock.h>
#include <linux/printk.h>
#include <asm/tlbflush.h>
#include <trace/events/block.h>
#include "blk.h"
+#include "blk-cgroup.h"
#define POOL_SIZE 64
#define ISA_POOL_SIZE 16
static struct bio_set bounce_bio_set, bounce_bio_split;
-static mempool_t page_pool, isa_page_pool;
+static mempool_t page_pool;
static void init_bounce_bioset(void)
{
@@ -49,11 +49,11 @@ static void init_bounce_bioset(void)
bounce_bs_setup = true;
}
-#if defined(CONFIG_HIGHMEM)
static __init int init_emergency_pool(void)
{
int ret;
-#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG)
+
+#ifndef CONFIG_MEMORY_HOTPLUG
if (max_pfn <= max_low_pfn)
return 0;
#endif
@@ -67,62 +67,6 @@ static __init int init_emergency_pool(void)
}
__initcall(init_emergency_pool);
-#endif
-
-#ifdef CONFIG_HIGHMEM
-/*
- * highmem version, map in to vec
- */
-static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom)
-{
- unsigned char *vto;
-
- vto = kmap_atomic(to->bv_page);
- memcpy(vto + to->bv_offset, vfrom, to->bv_len);
- kunmap_atomic(vto);
-}
-
-#else /* CONFIG_HIGHMEM */
-
-#define bounce_copy_vec(to, vfrom) \
- memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len)
-
-#endif /* CONFIG_HIGHMEM */
-
-/*
- * allocate pages in the DMA region for the ISA pool
- */
-static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data)
-{
- return mempool_alloc_pages(gfp_mask | GFP_DMA, data);
-}
-
-static DEFINE_MUTEX(isa_mutex);
-
-/*
- * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA
- * as the max address, so check if the pool has already been created.
- */
-int init_emergency_isa_pool(void)
-{
- int ret;
-
- mutex_lock(&isa_mutex);
-
- if (mempool_initialized(&isa_page_pool)) {
- mutex_unlock(&isa_mutex);
- return 0;
- }
-
- ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa,
- mempool_free_pages, (void *) 0);
- BUG_ON(ret);
-
- pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE);
- init_bounce_bioset();
- mutex_unlock(&isa_mutex);
- return 0;
-}
/*
* Simple bounce buffer support for highmem pages. Depending on the
@@ -131,7 +75,6 @@ int init_emergency_isa_pool(void)
*/
static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
{
- unsigned char *vfrom;
struct bio_vec tovec, fromvec;
struct bvec_iter iter;
/*
@@ -149,17 +92,14 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from)
* been modified by the block layer, so use the original
* copy, bounce_copy_vec already uses tovec->bv_len
*/
- vfrom = page_address(fromvec.bv_page) +
- tovec.bv_offset;
-
- bounce_copy_vec(&tovec, vfrom);
- flush_dcache_page(tovec.bv_page);
+ memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) +
+ tovec.bv_offset);
}
bio_advance_iter(from, &from_iter, tovec.bv_len);
}
}
-static void bounce_end_io(struct bio *bio, mempool_t *pool)
+static void bounce_end_io(struct bio *bio)
{
struct bio *bio_orig = bio->bi_private;
struct bio_vec *bvec, orig_vec;
@@ -173,7 +113,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
orig_vec = bio_iter_iovec(bio_orig, orig_iter);
if (bvec->bv_page != orig_vec.bv_page) {
dec_zone_page_state(bvec->bv_page, NR_BOUNCE);
- mempool_free(bvec->bv_page, pool);
+ mempool_free(bvec->bv_page, &page_pool);
}
bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len);
}
@@ -185,37 +125,20 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool)
static void bounce_end_io_write(struct bio *bio)
{
- bounce_end_io(bio, &page_pool);
-}
-
-static void bounce_end_io_write_isa(struct bio *bio)
-{
-
- bounce_end_io(bio, &isa_page_pool);
+ bounce_end_io(bio);
}
-static void __bounce_end_io_read(struct bio *bio, mempool_t *pool)
+static void bounce_end_io_read(struct bio *bio)
{
struct bio *bio_orig = bio->bi_private;
if (!bio->bi_status)
copy_to_high_bio_irq(bio_orig, bio);
- bounce_end_io(bio, pool);
-}
-
-static void bounce_end_io_read(struct bio *bio)
-{
- __bounce_end_io_read(bio, &page_pool);
+ bounce_end_io(bio);
}
-static void bounce_end_io_read_isa(struct bio *bio)
-{
- __bounce_end_io_read(bio, &isa_page_pool);
-}
-
-static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
- struct bio_set *bs)
+static struct bio *bounce_clone_bio(struct bio *bio_src)
{
struct bvec_iter iter;
struct bio_vec bv;
@@ -230,26 +153,22 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
* - The point of cloning the biovec is to produce a bio with a biovec
* the caller can modify: bi_idx and bi_bvec_done should be 0.
*
- * - The original bio could've had more than BIO_MAX_PAGES biovecs; if
+ * - The original bio could've had more than BIO_MAX_VECS biovecs; if
* we tried to clone the whole thing bio_alloc_bioset() would fail.
* But the clone should succeed as long as the number of biovecs we
- * actually need to allocate is fewer than BIO_MAX_PAGES.
+ * actually need to allocate is fewer than BIO_MAX_VECS.
*
* - Lastly, bi_vcnt should not be looked at or relied upon by code
* that does not own the bio - reason being drivers don't use it for
* iterating over the biovec anymore, so expecting it to be kept up
* to date (i.e. for clones that share the parent biovec) is just
- * asking for trouble and would force extra work on
- * __bio_clone_fast() anyways.
+ * asking for trouble and would force extra work.
*/
-
- bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs);
- if (!bio)
- return NULL;
- bio->bi_disk = bio_src->bi_disk;
- bio->bi_opf = bio_src->bi_opf;
+ bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src),
+ bio_src->bi_opf, GFP_NOIO, &bounce_bio_set);
+ if (bio_flagged(bio_src, BIO_REMAPPED))
+ bio_set_flag(bio, BIO_REMAPPED);
bio->bi_ioprio = bio_src->bi_ioprio;
- bio->bi_write_hint = bio_src->bi_write_hint;
bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector;
bio->bi_iter.bi_size = bio_src->bi_iter.bi_size;
@@ -258,62 +177,61 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask,
case REQ_OP_SECURE_ERASE:
case REQ_OP_WRITE_ZEROES:
break;
- case REQ_OP_WRITE_SAME:
- bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0];
- break;
default:
bio_for_each_segment(bv, bio_src, iter)
bio->bi_io_vec[bio->bi_vcnt++] = bv;
break;
}
- bio_crypt_clone(bio, bio_src, gfp_mask);
+ if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0)
+ goto err_put;
- if (bio_integrity(bio_src)) {
- int ret;
-
- ret = bio_integrity_clone(bio, bio_src, gfp_mask);
- if (ret < 0) {
- bio_put(bio);
- return NULL;
- }
- }
+ if (bio_integrity(bio_src) &&
+ bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0)
+ goto err_put;
bio_clone_blkg_association(bio, bio_src);
- blkcg_bio_issue_init(bio);
return bio;
+
+err_put:
+ bio_put(bio);
+ return NULL;
}
-static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
- mempool_t *pool)
+struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q)
{
struct bio *bio;
- int rw = bio_data_dir(*bio_orig);
+ int rw = bio_data_dir(bio_orig);
struct bio_vec *to, from;
struct bvec_iter iter;
- unsigned i = 0;
+ unsigned i = 0, bytes = 0;
bool bounce = false;
- int sectors = 0;
- bool passthrough = bio_is_passthrough(*bio_orig);
+ int sectors;
- bio_for_each_segment(from, *bio_orig, iter) {
- if (i++ < BIO_MAX_PAGES)
- sectors += from.bv_len >> 9;
- if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn)
+ bio_for_each_segment(from, bio_orig, iter) {
+ if (i++ < BIO_MAX_VECS)
+ bytes += from.bv_len;
+ if (PageHighMem(from.bv_page))
bounce = true;
}
if (!bounce)
- return;
+ return bio_orig;
- if (!passthrough && sectors < bio_sectors(*bio_orig)) {
- bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
- bio_chain(bio, *bio_orig);
- generic_make_request(*bio_orig);
- *bio_orig = bio;
+ /*
+ * Individual bvecs might not be logical block aligned. Round down
+ * the split size so that each bio is properly block size aligned,
+ * even if we do not use the full hardware limits.
+ */
+ sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >>
+ SECTOR_SHIFT;
+ if (sectors < bio_sectors(bio_orig)) {
+ bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split);
+ bio_chain(bio, bio_orig);
+ submit_bio_noacct(bio_orig);
+ bio_orig = bio;
}
- bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL :
- &bounce_bio_set);
+ bio = bounce_clone_bio(bio_orig);
/*
* Bvec table can't be updated by bio_for_each_segment_all(),
@@ -321,70 +239,30 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig,
* because the 'bio' is single-page bvec.
*/
for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) {
- struct page *page = to->bv_page;
+ struct page *bounce_page;
- if (page_to_pfn(page) <= q->limits.bounce_pfn)
+ if (!PageHighMem(to->bv_page))
continue;
- to->bv_page = mempool_alloc(pool, q->bounce_gfp);
- inc_zone_page_state(to->bv_page, NR_BOUNCE);
+ bounce_page = mempool_alloc(&page_pool, GFP_NOIO);
+ inc_zone_page_state(bounce_page, NR_BOUNCE);
if (rw == WRITE) {
- char *vto, *vfrom;
-
- flush_dcache_page(page);
-
- vto = page_address(to->bv_page) + to->bv_offset;
- vfrom = kmap_atomic(page) + to->bv_offset;
- memcpy(vto, vfrom, to->bv_len);
- kunmap_atomic(vfrom);
+ flush_dcache_page(to->bv_page);
+ memcpy_from_bvec(page_address(bounce_page), to);
}
+ to->bv_page = bounce_page;
}
- trace_block_bio_bounce(q, *bio_orig);
+ trace_block_bio_bounce(bio_orig);
bio->bi_flags |= (1 << BIO_BOUNCED);
- if (pool == &page_pool) {
+ if (rw == READ)
+ bio->bi_end_io = bounce_end_io_read;
+ else
bio->bi_end_io = bounce_end_io_write;
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read;
- } else {
- bio->bi_end_io = bounce_end_io_write_isa;
- if (rw == READ)
- bio->bi_end_io = bounce_end_io_read_isa;
- }
-
- bio->bi_private = *bio_orig;
- *bio_orig = bio;
-}
-
-void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig)
-{
- mempool_t *pool;
-
- /*
- * Data-less bio, nothing to bounce
- */
- if (!bio_has_data(*bio_orig))
- return;
- /*
- * for non-isa bounce case, just check if the bounce pfn is equal
- * to or bigger than the highest pfn in the system -- in that case,
- * don't waste time iterating over bio segments
- */
- if (!(q->bounce_gfp & GFP_DMA)) {
- if (q->limits.bounce_pfn >= blk_max_pfn)
- return;
- pool = &page_pool;
- } else {
- BUG_ON(!mempool_initialized(&isa_page_pool));
- pool = &isa_page_pool;
- }
-
- /*
- * slow path
- */
- __blk_queue_bounce(q, bio_orig, pool);
+ bio->bi_private = bio_orig;
+ return bio;
}
diff --git a/block/bsg-lib.c b/block/bsg-lib.c
index 6cbb7926534c..b3acdbdb6e7e 100644
--- a/block/bsg-lib.c
+++ b/block/bsg-lib.c
@@ -6,6 +6,7 @@
* Copyright (C) 2011 Red Hat, Inc. All rights reserved.
* Copyright (C) 2011 Mike Christie
*/
+#include <linux/bsg.h>
#include <linux/slab.h>
#include <linux/blk-mq.h>
#include <linux/delay.h>
@@ -19,36 +20,51 @@
struct bsg_set {
struct blk_mq_tag_set tag_set;
+ struct bsg_device *bd;
bsg_job_fn *job_fn;
bsg_timeout_fn *timeout_fn;
};
-static int bsg_transport_check_proto(struct sg_io_v4 *hdr)
+static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr,
+ bool open_for_write, unsigned int timeout)
{
+ struct bsg_job *job;
+ struct request *rq;
+ struct bio *bio;
+ void *reply;
+ int ret;
+
if (hdr->protocol != BSG_PROTOCOL_SCSI ||
hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT)
return -EINVAL;
if (!capable(CAP_SYS_RAWIO))
return -EPERM;
- return 0;
-}
-static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
- fmode_t mode)
-{
- struct bsg_job *job = blk_mq_rq_to_pdu(rq);
- int ret;
+ rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ?
+ REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0);
+ if (IS_ERR(rq))
+ return PTR_ERR(rq);
+ rq->timeout = timeout;
+
+ job = blk_mq_rq_to_pdu(rq);
+ reply = job->reply;
+ memset(job, 0, sizeof(*job));
+ job->reply = reply;
+ job->reply_len = SCSI_SENSE_BUFFERSIZE;
+ job->dd_data = job + 1;
job->request_len = hdr->request_len;
job->request = memdup_user(uptr64(hdr->request), hdr->request_len);
- if (IS_ERR(job->request))
- return PTR_ERR(job->request);
+ if (IS_ERR(job->request)) {
+ ret = PTR_ERR(job->request);
+ goto out_free_rq;
+ }
if (hdr->dout_xfer_len && hdr->din_xfer_len) {
- job->bidi_rq = blk_get_request(rq->q, REQ_OP_SCSI_IN, 0);
+ job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0);
if (IS_ERR(job->bidi_rq)) {
ret = PTR_ERR(job->bidi_rq);
- goto out;
+ goto out_free_job_request;
}
ret = blk_rq_map_user(rq->q, job->bidi_rq, NULL,
@@ -63,20 +79,20 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
job->bidi_bio = NULL;
}
- return 0;
+ ret = 0;
+ if (hdr->dout_xfer_len) {
+ ret = blk_rq_map_user(rq->q, rq, NULL, uptr64(hdr->dout_xferp),
+ hdr->dout_xfer_len, GFP_KERNEL);
+ } else if (hdr->din_xfer_len) {
+ ret = blk_rq_map_user(rq->q, rq, NULL, uptr64(hdr->din_xferp),
+ hdr->din_xfer_len, GFP_KERNEL);
+ }
-out_free_bidi_rq:
- if (job->bidi_rq)
- blk_put_request(job->bidi_rq);
-out:
- kfree(job->request);
- return ret;
-}
+ if (ret)
+ goto out_unmap_bidi_rq;
-static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
-{
- struct bsg_job *job = blk_mq_rq_to_pdu(rq);
- int ret = 0;
+ bio = rq->bio;
+ blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL));
/*
* The assignments below don't make much sense, but are kept for
@@ -84,7 +100,7 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
*/
hdr->device_status = job->result & 0xff;
hdr->transport_status = host_byte(job->result);
- hdr->driver_status = driver_byte(job->result);
+ hdr->driver_status = 0;
hdr->info = 0;
if (hdr->device_status || hdr->transport_status || hdr->driver_status)
hdr->info |= SG_INFO_CHECK;
@@ -119,28 +135,20 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
hdr->din_resid = 0;
}
- return ret;
-}
-
-static void bsg_transport_free_rq(struct request *rq)
-{
- struct bsg_job *job = blk_mq_rq_to_pdu(rq);
-
- if (job->bidi_rq) {
+ blk_rq_unmap_user(bio);
+out_unmap_bidi_rq:
+ if (job->bidi_rq)
blk_rq_unmap_user(job->bidi_bio);
- blk_put_request(job->bidi_rq);
- }
-
+out_free_bidi_rq:
+ if (job->bidi_rq)
+ blk_mq_free_request(job->bidi_rq);
+out_free_job_request:
kfree(job->request);
+out_free_rq:
+ blk_mq_free_request(rq);
+ return ret;
}
-static const struct bsg_ops bsg_transport_ops = {
- .check_proto = bsg_transport_check_proto,
- .fill_hdr = bsg_transport_fill_hdr,
- .complete_rq = bsg_transport_complete_rq,
- .free_rq = bsg_transport_free_rq,
-};
-
/**
* bsg_teardown_job - routine to teardown a bsg job
* @kref: kref inside bsg_job that is to be torn down
@@ -181,9 +189,12 @@ EXPORT_SYMBOL_GPL(bsg_job_get);
void bsg_job_done(struct bsg_job *job, int result,
unsigned int reply_payload_rcv_len)
{
+ struct request *rq = blk_mq_rq_from_pdu(job);
+
job->result = result;
job->reply_payload_rcv_len = reply_payload_rcv_len;
- blk_mq_complete_request(blk_mq_rq_from_pdu(job));
+ if (likely(!blk_should_fake_timeout(rq->q)))
+ blk_mq_complete_request(rq);
}
EXPORT_SYMBOL_GPL(bsg_job_done);
@@ -204,7 +215,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req)
BUG_ON(!req->nr_phys_segments);
- buf->sg_list = kzalloc(sz, GFP_KERNEL);
+ buf->sg_list = kmalloc(sz, GFP_KERNEL);
if (!buf->sg_list)
return -ENOMEM;
sg_init_table(buf->sg_list, req->nr_phys_segments);
@@ -298,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req,
return 0;
}
-/* called right before the request is given to the request_queue user */
-static void bsg_initialize_rq(struct request *req)
-{
- struct bsg_job *job = blk_mq_rq_to_pdu(req);
- void *reply = job->reply;
-
- memset(job, 0, sizeof(*job));
- job->reply = reply;
- job->reply_len = SCSI_SENSE_BUFFERSIZE;
- job->dd_data = job + 1;
-}
-
static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req,
unsigned int hctx_idx)
{
@@ -324,15 +323,16 @@ void bsg_remove_queue(struct request_queue *q)
struct bsg_set *bset =
container_of(q->tag_set, struct bsg_set, tag_set);
- bsg_unregister_queue(q);
- blk_cleanup_queue(q);
+ bsg_unregister_queue(bset->bd);
+ blk_mq_destroy_queue(q);
+ blk_put_queue(q);
blk_mq_free_tag_set(&bset->tag_set);
kfree(bset);
}
}
EXPORT_SYMBOL_GPL(bsg_remove_queue);
-static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved)
+static enum blk_eh_timer_return bsg_timeout(struct request *rq)
{
struct bsg_set *bset =
container_of(rq->q->tag_set, struct bsg_set, tag_set);
@@ -346,7 +346,6 @@ static const struct blk_mq_ops bsg_mq_ops = {
.queue_rq = bsg_queue_rq,
.init_request = bsg_init_rq,
.exit_request = bsg_exit_rq,
- .initialize_rq_fn = bsg_initialize_rq,
.complete = bsg_complete,
.timeout = bsg_timeout,
};
@@ -375,7 +374,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
bset->timeout_fn = timeout;
set = &bset->tag_set;
- set->ops = &bsg_mq_ops,
+ set->ops = &bsg_mq_ops;
set->nr_hw_queues = 1;
set->queue_depth = 128;
set->numa_node = NUMA_NO_NODE;
@@ -393,16 +392,16 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name,
q->queuedata = dev;
blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT);
- ret = bsg_register_queue(q, dev, name, &bsg_transport_ops);
- if (ret) {
- printk(KERN_ERR "%s: bsg interface failed to "
- "initialize - register queue\n", dev->kobj.name);
+ bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn);
+ if (IS_ERR(bset->bd)) {
+ ret = PTR_ERR(bset->bd);
goto out_cleanup_queue;
}
return q;
out_cleanup_queue:
- blk_cleanup_queue(q);
+ blk_mq_destroy_queue(q);
+ blk_put_queue(q);
out_queue:
blk_mq_free_tag_set(set);
out_tag_set:
diff --git a/block/bsg.c b/block/bsg.c
index d7bae94b64d9..72157a59b788 100644
--- a/block/bsg.c
+++ b/block/bsg.c
@@ -15,339 +15,99 @@
#include <scsi/scsi.h>
#include <scsi/scsi_ioctl.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/scsi_device.h>
-#include <scsi/scsi_driver.h>
#include <scsi/sg.h>
#define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver"
#define BSG_VERSION "0.4"
-#define bsg_dbg(bd, fmt, ...) \
- pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__)
-
struct bsg_device {
struct request_queue *queue;
- spinlock_t lock;
- struct hlist_node dev_list;
- refcount_t ref_count;
- char name[20];
+ struct device device;
+ struct cdev cdev;
int max_queue;
+ unsigned int timeout;
+ unsigned int reserved_size;
+ bsg_sg_io_fn *sg_io_fn;
};
-#define BSG_DEFAULT_CMDS 64
-#define BSG_MAX_DEVS 32768
-
-static DEFINE_MUTEX(bsg_mutex);
-static DEFINE_IDR(bsg_minor_idr);
-
-#define BSG_LIST_ARRAY_SIZE 8
-static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE];
-
-static struct class *bsg_class;
-static int bsg_major;
-
-static inline struct hlist_head *bsg_dev_idx_hash(int index)
-{
- return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)];
-}
-
-#define uptr64(val) ((void __user *)(uintptr_t)(val))
-
-static int bsg_scsi_check_proto(struct sg_io_v4 *hdr)
+static inline struct bsg_device *to_bsg_device(struct inode *inode)
{
- if (hdr->protocol != BSG_PROTOCOL_SCSI ||
- hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD)
- return -EINVAL;
- return 0;
+ return container_of(inode->i_cdev, struct bsg_device, cdev);
}
-static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr,
- fmode_t mode)
-{
- struct scsi_request *sreq = scsi_req(rq);
-
- if (hdr->dout_xfer_len && hdr->din_xfer_len) {
- pr_warn_once("BIDI support in bsg has been removed.\n");
- return -EOPNOTSUPP;
- }
-
- sreq->cmd_len = hdr->request_len;
- if (sreq->cmd_len > BLK_MAX_CDB) {
- sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL);
- if (!sreq->cmd)
- return -ENOMEM;
- }
+#define BSG_DEFAULT_CMDS 64
+#define BSG_MAX_DEVS (1 << MINORBITS)
- if (copy_from_user(sreq->cmd, uptr64(hdr->request), sreq->cmd_len))
- return -EFAULT;
- if (blk_verify_command(sreq->cmd, mode))
- return -EPERM;
- return 0;
-}
+static DEFINE_IDA(bsg_minor_ida);
+static const struct class bsg_class;
+static int bsg_major;
-static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr)
+static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr)
{
- struct scsi_request *sreq = scsi_req(rq);
- int ret = 0;
-
- /*
- * fill in all the output members
- */
- hdr->device_status = sreq->result & 0xff;
- hdr->transport_status = host_byte(sreq->result);
- hdr->driver_status = driver_byte(sreq->result);
- hdr->info = 0;
- if (hdr->device_status || hdr->transport_status || hdr->driver_status)
- hdr->info |= SG_INFO_CHECK;
- hdr->response_len = 0;
-
- if (sreq->sense_len && hdr->response) {
- int len = min_t(unsigned int, hdr->max_response_len,
- sreq->sense_len);
-
- if (copy_to_user(uptr64(hdr->response), sreq->sense, len))
- ret = -EFAULT;
- else
- hdr->response_len = len;
- }
-
- if (rq_data_dir(rq) == READ)
- hdr->din_resid = sreq->resid_len;
- else
- hdr->dout_resid = sreq->resid_len;
+ unsigned int timeout = BLK_DEFAULT_SG_TIMEOUT;
- return ret;
-}
+ if (hdr->timeout)
+ timeout = msecs_to_jiffies(hdr->timeout);
+ else if (bd->timeout)
+ timeout = bd->timeout;
-static void bsg_scsi_free_rq(struct request *rq)
-{
- scsi_req_free_cmd(scsi_req(rq));
+ return max_t(unsigned int, timeout, BLK_MIN_SG_TIMEOUT);
}
-static const struct bsg_ops bsg_scsi_ops = {
- .check_proto = bsg_scsi_check_proto,
- .fill_hdr = bsg_scsi_fill_hdr,
- .complete_rq = bsg_scsi_complete_rq,
- .free_rq = bsg_scsi_free_rq,
-};
-
-static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg)
+static int bsg_sg_io(struct bsg_device *bd, bool open_for_write,
+ void __user *uarg)
{
- struct request *rq;
- struct bio *bio;
struct sg_io_v4 hdr;
int ret;
if (copy_from_user(&hdr, uarg, sizeof(hdr)))
return -EFAULT;
-
- if (!q->bsg_dev.class_dev)
- return -ENXIO;
-
if (hdr.guard != 'Q')
return -EINVAL;
- ret = q->bsg_dev.ops->check_proto(&hdr);
- if (ret)
- return ret;
-
- rq = blk_get_request(q, hdr.dout_xfer_len ?
- REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
-
- ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode);
- if (ret)
- return ret;
-
- rq->timeout = msecs_to_jiffies(hdr.timeout);
- if (!rq->timeout)
- rq->timeout = q->sg_timeout;
- if (!rq->timeout)
- rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
- if (rq->timeout < BLK_MIN_SG_TIMEOUT)
- rq->timeout = BLK_MIN_SG_TIMEOUT;
-
- if (hdr.dout_xfer_len) {
- ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.dout_xferp),
- hdr.dout_xfer_len, GFP_KERNEL);
- } else if (hdr.din_xfer_len) {
- ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.din_xferp),
- hdr.din_xfer_len, GFP_KERNEL);
- }
-
- if (ret)
- goto out_free_rq;
-
- bio = rq->bio;
-
- blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL));
- ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr);
- blk_rq_unmap_user(bio);
-
-out_free_rq:
- rq->q->bsg_dev.ops->free_rq(rq);
- blk_put_request(rq);
+ ret = bd->sg_io_fn(bd->queue, &hdr, open_for_write,
+ bsg_timeout(bd, &hdr));
if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr)))
return -EFAULT;
return ret;
}
-static struct bsg_device *bsg_alloc_device(void)
-{
- struct bsg_device *bd;
-
- bd = kzalloc(sizeof(struct bsg_device), GFP_KERNEL);
- if (unlikely(!bd))
- return NULL;
-
- spin_lock_init(&bd->lock);
- bd->max_queue = BSG_DEFAULT_CMDS;
- INIT_HLIST_NODE(&bd->dev_list);
- return bd;
-}
-
-static int bsg_put_device(struct bsg_device *bd)
-{
- struct request_queue *q = bd->queue;
-
- mutex_lock(&bsg_mutex);
-
- if (!refcount_dec_and_test(&bd->ref_count)) {
- mutex_unlock(&bsg_mutex);
- return 0;
- }
-
- hlist_del(&bd->dev_list);
- mutex_unlock(&bsg_mutex);
-
- bsg_dbg(bd, "tearing down\n");
-
- /*
- * close can always block
- */
- kfree(bd);
- blk_put_queue(q);
- return 0;
-}
-
-static struct bsg_device *bsg_add_device(struct inode *inode,
- struct request_queue *rq,
- struct file *file)
-{
- struct bsg_device *bd;
- unsigned char buf[32];
-
- lockdep_assert_held(&bsg_mutex);
-
- if (!blk_get_queue(rq))
- return ERR_PTR(-ENXIO);
-
- bd = bsg_alloc_device();
- if (!bd) {
- blk_put_queue(rq);
- return ERR_PTR(-ENOMEM);
- }
-
- bd->queue = rq;
-
- refcount_set(&bd->ref_count, 1);
- hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode)));
-
- strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1);
- bsg_dbg(bd, "bound to <%s>, max queue %d\n",
- format_dev_t(buf, inode->i_rdev), bd->max_queue);
-
- return bd;
-}
-
-static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q)
-{
- struct bsg_device *bd;
-
- lockdep_assert_held(&bsg_mutex);
-
- hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) {
- if (bd->queue == q) {
- refcount_inc(&bd->ref_count);
- goto found;
- }
- }
- bd = NULL;
-found:
- return bd;
-}
-
-static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file)
-{
- struct bsg_device *bd;
- struct bsg_class_device *bcd;
-
- /*
- * find the class device
- */
- mutex_lock(&bsg_mutex);
- bcd = idr_find(&bsg_minor_idr, iminor(inode));
-
- if (!bcd) {
- bd = ERR_PTR(-ENODEV);
- goto out_unlock;
- }
-
- bd = __bsg_get_device(iminor(inode), bcd->queue);
- if (!bd)
- bd = bsg_add_device(inode, bcd->queue, file);
-
-out_unlock:
- mutex_unlock(&bsg_mutex);
- return bd;
-}
-
static int bsg_open(struct inode *inode, struct file *file)
{
- struct bsg_device *bd;
-
- bd = bsg_get_device(inode, file);
-
- if (IS_ERR(bd))
- return PTR_ERR(bd);
-
- file->private_data = bd;
+ if (!blk_get_queue(to_bsg_device(inode)->queue))
+ return -ENXIO;
return 0;
}
static int bsg_release(struct inode *inode, struct file *file)
{
- struct bsg_device *bd = file->private_data;
-
- file->private_data = NULL;
- return bsg_put_device(bd);
+ blk_put_queue(to_bsg_device(inode)->queue);
+ return 0;
}
static int bsg_get_command_q(struct bsg_device *bd, int __user *uarg)
{
- return put_user(bd->max_queue, uarg);
+ return put_user(READ_ONCE(bd->max_queue), uarg);
}
static int bsg_set_command_q(struct bsg_device *bd, int __user *uarg)
{
- int queue;
+ int max_queue;
- if (get_user(queue, uarg))
+ if (get_user(max_queue, uarg))
return -EFAULT;
- if (queue < 1)
+ if (max_queue < 1)
return -EINVAL;
-
- spin_lock_irq(&bd->lock);
- bd->max_queue = queue;
- spin_unlock_irq(&bd->lock);
+ WRITE_ONCE(bd->max_queue, max_queue);
return 0;
}
static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
{
- struct bsg_device *bd = file->private_data;
+ struct bsg_device *bd = to_bsg_device(file_inode(file));
+ struct request_queue *q = bd->queue;
void __user *uarg = (void __user *) arg;
+ int __user *intp = uarg;
+ int val;
switch (cmd) {
/*
@@ -362,17 +122,37 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
* SCSI/sg ioctls
*/
case SG_GET_VERSION_NUM:
+ return put_user(30527, intp);
case SCSI_IOCTL_GET_IDLUN:
+ return put_user(0, intp);
case SCSI_IOCTL_GET_BUS_NUMBER:
+ return put_user(0, intp);
case SG_SET_TIMEOUT:
+ if (get_user(val, intp))
+ return -EFAULT;
+ bd->timeout = clock_t_to_jiffies(val);
+ return 0;
case SG_GET_TIMEOUT:
+ return jiffies_to_clock_t(bd->timeout);
case SG_GET_RESERVED_SIZE:
+ return put_user(min(bd->reserved_size, queue_max_bytes(q)),
+ intp);
case SG_SET_RESERVED_SIZE:
+ if (get_user(val, intp))
+ return -EFAULT;
+ if (val < 0)
+ return -EINVAL;
+ bd->reserved_size =
+ min_t(unsigned int, val, queue_max_bytes(q));
+ return 0;
case SG_EMULATED_HOST:
- case SCSI_IOCTL_SEND_COMMAND:
- return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg);
+ return put_user(1, intp);
case SG_IO:
- return bsg_sg_io(bd->queue, file->f_mode, uarg);
+ return bsg_sg_io(bd, file->f_mode & FMODE_WRITE, uarg);
+ case SCSI_IOCTL_SEND_COMMAND:
+ pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n",
+ current->comm);
+ return -EINVAL;
default:
return -ENOTTY;
}
@@ -387,129 +167,106 @@ static const struct file_operations bsg_fops = {
.llseek = default_llseek,
};
-void bsg_unregister_queue(struct request_queue *q)
+static void bsg_device_release(struct device *dev)
{
- struct bsg_class_device *bcd = &q->bsg_dev;
-
- if (!bcd->class_dev)
- return;
-
- mutex_lock(&bsg_mutex);
- idr_remove(&bsg_minor_idr, bcd->minor);
- if (q->kobj.sd)
- sysfs_remove_link(&q->kobj, "bsg");
- device_unregister(bcd->class_dev);
- bcd->class_dev = NULL;
- mutex_unlock(&bsg_mutex);
+ struct bsg_device *bd = container_of(dev, struct bsg_device, device);
+
+ ida_free(&bsg_minor_ida, MINOR(bd->device.devt));
+ kfree(bd);
}
-EXPORT_SYMBOL_GPL(bsg_unregister_queue);
-int bsg_register_queue(struct request_queue *q, struct device *parent,
- const char *name, const struct bsg_ops *ops)
+void bsg_unregister_queue(struct bsg_device *bd)
{
- struct bsg_class_device *bcd;
- dev_t dev;
- int ret;
- struct device *class_dev = NULL;
+ struct gendisk *disk = bd->queue->disk;
- /*
- * we need a proper transport to send commands, not a stacked device
- */
- if (!queue_is_mq(q))
- return 0;
+ if (disk && disk->queue_kobj.sd)
+ sysfs_remove_link(&disk->queue_kobj, "bsg");
+ cdev_device_del(&bd->cdev, &bd->device);
+ put_device(&bd->device);
+}
+EXPORT_SYMBOL_GPL(bsg_unregister_queue);
- bcd = &q->bsg_dev;
- memset(bcd, 0, sizeof(*bcd));
+struct bsg_device *bsg_register_queue(struct request_queue *q,
+ struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn)
+{
+ struct bsg_device *bd;
+ int ret;
- mutex_lock(&bsg_mutex);
+ bd = kzalloc(sizeof(*bd), GFP_KERNEL);
+ if (!bd)
+ return ERR_PTR(-ENOMEM);
+ bd->max_queue = BSG_DEFAULT_CMDS;
+ bd->reserved_size = INT_MAX;
+ bd->queue = q;
+ bd->sg_io_fn = sg_io_fn;
- ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL);
+ ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL);
if (ret < 0) {
- if (ret == -ENOSPC) {
- printk(KERN_ERR "bsg: too many bsg devices\n");
- ret = -EINVAL;
- }
- goto unlock;
+ if (ret == -ENOSPC)
+ dev_err(parent, "bsg: too many bsg devices\n");
+ kfree(bd);
+ return ERR_PTR(ret);
}
+ bd->device.devt = MKDEV(bsg_major, ret);
+ bd->device.class = &bsg_class;
+ bd->device.parent = parent;
+ bd->device.release = bsg_device_release;
+ dev_set_name(&bd->device, "%s", name);
+ device_initialize(&bd->device);
+
+ cdev_init(&bd->cdev, &bsg_fops);
+ bd->cdev.owner = THIS_MODULE;
+ ret = cdev_device_add(&bd->cdev, &bd->device);
+ if (ret)
+ goto out_put_device;
- bcd->minor = ret;
- bcd->queue = q;
- bcd->ops = ops;
- dev = MKDEV(bsg_major, bcd->minor);
- class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name);
- if (IS_ERR(class_dev)) {
- ret = PTR_ERR(class_dev);
- goto idr_remove;
- }
- bcd->class_dev = class_dev;
-
- if (q->kobj.sd) {
- ret = sysfs_create_link(&q->kobj, &bcd->class_dev->kobj, "bsg");
+ if (q->disk && q->disk->queue_kobj.sd) {
+ ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj,
+ "bsg");
if (ret)
- goto unregister_class_dev;
+ goto out_device_del;
}
- mutex_unlock(&bsg_mutex);
- return 0;
-
-unregister_class_dev:
- device_unregister(class_dev);
-idr_remove:
- idr_remove(&bsg_minor_idr, bcd->minor);
-unlock:
- mutex_unlock(&bsg_mutex);
- return ret;
-}
-
-int bsg_scsi_register_queue(struct request_queue *q, struct device *parent)
-{
- if (!blk_queue_scsi_passthrough(q)) {
- WARN_ONCE(true, "Attempt to register a non-SCSI queue\n");
- return -EINVAL;
- }
+ return bd;
- return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops);
+out_device_del:
+ cdev_device_del(&bd->cdev, &bd->device);
+out_put_device:
+ put_device(&bd->device);
+ return ERR_PTR(ret);
}
-EXPORT_SYMBOL_GPL(bsg_scsi_register_queue);
-
-static struct cdev bsg_cdev;
+EXPORT_SYMBOL_GPL(bsg_register_queue);
-static char *bsg_devnode(struct device *dev, umode_t *mode)
+static char *bsg_devnode(const struct device *dev, umode_t *mode)
{
return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev));
}
+static const struct class bsg_class = {
+ .name = "bsg",
+ .devnode = bsg_devnode,
+};
+
static int __init bsg_init(void)
{
- int ret, i;
dev_t devid;
+ int ret;
- for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++)
- INIT_HLIST_HEAD(&bsg_device_list[i]);
-
- bsg_class = class_create(THIS_MODULE, "bsg");
- if (IS_ERR(bsg_class))
- return PTR_ERR(bsg_class);
- bsg_class->devnode = bsg_devnode;
+ ret = class_register(&bsg_class);
+ if (ret)
+ return ret;
ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg");
if (ret)
goto destroy_bsg_class;
-
bsg_major = MAJOR(devid);
- cdev_init(&bsg_cdev, &bsg_fops);
- ret = cdev_add(&bsg_cdev, MKDEV(bsg_major, 0), BSG_MAX_DEVS);
- if (ret)
- goto unregister_chrdev;
-
printk(KERN_INFO BSG_DESCRIPTION " version " BSG_VERSION
" loaded (major %d)\n", bsg_major);
return 0;
-unregister_chrdev:
- unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS);
+
destroy_bsg_class:
- class_destroy(bsg_class);
+ class_unregister(&bsg_class);
return ret;
}
diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c
deleted file mode 100644
index f2a14571882b..000000000000
--- a/block/cmdline-parser.c
+++ /dev/null
@@ -1,255 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Parse command line, get partition information
- *
- * Written by Cai Zhiyong <caizhiyong@huawei.com>
- *
- */
-#include <linux/export.h>
-#include <linux/cmdline-parser.h>
-
-static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
-{
- int ret = 0;
- struct cmdline_subpart *new_subpart;
-
- *subpart = NULL;
-
- new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
- if (!new_subpart)
- return -ENOMEM;
-
- if (*partdef == '-') {
- new_subpart->size = (sector_t)(~0ULL);
- partdef++;
- } else {
- new_subpart->size = (sector_t)memparse(partdef, &partdef);
- if (new_subpart->size < (sector_t)PAGE_SIZE) {
- pr_warn("cmdline partition size is invalid.");
- ret = -EINVAL;
- goto fail;
- }
- }
-
- if (*partdef == '@') {
- partdef++;
- new_subpart->from = (sector_t)memparse(partdef, &partdef);
- } else {
- new_subpart->from = (sector_t)(~0ULL);
- }
-
- if (*partdef == '(') {
- int length;
- char *next = strchr(++partdef, ')');
-
- if (!next) {
- pr_warn("cmdline partition format is invalid.");
- ret = -EINVAL;
- goto fail;
- }
-
- length = min_t(int, next - partdef,
- sizeof(new_subpart->name) - 1);
- strncpy(new_subpart->name, partdef, length);
- new_subpart->name[length] = '\0';
-
- partdef = ++next;
- } else
- new_subpart->name[0] = '\0';
-
- new_subpart->flags = 0;
-
- if (!strncmp(partdef, "ro", 2)) {
- new_subpart->flags |= PF_RDONLY;
- partdef += 2;
- }
-
- if (!strncmp(partdef, "lk", 2)) {
- new_subpart->flags |= PF_POWERUP_LOCK;
- partdef += 2;
- }
-
- *subpart = new_subpart;
- return 0;
-fail:
- kfree(new_subpart);
- return ret;
-}
-
-static void free_subpart(struct cmdline_parts *parts)
-{
- struct cmdline_subpart *subpart;
-
- while (parts->subpart) {
- subpart = parts->subpart;
- parts->subpart = subpart->next_subpart;
- kfree(subpart);
- }
-}
-
-static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
-{
- int ret = -EINVAL;
- char *next;
- int length;
- struct cmdline_subpart **next_subpart;
- struct cmdline_parts *newparts;
- char buf[BDEVNAME_SIZE + 32 + 4];
-
- *parts = NULL;
-
- newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
- if (!newparts)
- return -ENOMEM;
-
- next = strchr(bdevdef, ':');
- if (!next) {
- pr_warn("cmdline partition has no block device.");
- goto fail;
- }
-
- length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
- strncpy(newparts->name, bdevdef, length);
- newparts->name[length] = '\0';
- newparts->nr_subparts = 0;
-
- next_subpart = &newparts->subpart;
-
- while (next && *(++next)) {
- bdevdef = next;
- next = strchr(bdevdef, ',');
-
- length = (!next) ? (sizeof(buf) - 1) :
- min_t(int, next - bdevdef, sizeof(buf) - 1);
-
- strncpy(buf, bdevdef, length);
- buf[length] = '\0';
-
- ret = parse_subpart(next_subpart, buf);
- if (ret)
- goto fail;
-
- newparts->nr_subparts++;
- next_subpart = &(*next_subpart)->next_subpart;
- }
-
- if (!newparts->subpart) {
- pr_warn("cmdline partition has no valid partition.");
- ret = -EINVAL;
- goto fail;
- }
-
- *parts = newparts;
-
- return 0;
-fail:
- free_subpart(newparts);
- kfree(newparts);
- return ret;
-}
-
-void cmdline_parts_free(struct cmdline_parts **parts)
-{
- struct cmdline_parts *next_parts;
-
- while (*parts) {
- next_parts = (*parts)->next_parts;
- free_subpart(*parts);
- kfree(*parts);
- *parts = next_parts;
- }
-}
-EXPORT_SYMBOL(cmdline_parts_free);
-
-int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline)
-{
- int ret;
- char *buf;
- char *pbuf;
- char *next;
- struct cmdline_parts **next_parts;
-
- *parts = NULL;
-
- next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
- if (!buf)
- return -ENOMEM;
-
- next_parts = parts;
-
- while (next && *pbuf) {
- next = strchr(pbuf, ';');
- if (next)
- *next = '\0';
-
- ret = parse_parts(next_parts, pbuf);
- if (ret)
- goto fail;
-
- if (next)
- pbuf = ++next;
-
- next_parts = &(*next_parts)->next_parts;
- }
-
- if (!*parts) {
- pr_warn("cmdline partition has no valid partition.");
- ret = -EINVAL;
- goto fail;
- }
-
- ret = 0;
-done:
- kfree(buf);
- return ret;
-
-fail:
- cmdline_parts_free(parts);
- goto done;
-}
-EXPORT_SYMBOL(cmdline_parts_parse);
-
-struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
- const char *bdev)
-{
- while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
- parts = parts->next_parts;
- return parts;
-}
-EXPORT_SYMBOL(cmdline_parts_find);
-
-/*
- * add_part()
- * 0 success.
- * 1 can not add so many partitions.
- */
-int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
- int slot,
- int (*add_part)(int, struct cmdline_subpart *, void *),
- void *param)
-{
- sector_t from = 0;
- struct cmdline_subpart *subpart;
-
- for (subpart = parts->subpart; subpart;
- subpart = subpart->next_subpart, slot++) {
- if (subpart->from == (sector_t)(~0ULL))
- subpart->from = from;
- else
- from = subpart->from;
-
- if (from >= disk_size)
- break;
-
- if (subpart->size > (disk_size - from))
- subpart->size = disk_size - from;
-
- from += subpart->size;
-
- if (add_part(slot, subpart, param))
- break;
- }
-
- return slot;
-}
-EXPORT_SYMBOL(cmdline_parts_set);
diff --git a/block/disk-events.c b/block/disk-events.c
new file mode 100644
index 000000000000..2f697224386a
--- /dev/null
+++ b/block/disk-events.c
@@ -0,0 +1,489 @@
+// SPDX-License-Identifier: GPL-2.0
+/*
+ * Disk events - monitor disk events like media change and eject request.
+ */
+#include <linux/export.h>
+#include <linux/moduleparam.h>
+#include <linux/blkdev.h>
+#include "blk.h"
+
+struct disk_events {
+ struct list_head node; /* all disk_event's */
+ struct gendisk *disk; /* the associated disk */
+ spinlock_t lock;
+
+ struct mutex block_mutex; /* protects blocking */
+ int block; /* event blocking depth */
+ unsigned int pending; /* events already sent out */
+ unsigned int clearing; /* events being cleared */
+
+ long poll_msecs; /* interval, -1 for default */
+ struct delayed_work dwork;
+};
+
+static const char *disk_events_strs[] = {
+ [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
+ [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
+};
+
+static char *disk_uevents[] = {
+ [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
+ [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
+};
+
+/* list of all disk_events */
+static DEFINE_MUTEX(disk_events_mutex);
+static LIST_HEAD(disk_events);
+
+/* disable in-kernel polling by default */
+static unsigned long disk_events_dfl_poll_msecs;
+
+static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
+{
+ struct disk_events *ev = disk->ev;
+ long intv_msecs = 0;
+
+ /*
+ * If device-specific poll interval is set, always use it. If
+ * the default is being used, poll if the POLL flag is set.
+ */
+ if (ev->poll_msecs >= 0)
+ intv_msecs = ev->poll_msecs;
+ else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
+ intv_msecs = disk_events_dfl_poll_msecs;
+
+ return msecs_to_jiffies(intv_msecs);
+}
+
+/**
+ * disk_block_events - block and flush disk event checking
+ * @disk: disk to block events for
+ *
+ * On return from this function, it is guaranteed that event checking
+ * isn't in progress and won't happen until unblocked by
+ * disk_unblock_events(). Events blocking is counted and the actual
+ * unblocking happens after the matching number of unblocks are done.
+ *
+ * Note that this intentionally does not block event checking from
+ * disk_clear_events().
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void disk_block_events(struct gendisk *disk)
+{
+ struct disk_events *ev = disk->ev;
+ unsigned long flags;
+ bool cancel;
+
+ if (!ev)
+ return;
+
+ /*
+ * Outer mutex ensures that the first blocker completes canceling
+ * the event work before further blockers are allowed to finish.
+ */
+ mutex_lock(&ev->block_mutex);
+
+ spin_lock_irqsave(&ev->lock, flags);
+ cancel = !ev->block++;
+ spin_unlock_irqrestore(&ev->lock, flags);
+
+ if (cancel)
+ cancel_delayed_work_sync(&disk->ev->dwork);
+
+ mutex_unlock(&ev->block_mutex);
+}
+
+static void __disk_unblock_events(struct gendisk *disk, bool check_now)
+{
+ struct disk_events *ev = disk->ev;
+ unsigned long intv;
+ unsigned long flags;
+
+ spin_lock_irqsave(&ev->lock, flags);
+
+ if (WARN_ON_ONCE(ev->block <= 0))
+ goto out_unlock;
+
+ if (--ev->block)
+ goto out_unlock;
+
+ intv = disk_events_poll_jiffies(disk);
+ if (check_now)
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
+ else if (intv)
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
+out_unlock:
+ spin_unlock_irqrestore(&ev->lock, flags);
+}
+
+/**
+ * disk_unblock_events - unblock disk event checking
+ * @disk: disk to unblock events for
+ *
+ * Undo disk_block_events(). When the block count reaches zero, it
+ * starts events polling if configured.
+ *
+ * CONTEXT:
+ * Don't care. Safe to call from irq context.
+ */
+void disk_unblock_events(struct gendisk *disk)
+{
+ if (disk->ev)
+ __disk_unblock_events(disk, false);
+}
+
+/**
+ * disk_flush_events - schedule immediate event checking and flushing
+ * @disk: disk to check and flush events for
+ * @mask: events to flush
+ *
+ * Schedule immediate event checking on @disk if not blocked. Events in
+ * @mask are scheduled to be cleared from the driver. Note that this
+ * doesn't clear the events from @disk->ev.
+ *
+ * CONTEXT:
+ * If @mask is non-zero must be called with disk->open_mutex held.
+ */
+void disk_flush_events(struct gendisk *disk, unsigned int mask)
+{
+ struct disk_events *ev = disk->ev;
+
+ if (!ev)
+ return;
+
+ spin_lock_irq(&ev->lock);
+ ev->clearing |= mask;
+ if (!ev->block)
+ mod_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, 0);
+ spin_unlock_irq(&ev->lock);
+}
+
+/*
+ * Tell userland about new events. Only the events listed in @disk->events are
+ * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are
+ * processed internally but never get reported to userland.
+ */
+static void disk_event_uevent(struct gendisk *disk, unsigned int events)
+{
+ char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
+ int nr_events = 0, i;
+
+ for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
+ if (events & disk->events & (1 << i))
+ envp[nr_events++] = disk_uevents[i];
+
+ if (nr_events)
+ kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+}
+
+static void disk_check_events(struct disk_events *ev,
+ unsigned int *clearing_ptr)
+{
+ struct gendisk *disk = ev->disk;
+ unsigned int clearing = *clearing_ptr;
+ unsigned int events;
+ unsigned long intv;
+
+ /* check events */
+ events = disk->fops->check_events(disk, clearing);
+
+ /* accumulate pending events and schedule next poll if necessary */
+ spin_lock_irq(&ev->lock);
+
+ events &= ~ev->pending;
+ ev->pending |= events;
+ *clearing_ptr &= ~clearing;
+
+ intv = disk_events_poll_jiffies(disk);
+ if (!ev->block && intv)
+ queue_delayed_work(system_freezable_power_efficient_wq,
+ &ev->dwork, intv);
+
+ spin_unlock_irq(&ev->lock);
+
+ if (events & DISK_EVENT_MEDIA_CHANGE)
+ inc_diskseq(disk);
+
+ if (disk->event_flags & DISK_EVENT_FLAG_UEVENT)
+ disk_event_uevent(disk, events);
+}
+
+/**
+ * disk_clear_events - synchronously check, clear and return pending events
+ * @disk: disk to fetch and clear events from
+ * @mask: mask of events to be fetched and cleared
+ *
+ * Disk events are synchronously checked and pending events in @mask
+ * are cleared and returned. This ignores the block count.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
+{
+ struct disk_events *ev = disk->ev;
+ unsigned int pending;
+ unsigned int clearing = mask;
+
+ if (!ev)
+ return 0;
+
+ disk_block_events(disk);
+
+ /*
+ * store the union of mask and ev->clearing on the stack so that the
+ * race with disk_flush_events does not cause ambiguity (ev->clearing
+ * can still be modified even if events are blocked).
+ */
+ spin_lock_irq(&ev->lock);
+ clearing |= ev->clearing;
+ ev->clearing = 0;
+ spin_unlock_irq(&ev->lock);
+
+ disk_check_events(ev, &clearing);
+ /*
+ * if ev->clearing is not 0, the disk_flush_events got called in the
+ * middle of this function, so we want to run the workfn without delay.
+ */
+ __disk_unblock_events(disk, ev->clearing ? true : false);
+
+ /* then, fetch and clear pending events */
+ spin_lock_irq(&ev->lock);
+ pending = ev->pending & mask;
+ ev->pending &= ~mask;
+ spin_unlock_irq(&ev->lock);
+ WARN_ON_ONCE(clearing & mask);
+
+ return pending;
+}
+
+/**
+ * disk_check_media_change - check if a removable media has been changed
+ * @disk: gendisk to check
+ *
+ * Returns %true and marks the disk for a partition rescan whether a removable
+ * media has been changed, and %false if the media did not change.
+ */
+bool disk_check_media_change(struct gendisk *disk)
+{
+ unsigned int events;
+
+ events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE |
+ DISK_EVENT_EJECT_REQUEST);
+ if (events & DISK_EVENT_MEDIA_CHANGE) {
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ return true;
+ }
+ return false;
+}
+EXPORT_SYMBOL(disk_check_media_change);
+
+/**
+ * disk_force_media_change - force a media change event
+ * @disk: the disk which will raise the event
+ *
+ * Should be called when the media changes for @disk. Generates a uevent
+ * and attempts to free all dentries and inodes and invalidates all block
+ * device page cache entries in that case.
+ */
+void disk_force_media_change(struct gendisk *disk)
+{
+ disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE);
+ inc_diskseq(disk);
+ bdev_mark_dead(disk->part0, true);
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+}
+EXPORT_SYMBOL_GPL(disk_force_media_change);
+
+/*
+ * Separate this part out so that a different pointer for clearing_ptr can be
+ * passed in for disk_clear_events.
+ */
+static void disk_events_workfn(struct work_struct *work)
+{
+ struct delayed_work *dwork = to_delayed_work(work);
+ struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
+
+ disk_check_events(ev, &ev->clearing);
+}
+
+/*
+ * A disk events enabled device has the following sysfs nodes under
+ * its /sys/block/X/ directory.
+ *
+ * events : list of all supported events
+ * events_async : list of events which can be detected w/o polling
+ * (always empty, only for backwards compatibility)
+ * events_poll_msecs : polling interval, 0: disable, -1: system default
+ */
+static ssize_t __disk_events_show(unsigned int events, char *buf)
+{
+ const char *delim = "";
+ ssize_t pos = 0;
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
+ if (events & (1 << i)) {
+ pos += sprintf(buf + pos, "%s%s",
+ delim, disk_events_strs[i]);
+ delim = " ";
+ }
+ if (pos)
+ pos += sprintf(buf + pos, "\n");
+ return pos;
+}
+
+static ssize_t disk_events_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
+ return 0;
+ return __disk_events_show(disk->events, buf);
+}
+
+static ssize_t disk_events_async_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ return 0;
+}
+
+static ssize_t disk_events_poll_msecs_show(struct device *dev,
+ struct device_attribute *attr,
+ char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (!disk->ev)
+ return sprintf(buf, "-1\n");
+ return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
+}
+
+static ssize_t disk_events_poll_msecs_store(struct device *dev,
+ struct device_attribute *attr,
+ const char *buf, size_t count)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+ long intv;
+
+ if (!count || !sscanf(buf, "%ld", &intv))
+ return -EINVAL;
+
+ if (intv < 0 && intv != -1)
+ return -EINVAL;
+
+ if (!disk->ev)
+ return -ENODEV;
+
+ disk_block_events(disk);
+ disk->ev->poll_msecs = intv;
+ __disk_unblock_events(disk, true);
+ return count;
+}
+
+DEVICE_ATTR(events, 0444, disk_events_show, NULL);
+DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
+DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show,
+ disk_events_poll_msecs_store);
+
+/*
+ * The default polling interval can be specified by the kernel
+ * parameter block.events_dfl_poll_msecs which defaults to 0
+ * (disable). This can also be modified runtime by writing to
+ * /sys/module/block/parameters/events_dfl_poll_msecs.
+ */
+static int disk_events_set_dfl_poll_msecs(const char *val,
+ const struct kernel_param *kp)
+{
+ struct disk_events *ev;
+ int ret;
+
+ ret = param_set_ulong(val, kp);
+ if (ret < 0)
+ return ret;
+
+ mutex_lock(&disk_events_mutex);
+ list_for_each_entry(ev, &disk_events, node)
+ disk_flush_events(ev->disk, 0);
+ mutex_unlock(&disk_events_mutex);
+ return 0;
+}
+
+static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
+ .set = disk_events_set_dfl_poll_msecs,
+ .get = param_get_ulong,
+};
+
+#undef MODULE_PARAM_PREFIX
+#define MODULE_PARAM_PREFIX "block."
+
+module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
+ &disk_events_dfl_poll_msecs, 0644);
+
+/*
+ * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
+ */
+int disk_alloc_events(struct gendisk *disk)
+{
+ struct disk_events *ev;
+
+ if (!disk->fops->check_events || !disk->events)
+ return 0;
+
+ ev = kzalloc(sizeof(*ev), GFP_KERNEL);
+ if (!ev) {
+ pr_warn("%s: failed to initialize events\n", disk->disk_name);
+ return -ENOMEM;
+ }
+
+ INIT_LIST_HEAD(&ev->node);
+ ev->disk = disk;
+ spin_lock_init(&ev->lock);
+ mutex_init(&ev->block_mutex);
+ ev->block = 1;
+ ev->poll_msecs = -1;
+ INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
+
+ disk->ev = ev;
+ return 0;
+}
+
+void disk_add_events(struct gendisk *disk)
+{
+ if (!disk->ev)
+ return;
+
+ mutex_lock(&disk_events_mutex);
+ list_add_tail(&disk->ev->node, &disk_events);
+ mutex_unlock(&disk_events_mutex);
+
+ /*
+ * Block count is initialized to 1 and the following initial
+ * unblock kicks it into action.
+ */
+ __disk_unblock_events(disk, true);
+}
+
+void disk_del_events(struct gendisk *disk)
+{
+ if (disk->ev) {
+ disk_block_events(disk);
+
+ mutex_lock(&disk_events_mutex);
+ list_del_init(&disk->ev->node);
+ mutex_unlock(&disk_events_mutex);
+ }
+}
+
+void disk_release_events(struct gendisk *disk)
+{
+ /* the block count should be 1 from disk_del_events() */
+ WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
+ kfree(disk->ev);
+}
diff --git a/block/early-lookup.c b/block/early-lookup.c
new file mode 100644
index 000000000000..3effbd0d35e9
--- /dev/null
+++ b/block/early-lookup.c
@@ -0,0 +1,316 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Code for looking up block devices in the early boot code before mounting the
+ * root file system.
+ */
+#include <linux/blkdev.h>
+#include <linux/ctype.h>
+
+struct uuidcmp {
+ const char *uuid;
+ int len;
+};
+
+/**
+ * match_dev_by_uuid - callback for finding a partition using its uuid
+ * @dev: device passed in by the caller
+ * @data: opaque pointer to the desired struct uuidcmp to match
+ *
+ * Returns 1 if the device matches, and 0 otherwise.
+ */
+static int __init match_dev_by_uuid(struct device *dev, const void *data)
+{
+ struct block_device *bdev = dev_to_bdev(dev);
+ const struct uuidcmp *cmp = data;
+
+ if (!bdev->bd_meta_info ||
+ strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len))
+ return 0;
+ return 1;
+}
+
+/**
+ * devt_from_partuuid - looks up the dev_t of a partition by its UUID
+ * @uuid_str: char array containing ascii UUID
+ * @devt: dev_t result
+ *
+ * The function will return the first partition which contains a matching
+ * UUID value in its partition_meta_info struct. This does not search
+ * by filesystem UUIDs.
+ *
+ * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be
+ * extracted and used as an offset from the partition identified by the UUID.
+ *
+ * Returns 0 on success or a negative error code on failure.
+ */
+static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt)
+{
+ struct uuidcmp cmp;
+ struct device *dev = NULL;
+ int offset = 0;
+ char *slash;
+
+ cmp.uuid = uuid_str;
+
+ slash = strchr(uuid_str, '/');
+ /* Check for optional partition number offset attributes. */
+ if (slash) {
+ char c = 0;
+
+ /* Explicitly fail on poor PARTUUID syntax. */
+ if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1)
+ goto out_invalid;
+ cmp.len = slash - uuid_str;
+ } else {
+ cmp.len = strlen(uuid_str);
+ }
+
+ if (!cmp.len)
+ goto out_invalid;
+
+ dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid);
+ if (!dev)
+ return -ENODEV;
+
+ if (offset) {
+ /*
+ * Attempt to find the requested partition by adding an offset
+ * to the partition number found by UUID.
+ */
+ *devt = part_devt(dev_to_disk(dev),
+ dev_to_bdev(dev)->bd_partno + offset);
+ } else {
+ *devt = dev->devt;
+ }
+
+ put_device(dev);
+ return 0;
+
+out_invalid:
+ pr_err("VFS: PARTUUID= is invalid.\n"
+ "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n");
+ return -EINVAL;
+}
+
+/**
+ * match_dev_by_label - callback for finding a partition using its label
+ * @dev: device passed in by the caller
+ * @data: opaque pointer to the label to match
+ *
+ * Returns 1 if the device matches, and 0 otherwise.
+ */
+static int __init match_dev_by_label(struct device *dev, const void *data)
+{
+ struct block_device *bdev = dev_to_bdev(dev);
+ const char *label = data;
+
+ if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname))
+ return 0;
+ return 1;
+}
+
+static int __init devt_from_partlabel(const char *label, dev_t *devt)
+{
+ struct device *dev;
+
+ dev = class_find_device(&block_class, NULL, label, &match_dev_by_label);
+ if (!dev)
+ return -ENODEV;
+ *devt = dev->devt;
+ put_device(dev);
+ return 0;
+}
+
+static dev_t __init blk_lookup_devt(const char *name, int partno)
+{
+ dev_t devt = MKDEV(0, 0);
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+
+ if (strcmp(dev_name(dev), name))
+ continue;
+
+ if (partno < disk->minors) {
+ /* We need to return the right devno, even
+ * if the partition doesn't exist yet.
+ */
+ devt = MKDEV(MAJOR(dev->devt),
+ MINOR(dev->devt) + partno);
+ } else {
+ devt = part_devt(disk, partno);
+ if (devt)
+ break;
+ }
+ }
+ class_dev_iter_exit(&iter);
+ return devt;
+}
+
+static int __init devt_from_devname(const char *name, dev_t *devt)
+{
+ int part;
+ char s[32];
+ char *p;
+
+ if (strlen(name) > 31)
+ return -EINVAL;
+ strcpy(s, name);
+ for (p = s; *p; p++) {
+ if (*p == '/')
+ *p = '!';
+ }
+
+ *devt = blk_lookup_devt(s, 0);
+ if (*devt)
+ return 0;
+
+ /*
+ * Try non-existent, but valid partition, which may only exist after
+ * opening the device, like partitioned md devices.
+ */
+ while (p > s && isdigit(p[-1]))
+ p--;
+ if (p == s || !*p || *p == '0')
+ return -ENODEV;
+
+ /* try disk name without <part number> */
+ part = simple_strtoul(p, NULL, 10);
+ *p = '\0';
+ *devt = blk_lookup_devt(s, part);
+ if (*devt)
+ return 0;
+
+ /* try disk name without p<part number> */
+ if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p')
+ return -ENODEV;
+ p[-1] = '\0';
+ *devt = blk_lookup_devt(s, part);
+ if (*devt)
+ return 0;
+ return -ENODEV;
+}
+
+static int __init devt_from_devnum(const char *name, dev_t *devt)
+{
+ unsigned maj, min, offset;
+ char *p, dummy;
+
+ if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 ||
+ sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) {
+ *devt = MKDEV(maj, min);
+ if (maj != MAJOR(*devt) || min != MINOR(*devt))
+ return -EINVAL;
+ } else {
+ *devt = new_decode_dev(simple_strtoul(name, &p, 16));
+ if (*p)
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
+/*
+ * Convert a name into device number. We accept the following variants:
+ *
+ * 1) <hex_major><hex_minor> device number in hexadecimal represents itself
+ * no leading 0x, for example b302.
+ * 3) /dev/<disk_name> represents the device number of disk
+ * 4) /dev/<disk_name><decimal> represents the device number
+ * of partition - device number of disk plus the partition number
+ * 5) /dev/<disk_name>p<decimal> - same as the above, that form is
+ * used when disk name of partitioned disk ends on a digit.
+ * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the
+ * unique id of a partition if the partition table provides it.
+ * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS
+ * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero-
+ * filled hex representation of the 32-bit "NT disk signature", and PP
+ * is a zero-filled hex representation of the 1-based partition number.
+ * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to
+ * a partition with a known unique id.
+ * 8) <major>:<minor> major and minor number of the device separated by
+ * a colon.
+ * 9) PARTLABEL=<name> with name being the GPT partition label.
+ * MSDOS partitions do not support labels!
+ *
+ * If name doesn't have fall into the categories above, we return (0,0).
+ * block_class is used to check if something is a disk name. If the disk
+ * name contains slashes, the device name has them replaced with
+ * bangs.
+ */
+int __init early_lookup_bdev(const char *name, dev_t *devt)
+{
+ if (strncmp(name, "PARTUUID=", 9) == 0)
+ return devt_from_partuuid(name + 9, devt);
+ if (strncmp(name, "PARTLABEL=", 10) == 0)
+ return devt_from_partlabel(name + 10, devt);
+ if (strncmp(name, "/dev/", 5) == 0)
+ return devt_from_devname(name + 5, devt);
+ return devt_from_devnum(name, devt);
+}
+
+static char __init *bdevt_str(dev_t devt, char *buf)
+{
+ if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
+ char tbuf[BDEVT_SIZE];
+ snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
+ snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
+ } else
+ snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
+
+ return buf;
+}
+
+/*
+ * print a full list of all partitions - intended for places where the root
+ * filesystem can't be mounted and thus to give the victim some idea of what
+ * went wrong
+ */
+void __init printk_all_partitions(void)
+{
+ struct class_dev_iter iter;
+ struct device *dev;
+
+ class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
+ while ((dev = class_dev_iter_next(&iter))) {
+ struct gendisk *disk = dev_to_disk(dev);
+ struct block_device *part;
+ char devt_buf[BDEVT_SIZE];
+ unsigned long idx;
+
+ /*
+ * Don't show empty devices or things that have been
+ * suppressed
+ */
+ if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN))
+ continue;
+
+ /*
+ * Note, unlike /proc/partitions, I am showing the numbers in
+ * hex - the same format as the root= option takes.
+ */
+ rcu_read_lock();
+ xa_for_each(&disk->part_tbl, idx, part) {
+ if (!bdev_nr_sectors(part))
+ continue;
+ printk("%s%s %10llu %pg %s",
+ bdev_is_partition(part) ? " " : "",
+ bdevt_str(part->bd_dev, devt_buf),
+ bdev_nr_sectors(part) >> 1, part,
+ part->bd_meta_info ?
+ part->bd_meta_info->uuid : "");
+ if (bdev_is_partition(part))
+ printk("\n");
+ else if (dev->parent && dev->parent->driver)
+ printk(" driver: %s\n",
+ dev->parent->driver->name);
+ else
+ printk(" (driver?)\n");
+ }
+ rcu_read_unlock();
+ }
+ class_dev_iter_exit(&iter);
+}
diff --git a/block/elevator.c b/block/elevator.c
index 4eab3d70e880..5ff093cb3cf8 100644
--- a/block/elevator.c
+++ b/block/elevator.c
@@ -26,7 +26,6 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
-#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -36,14 +35,15 @@
#include <linux/hash.h>
#include <linux/uaccess.h>
#include <linux/pm_runtime.h>
-#include <linux/blk-cgroup.h>
#include <trace/events/block.h>
+#include "elevator.h"
#include "blk.h"
#include "blk-mq-sched.h"
#include "blk-pm.h"
#include "blk-wbt.h"
+#include "blk-cgroup.h"
static DEFINE_SPINLOCK(elv_list_lock);
static LIST_HEAD(elv_list);
@@ -57,7 +57,7 @@ static LIST_HEAD(elv_list);
* Query io scheduler to see if the current process issuing bio may be
* merged with rq.
*/
-static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
+static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
{
struct request_queue *q = rq->q;
struct elevator_queue *e = q->elevator;
@@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio)
if (e->type->ops.allow_merge)
return e->type->ops.allow_merge(q, rq, bio);
- return 1;
+ return true;
}
/*
@@ -83,83 +83,50 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio)
}
EXPORT_SYMBOL(elv_bio_merge_ok);
-static inline bool elv_support_features(unsigned int elv_features,
- unsigned int required_features)
+static inline bool elv_support_features(struct request_queue *q,
+ const struct elevator_type *e)
{
- return (required_features & elv_features) == required_features;
+ return (q->required_elevator_features & e->elevator_features) ==
+ q->required_elevator_features;
}
/**
- * elevator_match - Test an elevator name and features
+ * elevator_match - Check whether @e's name or alias matches @name
* @e: Scheduler to test
* @name: Elevator name to test
- * @required_features: Features that the elevator must provide
*
- * Return true is the elevator @e name matches @name and if @e provides all the
- * the feratures spcified by @required_features.
+ * Return true if the elevator @e's name or alias matches @name.
*/
-static bool elevator_match(const struct elevator_type *e, const char *name,
- unsigned int required_features)
+static bool elevator_match(const struct elevator_type *e, const char *name)
{
- if (!elv_support_features(e->elevator_features, required_features))
- return false;
- if (!strcmp(e->elevator_name, name))
- return true;
- if (e->elevator_alias && !strcmp(e->elevator_alias, name))
- return true;
-
- return false;
+ return !strcmp(e->elevator_name, name) ||
+ (e->elevator_alias && !strcmp(e->elevator_alias, name));
}
-/**
- * elevator_find - Find an elevator
- * @name: Name of the elevator to find
- * @required_features: Features that the elevator must provide
- *
- * Return the first registered scheduler with name @name and supporting the
- * features @required_features and NULL otherwise.
- */
-static struct elevator_type *elevator_find(const char *name,
- unsigned int required_features)
+static struct elevator_type *__elevator_find(const char *name)
{
struct elevator_type *e;
- list_for_each_entry(e, &elv_list, list) {
- if (elevator_match(e, name, required_features))
+ list_for_each_entry(e, &elv_list, list)
+ if (elevator_match(e, name))
return e;
- }
-
return NULL;
}
-static void elevator_put(struct elevator_type *e)
-{
- module_put(e->elevator_owner);
-}
-
-static struct elevator_type *elevator_get(struct request_queue *q,
- const char *name, bool try_loading)
+static struct elevator_type *elevator_find_get(struct request_queue *q,
+ const char *name)
{
struct elevator_type *e;
spin_lock(&elv_list_lock);
-
- e = elevator_find(name, q->required_elevator_features);
- if (!e && try_loading) {
- spin_unlock(&elv_list_lock);
- request_module("%s-iosched", name);
- spin_lock(&elv_list_lock);
- e = elevator_find(name, q->required_elevator_features);
- }
-
- if (e && !try_module_get(e->elevator_owner))
+ e = __elevator_find(name);
+ if (e && (!elv_support_features(q, e) || !elevator_tryget(e)))
e = NULL;
-
spin_unlock(&elv_list_lock);
return e;
}
-static struct kobj_type elv_ktype;
+static const struct kobj_type elv_ktype;
struct elevator_queue *elevator_alloc(struct request_queue *q,
struct elevator_type *e)
@@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q,
if (unlikely(!eq))
return NULL;
+ __elevator_get(e);
eq->type = e;
kobject_init(&eq->kobj, &elv_ktype);
mutex_init(&eq->sysfs_lock);
@@ -188,11 +156,15 @@ static void elevator_release(struct kobject *kobj)
kfree(e);
}
-void __elevator_exit(struct request_queue *q, struct elevator_queue *e)
+void elevator_exit(struct request_queue *q)
{
+ struct elevator_queue *e = q->elevator;
+
+ ioc_clear_queue(q);
+ blk_mq_sched_free_rqs(q);
+
mutex_lock(&e->sysfs_lock);
- if (e->type->ops.exit_sched)
- blk_mq_exit_sched(q, e);
+ blk_mq_exit_sched(q, e);
mutex_unlock(&e->sysfs_lock);
kobject_put(&e->kobj);
@@ -337,6 +309,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
__rq = elv_rqhash_find(q, bio->bi_iter.bi_sector);
if (__rq && elv_bio_merge_ok(__rq, bio)) {
*req = __rq;
+
+ if (blk_discard_mergable(__rq))
+ return ELEVATOR_DISCARD_MERGE;
return ELEVATOR_BACK_MERGE;
}
@@ -351,9 +326,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req,
* we can append 'rq' to an existing request, so we can throw 'rq' away
* afterwards.
*
- * Returns true if we merged, false otherwise
+ * Returns true if we merged, false otherwise. 'free' will contain all
+ * requests that need to be freed.
*/
-bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
+bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq,
+ struct list_head *free)
{
struct request *__rq;
bool ret;
@@ -364,8 +341,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
/*
* First try one-hit cache.
*/
- if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq))
+ if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) {
+ list_add(&rq->queuelist, free);
return true;
+ }
if (blk_queue_noxmerges(q))
return false;
@@ -379,6 +358,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq)
if (!__rq || !blk_attempt_req_merge(q, __rq, rq))
break;
+ list_add(&rq->queuelist, free);
/* The merged request could be merged with others, try again */
ret = true;
rq = __rq;
@@ -475,22 +455,19 @@ static const struct sysfs_ops elv_sysfs_ops = {
.store = elv_attr_store,
};
-static struct kobj_type elv_ktype = {
+static const struct kobj_type elv_ktype = {
.sysfs_ops = &elv_sysfs_ops,
.release = elevator_release,
};
-/*
- * elv_register_queue is called from either blk_register_queue or
- * elevator_switch, elevator switch is prevented from being happen
- * in the two paths, so it is safe to not hold q->sysfs_lock.
- */
int elv_register_queue(struct request_queue *q, bool uevent)
{
struct elevator_queue *e = q->elevator;
int error;
- error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched");
+ lockdep_assert_held(&q->sysfs_lock);
+
+ error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched");
if (!error) {
struct elv_fs_entry *attr = e->type->elevator_attrs;
if (attr) {
@@ -503,32 +480,32 @@ int elv_register_queue(struct request_queue *q, bool uevent)
if (uevent)
kobject_uevent(&e->kobj, KOBJ_ADD);
- e->registered = 1;
+ set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags);
}
return error;
}
-/*
- * elv_unregister_queue is called from either blk_unregister_queue or
- * elevator_switch, elevator switch is prevented from being happen
- * in the two paths, so it is safe to not hold q->sysfs_lock.
- */
void elv_unregister_queue(struct request_queue *q)
{
- if (q) {
- struct elevator_queue *e = q->elevator;
+ struct elevator_queue *e = q->elevator;
+ lockdep_assert_held(&q->sysfs_lock);
+
+ if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) {
kobject_uevent(&e->kobj, KOBJ_REMOVE);
kobject_del(&e->kobj);
-
- e->registered = 0;
- /* Re-enable throttling in case elevator disabled it */
- wbt_enable_default(q);
}
}
int elv_register(struct elevator_type *e)
{
+ /* finish request is mandatory */
+ if (WARN_ON_ONCE(!e->ops.finish_request))
+ return -EINVAL;
+ /* insert_requests and dispatch_request are mandatory */
+ if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request))
+ return -EINVAL;
+
/* create icq_cache if requested */
if (e->icq_size) {
if (WARN_ON(e->icq_size < sizeof(struct io_cq)) ||
@@ -545,7 +522,7 @@ int elv_register(struct elevator_type *e)
/* register, don't allow duplicate names */
spin_lock(&elv_list_lock);
- if (elevator_find(e->elevator_name, 0)) {
+ if (__elevator_find(e->elevator_name)) {
spin_unlock(&elv_list_lock);
kmem_cache_destroy(e->icq_cache);
return -EBUSY;
@@ -578,45 +555,9 @@ void elv_unregister(struct elevator_type *e)
}
EXPORT_SYMBOL_GPL(elv_unregister);
-int elevator_switch_mq(struct request_queue *q,
- struct elevator_type *new_e)
-{
- int ret;
-
- lockdep_assert_held(&q->sysfs_lock);
-
- if (q->elevator) {
- if (q->elevator->registered)
- elv_unregister_queue(q);
-
- ioc_clear_queue(q);
- elevator_exit(q, q->elevator);
- }
-
- ret = blk_mq_init_sched(q, new_e);
- if (ret)
- goto out;
-
- if (new_e) {
- ret = elv_register_queue(q, true);
- if (ret) {
- elevator_exit(q, q->elevator);
- goto out;
- }
- }
-
- if (new_e)
- blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
- else
- blk_add_trace_msg(q, "elv switch: none");
-
-out:
- return ret;
-}
-
static inline bool elv_support_iosched(struct request_queue *q)
{
- if (!q->mq_ops ||
+ if (!queue_is_mq(q) ||
(q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)))
return false;
return true;
@@ -628,10 +569,14 @@ static inline bool elv_support_iosched(struct request_queue *q)
*/
static struct elevator_type *elevator_get_default(struct request_queue *q)
{
- if (q->nr_hw_queues != 1)
+ if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT)
+ return NULL;
+
+ if (q->nr_hw_queues != 1 &&
+ !blk_mq_is_shared_tags(q->tag_set->flags))
return NULL;
- return elevator_get(q, "mq-deadline", false);
+ return elevator_find_get(q, "mq-deadline");
}
/*
@@ -645,14 +590,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q)
spin_lock(&elv_list_lock);
list_for_each_entry(e, &elv_list, list) {
- if (elv_support_features(e->elevator_features,
- q->required_elevator_features)) {
+ if (elv_support_features(q, e)) {
found = e;
break;
}
}
- if (found && !try_module_get(found->elevator_owner))
+ if (found && !elevator_tryget(found))
found = NULL;
spin_unlock(&elv_list_lock);
@@ -673,7 +617,7 @@ void elevator_init_mq(struct request_queue *q)
if (!elv_support_iosched(q))
return;
- WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags));
+ WARN_ON_ONCE(blk_queue_registered(q));
if (unlikely(q->elevator))
return;
@@ -685,127 +629,164 @@ void elevator_init_mq(struct request_queue *q)
if (!e)
return;
+ /*
+ * We are called before adding disk, when there isn't any FS I/O,
+ * so freezing queue plus canceling dispatch work is enough to
+ * drain any dispatch activities originated from passthrough
+ * requests, then no need to quiesce queue which may add long boot
+ * latency, especially when lots of disks are involved.
+ */
blk_mq_freeze_queue(q);
- blk_mq_quiesce_queue(q);
+ blk_mq_cancel_work_sync(q);
err = blk_mq_init_sched(q, e);
- blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
if (err) {
pr_warn("\"%s\" elevator initialization failed, "
"falling back to \"none\"\n", e->elevator_name);
- elevator_put(e);
}
-}
+ elevator_put(e);
+}
/*
- * switch to new_e io scheduler. be careful not to introduce deadlocks -
- * we don't free the old io scheduler, before we have allocated what we
- * need for the new one. this way we have a chance of going back to the old
- * one, if the new one fails init for some reason.
+ * Switch to new_e io scheduler.
+ *
+ * If switching fails, we are most likely running out of memory and not able
+ * to restore the old io scheduler, so leaving the io scheduler being none.
*/
-static int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
+int elevator_switch(struct request_queue *q, struct elevator_type *new_e)
{
- int err;
+ int ret;
lockdep_assert_held(&q->sysfs_lock);
blk_mq_freeze_queue(q);
blk_mq_quiesce_queue(q);
- err = elevator_switch_mq(q, new_e);
+ if (q->elevator) {
+ elv_unregister_queue(q);
+ elevator_exit(q);
+ }
+
+ ret = blk_mq_init_sched(q, new_e);
+ if (ret)
+ goto out_unfreeze;
+ ret = elv_register_queue(q, true);
+ if (ret) {
+ elevator_exit(q);
+ goto out_unfreeze;
+ }
+ blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name);
+
+out_unfreeze:
blk_mq_unquiesce_queue(q);
blk_mq_unfreeze_queue(q);
- return err;
+ if (ret) {
+ pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n",
+ new_e->elevator_name);
+ }
+
+ return ret;
+}
+
+void elevator_disable(struct request_queue *q)
+{
+ lockdep_assert_held(&q->sysfs_lock);
+
+ blk_mq_freeze_queue(q);
+ blk_mq_quiesce_queue(q);
+
+ elv_unregister_queue(q);
+ elevator_exit(q);
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+ q->elevator = NULL;
+ q->nr_requests = q->tag_set->queue_depth;
+ blk_add_trace_msg(q, "elv switch: none");
+
+ blk_mq_unquiesce_queue(q);
+ blk_mq_unfreeze_queue(q);
}
/*
* Switch this queue to the given IO scheduler.
*/
-static int __elevator_change(struct request_queue *q, const char *name)
+static int elevator_change(struct request_queue *q, const char *elevator_name)
{
- char elevator_name[ELV_NAME_MAX];
struct elevator_type *e;
+ int ret;
/* Make sure queue is not in the middle of being removed */
if (!blk_queue_registered(q))
return -ENOENT;
- /*
- * Special case for mq, turn off scheduling
- */
- if (!strncmp(name, "none", 4)) {
- if (!q->elevator)
- return 0;
- return elevator_switch(q, NULL);
+ if (!strncmp(elevator_name, "none", 4)) {
+ if (q->elevator)
+ elevator_disable(q);
+ return 0;
}
- strlcpy(elevator_name, name, sizeof(elevator_name));
- e = elevator_get(q, strstrip(elevator_name), true);
- if (!e)
- return -EINVAL;
-
- if (q->elevator &&
- elevator_match(q->elevator->type, elevator_name, 0)) {
- elevator_put(e);
+ if (q->elevator && elevator_match(q->elevator->type, elevator_name))
return 0;
- }
- return elevator_switch(q, e);
+ e = elevator_find_get(q, elevator_name);
+ if (!e) {
+ request_module("%s-iosched", elevator_name);
+ e = elevator_find_get(q, elevator_name);
+ if (!e)
+ return -EINVAL;
+ }
+ ret = elevator_switch(q, e);
+ elevator_put(e);
+ return ret;
}
-ssize_t elv_iosched_store(struct request_queue *q, const char *name,
+ssize_t elv_iosched_store(struct request_queue *q, const char *buf,
size_t count)
{
+ char elevator_name[ELV_NAME_MAX];
int ret;
- if (!queue_is_mq(q) || !elv_support_iosched(q))
+ if (!elv_support_iosched(q))
return count;
- ret = __elevator_change(q, name);
+ strscpy(elevator_name, buf, sizeof(elevator_name));
+ ret = elevator_change(q, strstrip(elevator_name));
if (!ret)
return count;
-
return ret;
}
ssize_t elv_iosched_show(struct request_queue *q, char *name)
{
- struct elevator_queue *e = q->elevator;
- struct elevator_type *elv = NULL;
- struct elevator_type *__e;
+ struct elevator_queue *eq = q->elevator;
+ struct elevator_type *cur = NULL, *e;
int len = 0;
- if (!queue_is_mq(q))
+ if (!elv_support_iosched(q))
return sprintf(name, "none\n");
- if (!q->elevator)
+ if (!q->elevator) {
len += sprintf(name+len, "[none] ");
- else
- elv = e->type;
+ } else {
+ len += sprintf(name+len, "none ");
+ cur = eq->type;
+ }
spin_lock(&elv_list_lock);
- list_for_each_entry(__e, &elv_list, list) {
- if (elv && elevator_match(elv, __e->elevator_name, 0)) {
- len += sprintf(name+len, "[%s] ", elv->elevator_name);
- continue;
- }
- if (elv_support_iosched(q) &&
- elevator_match(__e, __e->elevator_name,
- q->required_elevator_features))
- len += sprintf(name+len, "%s ", __e->elevator_name);
+ list_for_each_entry(e, &elv_list, list) {
+ if (e == cur)
+ len += sprintf(name+len, "[%s] ", e->elevator_name);
+ else if (elv_support_features(q, e))
+ len += sprintf(name+len, "%s ", e->elevator_name);
}
spin_unlock(&elv_list_lock);
- if (q->elevator)
- len += sprintf(name+len, "none");
-
- len += sprintf(len+name, "\n");
+ len += sprintf(name+len, "\n");
return len;
}
diff --git a/block/elevator.h b/block/elevator.h
new file mode 100644
index 000000000000..7ca3d7b6ed82
--- /dev/null
+++ b/block/elevator.h
@@ -0,0 +1,186 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ELEVATOR_H
+#define _ELEVATOR_H
+
+#include <linux/percpu.h>
+#include <linux/hashtable.h>
+#include "blk-mq.h"
+
+struct io_cq;
+struct elevator_type;
+struct blk_mq_debugfs_attr;
+
+/*
+ * Return values from elevator merger
+ */
+enum elv_merge {
+ ELEVATOR_NO_MERGE = 0,
+ ELEVATOR_FRONT_MERGE = 1,
+ ELEVATOR_BACK_MERGE = 2,
+ ELEVATOR_DISCARD_MERGE = 3,
+};
+
+struct blk_mq_alloc_data;
+struct blk_mq_hw_ctx;
+
+struct elevator_mq_ops {
+ int (*init_sched)(struct request_queue *, struct elevator_type *);
+ void (*exit_sched)(struct elevator_queue *);
+ int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+ void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int);
+ void (*depth_updated)(struct blk_mq_hw_ctx *);
+
+ bool (*allow_merge)(struct request_queue *, struct request *, struct bio *);
+ bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int);
+ int (*request_merge)(struct request_queue *q, struct request **, struct bio *);
+ void (*request_merged)(struct request_queue *, struct request *, enum elv_merge);
+ void (*requests_merged)(struct request_queue *, struct request *, struct request *);
+ void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *);
+ void (*prepare_request)(struct request *);
+ void (*finish_request)(struct request *);
+ void (*insert_requests)(struct blk_mq_hw_ctx *hctx, struct list_head *list,
+ blk_insert_t flags);
+ struct request *(*dispatch_request)(struct blk_mq_hw_ctx *);
+ bool (*has_work)(struct blk_mq_hw_ctx *);
+ void (*completed_request)(struct request *, u64);
+ void (*requeue_request)(struct request *);
+ struct request *(*former_request)(struct request_queue *, struct request *);
+ struct request *(*next_request)(struct request_queue *, struct request *);
+ void (*init_icq)(struct io_cq *);
+ void (*exit_icq)(struct io_cq *);
+};
+
+#define ELV_NAME_MAX (16)
+
+struct elv_fs_entry {
+ struct attribute attr;
+ ssize_t (*show)(struct elevator_queue *, char *);
+ ssize_t (*store)(struct elevator_queue *, const char *, size_t);
+};
+
+/*
+ * identifies an elevator type, such as AS or deadline
+ */
+struct elevator_type
+{
+ /* managed by elevator core */
+ struct kmem_cache *icq_cache;
+
+ /* fields provided by elevator implementation */
+ struct elevator_mq_ops ops;
+
+ size_t icq_size; /* see iocontext.h */
+ size_t icq_align; /* ditto */
+ struct elv_fs_entry *elevator_attrs;
+ const char *elevator_name;
+ const char *elevator_alias;
+ const unsigned int elevator_features;
+ struct module *elevator_owner;
+#ifdef CONFIG_BLK_DEBUG_FS
+ const struct blk_mq_debugfs_attr *queue_debugfs_attrs;
+ const struct blk_mq_debugfs_attr *hctx_debugfs_attrs;
+#endif
+
+ /* managed by elevator core */
+ char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */
+ struct list_head list;
+};
+
+static inline bool elevator_tryget(struct elevator_type *e)
+{
+ return try_module_get(e->elevator_owner);
+}
+
+static inline void __elevator_get(struct elevator_type *e)
+{
+ __module_get(e->elevator_owner);
+}
+
+static inline void elevator_put(struct elevator_type *e)
+{
+ module_put(e->elevator_owner);
+}
+
+#define ELV_HASH_BITS 6
+
+void elv_rqhash_del(struct request_queue *q, struct request *rq);
+void elv_rqhash_add(struct request_queue *q, struct request *rq);
+void elv_rqhash_reposition(struct request_queue *q, struct request *rq);
+struct request *elv_rqhash_find(struct request_queue *q, sector_t offset);
+
+/*
+ * each queue has an elevator_queue associated with it
+ */
+struct elevator_queue
+{
+ struct elevator_type *type;
+ void *elevator_data;
+ struct kobject kobj;
+ struct mutex sysfs_lock;
+ unsigned long flags;
+ DECLARE_HASHTABLE(hash, ELV_HASH_BITS);
+};
+
+#define ELEVATOR_FLAG_REGISTERED 0
+#define ELEVATOR_FLAG_DISABLE_WBT 1
+
+/*
+ * block elevator interface
+ */
+extern enum elv_merge elv_merge(struct request_queue *, struct request **,
+ struct bio *);
+extern void elv_merge_requests(struct request_queue *, struct request *,
+ struct request *);
+extern void elv_merged_request(struct request_queue *, struct request *,
+ enum elv_merge);
+extern bool elv_attempt_insert_merge(struct request_queue *, struct request *,
+ struct list_head *);
+extern struct request *elv_former_request(struct request_queue *, struct request *);
+extern struct request *elv_latter_request(struct request_queue *, struct request *);
+void elevator_init_mq(struct request_queue *q);
+
+/*
+ * io scheduler registration
+ */
+extern int elv_register(struct elevator_type *);
+extern void elv_unregister(struct elevator_type *);
+
+/*
+ * io scheduler sysfs switching
+ */
+extern ssize_t elv_iosched_show(struct request_queue *, char *);
+extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t);
+
+extern bool elv_bio_merge_ok(struct request *, struct bio *);
+extern struct elevator_queue *elevator_alloc(struct request_queue *,
+ struct elevator_type *);
+
+/*
+ * Helper functions.
+ */
+extern struct request *elv_rb_former_request(struct request_queue *, struct request *);
+extern struct request *elv_rb_latter_request(struct request_queue *, struct request *);
+
+/*
+ * rb support functions.
+ */
+extern void elv_rb_add(struct rb_root *, struct request *);
+extern void elv_rb_del(struct rb_root *, struct request *);
+extern struct request *elv_rb_find(struct rb_root *, sector_t);
+
+/*
+ * Insertion selection
+ */
+#define ELEVATOR_INSERT_FRONT 1
+#define ELEVATOR_INSERT_BACK 2
+#define ELEVATOR_INSERT_SORT 3
+#define ELEVATOR_INSERT_REQUEUE 4
+#define ELEVATOR_INSERT_FLUSH 5
+#define ELEVATOR_INSERT_SORT_MERGE 6
+
+#define rb_entry_rq(node) rb_entry((node), struct request, rb_node)
+
+#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist)
+#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist)
+
+#endif /* _ELEVATOR_H */
diff --git a/block/fops.c b/block/fops.c
new file mode 100644
index 000000000000..0cf8cf72cdfa
--- /dev/null
+++ b/block/fops.c
@@ -0,0 +1,877 @@
+// SPDX-License-Identifier: GPL-2.0-only
+/*
+ * Copyright (C) 1991, 1992 Linus Torvalds
+ * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE
+ * Copyright (C) 2016 - 2020 Christoph Hellwig
+ */
+#include <linux/init.h>
+#include <linux/mm.h>
+#include <linux/blkdev.h>
+#include <linux/buffer_head.h>
+#include <linux/mpage.h>
+#include <linux/uio.h>
+#include <linux/namei.h>
+#include <linux/task_io_accounting_ops.h>
+#include <linux/falloc.h>
+#include <linux/suspend.h>
+#include <linux/fs.h>
+#include <linux/iomap.h>
+#include <linux/module.h>
+#include "blk.h"
+
+static inline struct inode *bdev_file_inode(struct file *file)
+{
+ return file->f_mapping->host;
+}
+
+static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
+{
+ blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
+
+ /* avoid the need for a I/O completion work item */
+ if (iocb_is_dsync(iocb))
+ opf |= REQ_FUA;
+ return opf;
+}
+
+static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos,
+ struct iov_iter *iter)
+{
+ return pos & (bdev_logical_block_size(bdev) - 1) ||
+ !bdev_iter_is_aligned(bdev, iter);
+}
+
+#define DIO_INLINE_BIO_VECS 4
+
+static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb,
+ struct iov_iter *iter, unsigned int nr_pages)
+{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs;
+ loff_t pos = iocb->ki_pos;
+ bool should_dirty = false;
+ struct bio bio;
+ ssize_t ret;
+
+ if (blkdev_dio_unaligned(bdev, pos, iter))
+ return -EINVAL;
+
+ if (nr_pages <= DIO_INLINE_BIO_VECS)
+ vecs = inline_vecs;
+ else {
+ vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec),
+ GFP_KERNEL);
+ if (!vecs)
+ return -ENOMEM;
+ }
+
+ if (iov_iter_rw(iter) == READ) {
+ bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ);
+ if (user_backed_iter(iter))
+ should_dirty = true;
+ } else {
+ bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb));
+ }
+ bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio.bi_ioprio = iocb->ki_ioprio;
+
+ ret = bio_iov_iter_get_pages(&bio, iter);
+ if (unlikely(ret))
+ goto out;
+ ret = bio.bi_iter.bi_size;
+
+ if (iov_iter_rw(iter) == WRITE)
+ task_io_account_write(ret);
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio.bi_opf |= REQ_NOWAIT;
+
+ submit_bio_wait(&bio);
+
+ bio_release_pages(&bio, should_dirty);
+ if (unlikely(bio.bi_status))
+ ret = blk_status_to_errno(bio.bi_status);
+
+out:
+ if (vecs != inline_vecs)
+ kfree(vecs);
+
+ bio_uninit(&bio);
+
+ return ret;
+}
+
+enum {
+ DIO_SHOULD_DIRTY = 1,
+ DIO_IS_SYNC = 2,
+};
+
+struct blkdev_dio {
+ union {
+ struct kiocb *iocb;
+ struct task_struct *waiter;
+ };
+ size_t size;
+ atomic_t ref;
+ unsigned int flags;
+ struct bio bio ____cacheline_aligned_in_smp;
+};
+
+static struct bio_set blkdev_dio_pool;
+
+static void blkdev_bio_end_io(struct bio *bio)
+{
+ struct blkdev_dio *dio = bio->bi_private;
+ bool should_dirty = dio->flags & DIO_SHOULD_DIRTY;
+
+ if (bio->bi_status && !dio->bio.bi_status)
+ dio->bio.bi_status = bio->bi_status;
+
+ if (atomic_dec_and_test(&dio->ref)) {
+ if (!(dio->flags & DIO_IS_SYNC)) {
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret;
+
+ WRITE_ONCE(iocb->private, NULL);
+
+ if (likely(!dio->bio.bi_status)) {
+ ret = dio->size;
+ iocb->ki_pos += ret;
+ } else {
+ ret = blk_status_to_errno(dio->bio.bi_status);
+ }
+
+ dio->iocb->ki_complete(iocb, ret);
+ bio_put(&dio->bio);
+ } else {
+ struct task_struct *waiter = dio->waiter;
+
+ WRITE_ONCE(dio->waiter, NULL);
+ blk_wake_io_task(waiter);
+ }
+ }
+
+ if (should_dirty) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
+static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
+ unsigned int nr_pages)
+{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ struct blk_plug plug;
+ struct blkdev_dio *dio;
+ struct bio *bio;
+ bool is_read = (iov_iter_rw(iter) == READ), is_sync;
+ blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ loff_t pos = iocb->ki_pos;
+ int ret = 0;
+
+ if (blkdev_dio_unaligned(bdev, pos, iter))
+ return -EINVAL;
+
+ if (iocb->ki_flags & IOCB_ALLOC_CACHE)
+ opf |= REQ_ALLOC_CACHE;
+ bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
+ &blkdev_dio_pool);
+ dio = container_of(bio, struct blkdev_dio, bio);
+ atomic_set(&dio->ref, 1);
+ /*
+ * Grab an extra reference to ensure the dio structure which is embedded
+ * into the first bio stays around.
+ */
+ bio_get(bio);
+
+ is_sync = is_sync_kiocb(iocb);
+ if (is_sync) {
+ dio->flags = DIO_IS_SYNC;
+ dio->waiter = current;
+ } else {
+ dio->flags = 0;
+ dio->iocb = iocb;
+ }
+
+ dio->size = 0;
+ if (is_read && user_backed_iter(iter))
+ dio->flags |= DIO_SHOULD_DIRTY;
+
+ blk_start_plug(&plug);
+
+ for (;;) {
+ bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_private = dio;
+ bio->bi_end_io = blkdev_bio_end_io;
+ bio->bi_ioprio = iocb->ki_ioprio;
+
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (unlikely(ret)) {
+ bio->bi_status = BLK_STS_IOERR;
+ bio_endio(bio);
+ break;
+ }
+ if (iocb->ki_flags & IOCB_NOWAIT) {
+ /*
+ * This is nonblocking IO, and we need to allocate
+ * another bio if we have data left to map. As we
+ * cannot guarantee that one of the sub bios will not
+ * fail getting issued FOR NOWAIT and as error results
+ * are coalesced across all of them, be safe and ask for
+ * a retry of this from blocking context.
+ */
+ if (unlikely(iov_iter_count(iter))) {
+ bio_release_pages(bio, false);
+ bio_clear_flag(bio, BIO_REFFED);
+ bio_put(bio);
+ blk_finish_plug(&plug);
+ return -EAGAIN;
+ }
+ bio->bi_opf |= REQ_NOWAIT;
+ }
+
+ if (is_read) {
+ if (dio->flags & DIO_SHOULD_DIRTY)
+ bio_set_pages_dirty(bio);
+ } else {
+ task_io_account_write(bio->bi_iter.bi_size);
+ }
+ dio->size += bio->bi_iter.bi_size;
+ pos += bio->bi_iter.bi_size;
+
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS);
+ if (!nr_pages) {
+ submit_bio(bio);
+ break;
+ }
+ atomic_inc(&dio->ref);
+ submit_bio(bio);
+ bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL);
+ }
+
+ blk_finish_plug(&plug);
+
+ if (!is_sync)
+ return -EIOCBQUEUED;
+
+ for (;;) {
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ if (!READ_ONCE(dio->waiter))
+ break;
+ blk_io_schedule();
+ }
+ __set_current_state(TASK_RUNNING);
+
+ if (!ret)
+ ret = blk_status_to_errno(dio->bio.bi_status);
+ if (likely(!ret))
+ ret = dio->size;
+
+ bio_put(&dio->bio);
+ return ret;
+}
+
+static void blkdev_bio_end_io_async(struct bio *bio)
+{
+ struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio);
+ struct kiocb *iocb = dio->iocb;
+ ssize_t ret;
+
+ WRITE_ONCE(iocb->private, NULL);
+
+ if (likely(!bio->bi_status)) {
+ ret = dio->size;
+ iocb->ki_pos += ret;
+ } else {
+ ret = blk_status_to_errno(bio->bi_status);
+ }
+
+ iocb->ki_complete(iocb, ret);
+
+ if (dio->flags & DIO_SHOULD_DIRTY) {
+ bio_check_pages_dirty(bio);
+ } else {
+ bio_release_pages(bio, false);
+ bio_put(bio);
+ }
+}
+
+static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb,
+ struct iov_iter *iter,
+ unsigned int nr_pages)
+{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ bool is_read = iov_iter_rw(iter) == READ;
+ blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb);
+ struct blkdev_dio *dio;
+ struct bio *bio;
+ loff_t pos = iocb->ki_pos;
+ int ret = 0;
+
+ if (blkdev_dio_unaligned(bdev, pos, iter))
+ return -EINVAL;
+
+ if (iocb->ki_flags & IOCB_ALLOC_CACHE)
+ opf |= REQ_ALLOC_CACHE;
+ bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL,
+ &blkdev_dio_pool);
+ dio = container_of(bio, struct blkdev_dio, bio);
+ dio->flags = 0;
+ dio->iocb = iocb;
+ bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT;
+ bio->bi_end_io = blkdev_bio_end_io_async;
+ bio->bi_ioprio = iocb->ki_ioprio;
+
+ if (iov_iter_is_bvec(iter)) {
+ /*
+ * Users don't rely on the iterator being in any particular
+ * state for async I/O returning -EIOCBQUEUED, hence we can
+ * avoid expensive iov_iter_advance(). Bypass
+ * bio_iov_iter_get_pages() and set the bvec directly.
+ */
+ bio_iov_bvec_set(bio, iter);
+ } else {
+ ret = bio_iov_iter_get_pages(bio, iter);
+ if (unlikely(ret)) {
+ bio_put(bio);
+ return ret;
+ }
+ }
+ dio->size = bio->bi_iter.bi_size;
+
+ if (is_read) {
+ if (user_backed_iter(iter)) {
+ dio->flags |= DIO_SHOULD_DIRTY;
+ bio_set_pages_dirty(bio);
+ }
+ } else {
+ task_io_account_write(bio->bi_iter.bi_size);
+ }
+
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
+
+ if (iocb->ki_flags & IOCB_HIPRI) {
+ bio->bi_opf |= REQ_POLLED;
+ submit_bio(bio);
+ WRITE_ONCE(iocb->private, bio);
+ } else {
+ submit_bio(bio);
+ }
+ return -EIOCBQUEUED;
+}
+
+static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+{
+ unsigned int nr_pages;
+
+ if (!iov_iter_count(iter))
+ return 0;
+
+ nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1);
+ if (likely(nr_pages <= BIO_MAX_VECS)) {
+ if (is_sync_kiocb(iocb))
+ return __blkdev_direct_IO_simple(iocb, iter, nr_pages);
+ return __blkdev_direct_IO_async(iocb, iter, nr_pages);
+ }
+ return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages));
+}
+
+static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
+ unsigned int flags, struct iomap *iomap, struct iomap *srcmap)
+{
+ struct block_device *bdev = I_BDEV(inode);
+ loff_t isize = i_size_read(inode);
+
+ iomap->bdev = bdev;
+ iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev));
+ if (iomap->offset >= isize)
+ return -EIO;
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = iomap->offset;
+ iomap->length = isize - iomap->offset;
+ iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */
+ return 0;
+}
+
+static const struct iomap_ops blkdev_iomap_ops = {
+ .iomap_begin = blkdev_iomap_begin,
+};
+
+#ifdef CONFIG_BUFFER_HEAD
+static int blkdev_get_block(struct inode *inode, sector_t iblock,
+ struct buffer_head *bh, int create)
+{
+ bh->b_bdev = I_BDEV(inode);
+ bh->b_blocknr = iblock;
+ set_buffer_mapped(bh);
+ return 0;
+}
+
+/*
+ * We cannot call mpage_writepages() as it does not take the buffer lock.
+ * We must use block_write_full_folio() directly which holds the buffer
+ * lock. The buffer lock provides the synchronisation with writeback
+ * that filesystems rely on when they use the blockdev's mapping.
+ */
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct blk_plug plug;
+ int err;
+
+ blk_start_plug(&plug);
+ err = write_cache_pages(mapping, wbc, block_write_full_folio,
+ blkdev_get_block);
+ blk_finish_plug(&plug);
+
+ return err;
+}
+
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+ return block_read_full_folio(folio, blkdev_get_block);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+ mpage_readahead(rac, blkdev_get_block);
+}
+
+static int blkdev_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, struct page **pagep, void **fsdata)
+{
+ return block_write_begin(mapping, pos, len, pagep, blkdev_get_block);
+}
+
+static int blkdev_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied, struct page *page,
+ void *fsdata)
+{
+ int ret;
+ ret = block_write_end(file, mapping, pos, len, copied, page, fsdata);
+
+ unlock_page(page);
+ put_page(page);
+
+ return ret;
+}
+
+const struct address_space_operations def_blk_aops = {
+ .dirty_folio = block_dirty_folio,
+ .invalidate_folio = block_invalidate_folio,
+ .read_folio = blkdev_read_folio,
+ .readahead = blkdev_readahead,
+ .writepages = blkdev_writepages,
+ .write_begin = blkdev_write_begin,
+ .write_end = blkdev_write_end,
+ .migrate_folio = buffer_migrate_folio_norefs,
+ .is_dirty_writeback = buffer_check_dirty_writeback,
+};
+#else /* CONFIG_BUFFER_HEAD */
+static int blkdev_read_folio(struct file *file, struct folio *folio)
+{
+ return iomap_read_folio(folio, &blkdev_iomap_ops);
+}
+
+static void blkdev_readahead(struct readahead_control *rac)
+{
+ iomap_readahead(rac, &blkdev_iomap_ops);
+}
+
+static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
+{
+ loff_t isize = i_size_read(inode);
+
+ if (WARN_ON_ONCE(offset >= isize))
+ return -EIO;
+ if (offset >= wpc->iomap.offset &&
+ offset < wpc->iomap.offset + wpc->iomap.length)
+ return 0;
+ return blkdev_iomap_begin(inode, offset, isize - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
+}
+
+static const struct iomap_writeback_ops blkdev_writeback_ops = {
+ .map_blocks = blkdev_map_blocks,
+};
+
+static int blkdev_writepages(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct iomap_writepage_ctx wpc = { };
+
+ return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops);
+}
+
+const struct address_space_operations def_blk_aops = {
+ .dirty_folio = filemap_dirty_folio,
+ .release_folio = iomap_release_folio,
+ .invalidate_folio = iomap_invalidate_folio,
+ .read_folio = blkdev_read_folio,
+ .readahead = blkdev_readahead,
+ .writepages = blkdev_writepages,
+ .is_partially_uptodate = iomap_is_partially_uptodate,
+ .error_remove_folio = generic_error_remove_folio,
+ .migrate_folio = filemap_migrate_folio,
+};
+#endif /* CONFIG_BUFFER_HEAD */
+
+/*
+ * for a block special file file_inode(file)->i_size is zero
+ * so we compute the size by hand (just as in block_read/write above)
+ */
+static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence)
+{
+ struct inode *bd_inode = bdev_file_inode(file);
+ loff_t retval;
+
+ inode_lock(bd_inode);
+ retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode));
+ inode_unlock(bd_inode);
+ return retval;
+}
+
+static int blkdev_fsync(struct file *filp, loff_t start, loff_t end,
+ int datasync)
+{
+ struct block_device *bdev = I_BDEV(filp->f_mapping->host);
+ int error;
+
+ error = file_write_and_wait_range(filp, start, end);
+ if (error)
+ return error;
+
+ /*
+ * There is no need to serialise calls to blkdev_issue_flush with
+ * i_mutex and doing so causes performance issues with concurrent
+ * O_SYNC writers to a block device.
+ */
+ error = blkdev_issue_flush(bdev);
+ if (error == -EOPNOTSUPP)
+ error = 0;
+
+ return error;
+}
+
+/**
+ * file_to_blk_mode - get block open flags from file flags
+ * @file: file whose open flags should be converted
+ *
+ * Look at file open flags and generate corresponding block open flags from
+ * them. The function works both for file just being open (e.g. during ->open
+ * callback) and for file that is already open. This is actually non-trivial
+ * (see comment in the function).
+ */
+blk_mode_t file_to_blk_mode(struct file *file)
+{
+ blk_mode_t mode = 0;
+ struct bdev_handle *handle = file->private_data;
+
+ if (file->f_mode & FMODE_READ)
+ mode |= BLK_OPEN_READ;
+ if (file->f_mode & FMODE_WRITE)
+ mode |= BLK_OPEN_WRITE;
+ /*
+ * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to
+ * determine whether the open was exclusive for already open files.
+ */
+ if (handle)
+ mode |= handle->mode & BLK_OPEN_EXCL;
+ else if (file->f_flags & O_EXCL)
+ mode |= BLK_OPEN_EXCL;
+ if (file->f_flags & O_NDELAY)
+ mode |= BLK_OPEN_NDELAY;
+
+ /*
+ * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy
+ * driver has historically allowed ioctls as if the file was opened for
+ * writing, but does not allow and actual reads or writes.
+ */
+ if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY))
+ mode |= BLK_OPEN_WRITE_IOCTL;
+
+ return mode;
+}
+
+static int blkdev_open(struct inode *inode, struct file *filp)
+{
+ struct bdev_handle *handle;
+ blk_mode_t mode;
+
+ /*
+ * Preserve backwards compatibility and allow large file access
+ * even if userspace doesn't ask for it explicitly. Some mkfs
+ * binary needs it. We might want to drop this workaround
+ * during an unstable branch.
+ */
+ filp->f_flags |= O_LARGEFILE;
+ filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT;
+
+ mode = file_to_blk_mode(filp);
+ handle = bdev_open_by_dev(inode->i_rdev, mode,
+ mode & BLK_OPEN_EXCL ? filp : NULL, NULL);
+ if (IS_ERR(handle))
+ return PTR_ERR(handle);
+
+ if (bdev_nowait(handle->bdev))
+ filp->f_mode |= FMODE_NOWAIT;
+
+ filp->f_mapping = handle->bdev->bd_inode->i_mapping;
+ filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping);
+ filp->private_data = handle;
+ return 0;
+}
+
+static int blkdev_release(struct inode *inode, struct file *filp)
+{
+ bdev_release(filp->private_data);
+ return 0;
+}
+
+static ssize_t
+blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ size_t count = iov_iter_count(from);
+ ssize_t written;
+
+ written = kiocb_invalidate_pages(iocb, count);
+ if (written) {
+ if (written == -EBUSY)
+ return 0;
+ return written;
+ }
+
+ written = blkdev_direct_IO(iocb, from);
+ if (written > 0) {
+ kiocb_invalidate_post_direct_write(iocb, count);
+ iocb->ki_pos += written;
+ count -= written;
+ }
+ if (written != -EIOCBQUEUED)
+ iov_iter_revert(from, count - iov_iter_count(from));
+ return written;
+}
+
+static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from)
+{
+ return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops);
+}
+
+/*
+ * Write data to the block device. Only intended for the block device itself
+ * and the raw driver which basically is a fake block device.
+ *
+ * Does not take i_mutex for the write and thus is not for general purpose
+ * use.
+ */
+static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
+{
+ struct file *file = iocb->ki_filp;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
+ struct inode *bd_inode = bdev->bd_inode;
+ loff_t size = bdev_nr_bytes(bdev);
+ size_t shorted = 0;
+ ssize_t ret;
+
+ if (bdev_read_only(bdev))
+ return -EPERM;
+
+ if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev))
+ return -ETXTBSY;
+
+ if (!iov_iter_count(from))
+ return 0;
+
+ if (iocb->ki_pos >= size)
+ return -ENOSPC;
+
+ if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
+ return -EOPNOTSUPP;
+
+ size -= iocb->ki_pos;
+ if (iov_iter_count(from) > size) {
+ shorted = iov_iter_count(from) - size;
+ iov_iter_truncate(from, size);
+ }
+
+ ret = file_update_time(file);
+ if (ret)
+ return ret;
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = blkdev_direct_write(iocb, from);
+ if (ret >= 0 && iov_iter_count(from))
+ ret = direct_write_fallback(iocb, from, ret,
+ blkdev_buffered_write(iocb, from));
+ } else {
+ ret = blkdev_buffered_write(iocb, from);
+ }
+
+ if (ret > 0)
+ ret = generic_write_sync(iocb, ret);
+ iov_iter_reexpand(from, iov_iter_count(from) + shorted);
+ return ret;
+}
+
+static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
+{
+ struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host);
+ loff_t size = bdev_nr_bytes(bdev);
+ loff_t pos = iocb->ki_pos;
+ size_t shorted = 0;
+ ssize_t ret = 0;
+ size_t count;
+
+ if (unlikely(pos + iov_iter_count(to) > size)) {
+ if (pos >= size)
+ return 0;
+ size -= pos;
+ shorted = iov_iter_count(to) - size;
+ iov_iter_truncate(to, size);
+ }
+
+ count = iov_iter_count(to);
+ if (!count)
+ goto reexpand; /* skip atime */
+
+ if (iocb->ki_flags & IOCB_DIRECT) {
+ ret = kiocb_write_and_wait(iocb, count);
+ if (ret < 0)
+ goto reexpand;
+ file_accessed(iocb->ki_filp);
+
+ ret = blkdev_direct_IO(iocb, to);
+ if (ret >= 0) {
+ iocb->ki_pos += ret;
+ count -= ret;
+ }
+ iov_iter_revert(to, count - iov_iter_count(to));
+ if (ret < 0 || !count)
+ goto reexpand;
+ }
+
+ ret = filemap_read(iocb, to, ret);
+
+reexpand:
+ if (unlikely(shorted))
+ iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+ return ret;
+}
+
+#define BLKDEV_FALLOC_FL_SUPPORTED \
+ (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \
+ FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE)
+
+static long blkdev_fallocate(struct file *file, int mode, loff_t start,
+ loff_t len)
+{
+ struct inode *inode = bdev_file_inode(file);
+ struct block_device *bdev = I_BDEV(inode);
+ loff_t end = start + len - 1;
+ loff_t isize;
+ int error;
+
+ /* Fail if we don't recognize the flags. */
+ if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED)
+ return -EOPNOTSUPP;
+
+ /* Don't go off the end of the device. */
+ isize = bdev_nr_bytes(bdev);
+ if (start >= isize)
+ return -EINVAL;
+ if (end >= isize) {
+ if (mode & FALLOC_FL_KEEP_SIZE) {
+ len = isize - start;
+ end = start + len - 1;
+ } else
+ return -EINVAL;
+ }
+
+ /*
+ * Don't allow IO that isn't aligned to logical block size.
+ */
+ if ((start | len) & (bdev_logical_block_size(bdev) - 1))
+ return -EINVAL;
+
+ filemap_invalidate_lock(inode->i_mapping);
+
+ /*
+ * Invalidate the page cache, including dirty pages, for valid
+ * de-allocate mode calls to fallocate().
+ */
+ switch (mode) {
+ case FALLOC_FL_ZERO_RANGE:
+ case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE:
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL,
+ BLKDEV_ZERO_NOUNMAP);
+ break;
+ case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE:
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL,
+ BLKDEV_ZERO_NOFALLBACK);
+ break;
+ case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE:
+ error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end);
+ if (error)
+ goto fail;
+
+ error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT,
+ len >> SECTOR_SHIFT, GFP_KERNEL);
+ break;
+ default:
+ error = -EOPNOTSUPP;
+ }
+
+ fail:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return error;
+}
+
+static int blkdev_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct inode *bd_inode = bdev_file_inode(file);
+
+ if (bdev_read_only(I_BDEV(bd_inode)))
+ return generic_file_readonly_mmap(file, vma);
+
+ return generic_file_mmap(file, vma);
+}
+
+const struct file_operations def_blk_fops = {
+ .open = blkdev_open,
+ .release = blkdev_release,
+ .llseek = blkdev_llseek,
+ .read_iter = blkdev_read_iter,
+ .write_iter = blkdev_write_iter,
+ .iopoll = iocb_bio_iopoll,
+ .mmap = blkdev_mmap,
+ .fsync = blkdev_fsync,
+ .unlocked_ioctl = blkdev_ioctl,
+#ifdef CONFIG_COMPAT
+ .compat_ioctl = compat_blkdev_ioctl,
+#endif
+ .splice_read = filemap_splice_read,
+ .splice_write = iter_file_splice_write,
+ .fallocate = blkdev_fallocate,
+};
+
+static __init int blkdev_init(void)
+{
+ return bioset_init(&blkdev_dio_pool, 4,
+ offsetof(struct blkdev_dio, bio),
+ BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE);
+}
+module_init(blkdev_init);
diff --git a/block/genhd.c b/block/genhd.c
index 1a7659327664..d74fb5b4ae68 100644
--- a/block/genhd.c
+++ b/block/genhd.c
@@ -1,12 +1,13 @@
// SPDX-License-Identifier: GPL-2.0
/*
* gendisk handling
+ *
+ * Portions Copyright (C) 2020 Christoph Hellwig
*/
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kdev_t.h>
#include <linux/kernel.h>
#include <linux/blkdev.h>
@@ -17,88 +18,93 @@
#include <linux/seq_file.h>
#include <linux/slab.h>
#include <linux/kmod.h>
-#include <linux/kobj_map.h>
+#include <linux/major.h>
#include <linux/mutex.h>
#include <linux/idr.h>
#include <linux/log2.h>
#include <linux/pm_runtime.h>
#include <linux/badblocks.h>
+#include <linux/part_stat.h>
+#include <linux/blktrace_api.h>
+#include "blk-throttle.h"
#include "blk.h"
+#include "blk-mq-sched.h"
+#include "blk-rq-qos.h"
+#include "blk-cgroup.h"
-static DEFINE_MUTEX(block_class_lock);
static struct kobject *block_depr;
-/* for extended dynamic devt allocation, currently only one major is used */
-#define NR_EXT_DEVT (1 << MINORBITS)
-
-/* For extended devt allocation. ext_devt_lock prevents look up
- * results from going away underneath its user.
+/*
+ * Unique, monotonically increasing sequential number associated with block
+ * devices instances (i.e. incremented each time a device is attached).
+ * Associating uevents with block devices in userspace is difficult and racy:
+ * the uevent netlink socket is lossy, and on slow and overloaded systems has
+ * a very high latency.
+ * Block devices do not have exclusive owners in userspace, any process can set
+ * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0
+ * can be reused again and again).
+ * A userspace process setting up a block device and watching for its events
+ * cannot thus reliably tell whether an event relates to the device it just set
+ * up or another earlier instance with the same name.
+ * This sequential number allows userspace processes to solve this problem, and
+ * uniquely associate an uevent to the lifetime to a device.
*/
-static DEFINE_SPINLOCK(ext_devt_lock);
-static DEFINE_IDR(ext_devt_idr);
+static atomic64_t diskseq;
-static const struct device_type disk_type;
+/* for extended dynamic devt allocation, currently only one major is used */
+#define NR_EXT_DEVT (1 << MINORBITS)
+static DEFINE_IDA(ext_devt_ida);
-static void disk_check_events(struct disk_events *ev,
- unsigned int *clearing_ptr);
-static void disk_alloc_events(struct gendisk *disk);
-static void disk_add_events(struct gendisk *disk);
-static void disk_del_events(struct gendisk *disk);
-static void disk_release_events(struct gendisk *disk);
+void set_capacity(struct gendisk *disk, sector_t sectors)
+{
+ bdev_set_nr_sectors(disk->part0, sectors);
+}
+EXPORT_SYMBOL(set_capacity);
/*
- * Set disk capacity and notify if the size is not currently
- * zero and will not be set to zero
+ * Set disk capacity and notify if the size is not currently zero and will not
+ * be set to zero. Returns true if a uevent was sent, otherwise false.
*/
-void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size,
- bool revalidate)
+bool set_capacity_and_notify(struct gendisk *disk, sector_t size)
{
sector_t capacity = get_capacity(disk);
+ char *envp[] = { "RESIZE=1", NULL };
set_capacity(disk, size);
- if (revalidate)
- revalidate_disk(disk);
-
- if (capacity != size && capacity != 0 && size != 0) {
- char *envp[] = { "RESIZE=1", NULL };
-
- kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
- }
-}
-
-EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify);
-
-/*
- * Format the device name of the indicated disk into the supplied buffer and
- * return a pointer to that same buffer for convenience.
- */
-char *disk_name(struct gendisk *hd, int partno, char *buf)
-{
- if (!partno)
- snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name);
- else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1]))
- snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno);
- else
- snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno);
+ /*
+ * Only print a message and send a uevent if the gendisk is user visible
+ * and alive. This avoids spamming the log and udev when setting the
+ * initial capacity during probing.
+ */
+ if (size == capacity ||
+ !disk_live(disk) ||
+ (disk->flags & GENHD_FL_HIDDEN))
+ return false;
- return buf;
-}
+ pr_info("%s: detected capacity change from %lld to %lld\n",
+ disk->disk_name, capacity, size);
-const char *bdevname(struct block_device *bdev, char *buf)
-{
- return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf);
+ /*
+ * Historically we did not send a uevent for changes to/from an empty
+ * device.
+ */
+ if (!capacity || !size)
+ return false;
+ kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
+ return true;
}
-EXPORT_SYMBOL(bdevname);
+EXPORT_SYMBOL_GPL(set_capacity_and_notify);
-static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
+static void part_stat_read_all(struct block_device *part,
+ struct disk_stats *stat)
{
int cpu;
memset(stat, 0, sizeof(struct disk_stats));
for_each_possible_cpu(cpu) {
- struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu);
+ struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu);
int group;
for (group = 0; group < NR_STAT_GROUPS; group++) {
@@ -112,8 +118,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat)
}
}
-static unsigned int part_in_flight(struct request_queue *q,
- struct hd_struct *part)
+static unsigned int part_in_flight(struct block_device *part)
{
unsigned int inflight = 0;
int cpu;
@@ -128,7 +133,7 @@ static unsigned int part_in_flight(struct request_queue *q,
return inflight;
}
-static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
+static void part_in_flight_rw(struct block_device *part,
unsigned int inflight[2])
{
int cpu;
@@ -145,247 +150,6 @@ static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part,
inflight[1] = 0;
}
-struct hd_struct *__disk_get_part(struct gendisk *disk, int partno)
-{
- struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl);
-
- if (unlikely(partno < 0 || partno >= ptbl->len))
- return NULL;
- return rcu_dereference(ptbl->part[partno]);
-}
-
-/**
- * disk_get_part - get partition
- * @disk: disk to look partition from
- * @partno: partition number
- *
- * Look for partition @partno from @disk. If found, increment
- * reference count and return it.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * Pointer to the found partition on success, NULL if not found.
- */
-struct hd_struct *disk_get_part(struct gendisk *disk, int partno)
-{
- struct hd_struct *part;
-
- rcu_read_lock();
- part = __disk_get_part(disk, partno);
- if (part)
- get_device(part_to_dev(part));
- rcu_read_unlock();
-
- return part;
-}
-
-/**
- * disk_part_iter_init - initialize partition iterator
- * @piter: iterator to initialize
- * @disk: disk to iterate over
- * @flags: DISK_PITER_* flags
- *
- * Initialize @piter so that it iterates over partitions of @disk.
- *
- * CONTEXT:
- * Don't care.
- */
-void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk,
- unsigned int flags)
-{
- struct disk_part_tbl *ptbl;
-
- rcu_read_lock();
- ptbl = rcu_dereference(disk->part_tbl);
-
- piter->disk = disk;
- piter->part = NULL;
-
- if (flags & DISK_PITER_REVERSE)
- piter->idx = ptbl->len - 1;
- else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0))
- piter->idx = 0;
- else
- piter->idx = 1;
-
- piter->flags = flags;
-
- rcu_read_unlock();
-}
-EXPORT_SYMBOL_GPL(disk_part_iter_init);
-
-/**
- * disk_part_iter_next - proceed iterator to the next partition and return it
- * @piter: iterator of interest
- *
- * Proceed @piter to the next partition and return it.
- *
- * CONTEXT:
- * Don't care.
- */
-struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter)
-{
- struct disk_part_tbl *ptbl;
- int inc, end;
-
- /* put the last partition */
- disk_put_part(piter->part);
- piter->part = NULL;
-
- /* get part_tbl */
- rcu_read_lock();
- ptbl = rcu_dereference(piter->disk->part_tbl);
-
- /* determine iteration parameters */
- if (piter->flags & DISK_PITER_REVERSE) {
- inc = -1;
- if (piter->flags & (DISK_PITER_INCL_PART0 |
- DISK_PITER_INCL_EMPTY_PART0))
- end = -1;
- else
- end = 0;
- } else {
- inc = 1;
- end = ptbl->len;
- }
-
- /* iterate to the next partition */
- for (; piter->idx != end; piter->idx += inc) {
- struct hd_struct *part;
-
- part = rcu_dereference(ptbl->part[piter->idx]);
- if (!part)
- continue;
- if (!part_nr_sects_read(part) &&
- !(piter->flags & DISK_PITER_INCL_EMPTY) &&
- !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 &&
- piter->idx == 0))
- continue;
-
- get_device(part_to_dev(part));
- piter->part = part;
- piter->idx += inc;
- break;
- }
-
- rcu_read_unlock();
-
- return piter->part;
-}
-EXPORT_SYMBOL_GPL(disk_part_iter_next);
-
-/**
- * disk_part_iter_exit - finish up partition iteration
- * @piter: iter of interest
- *
- * Called when iteration is over. Cleans up @piter.
- *
- * CONTEXT:
- * Don't care.
- */
-void disk_part_iter_exit(struct disk_part_iter *piter)
-{
- disk_put_part(piter->part);
- piter->part = NULL;
-}
-EXPORT_SYMBOL_GPL(disk_part_iter_exit);
-
-static inline int sector_in_part(struct hd_struct *part, sector_t sector)
-{
- return part->start_sect <= sector &&
- sector < part->start_sect + part_nr_sects_read(part);
-}
-
-/**
- * disk_map_sector_rcu - map sector to partition
- * @disk: gendisk of interest
- * @sector: sector to map
- *
- * Find out which partition @sector maps to on @disk. This is
- * primarily used for stats accounting.
- *
- * CONTEXT:
- * RCU read locked. The returned partition pointer is always valid
- * because its refcount is grabbed except for part0, which lifetime
- * is same with the disk.
- *
- * RETURNS:
- * Found partition on success, part0 is returned if no partition matches
- * or the matched partition is being deleted.
- */
-struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector)
-{
- struct disk_part_tbl *ptbl;
- struct hd_struct *part;
- int i;
-
- rcu_read_lock();
- ptbl = rcu_dereference(disk->part_tbl);
-
- part = rcu_dereference(ptbl->last_lookup);
- if (part && sector_in_part(part, sector) && hd_struct_try_get(part))
- goto out_unlock;
-
- for (i = 1; i < ptbl->len; i++) {
- part = rcu_dereference(ptbl->part[i]);
-
- if (part && sector_in_part(part, sector)) {
- /*
- * only live partition can be cached for lookup,
- * so use-after-free on cached & deleting partition
- * can be avoided
- */
- if (!hd_struct_try_get(part))
- break;
- rcu_assign_pointer(ptbl->last_lookup, part);
- goto out_unlock;
- }
- }
-
- part = &disk->part0;
-out_unlock:
- rcu_read_unlock();
- return part;
-}
-
-/**
- * disk_has_partitions
- * @disk: gendisk of interest
- *
- * Walk through the partition table and check if valid partition exists.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * True if the gendisk has at least one valid non-zero size partition.
- * Otherwise false.
- */
-bool disk_has_partitions(struct gendisk *disk)
-{
- struct disk_part_tbl *ptbl;
- int i;
- bool ret = false;
-
- rcu_read_lock();
- ptbl = rcu_dereference(disk->part_tbl);
-
- /* Iterate partitions skipping the whole device at index 0 */
- for (i = 1; i < ptbl->len; i++) {
- if (rcu_dereference(ptbl->part[i])) {
- ret = true;
- break;
- }
- }
-
- rcu_read_unlock();
-
- return ret;
-}
-EXPORT_SYMBOL_GPL(disk_has_partitions);
-
/*
* Can be deleted altogether. Later.
*
@@ -395,7 +159,12 @@ static struct blk_major_name {
struct blk_major_name *next;
int major;
char name[16];
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
+ void (*probe)(dev_t devt);
+#endif
} *major_names[BLKDEV_MAJOR_HASH_SIZE];
+static DEFINE_MUTEX(major_names_lock);
+static DEFINE_SPINLOCK(major_names_spinlock);
/* index in the above - for now: assume no multimajor ranges */
static inline int major_to_index(unsigned major)
@@ -408,20 +177,24 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
{
struct blk_major_name *dp;
- mutex_lock(&block_class_lock);
+ spin_lock(&major_names_spinlock);
for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next)
if (dp->major == offset)
seq_printf(seqf, "%3d %s\n", dp->major, dp->name);
- mutex_unlock(&block_class_lock);
+ spin_unlock(&major_names_spinlock);
}
#endif /* CONFIG_PROC_FS */
/**
- * register_blkdev - register a new block device
+ * __register_blkdev - register a new block device
*
* @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If
* @major = 0, try to allocate any unused major number.
* @name: the name of the new block device as a zero terminated string
+ * @probe: pre-devtmpfs / pre-udev callback used to create disks when their
+ * pre-created device node is accessed. When a probe call uses
+ * add_disk() and it fails the driver must cleanup resources. This
+ * interface may soon be removed.
*
* The @name must be unique within the system.
*
@@ -435,13 +208,16 @@ void blkdev_show(struct seq_file *seqf, off_t offset)
*
* See Documentation/admin-guide/devices.txt for the list of allocated
* major numbers.
+ *
+ * Use register_blkdev instead for any new code.
*/
-int register_blkdev(unsigned int major, const char *name)
+int __register_blkdev(unsigned int major, const char *name,
+ void (*probe)(dev_t devt))
{
struct blk_major_name **n, *p;
int index, ret = 0;
- mutex_lock(&block_class_lock);
+ mutex_lock(&major_names_lock);
/* temporary */
if (major == 0) {
@@ -475,10 +251,14 @@ int register_blkdev(unsigned int major, const char *name)
}
p->major = major;
- strlcpy(p->name, name, sizeof(p->name));
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
+ p->probe = probe;
+#endif
+ strscpy(p->name, name, sizeof(p->name));
p->next = NULL;
index = major_to_index(major);
+ spin_lock(&major_names_spinlock);
for (n = &major_names[index]; *n; n = &(*n)->next) {
if ((*n)->major == major)
break;
@@ -487,6 +267,7 @@ int register_blkdev(unsigned int major, const char *name)
*n = p;
else
ret = -EBUSY;
+ spin_unlock(&major_names_spinlock);
if (ret < 0) {
printk("register_blkdev: cannot get major %u for %s\n",
@@ -494,11 +275,10 @@ int register_blkdev(unsigned int major, const char *name)
kfree(p);
}
out:
- mutex_unlock(&block_class_lock);
+ mutex_unlock(&major_names_lock);
return ret;
}
-
-EXPORT_SYMBOL(register_blkdev);
+EXPORT_SYMBOL(__register_blkdev);
void unregister_blkdev(unsigned int major, const char *name)
{
@@ -506,7 +286,8 @@ void unregister_blkdev(unsigned int major, const char *name)
struct blk_major_name *p = NULL;
int index = major_to_index(major);
- mutex_lock(&block_class_lock);
+ mutex_lock(&major_names_lock);
+ spin_lock(&major_names_spinlock);
for (n = &major_names[index]; *n; n = &(*n)->next)
if ((*n)->major == major)
break;
@@ -516,198 +297,176 @@ void unregister_blkdev(unsigned int major, const char *name)
p = *n;
*n = p->next;
}
- mutex_unlock(&block_class_lock);
+ spin_unlock(&major_names_spinlock);
+ mutex_unlock(&major_names_lock);
kfree(p);
}
EXPORT_SYMBOL(unregister_blkdev);
-static struct kobj_map *bdev_map;
-
-/**
- * blk_mangle_minor - scatter minor numbers apart
- * @minor: minor number to mangle
- *
- * Scatter consecutively allocated @minor number apart if MANGLE_DEVT
- * is enabled. Mangling twice gives the original value.
- *
- * RETURNS:
- * Mangled value.
- *
- * CONTEXT:
- * Don't care.
- */
-static int blk_mangle_minor(int minor)
+int blk_alloc_ext_minor(void)
{
-#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT
- int i;
-
- for (i = 0; i < MINORBITS / 2; i++) {
- int low = minor & (1 << i);
- int high = minor & (1 << (MINORBITS - 1 - i));
- int distance = MINORBITS - 1 - 2 * i;
+ int idx;
- minor ^= low | high; /* clear both bits */
- low <<= distance; /* swap the positions */
- high >>= distance;
- minor |= low | high; /* and set */
- }
-#endif
- return minor;
+ idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL);
+ if (idx == -ENOSPC)
+ return -EBUSY;
+ return idx;
}
-/**
- * blk_alloc_devt - allocate a dev_t for a partition
- * @part: partition to allocate dev_t for
- * @devt: out parameter for resulting dev_t
- *
- * Allocate a dev_t for block device.
- *
- * RETURNS:
- * 0 on success, allocated dev_t is returned in *@devt. -errno on
- * failure.
- *
- * CONTEXT:
- * Might sleep.
- */
-int blk_alloc_devt(struct hd_struct *part, dev_t *devt)
+void blk_free_ext_minor(unsigned int minor)
{
- struct gendisk *disk = part_to_disk(part);
- int idx;
-
- /* in consecutive minor range? */
- if (part->partno < disk->minors) {
- *devt = MKDEV(disk->major, disk->first_minor + part->partno);
- return 0;
- }
-
- /* allocate ext devt */
- idr_preload(GFP_KERNEL);
-
- spin_lock_bh(&ext_devt_lock);
- idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT);
- spin_unlock_bh(&ext_devt_lock);
-
- idr_preload_end();
- if (idx < 0)
- return idx == -ENOSPC ? -EBUSY : idx;
-
- *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx));
- return 0;
+ ida_free(&ext_devt_ida, minor);
}
-/**
- * blk_free_devt - free a dev_t
- * @devt: dev_t to free
- *
- * Free @devt which was allocated using blk_alloc_devt().
- *
- * CONTEXT:
- * Might sleep.
- */
-void blk_free_devt(dev_t devt)
+void disk_uevent(struct gendisk *disk, enum kobject_action action)
{
- if (devt == MKDEV(0, 0))
- return;
+ struct block_device *part;
+ unsigned long idx;
- if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
- spin_lock_bh(&ext_devt_lock);
- idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
- spin_unlock_bh(&ext_devt_lock);
- }
-}
+ rcu_read_lock();
+ xa_for_each(&disk->part_tbl, idx, part) {
+ if (bdev_is_partition(part) && !bdev_nr_sectors(part))
+ continue;
+ if (!kobject_get_unless_zero(&part->bd_device.kobj))
+ continue;
-/*
- * We invalidate devt by assigning NULL pointer for devt in idr.
- */
-void blk_invalidate_devt(dev_t devt)
-{
- if (MAJOR(devt) == BLOCK_EXT_MAJOR) {
- spin_lock_bh(&ext_devt_lock);
- idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt)));
- spin_unlock_bh(&ext_devt_lock);
+ rcu_read_unlock();
+ kobject_uevent(bdev_kobj(part), action);
+ put_device(&part->bd_device);
+ rcu_read_lock();
}
+ rcu_read_unlock();
}
+EXPORT_SYMBOL_GPL(disk_uevent);
-static char *bdevt_str(dev_t devt, char *buf)
+int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode)
{
- if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) {
- char tbuf[BDEVT_SIZE];
- snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt));
- snprintf(buf, BDEVT_SIZE, "%-9s", tbuf);
- } else
- snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt));
-
- return buf;
-}
+ struct bdev_handle *handle;
+ int ret = 0;
-/*
- * Register device numbers dev..(dev+range-1)
- * range must be nonzero
- * The hash chain is sorted on range, so that subranges can override.
- */
-void blk_register_region(dev_t devt, unsigned long range, struct module *module,
- struct kobject *(*probe)(dev_t, int *, void *),
- int (*lock)(dev_t, void *), void *data)
-{
- kobj_map(bdev_map, devt, range, module, probe, lock, data);
-}
+ if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN))
+ return -EINVAL;
+ if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ return -EINVAL;
+ if (disk->open_partitions)
+ return -EBUSY;
-EXPORT_SYMBOL(blk_register_region);
+ /*
+ * If the device is opened exclusively by current thread already, it's
+ * safe to scan partitons, otherwise, use bd_prepare_to_claim() to
+ * synchronize with other exclusive openers and other partition
+ * scanners.
+ */
+ if (!(mode & BLK_OPEN_EXCL)) {
+ ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions,
+ NULL);
+ if (ret)
+ return ret;
+ }
-void blk_unregister_region(dev_t devt, unsigned long range)
-{
- kobj_unmap(bdev_map, devt, range);
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+ handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL,
+ NULL);
+ if (IS_ERR(handle))
+ ret = PTR_ERR(handle);
+ else
+ bdev_release(handle);
+
+ /*
+ * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set,
+ * and this will cause that re-assemble partitioned raid device will
+ * creat partition for underlying disk.
+ */
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
+ if (!(mode & BLK_OPEN_EXCL))
+ bd_abort_claiming(disk->part0, disk_scan_partitions);
+ return ret;
}
-EXPORT_SYMBOL(blk_unregister_region);
+/**
+ * device_add_disk - add disk information to kernel list
+ * @parent: parent device for the disk
+ * @disk: per-device partitioning information
+ * @groups: Additional per-device sysfs groups
+ *
+ * This function registers the partitioning information in @disk
+ * with the kernel.
+ */
+int __must_check device_add_disk(struct device *parent, struct gendisk *disk,
+ const struct attribute_group **groups)
-static struct kobject *exact_match(dev_t devt, int *partno, void *data)
{
- struct gendisk *p = data;
+ struct device *ddev = disk_to_dev(disk);
+ int ret;
- return &disk_to_dev(p)->kobj;
-}
+ /* Only makes sense for bio-based to set ->poll_bio */
+ if (queue_is_mq(disk->queue) && disk->fops->poll_bio)
+ return -EINVAL;
-static int exact_lock(dev_t devt, void *data)
-{
- struct gendisk *p = data;
+ /*
+ * The disk queue should now be all set with enough information about
+ * the device for the elevator code to pick an adequate default
+ * elevator if one is needed, that is, for devices requesting queue
+ * registration.
+ */
+ elevator_init_mq(disk->queue);
- if (!get_disk_and_module(p))
- return -1;
- return 0;
-}
+ /* Mark bdev as having a submit_bio, if needed */
+ disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL;
-static void register_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
-{
- struct device *ddev = disk_to_dev(disk);
- struct block_device *bdev;
- struct disk_part_iter piter;
- struct hd_struct *part;
- int err;
+ /*
+ * If the driver provides an explicit major number it also must provide
+ * the number of minors numbers supported, and those will be used to
+ * setup the gendisk.
+ * Otherwise just allocate the device numbers for both the whole device
+ * and all partitions from the extended dev_t space.
+ */
+ ret = -EINVAL;
+ if (disk->major) {
+ if (WARN_ON(!disk->minors))
+ goto out_exit_elevator;
+
+ if (disk->minors > DISK_MAX_PARTS) {
+ pr_err("block: can't allocate more than %d partitions\n",
+ DISK_MAX_PARTS);
+ disk->minors = DISK_MAX_PARTS;
+ }
+ if (disk->first_minor > MINORMASK ||
+ disk->minors > MINORMASK + 1 ||
+ disk->first_minor + disk->minors > MINORMASK + 1)
+ goto out_exit_elevator;
+ } else {
+ if (WARN_ON(disk->minors))
+ goto out_exit_elevator;
+
+ ret = blk_alloc_ext_minor();
+ if (ret < 0)
+ goto out_exit_elevator;
+ disk->major = BLOCK_EXT_MAJOR;
+ disk->first_minor = ret;
+ }
- ddev->parent = parent;
+ /* delay uevents, until we scanned partition table */
+ dev_set_uevent_suppress(ddev, 1);
+ ddev->parent = parent;
+ ddev->groups = groups;
dev_set_name(ddev, "%s", disk->disk_name);
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ ddev->devt = MKDEV(disk->major, disk->first_minor);
+ ret = device_add(ddev);
+ if (ret)
+ goto out_free_ext_minor;
- /* delay uevents, until we scanned partition table */
- dev_set_uevent_suppress(ddev, 1);
+ ret = disk_alloc_events(disk);
+ if (ret)
+ goto out_device_del;
- if (groups) {
- WARN_ON(ddev->groups);
- ddev->groups = groups;
- }
- if (device_add(ddev))
- return;
- if (!sysfs_deprecated) {
- err = sysfs_create_link(block_depr, &ddev->kobj,
- kobject_name(&ddev->kobj));
- if (err) {
- device_del(ddev);
- return;
- }
- }
+ ret = sysfs_create_link(block_depr, &ddev->kobj,
+ kobject_name(&ddev->kobj));
+ if (ret)
+ goto out_device_del;
/*
* avoid probable deadlock caused by allocating memory with
@@ -716,229 +475,281 @@ static void register_disk(struct device *parent, struct gendisk *disk,
*/
pm_runtime_set_memalloc_noio(ddev, true);
- disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj);
+ disk->part0->bd_holder_dir =
+ kobject_create_and_add("holders", &ddev->kobj);
+ if (!disk->part0->bd_holder_dir) {
+ ret = -ENOMEM;
+ goto out_del_block_link;
+ }
disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj);
+ if (!disk->slave_dir) {
+ ret = -ENOMEM;
+ goto out_put_holder_dir;
+ }
+
+ ret = blk_register_queue(disk);
+ if (ret)
+ goto out_put_slave_dir;
+
+ if (!(disk->flags & GENHD_FL_HIDDEN)) {
+ ret = bdi_register(disk->bdi, "%u:%u",
+ disk->major, disk->first_minor);
+ if (ret)
+ goto out_unregister_queue;
+ bdi_set_owner(disk->bdi, ddev);
+ ret = sysfs_create_link(&ddev->kobj,
+ &disk->bdi->dev->kobj, "bdi");
+ if (ret)
+ goto out_unregister_bdi;
+
+ /* Make sure the first partition scan will be proceed */
+ if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) &&
+ !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
+ set_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ bdev_add(disk->part0, ddev->devt);
+ if (get_capacity(disk))
+ disk_scan_partitions(disk, BLK_OPEN_READ);
- if (disk->flags & GENHD_FL_HIDDEN) {
+ /*
+ * Announce the disk and partitions after all partitions are
+ * created. (for hidden disks uevents remain suppressed forever)
+ */
dev_set_uevent_suppress(ddev, 0);
- return;
+ disk_uevent(disk, KOBJ_ADD);
+ } else {
+ /*
+ * Even if the block_device for a hidden gendisk is not
+ * registered, it needs to have a valid bd_dev so that the
+ * freeing of the dynamic major works.
+ */
+ disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor);
}
- /* No minors to use for partitions */
- if (!disk_part_scan_enabled(disk))
- goto exit;
-
- /* No such device (e.g., media were just removed) */
- if (!get_capacity(disk))
- goto exit;
-
- bdev = bdget_disk(disk, 0);
- if (!bdev)
- goto exit;
-
- bdev->bd_invalidated = 1;
- err = blkdev_get(bdev, FMODE_READ, NULL);
- if (err < 0)
- goto exit;
- blkdev_put(bdev, FMODE_READ);
-
-exit:
- /* announce disk after possible partitions are created */
- dev_set_uevent_suppress(ddev, 0);
- kobject_uevent(&ddev->kobj, KOBJ_ADD);
-
- /* announce possible partitions */
- disk_part_iter_init(&piter, disk, 0);
- while ((part = disk_part_iter_next(&piter)))
- kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD);
- disk_part_iter_exit(&piter);
-
- if (disk->queue->backing_dev_info->dev) {
- err = sysfs_create_link(&ddev->kobj,
- &disk->queue->backing_dev_info->dev->kobj,
- "bdi");
- WARN_ON(err);
- }
+ disk_update_readahead(disk);
+ disk_add_events(disk);
+ set_bit(GD_ADDED, &disk->state);
+ return 0;
+
+out_unregister_bdi:
+ if (!(disk->flags & GENHD_FL_HIDDEN))
+ bdi_unregister(disk->bdi);
+out_unregister_queue:
+ blk_unregister_queue(disk);
+ rq_qos_exit(disk->queue);
+out_put_slave_dir:
+ kobject_put(disk->slave_dir);
+ disk->slave_dir = NULL;
+out_put_holder_dir:
+ kobject_put(disk->part0->bd_holder_dir);
+out_del_block_link:
+ sysfs_remove_link(block_depr, dev_name(ddev));
+ pm_runtime_set_memalloc_noio(ddev, false);
+out_device_del:
+ device_del(ddev);
+out_free_ext_minor:
+ if (disk->major == BLOCK_EXT_MAJOR)
+ blk_free_ext_minor(disk->first_minor);
+out_exit_elevator:
+ if (disk->queue->elevator)
+ elevator_exit(disk->queue);
+ return ret;
}
+EXPORT_SYMBOL(device_add_disk);
-/**
- * __device_add_disk - add disk information to kernel list
- * @parent: parent device for the disk
- * @disk: per-device partitioning information
- * @groups: Additional per-device sysfs groups
- * @register_queue: register the queue if set to true
- *
- * This function registers the partitioning information in @disk
- * with the kernel.
- *
- * FIXME: error handling
- */
-static void __device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups,
- bool register_queue)
+static void blk_report_disk_dead(struct gendisk *disk, bool surprise)
{
- dev_t devt;
- int retval;
+ struct block_device *bdev;
+ unsigned long idx;
/*
- * The disk queue should now be all set with enough information about
- * the device for the elevator code to pick an adequate default
- * elevator if one is needed, that is, for devices requesting queue
- * registration.
+ * On surprise disk removal, bdev_mark_dead() may call into file
+ * systems below. Make it clear that we're expecting to not hold
+ * disk->open_mutex.
*/
- if (register_queue)
- elevator_init_mq(disk->queue);
+ lockdep_assert_not_held(&disk->open_mutex);
- /* minors == 0 indicates to use ext devt from part0 and should
- * be accompanied with EXT_DEVT flag. Make sure all
- * parameters make sense.
- */
- WARN_ON(disk->minors && !(disk->major || disk->first_minor));
- WARN_ON(!disk->minors &&
- !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN)));
+ rcu_read_lock();
+ xa_for_each(&disk->part_tbl, idx, bdev) {
+ if (!kobject_get_unless_zero(&bdev->bd_device.kobj))
+ continue;
+ rcu_read_unlock();
- disk->flags |= GENHD_FL_UP;
+ bdev_mark_dead(bdev, surprise);
- retval = blk_alloc_devt(&disk->part0, &devt);
- if (retval) {
- WARN_ON(1);
- return;
+ put_device(&bdev->bd_device);
+ rcu_read_lock();
}
- disk->major = MAJOR(devt);
- disk->first_minor = MINOR(devt);
+ rcu_read_unlock();
+}
- disk_alloc_events(disk);
+static void __blk_mark_disk_dead(struct gendisk *disk)
+{
+ /*
+ * Fail any new I/O.
+ */
+ if (test_and_set_bit(GD_DEAD, &disk->state))
+ return;
- if (disk->flags & GENHD_FL_HIDDEN) {
- /*
- * Don't let hidden disks show up in /proc/partitions,
- * and don't bother scanning for partitions either.
- */
- disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO;
- disk->flags |= GENHD_FL_NO_PART_SCAN;
- } else {
- struct backing_dev_info *bdi = disk->queue->backing_dev_info;
- struct device *dev = disk_to_dev(disk);
- int ret;
-
- /* Register BDI before referencing it from bdev */
- dev->devt = devt;
- ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt));
- WARN_ON(ret);
- bdi_set_owner(bdi, dev);
- blk_register_region(disk_devt(disk), disk->minors, NULL,
- exact_match, exact_lock, disk);
- }
- register_disk(parent, disk, groups);
- if (register_queue)
- blk_register_queue(disk);
+ if (test_bit(GD_OWNS_QUEUE, &disk->state))
+ blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue);
/*
- * Take an extra ref on queue which will be put on disk_release()
- * so that it sticks around as long as @disk is there.
+ * Stop buffered writers from dirtying pages that can't be written out.
*/
- WARN_ON_ONCE(!blk_get_queue(disk->queue));
+ set_capacity(disk, 0);
- disk_add_events(disk);
- blk_integrity_add(disk);
+ /*
+ * Prevent new I/O from crossing bio_queue_enter().
+ */
+ blk_queue_start_drain(disk->queue);
}
-void device_add_disk(struct device *parent, struct gendisk *disk,
- const struct attribute_group **groups)
-
+/**
+ * blk_mark_disk_dead - mark a disk as dead
+ * @disk: disk to mark as dead
+ *
+ * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O
+ * to this disk.
+ */
+void blk_mark_disk_dead(struct gendisk *disk)
{
- __device_add_disk(parent, disk, groups, true);
+ __blk_mark_disk_dead(disk);
+ blk_report_disk_dead(disk, true);
}
-EXPORT_SYMBOL(device_add_disk);
+EXPORT_SYMBOL_GPL(blk_mark_disk_dead);
-void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk)
+/**
+ * del_gendisk - remove the gendisk
+ * @disk: the struct gendisk to remove
+ *
+ * Removes the gendisk and all its associated resources. This deletes the
+ * partitions associated with the gendisk, and unregisters the associated
+ * request_queue.
+ *
+ * This is the counter to the respective __device_add_disk() call.
+ *
+ * The final removal of the struct gendisk happens when its refcount reaches 0
+ * with put_disk(), which should be called after del_gendisk(), if
+ * __device_add_disk() was used.
+ *
+ * Drivers exist which depend on the release of the gendisk to be synchronous,
+ * it should not be deferred.
+ *
+ * Context: can sleep
+ */
+void del_gendisk(struct gendisk *disk)
{
- __device_add_disk(parent, disk, NULL, false);
-}
-EXPORT_SYMBOL(device_add_disk_no_queue_reg);
+ struct request_queue *q = disk->queue;
+ struct block_device *part;
+ unsigned long idx;
-static void invalidate_partition(struct gendisk *disk, int partno)
-{
- struct block_device *bdev;
+ might_sleep();
- bdev = bdget_disk(disk, partno);
- if (!bdev)
+ if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN)))
return;
- fsync_bdev(bdev);
- __invalidate_device(bdev, true);
+ disk_del_events(disk);
/*
- * Unhash the bdev inode for this device so that it gets evicted as soon
- * as last inode reference is dropped.
+ * Prevent new openers by unlinked the bdev inode.
*/
- remove_inode_hash(bdev->bd_inode);
- bdput(bdev);
-}
-
-void del_gendisk(struct gendisk *disk)
-{
- struct disk_part_iter piter;
- struct hd_struct *part;
-
- blk_integrity_del(disk);
- disk_del_events(disk);
+ mutex_lock(&disk->open_mutex);
+ xa_for_each(&disk->part_tbl, idx, part)
+ remove_inode_hash(part->bd_inode);
+ mutex_unlock(&disk->open_mutex);
/*
- * Block lookups of the disk until all bdevs are unhashed and the
- * disk is marked as dead (GENHD_FL_UP cleared).
+ * Tell the file system to write back all dirty data and shut down if
+ * it hasn't been notified earlier.
*/
- down_write(&disk->lookup_sem);
- /* invalidate stuff */
- disk_part_iter_init(&piter, disk,
- DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE);
- while ((part = disk_part_iter_next(&piter))) {
- invalidate_partition(disk, part->partno);
- delete_partition(disk, part);
- }
- disk_part_iter_exit(&piter);
+ if (!test_bit(GD_DEAD, &disk->state))
+ blk_report_disk_dead(disk, false);
+ __blk_mark_disk_dead(disk);
- invalidate_partition(disk, 0);
- set_capacity(disk, 0);
- disk->flags &= ~GENHD_FL_UP;
- up_write(&disk->lookup_sem);
+ /*
+ * Drop all partitions now that the disk is marked dead.
+ */
+ mutex_lock(&disk->open_mutex);
+ xa_for_each_start(&disk->part_tbl, idx, part, 1)
+ drop_partition(part);
+ mutex_unlock(&disk->open_mutex);
- if (!(disk->flags & GENHD_FL_HIDDEN))
+ if (!(disk->flags & GENHD_FL_HIDDEN)) {
sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi");
- if (disk->queue) {
+
/*
* Unregister bdi before releasing device numbers (as they can
* get reused and we'd get clashes in sysfs).
*/
- if (!(disk->flags & GENHD_FL_HIDDEN))
- bdi_unregister(disk->queue->backing_dev_info);
- blk_unregister_queue(disk);
- } else {
- WARN_ON(1);
+ bdi_unregister(disk->bdi);
}
- if (!(disk->flags & GENHD_FL_HIDDEN))
- blk_unregister_region(disk_devt(disk), disk->minors);
- /*
- * Remove gendisk pointer from idr so that it cannot be looked up
- * while RCU period before freeing gendisk is running to prevent
- * use-after-free issues. Note that the device number stays
- * "in-use" until we really free the gendisk.
- */
- blk_invalidate_devt(disk_devt(disk));
+ blk_unregister_queue(disk);
- kobject_put(disk->part0.holder_dir);
+ kobject_put(disk->part0->bd_holder_dir);
kobject_put(disk->slave_dir);
+ disk->slave_dir = NULL;
- part_stat_set_all(&disk->part0, 0);
- disk->part0.stamp = 0;
- if (!sysfs_deprecated)
- sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
+ part_stat_set_all(disk->part0, 0);
+ disk->part0->bd_stamp = 0;
+ sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk)));
pm_runtime_set_memalloc_noio(disk_to_dev(disk), false);
device_del(disk_to_dev(disk));
+
+ blk_mq_freeze_queue_wait(q);
+
+ blk_throtl_cancel_bios(disk);
+
+ blk_sync_queue(q);
+ blk_flush_integrity();
+
+ if (queue_is_mq(q))
+ blk_mq_cancel_work_sync(q);
+
+ blk_mq_quiesce_queue(q);
+ if (q->elevator) {
+ mutex_lock(&q->sysfs_lock);
+ elevator_exit(q);
+ mutex_unlock(&q->sysfs_lock);
+ }
+ rq_qos_exit(q);
+ blk_mq_unquiesce_queue(q);
+
+ /*
+ * If the disk does not own the queue, allow using passthrough requests
+ * again. Else leave the queue frozen to fail all I/O.
+ */
+ if (!test_bit(GD_OWNS_QUEUE, &disk->state)) {
+ blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q);
+ __blk_mq_unfreeze_queue(q, true);
+ } else {
+ if (queue_is_mq(q))
+ blk_mq_exit_queue(q);
+ }
}
EXPORT_SYMBOL(del_gendisk);
+/**
+ * invalidate_disk - invalidate the disk
+ * @disk: the struct gendisk to invalidate
+ *
+ * A helper to invalidates the disk. It will clean the disk's associated
+ * buffer/page caches and reset its internal states so that the disk
+ * can be reused by the drivers.
+ *
+ * Context: can sleep
+ */
+void invalidate_disk(struct gendisk *disk)
+{
+ struct block_device *bdev = disk->part0;
+
+ invalidate_bdev(bdev);
+ bdev->bd_inode->i_mapping->wb_err = 0;
+ set_capacity(disk, 0);
+}
+EXPORT_SYMBOL(invalidate_disk);
+
/* sysfs access to bad-blocks list. */
static ssize_t disk_badblocks_show(struct device *dev,
struct device_attribute *attr,
@@ -964,135 +775,27 @@ static ssize_t disk_badblocks_store(struct device *dev,
return badblocks_store(disk->bb, page, len, 0);
}
-/**
- * get_gendisk - get partitioning information for a given device
- * @devt: device to get partitioning information for
- * @partno: returned partition index
- *
- * This function gets the structure containing partitioning
- * information for the given device @devt.
- */
-struct gendisk *get_gendisk(dev_t devt, int *partno)
+#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD
+void blk_request_module(dev_t devt)
{
- struct gendisk *disk = NULL;
-
- if (MAJOR(devt) != BLOCK_EXT_MAJOR) {
- struct kobject *kobj;
-
- kobj = kobj_lookup(bdev_map, devt, partno);
- if (kobj)
- disk = dev_to_disk(kobj_to_dev(kobj));
- } else {
- struct hd_struct *part;
+ unsigned int major = MAJOR(devt);
+ struct blk_major_name **n;
- spin_lock_bh(&ext_devt_lock);
- part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt)));
- if (part && get_disk_and_module(part_to_disk(part))) {
- *partno = part->partno;
- disk = part_to_disk(part);
+ mutex_lock(&major_names_lock);
+ for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) {
+ if ((*n)->major == major && (*n)->probe) {
+ (*n)->probe(devt);
+ mutex_unlock(&major_names_lock);
+ return;
}
- spin_unlock_bh(&ext_devt_lock);
}
+ mutex_unlock(&major_names_lock);
- if (!disk)
- return NULL;
-
- /*
- * Synchronize with del_gendisk() to not return disk that is being
- * destroyed.
- */
- down_read(&disk->lookup_sem);
- if (unlikely((disk->flags & GENHD_FL_HIDDEN) ||
- !(disk->flags & GENHD_FL_UP))) {
- up_read(&disk->lookup_sem);
- put_disk_and_module(disk);
- disk = NULL;
- } else {
- up_read(&disk->lookup_sem);
- }
- return disk;
-}
-
-/**
- * bdget_disk - do bdget() by gendisk and partition number
- * @disk: gendisk of interest
- * @partno: partition number
- *
- * Find partition @partno from @disk, do bdget() on it.
- *
- * CONTEXT:
- * Don't care.
- *
- * RETURNS:
- * Resulting block_device on success, NULL on failure.
- */
-struct block_device *bdget_disk(struct gendisk *disk, int partno)
-{
- struct hd_struct *part;
- struct block_device *bdev = NULL;
-
- part = disk_get_part(disk, partno);
- if (part)
- bdev = bdget(part_devt(part));
- disk_put_part(part);
-
- return bdev;
-}
-EXPORT_SYMBOL(bdget_disk);
-
-/*
- * print a full list of all partitions - intended for places where the root
- * filesystem can't be mounted and thus to give the victim some idea of what
- * went wrong
- */
-void __init printk_all_partitions(void)
-{
- struct class_dev_iter iter;
- struct device *dev;
-
- class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
- while ((dev = class_dev_iter_next(&iter))) {
- struct gendisk *disk = dev_to_disk(dev);
- struct disk_part_iter piter;
- struct hd_struct *part;
- char name_buf[BDEVNAME_SIZE];
- char devt_buf[BDEVT_SIZE];
-
- /*
- * Don't show empty devices or things that have been
- * suppressed
- */
- if (get_capacity(disk) == 0 ||
- (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO))
- continue;
-
- /*
- * Note, unlike /proc/partitions, I am showing the
- * numbers in hex - the same format as the root=
- * option takes.
- */
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0);
- while ((part = disk_part_iter_next(&piter))) {
- bool is_part0 = part == &disk->part0;
-
- printk("%s%s %10llu %s %s", is_part0 ? "" : " ",
- bdevt_str(part_devt(part), devt_buf),
- (unsigned long long)part_nr_sects_read(part) >> 1
- , disk_name(disk, part->partno, name_buf),
- part->info ? part->info->uuid : "");
- if (is_part0) {
- if (dev->parent && dev->parent->driver)
- printk(" driver: %s\n",
- dev->parent->driver->name);
- else
- printk(" (driver?)\n");
- } else
- printk("\n");
- }
- disk_part_iter_exit(&piter);
- }
- class_dev_iter_exit(&iter);
+ if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
+ /* Make old-style 2.4 aliases work */
+ request_module("block-major-%d", MAJOR(devt));
}
+#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */
#ifdef CONFIG_PROC_FS
/* iterator */
@@ -1154,26 +857,21 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos)
static int show_partition(struct seq_file *seqf, void *v)
{
struct gendisk *sgp = v;
- struct disk_part_iter piter;
- struct hd_struct *part;
- char buf[BDEVNAME_SIZE];
+ struct block_device *part;
+ unsigned long idx;
- /* Don't show non-partitionable removeable devices or empty devices */
- if (!get_capacity(sgp) || (!disk_max_parts(sgp) &&
- (sgp->flags & GENHD_FL_REMOVABLE)))
- return 0;
- if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)
+ if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN))
return 0;
- /* show the full disk and all non-0 size partitions of it */
- disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0);
- while ((part = disk_part_iter_next(&piter)))
- seq_printf(seqf, "%4d %7d %10llu %s\n",
- MAJOR(part_devt(part)), MINOR(part_devt(part)),
- (unsigned long long)part_nr_sects_read(part) >> 1,
- disk_name(sgp, part->partno, buf));
- disk_part_iter_exit(&piter);
-
+ rcu_read_lock();
+ xa_for_each(&sgp->part_tbl, idx, part) {
+ if (!bdev_nr_sectors(part))
+ continue;
+ seq_printf(seqf, "%4d %7d %10llu %pg\n",
+ MAJOR(part->bd_dev), MINOR(part->bd_dev),
+ bdev_nr_sectors(part) >> 1, part);
+ }
+ rcu_read_unlock();
return 0;
}
@@ -1185,31 +883,19 @@ static const struct seq_operations partitions_op = {
};
#endif
-
-static struct kobject *base_probe(dev_t devt, int *partno, void *data)
-{
- if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0)
- /* Make old-style 2.4 aliases work */
- request_module("block-major-%d", MAJOR(devt));
- return NULL;
-}
-
static int __init genhd_device_init(void)
{
int error;
- block_class.dev_kobj = sysfs_dev_block_kobj;
error = class_register(&block_class);
if (unlikely(error))
return error;
- bdev_map = kobj_map_init(base_probe, &block_class_lock);
blk_dev_init();
register_blkdev(BLOCK_EXT_MAJOR, "blkext");
/* create top-level block dir */
- if (!sysfs_deprecated)
- block_depr = kobject_create_and_add("block", NULL);
+ block_depr = kobject_create_and_add("block", NULL);
return 0;
}
@@ -1228,7 +914,8 @@ static ssize_t disk_ext_range_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", disk_max_parts(disk));
+ return sprintf(buf, "%d\n",
+ (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS);
}
static ssize_t disk_removable_show(struct device *dev,
@@ -1260,26 +947,28 @@ static ssize_t disk_ro_show(struct device *dev,
ssize_t part_size_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%llu\n",
- (unsigned long long)part_nr_sects_read(p));
+ return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev)));
}
ssize_t part_stat_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- struct request_queue *q = part_to_disk(p)->queue;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
struct disk_stats stat;
unsigned int inflight;
- part_stat_read_all(p, &stat);
if (queue_is_mq(q))
- inflight = blk_mq_in_flight(q, p);
+ inflight = blk_mq_in_flight(q, bdev);
else
- inflight = part_in_flight(q, p);
+ inflight = part_in_flight(bdev);
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(bdev, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(bdev, &stat);
return sprintf(buf,
"%8lu %8lu %8llu %8u "
"%8lu %8lu %8llu %8u "
@@ -1313,14 +1002,14 @@ ssize_t part_stat_show(struct device *dev,
ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- struct request_queue *q = part_to_disk(p)->queue;
+ struct block_device *bdev = dev_to_bdev(dev);
+ struct request_queue *q = bdev_get_queue(bdev);
unsigned int inflight[2];
if (queue_is_mq(q))
- blk_mq_in_flight_rw(q, p, inflight);
+ blk_mq_in_flight_rw(q, bdev, inflight);
else
- part_in_flight_rw(q, p, inflight);
+ part_in_flight_rw(bdev, inflight);
return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]);
}
@@ -1328,9 +1017,8 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr,
static ssize_t disk_capability_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct gendisk *disk = dev_to_disk(dev);
-
- return sprintf(buf, "%x\n", disk->flags);
+ dev_warn_once(dev, "the capability attribute has been deprecated.\n");
+ return sprintf(buf, "0\n");
}
static ssize_t disk_alignment_offset_show(struct device *dev,
@@ -1339,7 +1027,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue));
+ return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
}
static ssize_t disk_discard_alignment_show(struct device *dev,
@@ -1348,7 +1036,15 @@ static ssize_t disk_discard_alignment_show(struct device *dev,
{
struct gendisk *disk = dev_to_disk(dev);
- return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue));
+ return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0));
+}
+
+static ssize_t diskseq_show(struct device *dev,
+ struct device_attribute *attr, char *buf)
+{
+ struct gendisk *disk = dev_to_disk(dev);
+
+ return sprintf(buf, "%llu\n", disk->diskseq);
}
static DEVICE_ATTR(range, 0444, disk_range_show, NULL);
@@ -1363,25 +1059,23 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL);
static DEVICE_ATTR(stat, 0444, part_stat_show, NULL);
static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL);
static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store);
+static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL);
#ifdef CONFIG_FAIL_MAKE_REQUEST
ssize_t part_fail_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%d\n", p->make_it_fail);
+ return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail);
}
ssize_t part_fail_store(struct device *dev,
struct device_attribute *attr,
const char *buf, size_t count)
{
- struct hd_struct *p = dev_to_part(dev);
int i;
if (count > 0 && sscanf(buf, "%d", &i) > 0)
- p->make_it_fail = (i == 0) ? 0 : 1;
+ dev_to_bdev(dev)->bd_make_it_fail = i;
return count;
}
@@ -1408,6 +1102,10 @@ static struct attribute *disk_attrs[] = {
&dev_attr_stat.attr,
&dev_attr_inflight.attr,
&dev_attr_badblocks.attr,
+ &dev_attr_events.attr,
+ &dev_attr_events_async.attr,
+ &dev_attr_events_poll_msecs.attr,
+ &dev_attr_diskseq.attr,
#ifdef CONFIG_FAIL_MAKE_REQUEST
&dev_attr_fail.attr,
#endif
@@ -1434,104 +1132,81 @@ static struct attribute_group disk_attr_group = {
static const struct attribute_group *disk_attr_groups[] = {
&disk_attr_group,
+#ifdef CONFIG_BLK_DEV_IO_TRACE
+ &blk_trace_attr_group,
+#endif
+#ifdef CONFIG_BLK_DEV_INTEGRITY
+ &blk_integrity_attr_group,
+#endif
NULL
};
/**
- * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way
- * @disk: disk to replace part_tbl for
- * @new_ptbl: new part_tbl to install
+ * disk_release - releases all allocated resources of the gendisk
+ * @dev: the device representing this disk
*
- * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The
- * original ptbl is freed using RCU callback.
+ * This function releases all allocated resources of the gendisk.
*
- * LOCKING:
- * Matching bd_mutex locked or the caller is the only user of @disk.
+ * Drivers which used __device_add_disk() have a gendisk with a request_queue
+ * assigned. Since the request_queue sits on top of the gendisk for these
+ * drivers we also call blk_put_queue() for them, and we expect the
+ * request_queue refcount to reach 0 at this point, and so the request_queue
+ * will also be freed prior to the disk.
+ *
+ * Context: can sleep
*/
-static void disk_replace_part_tbl(struct gendisk *disk,
- struct disk_part_tbl *new_ptbl)
+static void disk_release(struct device *dev)
{
- struct disk_part_tbl *old_ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
+ struct gendisk *disk = dev_to_disk(dev);
- rcu_assign_pointer(disk->part_tbl, new_ptbl);
+ might_sleep();
+ WARN_ON_ONCE(disk_live(disk));
- if (old_ptbl) {
- rcu_assign_pointer(old_ptbl->last_lookup, NULL);
- kfree_rcu(old_ptbl, rcu_head);
- }
-}
-
-/**
- * disk_expand_part_tbl - expand disk->part_tbl
- * @disk: disk to expand part_tbl for
- * @partno: expand such that this partno can fit in
- *
- * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl
- * uses RCU to allow unlocked dereferencing for stats and other stuff.
- *
- * LOCKING:
- * Matching bd_mutex locked or the caller is the only user of @disk.
- * Might sleep.
- *
- * RETURNS:
- * 0 on success, -errno on failure.
- */
-int disk_expand_part_tbl(struct gendisk *disk, int partno)
-{
- struct disk_part_tbl *old_ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
- struct disk_part_tbl *new_ptbl;
- int len = old_ptbl ? old_ptbl->len : 0;
- int i, target;
+ blk_trace_remove(disk->queue);
/*
- * check for int overflow, since we can get here from blkpg_ioctl()
- * with a user passed 'partno'.
+ * To undo the all initialization from blk_mq_init_allocated_queue in
+ * case of a probe failure where add_disk is never called we have to
+ * call blk_mq_exit_queue here. We can't do this for the more common
+ * teardown case (yet) as the tagset can be gone by the time the disk
+ * is released once it was added.
*/
- target = partno + 1;
- if (target < 0)
- return -EINVAL;
+ if (queue_is_mq(disk->queue) &&
+ test_bit(GD_OWNS_QUEUE, &disk->state) &&
+ !test_bit(GD_ADDED, &disk->state))
+ blk_mq_exit_queue(disk->queue);
- /* disk_max_parts() is zero during initialization, ignore if so */
- if (disk_max_parts(disk) && target > disk_max_parts(disk))
- return -EINVAL;
+ blkcg_exit_disk(disk);
- if (target <= len)
- return 0;
+ bioset_exit(&disk->bio_split);
- new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL,
- disk->node_id);
- if (!new_ptbl)
- return -ENOMEM;
+ disk_release_events(disk);
+ kfree(disk->random);
+ disk_free_zone_bitmaps(disk);
+ xa_destroy(&disk->part_tbl);
- new_ptbl->len = target;
+ disk->queue->disk = NULL;
+ blk_put_queue(disk->queue);
- for (i = 0; i < len; i++)
- rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]);
+ if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk)
+ disk->fops->free_disk(disk);
- disk_replace_part_tbl(disk, new_ptbl);
- return 0;
+ iput(disk->part0->bd_inode); /* frees the disk */
}
-static void disk_release(struct device *dev)
+static int block_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
- struct gendisk *disk = dev_to_disk(dev);
+ const struct gendisk *disk = dev_to_disk(dev);
- blk_free_devt(dev->devt);
- disk_release_events(disk);
- kfree(disk->random);
- disk_replace_part_tbl(disk, NULL);
- hd_free_part(&disk->part0);
- if (disk->queue)
- blk_put_queue(disk->queue);
- kfree(disk);
+ return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq);
}
+
struct class block_class = {
.name = "block",
+ .dev_uevent = block_uevent,
};
-static char *block_devnode(struct device *dev, umode_t *mode,
+static char *block_devnode(const struct device *dev, umode_t *mode,
kuid_t *uid, kgid_t *gid)
{
struct gendisk *disk = dev_to_disk(dev);
@@ -1541,7 +1216,7 @@ static char *block_devnode(struct device *dev, umode_t *mode,
return NULL;
}
-static const struct device_type disk_type = {
+const struct device_type disk_type = {
.name = "disk",
.groups = disk_attr_groups,
.release = disk_release,
@@ -1559,11 +1234,10 @@ static const struct device_type disk_type = {
static int diskstats_show(struct seq_file *seqf, void *v)
{
struct gendisk *gp = v;
- struct disk_part_iter piter;
- struct hd_struct *hd;
- char buf[BDEVNAME_SIZE];
+ struct block_device *hd;
unsigned int inflight;
struct disk_stats stat;
+ unsigned long idx;
/*
if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next)
@@ -1573,23 +1247,29 @@ static int diskstats_show(struct seq_file *seqf, void *v)
"\n\n");
*/
- disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0);
- while ((hd = disk_part_iter_next(&piter))) {
- part_stat_read_all(hd, &stat);
+ rcu_read_lock();
+ xa_for_each(&gp->part_tbl, idx, hd) {
+ if (bdev_is_partition(hd) && !bdev_nr_sectors(hd))
+ continue;
if (queue_is_mq(gp->queue))
inflight = blk_mq_in_flight(gp->queue, hd);
else
- inflight = part_in_flight(gp->queue, hd);
+ inflight = part_in_flight(hd);
- seq_printf(seqf, "%4d %7d %s "
+ if (inflight) {
+ part_stat_lock();
+ update_io_ticks(hd, jiffies, true);
+ part_stat_unlock();
+ }
+ part_stat_read_all(hd, &stat);
+ seq_printf(seqf, "%4d %7d %pg "
"%lu %lu %lu %u "
"%lu %lu %lu %u "
"%u %u %u "
"%lu %lu %lu %u "
"%lu %u"
"\n",
- MAJOR(part_devt(hd)), MINOR(part_devt(hd)),
- disk_name(gp, hd->partno, buf),
+ MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd,
stat.ios[STAT_READ],
stat.merges[STAT_READ],
stat.sectors[STAT_READ],
@@ -1617,7 +1297,7 @@ static int diskstats_show(struct seq_file *seqf, void *v)
NSEC_PER_MSEC)
);
}
- disk_part_iter_exit(&piter);
+ rcu_read_unlock();
return 0;
}
@@ -1638,137 +1318,118 @@ static int __init proc_genhd_init(void)
module_init(proc_genhd_init);
#endif /* CONFIG_PROC_FS */
-dev_t blk_lookup_devt(const char *name, int partno)
+dev_t part_devt(struct gendisk *disk, u8 partno)
{
- dev_t devt = MKDEV(0, 0);
- struct class_dev_iter iter;
- struct device *dev;
+ struct block_device *part;
+ dev_t devt = 0;
- class_dev_iter_init(&iter, &block_class, NULL, &disk_type);
- while ((dev = class_dev_iter_next(&iter))) {
- struct gendisk *disk = dev_to_disk(dev);
- struct hd_struct *part;
-
- if (strcmp(dev_name(dev), name))
- continue;
+ rcu_read_lock();
+ part = xa_load(&disk->part_tbl, partno);
+ if (part)
+ devt = part->bd_dev;
+ rcu_read_unlock();
- if (partno < disk->minors) {
- /* We need to return the right devno, even
- * if the partition doesn't exist yet.
- */
- devt = MKDEV(MAJOR(dev->devt),
- MINOR(dev->devt) + partno);
- break;
- }
- part = disk_get_part(disk, partno);
- if (part) {
- devt = part_devt(part);
- disk_put_part(part);
- break;
- }
- disk_put_part(part);
- }
- class_dev_iter_exit(&iter);
return devt;
}
-struct gendisk *__alloc_disk_node(int minors, int node_id)
+struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
+ struct lock_class_key *lkclass)
{
struct gendisk *disk;
- struct disk_part_tbl *ptbl;
-
- if (minors > DISK_MAX_PARTS) {
- printk(KERN_ERR
- "block: can't allocate more than %d partitions\n",
- DISK_MAX_PARTS);
- minors = DISK_MAX_PARTS;
- }
disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id);
- if (disk) {
- disk->part0.dkstats = alloc_percpu(struct disk_stats);
- if (!disk->part0.dkstats) {
- kfree(disk);
- return NULL;
- }
- init_rwsem(&disk->lookup_sem);
- disk->node_id = node_id;
- if (disk_expand_part_tbl(disk, 0)) {
- free_percpu(disk->part0.dkstats);
- kfree(disk);
- return NULL;
- }
- ptbl = rcu_dereference_protected(disk->part_tbl, 1);
- rcu_assign_pointer(ptbl->part[0], &disk->part0);
-
- /*
- * set_capacity() and get_capacity() currently don't use
- * seqcounter to read/update the part0->nr_sects. Still init
- * the counter as we can read the sectors in IO submission
- * patch using seqence counters.
- *
- * TODO: Ideally set_capacity() and get_capacity() should be
- * converted to make use of bd_mutex and sequence counters.
- */
- hd_sects_seq_init(&disk->part0);
- if (hd_ref_init(&disk->part0)) {
- hd_free_part(&disk->part0);
- kfree(disk);
- return NULL;
- }
+ if (!disk)
+ return NULL;
- disk->minors = minors;
- rand_initialize_disk(disk);
- disk_to_dev(disk)->class = &block_class;
- disk_to_dev(disk)->type = &disk_type;
- device_initialize(disk_to_dev(disk));
- }
+ if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0))
+ goto out_free_disk;
+
+ disk->bdi = bdi_alloc(node_id);
+ if (!disk->bdi)
+ goto out_free_bioset;
+
+ /* bdev_alloc() might need the queue, set before the first call */
+ disk->queue = q;
+
+ disk->part0 = bdev_alloc(disk, 0);
+ if (!disk->part0)
+ goto out_free_bdi;
+
+ disk->node_id = node_id;
+ mutex_init(&disk->open_mutex);
+ xa_init(&disk->part_tbl);
+ if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL))
+ goto out_destroy_part_tbl;
+
+ if (blkcg_init_disk(disk))
+ goto out_erase_part0;
+
+ rand_initialize_disk(disk);
+ disk_to_dev(disk)->class = &block_class;
+ disk_to_dev(disk)->type = &disk_type;
+ device_initialize(disk_to_dev(disk));
+ inc_diskseq(disk);
+ q->disk = disk;
+ lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0);
+#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED
+ INIT_LIST_HEAD(&disk->slave_bdevs);
+#endif
return disk;
+
+out_erase_part0:
+ xa_erase(&disk->part_tbl, 0);
+out_destroy_part_tbl:
+ xa_destroy(&disk->part_tbl);
+ disk->part0->bd_disk = NULL;
+ iput(disk->part0->bd_inode);
+out_free_bdi:
+ bdi_put(disk->bdi);
+out_free_bioset:
+ bioset_exit(&disk->bio_split);
+out_free_disk:
+ kfree(disk);
+ return NULL;
}
-EXPORT_SYMBOL(__alloc_disk_node);
-struct kobject *get_disk_and_module(struct gendisk *disk)
+struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass)
{
- struct module *owner;
- struct kobject *kobj;
+ struct request_queue *q;
+ struct gendisk *disk;
- if (!disk->fops)
+ q = blk_alloc_queue(node);
+ if (!q)
return NULL;
- owner = disk->fops->owner;
- if (owner && !try_module_get(owner))
- return NULL;
- kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj);
- if (kobj == NULL) {
- module_put(owner);
+
+ disk = __alloc_disk_node(q, node, lkclass);
+ if (!disk) {
+ blk_put_queue(q);
return NULL;
}
- return kobj;
-
+ set_bit(GD_OWNS_QUEUE, &disk->state);
+ return disk;
}
-EXPORT_SYMBOL(get_disk_and_module);
+EXPORT_SYMBOL(__blk_alloc_disk);
+/**
+ * put_disk - decrements the gendisk refcount
+ * @disk: the struct gendisk to decrement the refcount for
+ *
+ * This decrements the refcount for the struct gendisk. When this reaches 0
+ * we'll have disk_release() called.
+ *
+ * Note: for blk-mq disk put_disk must be called before freeing the tag_set
+ * when handling probe errors (that is before add_disk() is called).
+ *
+ * Context: Any context, but the last reference must not be dropped from
+ * atomic context.
+ */
void put_disk(struct gendisk *disk)
{
if (disk)
- kobject_put(&disk_to_dev(disk)->kobj);
+ put_device(disk_to_dev(disk));
}
EXPORT_SYMBOL(put_disk);
-/*
- * This is a counterpart of get_disk_and_module() and thus also of
- * get_gendisk().
- */
-void put_disk_and_module(struct gendisk *disk)
-{
- if (disk) {
- struct module *owner = disk->fops->owner;
-
- put_disk(disk);
- module_put(owner);
- }
-}
-EXPORT_SYMBOL(put_disk_and_module);
-
static void set_disk_ro_uevent(struct gendisk *gd, int ro)
{
char event[] = "DISK_RO=1";
@@ -1779,500 +1440,29 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro)
kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp);
}
-void set_device_ro(struct block_device *bdev, int flag)
-{
- bdev->bd_part->policy = flag;
-}
-
-EXPORT_SYMBOL(set_device_ro);
-
-void set_disk_ro(struct gendisk *disk, int flag)
-{
- struct disk_part_iter piter;
- struct hd_struct *part;
-
- if (disk->part0.policy != flag) {
- set_disk_ro_uevent(disk, flag);
- disk->part0.policy = flag;
- }
-
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter)))
- part->policy = flag;
- disk_part_iter_exit(&piter);
-}
-
-EXPORT_SYMBOL(set_disk_ro);
-
-int bdev_read_only(struct block_device *bdev)
-{
- if (!bdev)
- return 0;
- return bdev->bd_part->policy;
-}
-
-EXPORT_SYMBOL(bdev_read_only);
-
-/*
- * Disk events - monitor disk events like media change and eject request.
- */
-struct disk_events {
- struct list_head node; /* all disk_event's */
- struct gendisk *disk; /* the associated disk */
- spinlock_t lock;
-
- struct mutex block_mutex; /* protects blocking */
- int block; /* event blocking depth */
- unsigned int pending; /* events already sent out */
- unsigned int clearing; /* events being cleared */
-
- long poll_msecs; /* interval, -1 for default */
- struct delayed_work dwork;
-};
-
-static const char *disk_events_strs[] = {
- [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change",
- [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request",
-};
-
-static char *disk_uevents[] = {
- [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1",
- [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1",
-};
-
-/* list of all disk_events */
-static DEFINE_MUTEX(disk_events_mutex);
-static LIST_HEAD(disk_events);
-
-/* disable in-kernel polling by default */
-static unsigned long disk_events_dfl_poll_msecs;
-
-static unsigned long disk_events_poll_jiffies(struct gendisk *disk)
-{
- struct disk_events *ev = disk->ev;
- long intv_msecs = 0;
-
- /*
- * If device-specific poll interval is set, always use it. If
- * the default is being used, poll if the POLL flag is set.
- */
- if (ev->poll_msecs >= 0)
- intv_msecs = ev->poll_msecs;
- else if (disk->event_flags & DISK_EVENT_FLAG_POLL)
- intv_msecs = disk_events_dfl_poll_msecs;
-
- return msecs_to_jiffies(intv_msecs);
-}
-
/**
- * disk_block_events - block and flush disk event checking
- * @disk: disk to block events for
+ * set_disk_ro - set a gendisk read-only
+ * @disk: gendisk to operate on
+ * @read_only: %true to set the disk read-only, %false set the disk read/write
*
- * On return from this function, it is guaranteed that event checking
- * isn't in progress and won't happen until unblocked by
- * disk_unblock_events(). Events blocking is counted and the actual
- * unblocking happens after the matching number of unblocks are done.
- *
- * Note that this intentionally does not block event checking from
- * disk_clear_events().
- *
- * CONTEXT:
- * Might sleep.
+ * This function is used to indicate whether a given disk device should have its
+ * read-only flag set. set_disk_ro() is typically used by device drivers to
+ * indicate whether the underlying physical device is write-protected.
*/
-void disk_block_events(struct gendisk *disk)
+void set_disk_ro(struct gendisk *disk, bool read_only)
{
- struct disk_events *ev = disk->ev;
- unsigned long flags;
- bool cancel;
-
- if (!ev)
- return;
-
- /*
- * Outer mutex ensures that the first blocker completes canceling
- * the event work before further blockers are allowed to finish.
- */
- mutex_lock(&ev->block_mutex);
-
- spin_lock_irqsave(&ev->lock, flags);
- cancel = !ev->block++;
- spin_unlock_irqrestore(&ev->lock, flags);
-
- if (cancel)
- cancel_delayed_work_sync(&disk->ev->dwork);
-
- mutex_unlock(&ev->block_mutex);
-}
-
-static void __disk_unblock_events(struct gendisk *disk, bool check_now)
-{
- struct disk_events *ev = disk->ev;
- unsigned long intv;
- unsigned long flags;
-
- spin_lock_irqsave(&ev->lock, flags);
-
- if (WARN_ON_ONCE(ev->block <= 0))
- goto out_unlock;
-
- if (--ev->block)
- goto out_unlock;
-
- intv = disk_events_poll_jiffies(disk);
- if (check_now)
- queue_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, 0);
- else if (intv)
- queue_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, intv);
-out_unlock:
- spin_unlock_irqrestore(&ev->lock, flags);
-}
-
-/**
- * disk_unblock_events - unblock disk event checking
- * @disk: disk to unblock events for
- *
- * Undo disk_block_events(). When the block count reaches zero, it
- * starts events polling if configured.
- *
- * CONTEXT:
- * Don't care. Safe to call from irq context.
- */
-void disk_unblock_events(struct gendisk *disk)
-{
- if (disk->ev)
- __disk_unblock_events(disk, false);
-}
-
-/**
- * disk_flush_events - schedule immediate event checking and flushing
- * @disk: disk to check and flush events for
- * @mask: events to flush
- *
- * Schedule immediate event checking on @disk if not blocked. Events in
- * @mask are scheduled to be cleared from the driver. Note that this
- * doesn't clear the events from @disk->ev.
- *
- * CONTEXT:
- * If @mask is non-zero must be called with bdev->bd_mutex held.
- */
-void disk_flush_events(struct gendisk *disk, unsigned int mask)
-{
- struct disk_events *ev = disk->ev;
-
- if (!ev)
- return;
-
- spin_lock_irq(&ev->lock);
- ev->clearing |= mask;
- if (!ev->block)
- mod_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, 0);
- spin_unlock_irq(&ev->lock);
-}
-
-/**
- * disk_clear_events - synchronously check, clear and return pending events
- * @disk: disk to fetch and clear events from
- * @mask: mask of events to be fetched and cleared
- *
- * Disk events are synchronously checked and pending events in @mask
- * are cleared and returned. This ignores the block count.
- *
- * CONTEXT:
- * Might sleep.
- */
-unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask)
-{
- const struct block_device_operations *bdops = disk->fops;
- struct disk_events *ev = disk->ev;
- unsigned int pending;
- unsigned int clearing = mask;
-
- if (!ev) {
- /* for drivers still using the old ->media_changed method */
- if ((mask & DISK_EVENT_MEDIA_CHANGE) &&
- bdops->media_changed && bdops->media_changed(disk))
- return DISK_EVENT_MEDIA_CHANGE;
- return 0;
- }
-
- disk_block_events(disk);
-
- /*
- * store the union of mask and ev->clearing on the stack so that the
- * race with disk_flush_events does not cause ambiguity (ev->clearing
- * can still be modified even if events are blocked).
- */
- spin_lock_irq(&ev->lock);
- clearing |= ev->clearing;
- ev->clearing = 0;
- spin_unlock_irq(&ev->lock);
-
- disk_check_events(ev, &clearing);
- /*
- * if ev->clearing is not 0, the disk_flush_events got called in the
- * middle of this function, so we want to run the workfn without delay.
- */
- __disk_unblock_events(disk, ev->clearing ? true : false);
-
- /* then, fetch and clear pending events */
- spin_lock_irq(&ev->lock);
- pending = ev->pending & mask;
- ev->pending &= ~mask;
- spin_unlock_irq(&ev->lock);
- WARN_ON_ONCE(clearing & mask);
-
- return pending;
-}
-
-/*
- * Separate this part out so that a different pointer for clearing_ptr can be
- * passed in for disk_clear_events.
- */
-static void disk_events_workfn(struct work_struct *work)
-{
- struct delayed_work *dwork = to_delayed_work(work);
- struct disk_events *ev = container_of(dwork, struct disk_events, dwork);
-
- disk_check_events(ev, &ev->clearing);
-}
-
-static void disk_check_events(struct disk_events *ev,
- unsigned int *clearing_ptr)
-{
- struct gendisk *disk = ev->disk;
- char *envp[ARRAY_SIZE(disk_uevents) + 1] = { };
- unsigned int clearing = *clearing_ptr;
- unsigned int events;
- unsigned long intv;
- int nr_events = 0, i;
-
- /* check events */
- events = disk->fops->check_events(disk, clearing);
-
- /* accumulate pending events and schedule next poll if necessary */
- spin_lock_irq(&ev->lock);
-
- events &= ~ev->pending;
- ev->pending |= events;
- *clearing_ptr &= ~clearing;
-
- intv = disk_events_poll_jiffies(disk);
- if (!ev->block && intv)
- queue_delayed_work(system_freezable_power_efficient_wq,
- &ev->dwork, intv);
-
- spin_unlock_irq(&ev->lock);
-
- /*
- * Tell userland about new events. Only the events listed in
- * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT
- * is set. Otherwise, events are processed internally but never
- * get reported to userland.
- */
- for (i = 0; i < ARRAY_SIZE(disk_uevents); i++)
- if ((events & disk->events & (1 << i)) &&
- (disk->event_flags & DISK_EVENT_FLAG_UEVENT))
- envp[nr_events++] = disk_uevents[i];
-
- if (nr_events)
- kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp);
-}
-
-/*
- * A disk events enabled device has the following sysfs nodes under
- * its /sys/block/X/ directory.
- *
- * events : list of all supported events
- * events_async : list of events which can be detected w/o polling
- * (always empty, only for backwards compatibility)
- * events_poll_msecs : polling interval, 0: disable, -1: system default
- */
-static ssize_t __disk_events_show(unsigned int events, char *buf)
-{
- const char *delim = "";
- ssize_t pos = 0;
- int i;
-
- for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++)
- if (events & (1 << i)) {
- pos += sprintf(buf + pos, "%s%s",
- delim, disk_events_strs[i]);
- delim = " ";
- }
- if (pos)
- pos += sprintf(buf + pos, "\n");
- return pos;
-}
-
-static ssize_t disk_events_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
-
- if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT))
- return 0;
-
- return __disk_events_show(disk->events, buf);
-}
-
-static ssize_t disk_events_async_show(struct device *dev,
- struct device_attribute *attr, char *buf)
-{
- return 0;
-}
-
-static ssize_t disk_events_poll_msecs_show(struct device *dev,
- struct device_attribute *attr,
- char *buf)
-{
- struct gendisk *disk = dev_to_disk(dev);
-
- if (!disk->ev)
- return sprintf(buf, "-1\n");
-
- return sprintf(buf, "%ld\n", disk->ev->poll_msecs);
-}
-
-static ssize_t disk_events_poll_msecs_store(struct device *dev,
- struct device_attribute *attr,
- const char *buf, size_t count)
-{
- struct gendisk *disk = dev_to_disk(dev);
- long intv;
-
- if (!count || !sscanf(buf, "%ld", &intv))
- return -EINVAL;
-
- if (intv < 0 && intv != -1)
- return -EINVAL;
-
- if (!disk->ev)
- return -ENODEV;
-
- disk_block_events(disk);
- disk->ev->poll_msecs = intv;
- __disk_unblock_events(disk, true);
-
- return count;
-}
-
-static const DEVICE_ATTR(events, 0444, disk_events_show, NULL);
-static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL);
-static const DEVICE_ATTR(events_poll_msecs, 0644,
- disk_events_poll_msecs_show,
- disk_events_poll_msecs_store);
-
-static const struct attribute *disk_events_attrs[] = {
- &dev_attr_events.attr,
- &dev_attr_events_async.attr,
- &dev_attr_events_poll_msecs.attr,
- NULL,
-};
-
-/*
- * The default polling interval can be specified by the kernel
- * parameter block.events_dfl_poll_msecs which defaults to 0
- * (disable). This can also be modified runtime by writing to
- * /sys/module/block/parameters/events_dfl_poll_msecs.
- */
-static int disk_events_set_dfl_poll_msecs(const char *val,
- const struct kernel_param *kp)
-{
- struct disk_events *ev;
- int ret;
-
- ret = param_set_ulong(val, kp);
- if (ret < 0)
- return ret;
-
- mutex_lock(&disk_events_mutex);
-
- list_for_each_entry(ev, &disk_events, node)
- disk_flush_events(ev->disk, 0);
-
- mutex_unlock(&disk_events_mutex);
-
- return 0;
-}
-
-static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = {
- .set = disk_events_set_dfl_poll_msecs,
- .get = param_get_ulong,
-};
-
-#undef MODULE_PARAM_PREFIX
-#define MODULE_PARAM_PREFIX "block."
-
-module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops,
- &disk_events_dfl_poll_msecs, 0644);
-
-/*
- * disk_{alloc|add|del|release}_events - initialize and destroy disk_events.
- */
-static void disk_alloc_events(struct gendisk *disk)
-{
- struct disk_events *ev;
-
- if (!disk->fops->check_events || !disk->events)
- return;
-
- ev = kzalloc(sizeof(*ev), GFP_KERNEL);
- if (!ev) {
- pr_warn("%s: failed to initialize events\n", disk->disk_name);
- return;
- }
-
- INIT_LIST_HEAD(&ev->node);
- ev->disk = disk;
- spin_lock_init(&ev->lock);
- mutex_init(&ev->block_mutex);
- ev->block = 1;
- ev->poll_msecs = -1;
- INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn);
-
- disk->ev = ev;
-}
-
-static void disk_add_events(struct gendisk *disk)
-{
- /* FIXME: error handling */
- if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0)
- pr_warn("%s: failed to create sysfs files for events\n",
- disk->disk_name);
-
- if (!disk->ev)
- return;
-
- mutex_lock(&disk_events_mutex);
- list_add_tail(&disk->ev->node, &disk_events);
- mutex_unlock(&disk_events_mutex);
-
- /*
- * Block count is initialized to 1 and the following initial
- * unblock kicks it into action.
- */
- __disk_unblock_events(disk, true);
-}
-
-static void disk_del_events(struct gendisk *disk)
-{
- if (disk->ev) {
- disk_block_events(disk);
-
- mutex_lock(&disk_events_mutex);
- list_del_init(&disk->ev->node);
- mutex_unlock(&disk_events_mutex);
+ if (read_only) {
+ if (test_and_set_bit(GD_READ_ONLY, &disk->state))
+ return;
+ } else {
+ if (!test_and_clear_bit(GD_READ_ONLY, &disk->state))
+ return;
}
-
- sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs);
+ set_disk_ro_uevent(disk, read_only);
}
+EXPORT_SYMBOL(set_disk_ro);
-static void disk_release_events(struct gendisk *disk)
+void inc_diskseq(struct gendisk *disk)
{
- /* the block count should be 1 from disk_del_events() */
- WARN_ON_ONCE(disk->ev && disk->ev->block != 1);
- kfree(disk->ev);
+ disk->diskseq = atomic64_inc_return(&diskseq);
}
diff --git a/block/holder.c b/block/holder.c
new file mode 100644
index 000000000000..37d18c13d958
--- /dev/null
+++ b/block/holder.c
@@ -0,0 +1,154 @@
+// SPDX-License-Identifier: GPL-2.0-only
+#include <linux/blkdev.h>
+#include <linux/slab.h>
+
+struct bd_holder_disk {
+ struct list_head list;
+ struct kobject *holder_dir;
+ int refcnt;
+};
+
+static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev,
+ struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+
+ list_for_each_entry(holder, &disk->slave_bdevs, list)
+ if (holder->holder_dir == bdev->bd_holder_dir)
+ return holder;
+ return NULL;
+}
+
+static int add_symlink(struct kobject *from, struct kobject *to)
+{
+ return sysfs_create_link(from, to, kobject_name(to));
+}
+
+static void del_symlink(struct kobject *from, struct kobject *to)
+{
+ sysfs_remove_link(from, kobject_name(to));
+}
+
+/**
+ * bd_link_disk_holder - create symlinks between holding disk and slave bdev
+ * @bdev: the claimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * This functions creates the following sysfs symlinks.
+ *
+ * - from "slaves" directory of the holder @disk to the claimed @bdev
+ * - from "holders" directory of the @bdev to the holder @disk
+ *
+ * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is
+ * passed to bd_link_disk_holder(), then:
+ *
+ * /sys/block/dm-0/slaves/sda --> /sys/block/sda
+ * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0
+ *
+ * The caller must have claimed @bdev before calling this function and
+ * ensure that both @bdev and @disk are valid during the creation and
+ * lifetime of these symlinks.
+ *
+ * CONTEXT:
+ * Might sleep.
+ *
+ * RETURNS:
+ * 0 on success, -errno on failure.
+ */
+int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+ int ret = 0;
+
+ if (WARN_ON_ONCE(!disk->slave_dir))
+ return -EINVAL;
+
+ if (bdev->bd_disk == disk)
+ return -EINVAL;
+
+ /*
+ * del_gendisk drops the initial reference to bd_holder_dir, so we
+ * need to keep our own here to allow for cleanup past that point.
+ */
+ mutex_lock(&bdev->bd_disk->open_mutex);
+ if (!disk_live(bdev->bd_disk)) {
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+ return -ENODEV;
+ }
+ kobject_get(bdev->bd_holder_dir);
+ mutex_unlock(&bdev->bd_disk->open_mutex);
+
+ mutex_lock(&disk->open_mutex);
+ WARN_ON_ONCE(!bdev->bd_holder);
+
+ holder = bd_find_holder_disk(bdev, disk);
+ if (holder) {
+ kobject_put(bdev->bd_holder_dir);
+ holder->refcnt++;
+ goto out_unlock;
+ }
+
+ holder = kzalloc(sizeof(*holder), GFP_KERNEL);
+ if (!holder) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+
+ INIT_LIST_HEAD(&holder->list);
+ holder->refcnt = 1;
+ holder->holder_dir = bdev->bd_holder_dir;
+
+ ret = add_symlink(disk->slave_dir, bdev_kobj(bdev));
+ if (ret)
+ goto out_free_holder;
+ ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj);
+ if (ret)
+ goto out_del_symlink;
+ list_add(&holder->list, &disk->slave_bdevs);
+
+ mutex_unlock(&disk->open_mutex);
+ return 0;
+
+out_del_symlink:
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+out_free_holder:
+ kfree(holder);
+out_unlock:
+ mutex_unlock(&disk->open_mutex);
+ if (ret)
+ kobject_put(bdev->bd_holder_dir);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(bd_link_disk_holder);
+
+/**
+ * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder()
+ * @bdev: the calimed slave bdev
+ * @disk: the holding disk
+ *
+ * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT.
+ *
+ * CONTEXT:
+ * Might sleep.
+ */
+void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk)
+{
+ struct bd_holder_disk *holder;
+
+ if (WARN_ON_ONCE(!disk->slave_dir))
+ return;
+
+ mutex_lock(&disk->open_mutex);
+ holder = bd_find_holder_disk(bdev, disk);
+ if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) {
+ del_symlink(disk->slave_dir, bdev_kobj(bdev));
+ del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj);
+ kobject_put(holder->holder_dir);
+ list_del_init(&holder->list);
+ kfree(holder);
+ }
+ mutex_unlock(&disk->open_mutex);
+}
+EXPORT_SYMBOL_GPL(bd_unlink_disk_holder);
diff --git a/block/ioctl.c b/block/ioctl.c
index bdb3bbb253d9..438f79c564cf 100644
--- a/block/ioctl.c
+++ b/block/ioctl.c
@@ -16,42 +16,37 @@
static int blkpg_do_ioctl(struct block_device *bdev,
struct blkpg_partition __user *upart, int op)
{
+ struct gendisk *disk = bdev->bd_disk;
struct blkpg_partition p;
- long long start, length;
+ sector_t start, length;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (copy_from_user(&p, upart, sizeof(struct blkpg_partition)))
return -EFAULT;
- if (bdev != bdev->bd_contains)
+ if (bdev_is_partition(bdev))
return -EINVAL;
if (p.pno <= 0)
return -EINVAL;
if (op == BLKPG_DEL_PARTITION)
- return bdev_del_partition(bdev, p.pno);
+ return bdev_del_partition(disk, p.pno);
+
+ if (p.start < 0 || p.length <= 0 || p.start + p.length < 0)
+ return -EINVAL;
+ /* Check that the partition is aligned to the block size */
+ if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev)))
+ return -EINVAL;
start = p.start >> SECTOR_SHIFT;
length = p.length >> SECTOR_SHIFT;
- /* check for fit in a hd_struct */
- if (sizeof(sector_t) < sizeof(long long)) {
- long pstart = start, plength = length;
-
- if (pstart != start || plength != length || pstart < 0 ||
- plength < 0 || p.pno > 65535)
- return -EINVAL;
- }
-
switch (op) {
case BLKPG_ADD_PARTITION:
- /* check if partition is aligned to blocksize */
- if (p.start & (bdev_logical_block_size(bdev) - 1))
- return -EINVAL;
- return bdev_add_partition(bdev, p.pno, start, length);
+ return bdev_add_partition(disk, p.pno, start, length);
case BLKPG_RESIZE_PARTITION:
- return bdev_resize_partition(bdev, p.pno, start, length);
+ return bdev_resize_partition(disk, p.pno, start, length);
default:
return -EINVAL;
}
@@ -90,35 +85,18 @@ static int compat_blkpg_ioctl(struct block_device *bdev,
}
#endif
-static int blkdev_reread_part(struct block_device *bdev)
-{
- int ret;
-
- if (!disk_part_scan_enabled(bdev->bd_disk) || bdev != bdev->bd_contains)
- return -EINVAL;
- if (!capable(CAP_SYS_ADMIN))
- return -EACCES;
-
- mutex_lock(&bdev->bd_mutex);
- ret = bdev_disk_changed(bdev, false);
- mutex_unlock(&bdev->bd_mutex);
-
- return ret;
-}
-
-static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
- unsigned long arg, unsigned long flags)
+static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode,
+ unsigned long arg)
{
uint64_t range[2];
uint64_t start, len;
- struct request_queue *q = bdev_get_queue(bdev);
- struct address_space *mapping = bdev->bd_inode->i_mapping;
-
+ struct inode *inode = bdev->bd_inode;
+ int err;
- if (!(mode & FMODE_WRITE))
+ if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
- if (!blk_queue_discard(q))
+ if (!bdev_max_discard_sectors(bdev))
return -EOPNOTSUPP;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
@@ -132,21 +110,59 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode,
if (len & 511)
return -EINVAL;
- if (start + len > i_size_read(bdev->bd_inode))
+ if (start + len > bdev_nr_bytes(bdev))
return -EINVAL;
- truncate_inode_pages_range(mapping, start, start + len - 1);
- return blkdev_issue_discard(bdev, start >> 9, len >> 9,
- GFP_KERNEL, flags);
+
+ filemap_invalidate_lock(inode->i_mapping);
+ err = truncate_bdev_range(bdev, mode, start, start + len - 1);
+ if (err)
+ goto fail;
+ err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL);
+fail:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
+}
+
+static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode,
+ void __user *argp)
+{
+ uint64_t start, len;
+ uint64_t range[2];
+ int err;
+
+ if (!(mode & BLK_OPEN_WRITE))
+ return -EBADF;
+ if (!bdev_max_secure_erase_sectors(bdev))
+ return -EOPNOTSUPP;
+ if (copy_from_user(range, argp, sizeof(range)))
+ return -EFAULT;
+
+ start = range[0];
+ len = range[1];
+ if ((start & 511) || (len & 511))
+ return -EINVAL;
+ if (start + len > bdev_nr_bytes(bdev))
+ return -EINVAL;
+
+ filemap_invalidate_lock(bdev->bd_inode->i_mapping);
+ err = truncate_bdev_range(bdev, mode, start, start + len - 1);
+ if (!err)
+ err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9,
+ GFP_KERNEL);
+ filemap_invalidate_unlock(bdev->bd_inode->i_mapping);
+ return err;
}
-static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
+
+static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode,
unsigned long arg)
{
uint64_t range[2];
- struct address_space *mapping;
uint64_t start, end, len;
+ struct inode *inode = bdev->bd_inode;
+ int err;
- if (!(mode & FMODE_WRITE))
+ if (!(mode & BLK_OPEN_WRITE))
return -EBADF;
if (copy_from_user(range, (void __user *)arg, sizeof(range)))
@@ -160,17 +176,23 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode,
return -EINVAL;
if (len & 511)
return -EINVAL;
- if (end >= (uint64_t)i_size_read(bdev->bd_inode))
+ if (end >= (uint64_t)bdev_nr_bytes(bdev))
return -EINVAL;
if (end < start)
return -EINVAL;
/* Invalidate the page cache, including dirty pages */
- mapping = bdev->bd_inode->i_mapping;
- truncate_inode_pages_range(mapping, start, end);
+ filemap_invalidate_lock(inode->i_mapping);
+ err = truncate_bdev_range(bdev, mode, start, end);
+ if (err)
+ goto fail;
- return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
- BLKDEV_ZERO_NOUNMAP);
+ err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL,
+ BLKDEV_ZERO_NOUNMAP);
+
+fail:
+ filemap_invalidate_unlock(inode->i_mapping);
+ return err;
}
static int put_ushort(unsigned short __user *argp, unsigned short val)
@@ -215,30 +237,13 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val)
}
#endif
-int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg)
-{
- struct gendisk *disk = bdev->bd_disk;
-
- if (disk->fops->ioctl)
- return disk->fops->ioctl(bdev, mode, cmd, arg);
-
- return -ENOTTY;
-}
-/*
- * For the record: _GPL here is only because somebody decided to slap it
- * on the previous export. Sheer idiocy, since it wasn't copyrightable
- * at all and could be open-coded without any exports by anybody who cares.
- */
-EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl);
-
#ifdef CONFIG_COMPAT
/*
* This is the equivalent of compat_ptr_ioctl(), to be used by block
* drivers that implement only commands that are completely compatible
* between 32-bit and 64-bit user space
*/
-int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode,
+int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode,
unsigned cmd, unsigned long arg)
{
struct gendisk *disk = bdev->bd_disk;
@@ -252,13 +257,28 @@ int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode,
EXPORT_SYMBOL(blkdev_compat_ptr_ioctl);
#endif
-static int blkdev_pr_register(struct block_device *bdev,
+static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode)
+{
+ /* no sense to make reservations for partitions */
+ if (bdev_is_partition(bdev))
+ return false;
+
+ if (capable(CAP_SYS_ADMIN))
+ return true;
+ /*
+ * Only allow unprivileged reservations if the file descriptor is open
+ * for writing.
+ */
+ return mode & BLK_OPEN_WRITE;
+}
+
+static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode,
struct pr_registration __user *arg)
{
const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
struct pr_registration reg;
- if (!capable(CAP_SYS_ADMIN))
+ if (!blkdev_pr_allowed(bdev, mode))
return -EPERM;
if (!ops || !ops->pr_register)
return -EOPNOTSUPP;
@@ -270,13 +290,13 @@ static int blkdev_pr_register(struct block_device *bdev,
return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags);
}
-static int blkdev_pr_reserve(struct block_device *bdev,
+static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode,
struct pr_reservation __user *arg)
{
const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
struct pr_reservation rsv;
- if (!capable(CAP_SYS_ADMIN))
+ if (!blkdev_pr_allowed(bdev, mode))
return -EPERM;
if (!ops || !ops->pr_reserve)
return -EOPNOTSUPP;
@@ -288,13 +308,13 @@ static int blkdev_pr_reserve(struct block_device *bdev,
return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags);
}
-static int blkdev_pr_release(struct block_device *bdev,
+static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode,
struct pr_reservation __user *arg)
{
const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
struct pr_reservation rsv;
- if (!capable(CAP_SYS_ADMIN))
+ if (!blkdev_pr_allowed(bdev, mode))
return -EPERM;
if (!ops || !ops->pr_release)
return -EOPNOTSUPP;
@@ -306,13 +326,13 @@ static int blkdev_pr_release(struct block_device *bdev,
return ops->pr_release(bdev, rsv.key, rsv.type);
}
-static int blkdev_pr_preempt(struct block_device *bdev,
+static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode,
struct pr_preempt __user *arg, bool abort)
{
const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
struct pr_preempt p;
- if (!capable(CAP_SYS_ADMIN))
+ if (!blkdev_pr_allowed(bdev, mode))
return -EPERM;
if (!ops || !ops->pr_preempt)
return -EOPNOTSUPP;
@@ -324,13 +344,13 @@ static int blkdev_pr_preempt(struct block_device *bdev,
return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort);
}
-static int blkdev_pr_clear(struct block_device *bdev,
+static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode,
struct pr_clear __user *arg)
{
const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops;
struct pr_clear c;
- if (!capable(CAP_SYS_ADMIN))
+ if (!blkdev_pr_allowed(bdev, mode))
return -EPERM;
if (!ops || !ops->pr_clear)
return -EOPNOTSUPP;
@@ -342,57 +362,40 @@ static int blkdev_pr_clear(struct block_device *bdev,
return ops->pr_clear(bdev, c.key);
}
-/*
- * Is it an unrecognized ioctl? The correct returns are either
- * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a
- * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl
- * code before returning.
- *
- * Confused drivers sometimes return EINVAL, which is wrong. It
- * means "I understood the ioctl command, but the parameters to
- * it were wrong".
- *
- * We should aim to just fix the broken drivers, the EINVAL case
- * should go away.
- */
-static inline int is_unrecognized_ioctl(int ret)
-{
- return ret == -EINVAL ||
- ret == -ENOTTY ||
- ret == -ENOIOCTLCMD;
-}
-
-static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg)
+static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd,
+ unsigned long arg)
{
- int ret;
-
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- if (!is_unrecognized_ioctl(ret))
- return ret;
+ mutex_lock(&bdev->bd_holder_lock);
+ if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync)
+ bdev->bd_holder_ops->sync(bdev);
+ else {
+ mutex_unlock(&bdev->bd_holder_lock);
+ sync_blockdev(bdev);
+ }
- fsync_bdev(bdev);
invalidate_bdev(bdev);
return 0;
}
-static int blkdev_roset(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg)
+static int blkdev_roset(struct block_device *bdev, unsigned cmd,
+ unsigned long arg)
{
int ret, n;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
- ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg);
- if (!is_unrecognized_ioctl(ret))
- return ret;
if (get_user(n, (int __user *)arg))
return -EFAULT;
- set_device_ro(bdev, n);
+ if (bdev->bd_disk->fops->set_read_only) {
+ ret = bdev->bd_disk->fops->set_read_only(bdev, n);
+ if (ret)
+ return ret;
+ }
+ bdev->bd_read_only = n;
return 0;
}
@@ -462,10 +465,11 @@ static int compat_hdio_getgeo(struct block_device *bdev,
#endif
/* set the logical block size */
-static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
+static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode,
int __user *argp)
{
int ret, n;
+ struct bdev_handle *handle;
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
@@ -474,15 +478,15 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
if (get_user(n, argp))
return -EFAULT;
- if (!(mode & FMODE_EXCL)) {
- bdgrab(bdev);
- if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0)
- return -EBUSY;
- }
+ if (mode & BLK_OPEN_EXCL)
+ return set_blocksize(bdev, n);
+ handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL);
+ if (IS_ERR(handle))
+ return -EBUSY;
ret = set_blocksize(bdev, n);
- if (!(mode & FMODE_EXCL))
- blkdev_put(bdev, mode | FMODE_EXCL);
+ bdev_release(handle);
+
return ret;
}
@@ -491,25 +495,27 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode,
* user space. Note the separate arg/argp parameters that are needed
* to deal with the compat_ptr() conversion.
*/
-static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
- unsigned cmd, unsigned long arg, void __user *argp)
+static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode,
+ unsigned int cmd, unsigned long arg,
+ void __user *argp)
{
unsigned int max_sectors;
switch (cmd) {
case BLKFLSBUF:
- return blkdev_flushbuf(bdev, mode, cmd, arg);
+ return blkdev_flushbuf(bdev, cmd, arg);
case BLKROSET:
- return blkdev_roset(bdev, mode, cmd, arg);
+ return blkdev_roset(bdev, cmd, arg);
case BLKDISCARD:
- return blk_ioctl_discard(bdev, mode, arg, 0);
+ return blk_ioctl_discard(bdev, mode, arg);
case BLKSECDISCARD:
- return blk_ioctl_discard(bdev, mode, arg,
- BLKDEV_DISCARD_SECURE);
+ return blk_ioctl_secure_erase(bdev, mode, argp);
case BLKZEROOUT:
return blk_ioctl_zeroout(bdev, mode, arg);
+ case BLKGETDISKSEQ:
+ return put_u64(argp, bdev->bd_disk->diskseq);
case BLKREPORTZONE:
- return blkdev_report_zones_ioctl(bdev, mode, cmd, arg);
+ return blkdev_report_zones_ioctl(bdev, cmd, arg);
case BLKRESETZONE:
case BLKOPENZONE:
case BLKCLOSEZONE:
@@ -518,7 +524,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
case BLKGETZONESZ:
return put_uint(argp, bdev_zone_sectors(bdev));
case BLKGETNRZONES:
- return put_uint(argp, blkdev_nr_zones(bdev->bd_disk));
+ return put_uint(argp, bdev_nr_zones(bdev));
case BLKROGET:
return put_int(argp, bdev_read_only(bdev) != 0);
case BLKSSZGET: /* get block device logical block size */
@@ -538,31 +544,35 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
queue_max_sectors(bdev_get_queue(bdev)));
return put_ushort(argp, max_sectors);
case BLKROTATIONAL:
- return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev)));
+ return put_ushort(argp, !bdev_nonrot(bdev));
case BLKRASET:
case BLKFRASET:
if(!capable(CAP_SYS_ADMIN))
return -EACCES;
- bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE;
+ bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE;
return 0;
case BLKRRPART:
- return blkdev_reread_part(bdev);
+ if (!capable(CAP_SYS_ADMIN))
+ return -EACCES;
+ if (bdev_is_partition(bdev))
+ return -EINVAL;
+ return disk_scan_partitions(bdev->bd_disk, mode);
case BLKTRACESTART:
case BLKTRACESTOP:
case BLKTRACETEARDOWN:
return blk_trace_ioctl(bdev, cmd, argp);
case IOC_PR_REGISTER:
- return blkdev_pr_register(bdev, argp);
+ return blkdev_pr_register(bdev, mode, argp);
case IOC_PR_RESERVE:
- return blkdev_pr_reserve(bdev, argp);
+ return blkdev_pr_reserve(bdev, mode, argp);
case IOC_PR_RELEASE:
- return blkdev_pr_release(bdev, argp);
+ return blkdev_pr_release(bdev, mode, argp);
case IOC_PR_PREEMPT:
- return blkdev_pr_preempt(bdev, argp, false);
+ return blkdev_pr_preempt(bdev, mode, argp, false);
case IOC_PR_PREEMPT_ABORT:
- return blkdev_pr_preempt(bdev, argp, true);
+ return blkdev_pr_preempt(bdev, mode, argp, true);
case IOC_PR_CLEAR:
- return blkdev_pr_clear(bdev, argp);
+ return blkdev_pr_clear(bdev, mode, argp);
default:
return -ENOIOCTLCMD;
}
@@ -574,12 +584,12 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode,
*
* New commands must be compatible and go into blkdev_common_ioctl
*/
-int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
- unsigned long arg)
+long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
- int ret;
- loff_t size;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
void __user *argp = (void __user *)arg;
+ blk_mode_t mode = file_to_blk_mode(file);
+ int ret;
switch (cmd) {
/* These need separate implementations for the data structure */
@@ -593,12 +603,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKFRAGET:
if (!argp)
return -EINVAL;
- return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512);
+ return put_long(argp,
+ (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
- size = i_size_read(bdev->bd_inode);
- if ((size >> 9) > ~0UL)
+ if (bdev_nr_sectors(bdev) > ~0UL)
return -EFBIG;
- return put_ulong(argp, size >> 9);
+ return put_ulong(argp, bdev_nr_sectors(bdev));
/* The data is compatible, but the command number is different */
case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */
@@ -606,7 +616,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
case BLKBSZSET:
return blkdev_bszset(bdev, mode, argp);
case BLKGETSIZE64:
- return put_u64(argp, i_size_read(bdev->bd_inode));
+ return put_u64(argp, bdev_nr_bytes(bdev));
/* Incompatible alignment on i386 */
case BLKTRACESETUP:
@@ -616,12 +626,13 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd,
}
ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp);
- if (ret == -ENOIOCTLCMD)
- return __blkdev_driver_ioctl(bdev, mode, cmd, arg);
+ if (ret != -ENOIOCTLCMD)
+ return ret;
- return ret;
+ if (!bdev->bd_disk->fops->ioctl)
+ return -ENOTTY;
+ return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg);
}
-EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */
#ifdef CONFIG_COMPAT
@@ -636,20 +647,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
{
int ret;
void __user *argp = compat_ptr(arg);
- struct inode *inode = file->f_mapping->host;
- struct block_device *bdev = inode->i_bdev;
+ struct block_device *bdev = I_BDEV(file->f_mapping->host);
struct gendisk *disk = bdev->bd_disk;
- fmode_t mode = file->f_mode;
- loff_t size;
-
- /*
- * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have
- * to updated it before every ioctl.
- */
- if (file->f_flags & O_NDELAY)
- mode |= FMODE_NDELAY;
- else
- mode &= ~FMODE_NDELAY;
+ blk_mode_t mode = file_to_blk_mode(file);
switch (cmd) {
/* These need separate implementations for the data structure */
@@ -664,12 +664,11 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
if (!argp)
return -EINVAL;
return compat_put_long(argp,
- (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512);
+ (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512);
case BLKGETSIZE:
- size = i_size_read(bdev->bd_inode);
- if ((size >> 9) > ~0UL)
+ if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0)
return -EFBIG;
- return compat_put_ulong(argp, size >> 9);
+ return compat_put_ulong(argp, bdev_nr_sectors(bdev));
/* The data is compatible, but the command number is different */
case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */
@@ -677,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg)
case BLKBSZSET_32:
return blkdev_bszset(bdev, mode, argp);
case BLKGETSIZE64_32:
- return put_u64(argp, i_size_read(bdev->bd_inode));
+ return put_u64(argp, bdev_nr_bytes(bdev));
/* Incompatible alignment on i386 */
case BLKTRACESETUP32:
diff --git a/block/ioprio.c b/block/ioprio.c
index 77bcab11dce5..73301a261429 100644
--- a/block/ioprio.c
+++ b/block/ioprio.c
@@ -22,68 +22,43 @@
*/
#include <linux/gfp.h>
#include <linux/kernel.h>
-#include <linux/export.h>
#include <linux/ioprio.h>
#include <linux/cred.h>
#include <linux/blkdev.h>
#include <linux/capability.h>
-#include <linux/sched/user.h>
-#include <linux/sched/task.h>
#include <linux/syscalls.h>
#include <linux/security.h>
#include <linux/pid_namespace.h>
-int set_task_ioprio(struct task_struct *task, int ioprio)
-{
- int err;
- struct io_context *ioc;
- const struct cred *cred = current_cred(), *tcred;
-
- rcu_read_lock();
- tcred = __task_cred(task);
- if (!uid_eq(tcred->uid, cred->euid) &&
- !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) {
- rcu_read_unlock();
- return -EPERM;
- }
- rcu_read_unlock();
-
- err = security_task_setioprio(task, ioprio);
- if (err)
- return err;
-
- ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE);
- if (ioc) {
- ioc->ioprio = ioprio;
- put_io_context(ioc);
- }
-
- return err;
-}
-EXPORT_SYMBOL_GPL(set_task_ioprio);
-
int ioprio_check_cap(int ioprio)
{
int class = IOPRIO_PRIO_CLASS(ioprio);
- int data = IOPRIO_PRIO_DATA(ioprio);
+ int level = IOPRIO_PRIO_LEVEL(ioprio);
switch (class) {
case IOPRIO_CLASS_RT:
- if (!capable(CAP_SYS_ADMIN))
+ /*
+ * Originally this only checked for CAP_SYS_ADMIN,
+ * which was implicitly allowed for pid 0 by security
+ * modules such as SELinux. Make sure we check
+ * CAP_SYS_ADMIN first to avoid a denial/avc for
+ * possibly missing CAP_SYS_NICE permission.
+ */
+ if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE))
return -EPERM;
- /* fall through */
+ fallthrough;
/* rt has prio field too */
case IOPRIO_CLASS_BE:
- if (data >= IOPRIO_BE_NR || data < 0)
+ if (level >= IOPRIO_NR_LEVELS)
return -EINVAL;
-
break;
case IOPRIO_CLASS_IDLE:
break;
case IOPRIO_CLASS_NONE:
- if (data)
+ if (level)
return -EINVAL;
break;
+ case IOPRIO_CLASS_INVALID:
default:
return -EINVAL;
}
@@ -119,11 +94,17 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
+
+ read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
ret = set_task_ioprio(p, ioprio);
- if (ret)
- break;
+ if (ret) {
+ read_unlock(&tasklist_lock);
+ goto out;
+ }
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ read_unlock(&tasklist_lock);
+
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
@@ -153,6 +134,7 @@ free_uid:
ret = -EINVAL;
}
+out:
rcu_read_unlock();
return ret;
}
@@ -164,22 +146,38 @@ static int get_task_ioprio(struct task_struct *p)
ret = security_task_getioprio(p);
if (ret)
goto out;
- ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM);
+ task_lock(p);
+ ret = __get_task_ioprio(p);
+ task_unlock(p);
+out:
+ return ret;
+}
+
+/*
+ * Return raw IO priority value as set by userspace. We use this for
+ * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and
+ * also so that userspace can distinguish unset IO priority (which just gets
+ * overriden based on task's nice value) from IO priority set to some value.
+ */
+static int get_task_raw_ioprio(struct task_struct *p)
+{
+ int ret;
+
+ ret = security_task_getioprio(p);
+ if (ret)
+ goto out;
task_lock(p);
if (p->io_context)
ret = p->io_context->ioprio;
+ else
+ ret = IOPRIO_DEFAULT;
task_unlock(p);
out:
return ret;
}
-int ioprio_best(unsigned short aprio, unsigned short bprio)
+static int ioprio_best(unsigned short aprio, unsigned short bprio)
{
- if (!ioprio_valid(aprio))
- aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
- if (!ioprio_valid(bprio))
- bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM);
-
return min(aprio, bprio);
}
@@ -200,13 +198,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
else
p = find_task_by_vpid(who);
if (p)
- ret = get_task_ioprio(p);
+ ret = get_task_raw_ioprio(p);
break;
case IOPRIO_WHO_PGRP:
if (!who)
pgrp = task_pgrp(current);
else
pgrp = find_vpid(who);
+ read_lock(&tasklist_lock);
do_each_pid_thread(pgrp, PIDTYPE_PGID, p) {
tmpio = get_task_ioprio(p);
if (tmpio < 0)
@@ -216,6 +215,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who)
else
ret = ioprio_best(ret, tmpio);
} while_each_pid_thread(pgrp, PIDTYPE_PGID, p);
+ read_unlock(&tasklist_lock);
+
break;
case IOPRIO_WHO_USER:
uid = make_kuid(current_user_ns(), who);
diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c
deleted file mode 100644
index 35abcb1ec051..000000000000
--- a/block/keyslot-manager.c
+++ /dev/null
@@ -1,396 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright 2019 Google LLC
- */
-
-/**
- * DOC: The Keyslot Manager
- *
- * Many devices with inline encryption support have a limited number of "slots"
- * into which encryption contexts may be programmed, and requests can be tagged
- * with a slot number to specify the key to use for en/decryption.
- *
- * As the number of slots is limited, and programming keys is expensive on
- * many inline encryption hardware, we don't want to program the same key into
- * multiple slots - if multiple requests are using the same key, we want to
- * program just one slot with that key and use that slot for all requests.
- *
- * The keyslot manager manages these keyslots appropriately, and also acts as
- * an abstraction between the inline encryption hardware and the upper layers.
- *
- * Lower layer devices will set up a keyslot manager in their request queue
- * and tell it how to perform device specific operations like programming/
- * evicting keys from keyslots.
- *
- * Upper layers will call blk_ksm_get_slot_for_key() to program a
- * key into some slot in the inline encryption hardware.
- */
-
-#define pr_fmt(fmt) "blk-crypto: " fmt
-
-#include <linux/keyslot-manager.h>
-#include <linux/atomic.h>
-#include <linux/mutex.h>
-#include <linux/pm_runtime.h>
-#include <linux/wait.h>
-#include <linux/blkdev.h>
-
-struct blk_ksm_keyslot {
- atomic_t slot_refs;
- struct list_head idle_slot_node;
- struct hlist_node hash_node;
- const struct blk_crypto_key *key;
- struct blk_keyslot_manager *ksm;
-};
-
-static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm)
-{
- /*
- * Calling into the driver requires ksm->lock held and the device
- * resumed. But we must resume the device first, since that can acquire
- * and release ksm->lock via blk_ksm_reprogram_all_keys().
- */
- if (ksm->dev)
- pm_runtime_get_sync(ksm->dev);
- down_write(&ksm->lock);
-}
-
-static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm)
-{
- up_write(&ksm->lock);
- if (ksm->dev)
- pm_runtime_put_sync(ksm->dev);
-}
-
-/**
- * blk_ksm_init() - Initialize a keyslot manager
- * @ksm: The keyslot_manager to initialize.
- * @num_slots: The number of key slots to manage.
- *
- * Allocate memory for keyslots and initialize a keyslot manager. Called by
- * e.g. storage drivers to set up a keyslot manager in their request_queue.
- *
- * Return: 0 on success, or else a negative error code.
- */
-int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots)
-{
- unsigned int slot;
- unsigned int i;
- unsigned int slot_hashtable_size;
-
- memset(ksm, 0, sizeof(*ksm));
-
- if (num_slots == 0)
- return -EINVAL;
-
- ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL);
- if (!ksm->slots)
- return -ENOMEM;
-
- ksm->num_slots = num_slots;
-
- init_rwsem(&ksm->lock);
-
- init_waitqueue_head(&ksm->idle_slots_wait_queue);
- INIT_LIST_HEAD(&ksm->idle_slots);
-
- for (slot = 0; slot < num_slots; slot++) {
- ksm->slots[slot].ksm = ksm;
- list_add_tail(&ksm->slots[slot].idle_slot_node,
- &ksm->idle_slots);
- }
-
- spin_lock_init(&ksm->idle_slots_lock);
-
- slot_hashtable_size = roundup_pow_of_two(num_slots);
- ksm->log_slot_ht_size = ilog2(slot_hashtable_size);
- ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size,
- sizeof(ksm->slot_hashtable[0]),
- GFP_KERNEL);
- if (!ksm->slot_hashtable)
- goto err_destroy_ksm;
- for (i = 0; i < slot_hashtable_size; i++)
- INIT_HLIST_HEAD(&ksm->slot_hashtable[i]);
-
- return 0;
-
-err_destroy_ksm:
- blk_ksm_destroy(ksm);
- return -ENOMEM;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_init);
-
-static inline struct hlist_head *
-blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)];
-}
-
-static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot)
-{
- struct blk_keyslot_manager *ksm = slot->ksm;
- unsigned long flags;
-
- spin_lock_irqsave(&ksm->idle_slots_lock, flags);
- list_del(&slot->idle_slot_node);
- spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_keyslot(
- struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key);
- struct blk_ksm_keyslot *slotp;
-
- hlist_for_each_entry(slotp, head, hash_node) {
- if (slotp->key == key)
- return slotp;
- }
- return NULL;
-}
-
-static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot(
- struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- struct blk_ksm_keyslot *slot;
-
- slot = blk_ksm_find_keyslot(ksm, key);
- if (!slot)
- return NULL;
- if (atomic_inc_return(&slot->slot_refs) == 1) {
- /* Took first reference to this slot; remove it from LRU list */
- blk_ksm_remove_slot_from_lru_list(slot);
- }
- return slot;
-}
-
-unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot)
-{
- return slot - slot->ksm->slots;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx);
-
-/**
- * blk_ksm_get_slot_for_key() - Program a key into a keyslot.
- * @ksm: The keyslot manager to program the key into.
- * @key: Pointer to the key object to program, including the raw key, crypto
- * mode, and data unit size.
- * @slot_ptr: A pointer to return the pointer of the allocated keyslot.
- *
- * Get a keyslot that's been programmed with the specified key. If one already
- * exists, return it with incremented refcount. Otherwise, wait for a keyslot
- * to become idle and program it.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the
- * allocated keyslot), or some other blk_status_t otherwise (and
- * keyslot is set to NULL).
- */
-blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key,
- struct blk_ksm_keyslot **slot_ptr)
-{
- struct blk_ksm_keyslot *slot;
- int slot_idx;
- int err;
-
- *slot_ptr = NULL;
- down_read(&ksm->lock);
- slot = blk_ksm_find_and_grab_keyslot(ksm, key);
- up_read(&ksm->lock);
- if (slot)
- goto success;
-
- for (;;) {
- blk_ksm_hw_enter(ksm);
- slot = blk_ksm_find_and_grab_keyslot(ksm, key);
- if (slot) {
- blk_ksm_hw_exit(ksm);
- goto success;
- }
-
- /*
- * If we're here, that means there wasn't a slot that was
- * already programmed with the key. So try to program it.
- */
- if (!list_empty(&ksm->idle_slots))
- break;
-
- blk_ksm_hw_exit(ksm);
- wait_event(ksm->idle_slots_wait_queue,
- !list_empty(&ksm->idle_slots));
- }
-
- slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot,
- idle_slot_node);
- slot_idx = blk_ksm_get_slot_idx(slot);
-
- err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx);
- if (err) {
- wake_up(&ksm->idle_slots_wait_queue);
- blk_ksm_hw_exit(ksm);
- return errno_to_blk_status(err);
- }
-
- /* Move this slot to the hash list for the new key. */
- if (slot->key)
- hlist_del(&slot->hash_node);
- slot->key = key;
- hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key));
-
- atomic_set(&slot->slot_refs, 1);
-
- blk_ksm_remove_slot_from_lru_list(slot);
-
- blk_ksm_hw_exit(ksm);
-success:
- *slot_ptr = slot;
- return BLK_STS_OK;
-}
-
-/**
- * blk_ksm_put_slot() - Release a reference to a slot
- * @slot: The keyslot to release the reference of.
- *
- * Context: Any context.
- */
-void blk_ksm_put_slot(struct blk_ksm_keyslot *slot)
-{
- struct blk_keyslot_manager *ksm;
- unsigned long flags;
-
- if (!slot)
- return;
-
- ksm = slot->ksm;
-
- if (atomic_dec_and_lock_irqsave(&slot->slot_refs,
- &ksm->idle_slots_lock, flags)) {
- list_add_tail(&slot->idle_slot_node, &ksm->idle_slots);
- spin_unlock_irqrestore(&ksm->idle_slots_lock, flags);
- wake_up(&ksm->idle_slots_wait_queue);
- }
-}
-
-/**
- * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is
- * supported by a ksm.
- * @ksm: The keyslot manager to check
- * @cfg: The crypto configuration to check for.
- *
- * Checks for crypto_mode/data unit size/dun bytes support.
- *
- * Return: Whether or not this ksm supports the specified crypto config.
- */
-bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_config *cfg)
-{
- if (!ksm)
- return false;
- if (!(ksm->crypto_modes_supported[cfg->crypto_mode] &
- cfg->data_unit_size))
- return false;
- if (ksm->max_dun_bytes_supported < cfg->dun_bytes)
- return false;
- return true;
-}
-
-/**
- * blk_ksm_evict_key() - Evict a key from the lower layer device.
- * @ksm: The keyslot manager to evict from
- * @key: The key to evict
- *
- * Find the keyslot that the specified key was programmed into, and evict that
- * slot from the lower layer device. The slot must not be in use by any
- * in-flight IO when this function is called.
- *
- * Context: Process context. Takes and releases ksm->lock.
- * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY
- * if the keyslot is still in use, or another -errno value on other
- * error.
- */
-int blk_ksm_evict_key(struct blk_keyslot_manager *ksm,
- const struct blk_crypto_key *key)
-{
- struct blk_ksm_keyslot *slot;
- int err = 0;
-
- blk_ksm_hw_enter(ksm);
- slot = blk_ksm_find_keyslot(ksm, key);
- if (!slot)
- goto out_unlock;
-
- if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) {
- err = -EBUSY;
- goto out_unlock;
- }
- err = ksm->ksm_ll_ops.keyslot_evict(ksm, key,
- blk_ksm_get_slot_idx(slot));
- if (err)
- goto out_unlock;
-
- hlist_del(&slot->hash_node);
- slot->key = NULL;
- err = 0;
-out_unlock:
- blk_ksm_hw_exit(ksm);
- return err;
-}
-
-/**
- * blk_ksm_reprogram_all_keys() - Re-program all keyslots.
- * @ksm: The keyslot manager
- *
- * Re-program all keyslots that are supposed to have a key programmed. This is
- * intended only for use by drivers for hardware that loses its keys on reset.
- *
- * Context: Process context. Takes and releases ksm->lock.
- */
-void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm)
-{
- unsigned int slot;
-
- /* This is for device initialization, so don't resume the device */
- down_write(&ksm->lock);
- for (slot = 0; slot < ksm->num_slots; slot++) {
- const struct blk_crypto_key *key = ksm->slots[slot].key;
- int err;
-
- if (!key)
- continue;
-
- err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot);
- WARN_ON(err);
- }
- up_write(&ksm->lock);
-}
-EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys);
-
-void blk_ksm_destroy(struct blk_keyslot_manager *ksm)
-{
- if (!ksm)
- return;
- kvfree(ksm->slot_hashtable);
- kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots);
- memzero_explicit(ksm, sizeof(*ksm));
-}
-EXPORT_SYMBOL_GPL(blk_ksm_destroy);
-
-bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q)
-{
- if (blk_integrity_queue_supports_integrity(q)) {
- pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n");
- return false;
- }
- q->ksm = ksm;
- return true;
-}
-EXPORT_SYMBOL_GPL(blk_ksm_register);
-
-void blk_ksm_unregister(struct request_queue *q)
-{
- q->ksm = NULL;
-}
diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c
index a38c5ab103d1..4155594aefc6 100644
--- a/block/kyber-iosched.c
+++ b/block/kyber-iosched.c
@@ -8,16 +8,16 @@
#include <linux/kernel.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/elevator.h>
#include <linux/module.h>
#include <linux/sbitmap.h>
+#include <trace/events/block.h>
+
+#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
#include "blk-mq-sched.h"
-#include "blk-mq-tag.h"
#define CREATE_TRACE_POINTS
#include <trace/events/kyber.h>
@@ -149,6 +149,7 @@ struct kyber_ctx_queue {
struct kyber_queue_data {
struct request_queue *q;
+ dev_t dev;
/*
* Each scheduling domain has a limited number of in-flight requests
@@ -192,9 +193,9 @@ struct kyber_hctx_data {
static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags,
void *key);
-static unsigned int kyber_sched_domain(unsigned int op)
+static unsigned int kyber_sched_domain(blk_opf_t opf)
{
- switch (op & REQ_OP_MASK) {
+ switch (opf & REQ_OP_MASK) {
case REQ_OP_READ:
return KYBER_READ;
case REQ_OP_WRITE:
@@ -255,7 +256,7 @@ static int calculate_percentile(struct kyber_queue_data *kqd,
}
memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type]));
- trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain],
+ trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain],
kyber_latency_type_names[type], percentile,
bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples);
@@ -268,7 +269,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd,
depth = clamp(depth, 1U, kyber_depth[sched_domain]);
if (depth != kqd->domain_tokens[sched_domain].sb.depth) {
sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth);
- trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain],
+ trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain],
depth);
}
}
@@ -353,19 +354,9 @@ static void kyber_timer_fn(struct timer_list *t)
}
}
-static unsigned int kyber_sched_tags_shift(struct request_queue *q)
-{
- /*
- * All of the hardware queues have the same depth, so we can just grab
- * the shift of the first one.
- */
- return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift;
-}
-
static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
{
struct kyber_queue_data *kqd;
- unsigned int shift;
int ret = -ENOMEM;
int i;
@@ -374,6 +365,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
goto err;
kqd->q = q;
+ kqd->dev = disk_devt(q->disk);
kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency,
GFP_KERNEL | __GFP_ZERO);
@@ -400,9 +392,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q)
kqd->latency_targets[i] = kyber_latency_targets[i];
}
- shift = kyber_sched_tags_shift(q);
- kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
-
return kqd;
err_buckets:
@@ -430,6 +419,8 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e)
blk_stat_enable_accounting(q);
+ blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q);
+
eq->elevator_data = kqd;
q->elevator = eq;
@@ -441,7 +432,8 @@ static void kyber_exit_sched(struct elevator_queue *e)
struct kyber_queue_data *kqd = e->elevator_data;
int i;
- del_timer_sync(&kqd->timer);
+ timer_shutdown_sync(&kqd->timer);
+ blk_stat_disable_accounting(kqd->q);
for (i = 0; i < KYBER_NUM_DOMAINS; i++)
sbitmap_queue_free(&kqd->domain_tokens[i]);
@@ -458,9 +450,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq)
INIT_LIST_HEAD(&kcq->rq_list[i]);
}
-static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx)
{
struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+ unsigned int shift = tags->bitmap_tags.sb.shift;
+
+ kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U;
+
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth);
+}
+
+static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
struct kyber_hctx_data *khd;
int i;
@@ -479,7 +481,8 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
for (i = 0; i < KYBER_NUM_DOMAINS; i++) {
if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx,
- ilog2(8), GFP_KERNEL, hctx->numa_node)) {
+ ilog2(8), GFP_KERNEL, hctx->numa_node,
+ false, false)) {
while (--i >= 0)
sbitmap_free(&khd->kcq_map[i]);
goto err_kcqs;
@@ -502,8 +505,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
khd->batching = 0;
hctx->sched_data = khd;
- sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags,
- kqd->async_depth);
+ kyber_depth_updated(hctx);
return 0;
@@ -549,31 +551,32 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd,
}
}
-static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data)
+static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
{
/*
* We use the scheduler tags as per-hardware queue queueing tokens.
* Async requests can be limited at this stage.
*/
- if (!op_is_sync(op)) {
+ if (!op_is_sync(opf)) {
struct kyber_queue_data *kqd = data->q->elevator->elevator_data;
data->shallow_depth = kqd->async_depth;
}
}
-static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+static bool kyber_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
+ struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
+ struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx);
struct kyber_hctx_data *khd = hctx->sched_data;
- struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue);
struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]];
unsigned int sched_domain = kyber_sched_domain(bio->bi_opf);
struct list_head *rq_list = &kcq->rq_list[sched_domain];
bool merged;
spin_lock(&kcq->lock);
- merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
+ merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs);
spin_unlock(&kcq->lock);
return merged;
@@ -585,7 +588,8 @@ static void kyber_prepare_request(struct request *rq)
}
static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
- struct list_head *rq_list, bool at_head)
+ struct list_head *rq_list,
+ blk_insert_t flags)
{
struct kyber_hctx_data *khd = hctx->sched_data;
struct request *rq, *next;
@@ -596,13 +600,13 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx,
struct list_head *head = &kcq->rq_list[sched_domain];
spin_lock(&kcq->lock);
- if (at_head)
+ trace_block_rq_insert(rq);
+ if (flags & BLK_MQ_INSERT_AT_HEAD)
list_move(&rq->queuelist, head);
else
list_move_tail(&rq->queuelist, head);
sbitmap_set_bit(&khd->kcq_map[sched_domain],
rq->mq_ctx->index_hw[hctx->type]);
- blk_mq_sched_request_inserted(rq);
spin_unlock(&kcq->lock);
}
}
@@ -774,7 +778,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
list_del_init(&rq->queuelist);
return rq;
} else {
- trace_kyber_throttled(kqd->q,
+ trace_kyber_throttled(kqd->dev,
kyber_domain_names[khd->cur_domain]);
}
} else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) {
@@ -787,7 +791,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd,
list_del_init(&rq->queuelist);
return rq;
} else {
- trace_kyber_throttled(kqd->q,
+ trace_kyber_throttled(kqd->dev,
kyber_domain_names[khd->cur_domain]);
}
}
@@ -1022,6 +1026,7 @@ static struct elevator_type kyber_sched = {
.completed_request = kyber_completed_request,
.dispatch_request = kyber_dispatch_request,
.has_work = kyber_has_work,
+ .depth_updated = kyber_depth_updated,
},
#ifdef CONFIG_BLK_DEBUG_FS
.queue_debugfs_attrs = kyber_queue_debugfs_attrs,
diff --git a/block/mq-deadline.c b/block/mq-deadline.c
index b57470e154c8..f958e79277b8 100644
--- a/block/mq-deadline.c
+++ b/block/mq-deadline.c
@@ -8,8 +8,6 @@
#include <linux/kernel.h>
#include <linux/fs.h>
#include <linux/blkdev.h>
-#include <linux/blk-mq.h>
-#include <linux/elevator.h>
#include <linux/bio.h>
#include <linux/module.h>
#include <linux/slab.h>
@@ -18,10 +16,12 @@
#include <linux/rbtree.h>
#include <linux/sbitmap.h>
+#include <trace/events/block.h>
+
+#include "elevator.h"
#include "blk.h"
#include "blk-mq.h"
#include "blk-mq-debugfs.h"
-#include "blk-mq-tag.h"
#include "blk-mq-sched.h"
/*
@@ -29,45 +29,117 @@
*/
static const int read_expire = HZ / 2; /* max time before a read is submitted. */
static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */
+/*
+ * Time after which to dispatch lower priority requests even if higher
+ * priority requests are pending.
+ */
+static const int prio_aging_expire = 10 * HZ;
static const int writes_starved = 2; /* max times reads can starve a write */
static const int fifo_batch = 16; /* # of sequential requests treated as one
by the above parameters. For throughput. */
+enum dd_data_dir {
+ DD_READ = READ,
+ DD_WRITE = WRITE,
+};
+
+enum { DD_DIR_COUNT = 2 };
+
+enum dd_prio {
+ DD_RT_PRIO = 0,
+ DD_BE_PRIO = 1,
+ DD_IDLE_PRIO = 2,
+ DD_PRIO_MAX = 2,
+};
+
+enum { DD_PRIO_COUNT = 3 };
+
+/*
+ * I/O statistics per I/O priority. It is fine if these counters overflow.
+ * What matters is that these counters are at least as wide as
+ * log2(max_outstanding_requests).
+ */
+struct io_stats_per_prio {
+ uint32_t inserted;
+ uint32_t merged;
+ uint32_t dispatched;
+ atomic_t completed;
+};
+
+/*
+ * Deadline scheduler data per I/O priority (enum dd_prio). Requests are
+ * present on both sort_list[] and fifo_list[].
+ */
+struct dd_per_prio {
+ struct list_head dispatch;
+ struct rb_root sort_list[DD_DIR_COUNT];
+ struct list_head fifo_list[DD_DIR_COUNT];
+ /* Position of the most recently dispatched request. */
+ sector_t latest_pos[DD_DIR_COUNT];
+ struct io_stats_per_prio stats;
+};
+
struct deadline_data {
/*
* run time data
*/
- /*
- * requests (deadline_rq s) are present on both sort_list and fifo_list
- */
- struct rb_root sort_list[2];
- struct list_head fifo_list[2];
+ struct dd_per_prio per_prio[DD_PRIO_COUNT];
- /*
- * next in sort order. read, write or both are NULL
- */
- struct request *next_rq[2];
+ /* Data direction of latest dispatched request. */
+ enum dd_data_dir last_dir;
unsigned int batching; /* number of sequential requests made */
unsigned int starved; /* times reads have starved writes */
/*
* settings that change how the i/o scheduler behaves
*/
- int fifo_expire[2];
+ int fifo_expire[DD_DIR_COUNT];
int fifo_batch;
int writes_starved;
int front_merges;
+ u32 async_depth;
+ int prio_aging_expire;
spinlock_t lock;
spinlock_t zone_lock;
- struct list_head dispatch;
+};
+
+/* Maps an I/O priority class to a deadline scheduler priority. */
+static const enum dd_prio ioprio_class_to_prio[] = {
+ [IOPRIO_CLASS_NONE] = DD_BE_PRIO,
+ [IOPRIO_CLASS_RT] = DD_RT_PRIO,
+ [IOPRIO_CLASS_BE] = DD_BE_PRIO,
+ [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO,
};
static inline struct rb_root *
-deadline_rb_root(struct deadline_data *dd, struct request *rq)
+deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq)
{
- return &dd->sort_list[rq_data_dir(rq)];
+ return &per_prio->sort_list[rq_data_dir(rq)];
+}
+
+/*
+ * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a
+ * request.
+ */
+static u8 dd_rq_ioclass(struct request *rq)
+{
+ return IOPRIO_PRIO_CLASS(req_get_ioprio(rq));
+}
+
+/*
+ * get the request before `rq' in sector-sorted order
+ */
+static inline struct request *
+deadline_earlier_request(struct request *rq)
+{
+ struct rb_node *node = rb_prev(&rq->rb_node);
+
+ if (node)
+ return rb_entry_rq(node);
+
+ return NULL;
}
/*
@@ -84,39 +156,68 @@ deadline_latter_request(struct request *rq)
return NULL;
}
+/*
+ * Return the first request for which blk_rq_pos() >= @pos. For zoned devices,
+ * return the first request after the start of the zone containing @pos.
+ */
+static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir, sector_t pos)
+{
+ struct rb_node *node = per_prio->sort_list[data_dir].rb_node;
+ struct request *rq, *res = NULL;
+
+ if (!node)
+ return NULL;
+
+ rq = rb_entry_rq(node);
+ /*
+ * A zoned write may have been requeued with a starting position that
+ * is below that of the most recently dispatched request. Hence, for
+ * zoned writes, start searching from the start of a zone.
+ */
+ if (blk_rq_is_seq_zoned_write(rq))
+ pos = round_down(pos, rq->q->limits.chunk_sectors);
+
+ while (node) {
+ rq = rb_entry_rq(node);
+ if (blk_rq_pos(rq) >= pos) {
+ res = rq;
+ node = node->rb_left;
+ } else {
+ node = node->rb_right;
+ }
+ }
+ return res;
+}
+
static void
-deadline_add_rq_rb(struct deadline_data *dd, struct request *rq)
+deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
- struct rb_root *root = deadline_rb_root(dd, rq);
+ struct rb_root *root = deadline_rb_root(per_prio, rq);
elv_rb_add(root, rq);
}
static inline void
-deadline_del_rq_rb(struct deadline_data *dd, struct request *rq)
+deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq)
{
- const int data_dir = rq_data_dir(rq);
-
- if (dd->next_rq[data_dir] == rq)
- dd->next_rq[data_dir] = deadline_latter_request(rq);
-
- elv_rb_del(deadline_rb_root(dd, rq), rq);
+ elv_rb_del(deadline_rb_root(per_prio, rq), rq);
}
/*
* remove rq from rbtree and fifo.
*/
-static void deadline_remove_request(struct request_queue *q, struct request *rq)
+static void deadline_remove_request(struct request_queue *q,
+ struct dd_per_prio *per_prio,
+ struct request *rq)
{
- struct deadline_data *dd = q->elevator->elevator_data;
-
list_del_init(&rq->queuelist);
/*
* We might not be on the rbtree, if we are doing an insert merge
*/
if (!RB_EMPTY_NODE(&rq->rb_node))
- deadline_del_rq_rb(dd, rq);
+ deadline_del_rq_rb(per_prio, rq);
elv_rqhash_del(q, rq);
if (q->last_merge == rq)
@@ -127,19 +228,33 @@ static void dd_request_merged(struct request_queue *q, struct request *req,
enum elv_merge type)
{
struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(req);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
/*
* if the merge was a front merge, we need to reposition request
*/
if (type == ELEVATOR_FRONT_MERGE) {
- elv_rb_del(deadline_rb_root(dd, req), req);
- deadline_add_rq_rb(dd, req);
+ elv_rb_del(deadline_rb_root(per_prio, req), req);
+ deadline_add_rq_rb(per_prio, req);
}
}
+/*
+ * Callback function that is invoked after @next has been merged into @req.
+ */
static void dd_merged_requests(struct request_queue *q, struct request *req,
struct request *next)
{
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(next);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+
+ lockdep_assert_held(&dd->lock);
+
+ dd->per_prio[prio].stats.merged++;
+
/*
* if next expires before rq, assign its expire time to rq
* and move into next position (next will be deleted) in fifo
@@ -155,42 +270,72 @@ static void dd_merged_requests(struct request_queue *q, struct request *req,
/*
* kill knowledge of next, this one is a goner
*/
- deadline_remove_request(q, next);
+ deadline_remove_request(q, &dd->per_prio[prio], next);
}
/*
* move an entry to dispatch queue
*/
static void
-deadline_move_request(struct deadline_data *dd, struct request *rq)
+deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ struct request *rq)
{
- const int data_dir = rq_data_dir(rq);
-
- dd->next_rq[READ] = NULL;
- dd->next_rq[WRITE] = NULL;
- dd->next_rq[data_dir] = deadline_latter_request(rq);
-
/*
* take it off the sort and fifo list
*/
- deadline_remove_request(rq->q, rq);
+ deadline_remove_request(rq->q, per_prio, rq);
+}
+
+/* Number of requests queued for a given priority level. */
+static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio)
+{
+ const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
+
+ lockdep_assert_held(&dd->lock);
+
+ return stats->inserted - atomic_read(&stats->completed);
}
/*
- * deadline_check_fifo returns 0 if there are no expired requests on the fifo,
- * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir])
+ * deadline_check_fifo returns true if and only if there are expired requests
+ * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]).
*/
-static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
+static inline bool deadline_check_fifo(struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
{
- struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next);
+ struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
- /*
- * rq is expired!
- */
- if (time_after_eq(jiffies, (unsigned long)rq->fifo_time))
- return 1;
+ return time_is_before_eq_jiffies((unsigned long)rq->fifo_time);
+}
- return 0;
+/*
+ * Check if rq has a sequential request preceding it.
+ */
+static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq)
+{
+ struct request *prev = deadline_earlier_request(rq);
+
+ if (!prev)
+ return false;
+
+ return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq);
+}
+
+/*
+ * Skip all write requests that are sequential from @rq, even if we cross
+ * a zone boundary.
+ */
+static struct request *deadline_skip_seq_writes(struct deadline_data *dd,
+ struct request *rq)
+{
+ sector_t pos = blk_rq_pos(rq);
+
+ do {
+ pos += blk_rq_sectors(rq);
+ rq = deadline_latter_request(rq);
+ } while (rq && blk_rq_pos(rq) == pos);
+
+ return rq;
}
/*
@@ -198,28 +343,36 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir)
* dispatch using arrival ordered lists.
*/
static struct request *
-deadline_fifo_request(struct deadline_data *dd, int data_dir)
+deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
{
- struct request *rq;
+ struct request *rq, *rb_rq, *next;
unsigned long flags;
- if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
- return NULL;
-
- if (list_empty(&dd->fifo_list[data_dir]))
+ if (list_empty(&per_prio->fifo_list[data_dir]))
return NULL;
- rq = rq_entry_fifo(dd->fifo_list[data_dir].next);
- if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+ rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next);
+ if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
+ * an unlocked target zone. For some HDDs, breaking a sequential
+ * write stream can lead to lower throughput, so make sure to preserve
+ * sequential write streams, even if that stream crosses into the next
+ * zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
- list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) {
- if (blk_req_can_dispatch_to_zone(rq))
+ list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE],
+ queuelist) {
+ /* Check whether a prior request exists for the same zone. */
+ rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq));
+ if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq))
+ rq = rb_rq;
+ if (blk_req_can_dispatch_to_zone(rq) &&
+ (blk_queue_nonrot(rq->q) ||
+ !deadline_is_seq_write(dd, rq)))
goto out;
}
rq = NULL;
@@ -234,30 +387,35 @@ out:
* dispatch using sector position sorted lists.
*/
static struct request *
-deadline_next_request(struct deadline_data *dd, int data_dir)
+deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio,
+ enum dd_data_dir data_dir)
{
struct request *rq;
unsigned long flags;
- if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE))
- return NULL;
-
- rq = dd->next_rq[data_dir];
+ rq = deadline_from_pos(per_prio, data_dir,
+ per_prio->latest_pos[data_dir]);
if (!rq)
return NULL;
- if (data_dir == READ || !blk_queue_is_zoned(rq->q))
+ if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q))
return rq;
/*
* Look for a write request that can be dispatched, that is one with
- * an unlocked target zone.
+ * an unlocked target zone. For some HDDs, breaking a sequential
+ * write stream can lead to lower throughput, so make sure to preserve
+ * sequential write streams, even if that stream crosses into the next
+ * zones and these zones are unlocked.
*/
spin_lock_irqsave(&dd->zone_lock, flags);
while (rq) {
if (blk_req_can_dispatch_to_zone(rq))
break;
- rq = deadline_latter_request(rq);
+ if (blk_queue_nonrot(rq->q))
+ rq = deadline_latter_request(rq);
+ else
+ rq = deadline_skip_seq_writes(dd, rq);
}
spin_unlock_irqrestore(&dd->zone_lock, flags);
@@ -265,48 +423,67 @@ deadline_next_request(struct deadline_data *dd, int data_dir)
}
/*
+ * Returns true if and only if @rq started after @latest_start where
+ * @latest_start is in jiffies.
+ */
+static bool started_after(struct deadline_data *dd, struct request *rq,
+ unsigned long latest_start)
+{
+ unsigned long start_time = (unsigned long)rq->fifo_time;
+
+ start_time -= dd->fifo_expire[rq_data_dir(rq)];
+
+ return time_after(start_time, latest_start);
+}
+
+/*
* deadline_dispatch_requests selects the best request according to
- * read/write expire, fifo_batch, etc
+ * read/write expire, fifo_batch, etc and with a start time <= @latest_start.
*/
-static struct request *__dd_dispatch_request(struct deadline_data *dd)
+static struct request *__dd_dispatch_request(struct deadline_data *dd,
+ struct dd_per_prio *per_prio,
+ unsigned long latest_start)
{
struct request *rq, *next_rq;
- bool reads, writes;
- int data_dir;
+ enum dd_data_dir data_dir;
+ enum dd_prio prio;
+ u8 ioprio_class;
+
+ lockdep_assert_held(&dd->lock);
- if (!list_empty(&dd->dispatch)) {
- rq = list_first_entry(&dd->dispatch, struct request, queuelist);
+ if (!list_empty(&per_prio->dispatch)) {
+ rq = list_first_entry(&per_prio->dispatch, struct request,
+ queuelist);
+ if (started_after(dd, rq, latest_start))
+ return NULL;
list_del_init(&rq->queuelist);
+ data_dir = rq_data_dir(rq);
goto done;
}
- reads = !list_empty(&dd->fifo_list[READ]);
- writes = !list_empty(&dd->fifo_list[WRITE]);
-
/*
* batches are currently reads XOR writes
*/
- rq = deadline_next_request(dd, WRITE);
- if (!rq)
- rq = deadline_next_request(dd, READ);
-
- if (rq && dd->batching < dd->fifo_batch)
- /* we have a next request are still entitled to batch */
+ rq = deadline_next_request(dd, per_prio, dd->last_dir);
+ if (rq && dd->batching < dd->fifo_batch) {
+ /* we have a next request and are still entitled to batch */
+ data_dir = rq_data_dir(rq);
goto dispatch_request;
+ }
/*
* at this point we are not running a batch. select the appropriate
* data direction (read / write)
*/
- if (reads) {
- BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ]));
+ if (!list_empty(&per_prio->fifo_list[DD_READ])) {
+ BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ]));
- if (deadline_fifo_request(dd, WRITE) &&
+ if (deadline_fifo_request(dd, per_prio, DD_WRITE) &&
(dd->starved++ >= dd->writes_starved))
goto dispatch_writes;
- data_dir = READ;
+ data_dir = DD_READ;
goto dispatch_find_request;
}
@@ -315,13 +492,13 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd)
* there are either no reads or writes have been starved
*/
- if (writes) {
+ if (!list_empty(&per_prio->fifo_list[DD_WRITE])) {
dispatch_writes:
- BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE]));
+ BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE]));
dd->starved = 0;
- data_dir = WRITE;
+ data_dir = DD_WRITE;
goto dispatch_find_request;
}
@@ -332,14 +509,14 @@ dispatch_find_request:
/*
* we are not running a batch, find best request for selected data_dir
*/
- next_rq = deadline_next_request(dd, data_dir);
- if (deadline_check_fifo(dd, data_dir) || !next_rq) {
+ next_rq = deadline_next_request(dd, per_prio, data_dir);
+ if (deadline_check_fifo(per_prio, data_dir) || !next_rq) {
/*
* A deadline has expired, the last request was in the other
* direction, or we have run out of higher-sectored requests.
* Start again from the request with the earliest expiry time.
*/
- rq = deadline_fifo_request(dd, data_dir);
+ rq = deadline_fifo_request(dd, per_prio, data_dir);
} else {
/*
* The last req was the same dir and we have a next request in
@@ -355,15 +532,23 @@ dispatch_find_request:
if (!rq)
return NULL;
+ dd->last_dir = data_dir;
dd->batching = 0;
dispatch_request:
+ if (started_after(dd, rq, latest_start))
+ return NULL;
+
/*
* rq is the selected appropriate request.
*/
dd->batching++;
- deadline_move_request(dd, rq);
+ deadline_move_request(dd, per_prio, rq);
done:
+ ioprio_class = dd_rq_ioclass(rq);
+ prio = ioprio_class_to_prio[ioprio_class];
+ dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq);
+ dd->per_prio[prio].stats.dispatched++;
/*
* If the request needs its target zone locked, do it.
*/
@@ -373,6 +558,36 @@ done:
}
/*
+ * Check whether there are any requests with priority other than DD_RT_PRIO
+ * that were inserted more than prio_aging_expire jiffies ago.
+ */
+static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd,
+ unsigned long now)
+{
+ struct request *rq;
+ enum dd_prio prio;
+ int prio_cnt;
+
+ lockdep_assert_held(&dd->lock);
+
+ prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) +
+ !!dd_queued(dd, DD_IDLE_PRIO);
+ if (prio_cnt < 2)
+ return NULL;
+
+ for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) {
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio],
+ now - dd->prio_aging_expire);
+ if (rq)
+ return rq;
+ }
+
+ return NULL;
+}
+
+/*
+ * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests().
+ *
* One confusing aspect here is that we get called for a specific
* hardware queue, but we may return a request that is for a
* different hardware queue. This is because mq-deadline has shared
@@ -381,21 +596,92 @@ done:
static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ const unsigned long now = jiffies;
struct request *rq;
+ enum dd_prio prio;
spin_lock(&dd->lock);
- rq = __dd_dispatch_request(dd);
+ rq = dd_dispatch_prio_aged_requests(dd, now);
+ if (rq)
+ goto unlock;
+
+ /*
+ * Next, dispatch requests in priority order. Ignore lower priority
+ * requests if any higher priority requests are pending.
+ */
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now);
+ if (rq || dd_queued(dd, prio))
+ break;
+ }
+
+unlock:
spin_unlock(&dd->lock);
return rq;
}
-static void dd_exit_queue(struct elevator_queue *e)
+/*
+ * Called by __blk_mq_alloc_request(). The shallow_depth value set by this
+ * function is used by __blk_mq_get_tag().
+ */
+static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data)
+{
+ struct deadline_data *dd = data->q->elevator->elevator_data;
+
+ /* Do not throttle synchronous reads. */
+ if (op_is_sync(opf) && !op_is_write(opf))
+ return;
+
+ /*
+ * Throttle asynchronous requests and writes such that these requests
+ * do not block the allocation of synchronous requests.
+ */
+ data->shallow_depth = dd->async_depth;
+}
+
+/* Called by blk_mq_update_nr_requests(). */
+static void dd_depth_updated(struct blk_mq_hw_ctx *hctx)
+{
+ struct request_queue *q = hctx->queue;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ struct blk_mq_tags *tags = hctx->sched_tags;
+ unsigned int shift = tags->bitmap_tags.sb.shift;
+
+ dd->async_depth = max(1U, 3 * (1U << shift) / 4);
+
+ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth);
+}
+
+/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */
+static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
+{
+ dd_depth_updated(hctx);
+ return 0;
+}
+
+static void dd_exit_sched(struct elevator_queue *e)
{
struct deadline_data *dd = e->elevator_data;
+ enum dd_prio prio;
- BUG_ON(!list_empty(&dd->fifo_list[READ]));
- BUG_ON(!list_empty(&dd->fifo_list[WRITE]));
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+ const struct io_stats_per_prio *stats = &per_prio->stats;
+ uint32_t queued;
+
+ WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ]));
+ WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE]));
+
+ spin_lock(&dd->lock);
+ queued = dd_queued(dd, prio);
+ spin_unlock(&dd->lock);
+
+ WARN_ONCE(queued != 0,
+ "statistics for priority %d: i %u m %u d %u c %u\n",
+ prio, stats->inserted, stats->merged,
+ stats->dispatched, atomic_read(&stats->completed));
+ }
kfree(dd);
}
@@ -403,55 +689,78 @@ static void dd_exit_queue(struct elevator_queue *e)
/*
* initialize elevator private data (deadline_data).
*/
-static int dd_init_queue(struct request_queue *q, struct elevator_type *e)
+static int dd_init_sched(struct request_queue *q, struct elevator_type *e)
{
struct deadline_data *dd;
struct elevator_queue *eq;
+ enum dd_prio prio;
+ int ret = -ENOMEM;
eq = elevator_alloc(q, e);
if (!eq)
- return -ENOMEM;
+ return ret;
dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node);
- if (!dd) {
- kobject_put(&eq->kobj);
- return -ENOMEM;
- }
+ if (!dd)
+ goto put_eq;
+
eq->elevator_data = dd;
- INIT_LIST_HEAD(&dd->fifo_list[READ]);
- INIT_LIST_HEAD(&dd->fifo_list[WRITE]);
- dd->sort_list[READ] = RB_ROOT;
- dd->sort_list[WRITE] = RB_ROOT;
- dd->fifo_expire[READ] = read_expire;
- dd->fifo_expire[WRITE] = write_expire;
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++) {
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ INIT_LIST_HEAD(&per_prio->dispatch);
+ INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]);
+ INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]);
+ per_prio->sort_list[DD_READ] = RB_ROOT;
+ per_prio->sort_list[DD_WRITE] = RB_ROOT;
+ }
+ dd->fifo_expire[DD_READ] = read_expire;
+ dd->fifo_expire[DD_WRITE] = write_expire;
dd->writes_starved = writes_starved;
dd->front_merges = 1;
+ dd->last_dir = DD_WRITE;
dd->fifo_batch = fifo_batch;
+ dd->prio_aging_expire = prio_aging_expire;
spin_lock_init(&dd->lock);
spin_lock_init(&dd->zone_lock);
- INIT_LIST_HEAD(&dd->dispatch);
+
+ /* We dispatch from request queue wide instead of hw queue */
+ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q);
q->elevator = eq;
return 0;
+
+put_eq:
+ kobject_put(&eq->kobj);
+ return ret;
}
+/*
+ * Try to merge @bio into an existing request. If @bio has been merged into
+ * an existing request, store the pointer to that request into *@rq.
+ */
static int dd_request_merge(struct request_queue *q, struct request **rq,
struct bio *bio)
{
struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
sector_t sector = bio_end_sector(bio);
struct request *__rq;
if (!dd->front_merges)
return ELEVATOR_NO_MERGE;
- __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector);
+ __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector);
if (__rq) {
BUG_ON(sector != blk_rq_pos(__rq));
if (elv_bio_merge_ok(__rq, bio)) {
*rq = __rq;
+ if (blk_discard_mergable(__rq))
+ return ELEVATOR_DISCARD_MERGE;
return ELEVATOR_FRONT_MERGE;
}
}
@@ -459,10 +768,13 @@ static int dd_request_merge(struct request_queue *q, struct request **rq,
return ELEVATOR_NO_MERGE;
}
-static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
+/*
+ * Attempt to merge a bio into an existing request. This function is called
+ * before @bio is associated with a request.
+ */
+static bool dd_bio_merge(struct request_queue *q, struct bio *bio,
unsigned int nr_segs)
{
- struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
struct request *free = NULL;
bool ret;
@@ -481,11 +793,17 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio,
* add rq to rbtree and fifo
*/
static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
- bool at_head)
+ blk_insert_t flags, struct list_head *free)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
- const int data_dir = rq_data_dir(rq);
+ const enum dd_data_dir data_dir = rq_data_dir(rq);
+ u16 ioprio = req_get_ioprio(rq);
+ u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio);
+ struct dd_per_prio *per_prio;
+ enum dd_prio prio;
+
+ lockdep_assert_held(&dd->lock);
/*
* This may be a requeue of a write request that has locked its
@@ -493,18 +811,25 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
*/
blk_req_zone_write_unlock(rq);
- if (blk_mq_sched_try_insert_merge(q, rq))
+ prio = ioprio_class_to_prio[ioprio_class];
+ per_prio = &dd->per_prio[prio];
+ if (!rq->elv.priv[0]) {
+ per_prio->stats.inserted++;
+ rq->elv.priv[0] = (void *)(uintptr_t)1;
+ }
+
+ if (blk_mq_sched_try_insert_merge(q, rq, free))
return;
- blk_mq_sched_request_inserted(rq);
+ trace_block_rq_insert(rq);
- if (at_head || blk_rq_is_passthrough(rq)) {
- if (at_head)
- list_add(&rq->queuelist, &dd->dispatch);
- else
- list_add_tail(&rq->queuelist, &dd->dispatch);
+ if (flags & BLK_MQ_INSERT_AT_HEAD) {
+ list_add(&rq->queuelist, &per_prio->dispatch);
+ rq->fifo_time = jiffies;
} else {
- deadline_add_rq_rb(dd, rq);
+ struct list_head *insert_before;
+
+ deadline_add_rq_rb(per_prio, rq);
if (rq_mergeable(rq)) {
elv_rqhash_add(q, rq);
@@ -516,15 +841,33 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq,
* set expire time and add to fifo list
*/
rq->fifo_time = jiffies + dd->fifo_expire[data_dir];
- list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]);
+ insert_before = &per_prio->fifo_list[data_dir];
+#ifdef CONFIG_BLK_DEV_ZONED
+ /*
+ * Insert zoned writes such that requests are sorted by
+ * position per zone.
+ */
+ if (blk_rq_is_seq_zoned_write(rq)) {
+ struct request *rq2 = deadline_latter_request(rq);
+
+ if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq))
+ insert_before = &rq2->queuelist;
+ }
+#endif
+ list_add_tail(&rq->queuelist, insert_before);
}
}
+/*
+ * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list().
+ */
static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
- struct list_head *list, bool at_head)
+ struct list_head *list,
+ blk_insert_t flags)
{
struct request_queue *q = hctx->queue;
struct deadline_data *dd = q->elevator->elevator_data;
+ LIST_HEAD(free);
spin_lock(&dd->lock);
while (!list_empty(list)) {
@@ -532,20 +875,34 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx,
rq = list_first_entry(list, struct request, queuelist);
list_del_init(&rq->queuelist);
- dd_insert_request(hctx, rq, at_head);
+ dd_insert_request(hctx, rq, flags, &free);
}
spin_unlock(&dd->lock);
+
+ blk_mq_free_requests(&free);
}
-/*
- * Nothing to do here. This is defined only to ensure that .finish_request
- * method is called upon request completion.
- */
+/* Callback from inside blk_mq_rq_ctx_init(). */
static void dd_prepare_request(struct request *rq)
{
+ rq->elv.priv[0] = NULL;
+}
+
+static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx)
+{
+ struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ enum dd_prio p;
+
+ for (p = 0; p <= DD_PRIO_MAX; p++)
+ if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE]))
+ return true;
+
+ return false;
}
/*
+ * Callback from inside blk_mq_free_request().
+ *
* For zoned block devices, write unlock the target zone of
* completed write requests. Do this while holding the zone lock
* spinlock so that the zone is never unlocked while deadline_fifo_request()
@@ -562,83 +919,103 @@ static void dd_prepare_request(struct request *rq)
static void dd_finish_request(struct request *rq)
{
struct request_queue *q = rq->q;
+ struct deadline_data *dd = q->elevator->elevator_data;
+ const u8 ioprio_class = dd_rq_ioclass(rq);
+ const enum dd_prio prio = ioprio_class_to_prio[ioprio_class];
+ struct dd_per_prio *per_prio = &dd->per_prio[prio];
+
+ /*
+ * The block layer core may call dd_finish_request() without having
+ * called dd_insert_requests(). Skip requests that bypassed I/O
+ * scheduling. See also blk_mq_request_bypass_insert().
+ */
+ if (!rq->elv.priv[0])
+ return;
+
+ atomic_inc(&per_prio->stats.completed);
if (blk_queue_is_zoned(q)) {
- struct deadline_data *dd = q->elevator->elevator_data;
unsigned long flags;
spin_lock_irqsave(&dd->zone_lock, flags);
blk_req_zone_write_unlock(rq);
- if (!list_empty(&dd->fifo_list[WRITE]))
- blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
spin_unlock_irqrestore(&dd->zone_lock, flags);
+
+ if (dd_has_write_work(rq->mq_hctx))
+ blk_mq_sched_mark_restart_hctx(rq->mq_hctx);
}
}
+static bool dd_has_work_for_prio(struct dd_per_prio *per_prio)
+{
+ return !list_empty_careful(&per_prio->dispatch) ||
+ !list_empty_careful(&per_prio->fifo_list[DD_READ]) ||
+ !list_empty_careful(&per_prio->fifo_list[DD_WRITE]);
+}
+
static bool dd_has_work(struct blk_mq_hw_ctx *hctx)
{
struct deadline_data *dd = hctx->queue->elevator->elevator_data;
+ enum dd_prio prio;
- return !list_empty_careful(&dd->dispatch) ||
- !list_empty_careful(&dd->fifo_list[0]) ||
- !list_empty_careful(&dd->fifo_list[1]);
+ for (prio = 0; prio <= DD_PRIO_MAX; prio++)
+ if (dd_has_work_for_prio(&dd->per_prio[prio]))
+ return true;
+
+ return false;
}
/*
* sysfs parts below
*/
-static ssize_t
-deadline_var_show(int var, char *page)
-{
- return sprintf(page, "%d\n", var);
-}
-
-static void
-deadline_var_store(int *var, const char *page)
-{
- char *p = (char *) page;
-
- *var = simple_strtol(p, &p, 10);
-}
-
-#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \
+#define SHOW_INT(__FUNC, __VAR) \
static ssize_t __FUNC(struct elevator_queue *e, char *page) \
{ \
struct deadline_data *dd = e->elevator_data; \
- int __data = __VAR; \
- if (__CONV) \
- __data = jiffies_to_msecs(__data); \
- return deadline_var_show(__data, (page)); \
-}
-SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1);
-SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1);
-SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0);
-SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0);
-SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0);
-#undef SHOW_FUNCTION
+ \
+ return sysfs_emit(page, "%d\n", __VAR); \
+}
+#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR))
+SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]);
+SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]);
+SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire);
+SHOW_INT(deadline_writes_starved_show, dd->writes_starved);
+SHOW_INT(deadline_front_merges_show, dd->front_merges);
+SHOW_INT(deadline_async_depth_show, dd->async_depth);
+SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch);
+#undef SHOW_INT
+#undef SHOW_JIFFIES
#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \
static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \
{ \
struct deadline_data *dd = e->elevator_data; \
- int __data; \
- deadline_var_store(&__data, (page)); \
+ int __data, __ret; \
+ \
+ __ret = kstrtoint(page, 0, &__data); \
+ if (__ret < 0) \
+ return __ret; \
if (__data < (MIN)) \
__data = (MIN); \
else if (__data > (MAX)) \
__data = (MAX); \
- if (__CONV) \
- *(__PTR) = msecs_to_jiffies(__data); \
- else \
- *(__PTR) = __data; \
+ *(__PTR) = __CONV(__data); \
return count; \
}
-STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1);
-STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0);
-STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0);
-STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0);
+#define STORE_INT(__FUNC, __PTR, MIN, MAX) \
+ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, )
+#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \
+ STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies)
+STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX);
+STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX);
+STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX);
+STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX);
+STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1);
+STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX);
+STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX);
#undef STORE_FUNCTION
+#undef STORE_INT
+#undef STORE_JIFFIES
#define DD_ATTR(name) \
__ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store)
@@ -648,21 +1025,24 @@ static struct elv_fs_entry deadline_attrs[] = {
DD_ATTR(write_expire),
DD_ATTR(writes_starved),
DD_ATTR(front_merges),
+ DD_ATTR(async_depth),
DD_ATTR(fifo_batch),
+ DD_ATTR(prio_aging_expire),
__ATTR_NULL
};
#ifdef CONFIG_BLK_DEBUG_FS
-#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \
+#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \
static void *deadline_##name##_fifo_start(struct seq_file *m, \
loff_t *pos) \
__acquires(&dd->lock) \
{ \
struct request_queue *q = m->private; \
struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
\
spin_lock(&dd->lock); \
- return seq_list_start(&dd->fifo_list[ddir], *pos); \
+ return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \
} \
\
static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
@@ -670,8 +1050,9 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \
{ \
struct request_queue *q = m->private; \
struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
\
- return seq_list_next(v, &dd->fifo_list[ddir], pos); \
+ return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \
} \
\
static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \
@@ -695,14 +1076,22 @@ static int deadline_##name##_next_rq_show(void *data, \
{ \
struct request_queue *q = data; \
struct deadline_data *dd = q->elevator->elevator_data; \
- struct request *rq = dd->next_rq[ddir]; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ struct request *rq; \
\
+ rq = deadline_from_pos(per_prio, data_dir, \
+ per_prio->latest_pos[data_dir]); \
if (rq) \
__blk_mq_debugfs_rq_show(m, rq); \
return 0; \
}
-DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read)
-DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write)
+
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2);
+DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2);
#undef DEADLINE_DEBUGFS_DDIR_ATTRS
static int deadline_batching_show(void *data, struct seq_file *m)
@@ -723,49 +1112,130 @@ static int deadline_starved_show(void *data, struct seq_file *m)
return 0;
}
-static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos)
- __acquires(&dd->lock)
+static int dd_async_depth_show(void *data, struct seq_file *m)
{
- struct request_queue *q = m->private;
+ struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
- spin_lock(&dd->lock);
- return seq_list_start(&dd->dispatch, *pos);
+ seq_printf(m, "%u\n", dd->async_depth);
+ return 0;
}
-static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos)
+static int dd_queued_show(void *data, struct seq_file *m)
{
- struct request_queue *q = m->private;
+ struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
+ u32 rt, be, idle;
+
+ spin_lock(&dd->lock);
+ rt = dd_queued(dd, DD_RT_PRIO);
+ be = dd_queued(dd, DD_BE_PRIO);
+ idle = dd_queued(dd, DD_IDLE_PRIO);
+ spin_unlock(&dd->lock);
+
+ seq_printf(m, "%u %u %u\n", rt, be, idle);
+
+ return 0;
+}
+
+/* Number of requests owned by the block driver for a given priority. */
+static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio)
+{
+ const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats;
- return seq_list_next(v, &dd->dispatch, pos);
+ lockdep_assert_held(&dd->lock);
+
+ return stats->dispatched + stats->merged -
+ atomic_read(&stats->completed);
}
-static void deadline_dispatch_stop(struct seq_file *m, void *v)
- __releases(&dd->lock)
+static int dd_owned_by_driver_show(void *data, struct seq_file *m)
{
- struct request_queue *q = m->private;
+ struct request_queue *q = data;
struct deadline_data *dd = q->elevator->elevator_data;
+ u32 rt, be, idle;
+ spin_lock(&dd->lock);
+ rt = dd_owned_by_driver(dd, DD_RT_PRIO);
+ be = dd_owned_by_driver(dd, DD_BE_PRIO);
+ idle = dd_owned_by_driver(dd, DD_IDLE_PRIO);
spin_unlock(&dd->lock);
+
+ seq_printf(m, "%u %u %u\n", rt, be, idle);
+
+ return 0;
}
-static const struct seq_operations deadline_dispatch_seq_ops = {
- .start = deadline_dispatch_start,
- .next = deadline_dispatch_next,
- .stop = deadline_dispatch_stop,
- .show = blk_mq_debugfs_rq_show,
-};
+#define DEADLINE_DISPATCH_ATTR(prio) \
+static void *deadline_dispatch##prio##_start(struct seq_file *m, \
+ loff_t *pos) \
+ __acquires(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ spin_lock(&dd->lock); \
+ return seq_list_start(&per_prio->dispatch, *pos); \
+} \
+ \
+static void *deadline_dispatch##prio##_next(struct seq_file *m, \
+ void *v, loff_t *pos) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ struct dd_per_prio *per_prio = &dd->per_prio[prio]; \
+ \
+ return seq_list_next(v, &per_prio->dispatch, pos); \
+} \
+ \
+static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \
+ __releases(&dd->lock) \
+{ \
+ struct request_queue *q = m->private; \
+ struct deadline_data *dd = q->elevator->elevator_data; \
+ \
+ spin_unlock(&dd->lock); \
+} \
+ \
+static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \
+ .start = deadline_dispatch##prio##_start, \
+ .next = deadline_dispatch##prio##_next, \
+ .stop = deadline_dispatch##prio##_stop, \
+ .show = blk_mq_debugfs_rq_show, \
+}
+
+DEADLINE_DISPATCH_ATTR(0);
+DEADLINE_DISPATCH_ATTR(1);
+DEADLINE_DISPATCH_ATTR(2);
+#undef DEADLINE_DISPATCH_ATTR
-#define DEADLINE_QUEUE_DDIR_ATTRS(name) \
- {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \
+#define DEADLINE_QUEUE_DDIR_ATTRS(name) \
+ {#name "_fifo_list", 0400, \
+ .seq_ops = &deadline_##name##_fifo_seq_ops}
+#define DEADLINE_NEXT_RQ_ATTR(name) \
{#name "_next_rq", 0400, deadline_##name##_next_rq_show}
static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
- DEADLINE_QUEUE_DDIR_ATTRS(read),
- DEADLINE_QUEUE_DDIR_ATTRS(write),
+ DEADLINE_QUEUE_DDIR_ATTRS(read0),
+ DEADLINE_QUEUE_DDIR_ATTRS(write0),
+ DEADLINE_QUEUE_DDIR_ATTRS(read1),
+ DEADLINE_QUEUE_DDIR_ATTRS(write1),
+ DEADLINE_QUEUE_DDIR_ATTRS(read2),
+ DEADLINE_QUEUE_DDIR_ATTRS(write2),
+ DEADLINE_NEXT_RQ_ATTR(read0),
+ DEADLINE_NEXT_RQ_ATTR(write0),
+ DEADLINE_NEXT_RQ_ATTR(read1),
+ DEADLINE_NEXT_RQ_ATTR(write1),
+ DEADLINE_NEXT_RQ_ATTR(read2),
+ DEADLINE_NEXT_RQ_ATTR(write2),
{"batching", 0400, deadline_batching_show},
{"starved", 0400, deadline_starved_show},
- {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops},
+ {"async_depth", 0400, dd_async_depth_show},
+ {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops},
+ {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops},
+ {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops},
+ {"owned_by_driver", 0400, dd_owned_by_driver_show},
+ {"queued", 0400, dd_queued_show},
{},
};
#undef DEADLINE_QUEUE_DDIR_ATTRS
@@ -773,6 +1243,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = {
static struct elevator_type mq_deadline = {
.ops = {
+ .depth_updated = dd_depth_updated,
+ .limit_depth = dd_limit_depth,
.insert_requests = dd_insert_requests,
.dispatch_request = dd_dispatch_request,
.prepare_request = dd_prepare_request,
@@ -784,8 +1256,9 @@ static struct elevator_type mq_deadline = {
.requests_merged = dd_merged_requests,
.request_merged = dd_request_merged,
.has_work = dd_has_work,
- .init_sched = dd_init_queue,
- .exit_sched = dd_exit_queue,
+ .init_sched = dd_init_sched,
+ .exit_sched = dd_exit_sched,
+ .init_hctx = dd_init_hctx,
},
#ifdef CONFIG_BLK_DEBUG_FS
@@ -812,6 +1285,6 @@ static void __exit deadline_exit(void)
module_init(deadline_init);
module_exit(deadline_exit);
-MODULE_AUTHOR("Jens Axboe");
+MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche");
MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("MQ deadline IO scheduler");
diff --git a/block/opal_proto.h b/block/opal_proto.h
index b486b3ec7dc4..d247a457bf6e 100644
--- a/block/opal_proto.h
+++ b/block/opal_proto.h
@@ -39,7 +39,12 @@ enum opal_response_token {
#define FIRST_TPER_SESSION_NUM 4096
#define TPER_SYNC_SUPPORTED 0x01
+/* FC_LOCKING features */
+#define LOCKING_SUPPORTED_MASK 0x01
+#define LOCKING_ENABLED_MASK 0x02
+#define LOCKED_MASK 0x04
#define MBR_ENABLED_MASK 0x10
+#define MBR_DONE_MASK 0x20
#define TINY_ATOM_DATA_MASK 0x3F
#define TINY_ATOM_SIGNED 0x40
@@ -66,6 +71,7 @@ enum opal_response_token {
#define SHORT_ATOM_BYTE 0xBF
#define MEDIUM_ATOM_BYTE 0xDF
#define LONG_ATOM_BYTE 0xE3
+#define EMPTY_ATOM_BYTE 0xFF
#define OPAL_INVAL_PARAM 12
#define OPAL_MANUFACTURED_INACTIVE 0x08
@@ -81,6 +87,15 @@ enum opal_response_token {
#define OPAL_MSID_KEYLEN 15
#define OPAL_UID_LENGTH_HALF 4
+/*
+ * Boolean operators from TCG Core spec 2.01 Section:
+ * 5.1.3.11
+ * Table 61
+ */
+#define OPAL_BOOLEAN_AND 0
+#define OPAL_BOOLEAN_OR 1
+#define OPAL_BOOLEAN_NOT 2
+
/* Enum to index OPALUID array */
enum opal_uid {
/* users */
@@ -100,6 +115,7 @@ enum opal_uid {
/* tables */
OPAL_TABLE_TABLE,
OPAL_LOCKINGRANGE_GLOBAL,
+ OPAL_LOCKINGRANGE_ACE_START_TO_KEY,
OPAL_LOCKINGRANGE_ACE_RDLOCKED,
OPAL_LOCKINGRANGE_ACE_WRLOCKED,
OPAL_MBRCONTROL,
@@ -210,6 +226,10 @@ enum opal_parameter {
OPAL_SUM_SET_LIST = 0x060000,
};
+enum opal_revertlsp {
+ OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000,
+};
+
/* Packets derived from:
* TCG_Storage_Architecture_Core_Spec_v2.01_r1.00
* Secion: 3.2.3 ComPackets, Packets & Subpackets
diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig
index 6e2a649669e5..7aff4eb81c60 100644
--- a/block/partitions/Kconfig
+++ b/block/partitions/Kconfig
@@ -2,6 +2,8 @@
#
# Partition configuration
#
+menu "Partition Types"
+
config PARTITION_ADVANCED
bool "Advanced partition selection"
help
@@ -264,7 +266,8 @@ config SYSV68_PARTITION
config CMDLINE_PARTITION
bool "Command line partition support" if PARTITION_ADVANCED
- select BLK_CMDLINE_PARSER
help
Say Y here if you want to read the partition table from bootargs.
The format for the command line is just like mtdparts.
+
+endmenu
diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c
index c64c57b958bf..d2fc122d7426 100644
--- a/block/partitions/acorn.c
+++ b/block/partitions/acorn.c
@@ -275,20 +275,20 @@ int adfspart_check_ADFS(struct parsed_partitions *state)
/*
* Work out start of non-adfs partition.
*/
- nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect;
+ nr_sects = get_capacity(state->disk) - start_sect;
if (start_sect) {
switch (id) {
#ifdef CONFIG_ACORN_PARTITION_RISCIX
case PARTITION_RISCIX_SCSI:
case PARTITION_RISCIX_MFM:
- slot = riscix_partition(state, start_sect, slot,
+ riscix_partition(state, start_sect, slot,
nr_sects);
break;
#endif
case PARTITION_LINUX:
- slot = linux_partition(state, start_sect, slot,
+ linux_partition(state, start_sect, slot,
nr_sects);
break;
}
@@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state)
if (i != 0) {
sector_t size;
- size = get_capacity(state->bdev->bd_disk);
+ size = get_capacity(state->disk);
put_partition(state, slot++, start, size - start);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
}
diff --git a/block/partitions/aix.c b/block/partitions/aix.c
index c7b4fd1a4a97..85f4b967565e 100644
--- a/block/partitions/aix.c
+++ b/block/partitions/aix.c
@@ -67,29 +67,13 @@ struct pvd {
#define LVM_MAXLVS 256
/**
- * last_lba(): return number of last logical block of device
- * @bdev: block device
- *
- * Description: Returns last LBA value on success, 0 on error.
- * This is stored (by sd and ide-geometry) in
- * the part[0] entry for this disk, and is the number of
- * physical sectors available on the disk.
- */
-static u64 last_lba(struct block_device *bdev)
-{
- if (!bdev || !bdev->bd_inode)
- return 0;
- return (bdev->bd_inode->i_size >> 9) - 1ULL;
-}
-
-/**
* read_lba(): Read bytes from disk, starting at given LBA
* @state
* @lba
* @buffer
* @count
*
- * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Description: Reads @count bytes from @state->disk into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
@@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer,
{
size_t totalreadcount = 0;
- if (!buffer || lba + count / 512 > last_lba(state->bdev))
+ if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL)
return 0;
while (count) {
diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c
index 9526491d9aed..506921095412 100644
--- a/block/partitions/amiga.c
+++ b/block/partitions/amiga.c
@@ -11,10 +11,18 @@
#define pr_fmt(fmt) fmt
#include <linux/types.h>
+#include <linux/mm_types.h>
+#include <linux/overflow.h>
#include <linux/affs_hardblocks.h>
#include "check.h"
+/* magic offsets in partition DosEnvVec */
+#define NR_HD 3
+#define NR_SECT 5
+#define LO_CYL 9
+#define HI_CYL 10
+
static __inline__ u32
checksum_block(__be32 *m, int size)
{
@@ -31,18 +39,21 @@ int amiga_partition(struct parsed_partitions *state)
unsigned char *data;
struct RigidDiskBlock *rdb;
struct PartitionBlock *pb;
- int start_sect, nr_sects, blk, part, res = 0;
- int blksize = 1; /* Multiplier for disk block size */
+ u64 start_sect, nr_sects;
+ sector_t blk, end_sect;
+ u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */
+ u32 nr_hd, nr_sect, lo_cyl, hi_cyl;
+ int part, res = 0;
+ unsigned int blksize = 1; /* Multiplier for disk block size */
int slot = 1;
- char b[BDEVNAME_SIZE];
for (blk = 0; ; blk++, put_dev_sector(sect)) {
if (blk == RDB_ALLOCATION_LIMIT)
goto rdb_done;
data = read_part_sector(state, blk, &sect);
if (!data) {
- pr_err("Dev %s: unable to read RDB block %d\n",
- bdevname(state->bdev, b), blk);
+ pr_err("Dev %s: unable to read RDB block %llu\n",
+ state->disk->disk_name, blk);
res = -1;
goto rdb_done;
}
@@ -58,13 +69,13 @@ int amiga_partition(struct parsed_partitions *state)
*(__be32 *)(data+0xdc) = 0;
if (checksum_block((__be32 *)data,
be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) {
- pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n",
+ pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n",
blk);
break;
}
- pr_err("Dev %s: RDB in block %d has bad checksum\n",
- bdevname(state->bdev, b), blk);
+ pr_err("Dev %s: RDB in block %llu has bad checksum\n",
+ state->disk->disk_name, blk);
}
/* blksize is blocks per 512 byte standard block */
@@ -79,12 +90,17 @@ int amiga_partition(struct parsed_partitions *state)
}
blk = be32_to_cpu(rdb->rdb_PartitionList);
put_dev_sector(sect);
- for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) {
- blk *= blksize; /* Read in terms partition table understands */
+ for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) {
+ /* Read in terms partition table understands */
+ if (check_mul_overflow(blk, (sector_t) blksize, &blk)) {
+ pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n",
+ state->disk->disk_name, blk, part);
+ break;
+ }
data = read_part_sector(state, blk, &sect);
if (!data) {
- pr_err("Dev %s: unable to read partition block %d\n",
- bdevname(state->bdev, b), blk);
+ pr_err("Dev %s: unable to read partition block %llu\n",
+ state->disk->disk_name, blk);
res = -1;
goto rdb_done;
}
@@ -95,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state)
if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 )
continue;
- /* Tell Kernel about it */
+ /* RDB gives us more than enough rope to hang ourselves with,
+ * many times over (2^128 bytes if all fields max out).
+ * Some careful checks are in order, so check for potential
+ * overflows.
+ * We are multiplying four 32 bit numbers to one sector_t!
+ */
+
+ nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]);
+ nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]);
+
+ /* CylBlocks is total number of blocks per cylinder */
+ if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) {
+ pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n",
+ state->disk->disk_name, cylblk);
+ continue;
+ }
+
+ /* check for consistency with RDB defined CylBlocks */
+ if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) {
+ pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n",
+ state->disk->disk_name, cylblk,
+ be32_to_cpu(rdb->rdb_CylBlocks));
+ }
+
+ /* RDB allows for variable logical block size -
+ * normalize to 512 byte blocks and check result.
+ */
+
+ if (check_mul_overflow(cylblk, blksize, &cylblk)) {
+ pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n",
+ state->disk->disk_name, part);
+ continue;
+ }
+
+ /* Calculate partition start and end. Limit of 32 bit on cylblk
+ * guarantees no overflow occurs if LBD support is enabled.
+ */
+
+ lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]);
+ start_sect = ((u64) lo_cyl * cylblk);
+
+ hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]);
+ nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk);
- nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 -
- be32_to_cpu(pb->pb_Environment[9])) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
if (!nr_sects)
continue;
- start_sect = be32_to_cpu(pb->pb_Environment[9]) *
- be32_to_cpu(pb->pb_Environment[3]) *
- be32_to_cpu(pb->pb_Environment[5]) *
- blksize;
+
+ /* Warn user if partition end overflows u32 (AmigaDOS limit) */
+
+ if ((start_sect + nr_sects) > UINT_MAX) {
+ pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n",
+ state->disk->disk_name, part,
+ start_sect, start_sect + nr_sects);
+ }
+
+ if (check_add_overflow(start_sect, nr_sects, &end_sect)) {
+ pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n",
+ state->disk->disk_name, part,
+ start_sect, end_sect);
+ continue;
+ }
+
+ /* Tell Kernel about it */
+
put_partition(state,slot++,start_sect,nr_sects);
{
/* Be even more informative to aid mounting */
diff --git a/block/partitions/atari.c b/block/partitions/atari.c
index 2305840c8522..9655c728262a 100644
--- a/block/partitions/atari.c
+++ b/block/partitions/atari.c
@@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state)
* ATARI partition scheme supports 512 lba only. If this is not
* the case, bail early to avoid miscalculating hd_size.
*/
- if (bdev_logical_block_size(state->bdev) != 512)
+ if (queue_logical_block_size(state->disk->queue) != 512)
return 0;
rs = read_part_sector(state, 0, &sect);
@@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state)
return -1;
/* Verify this is an Atari rootsector: */
- hd_size = state->bdev->bd_inode->i_size >> 9;
+ hd_size = get_capacity(state->disk);
if (!VALID_PARTITION(&rs->part[0], hd_size) &&
!VALID_PARTITION(&rs->part[1], hd_size) &&
!VALID_PARTITION(&rs->part[2], hd_size) &&
@@ -140,7 +140,6 @@ int atari_partition(struct parsed_partitions *state)
/* accept only GEM,BGM,RAW,LNX,SWP partitions */
if (!((pi->flg & 1) && OK_id(pi->id)))
continue;
- part_fmt = 2;
put_partition (state, slot,
be32_to_cpu(pi->st),
be32_to_cpu(pi->siz));
diff --git a/block/partitions/check.h b/block/partitions/check.h
index c577e9ee67f0..8d70a880c372 100644
--- a/block/partitions/check.h
+++ b/block/partitions/check.h
@@ -1,7 +1,6 @@
/* SPDX-License-Identifier: GPL-2.0 */
#include <linux/pagemap.h>
#include <linux/blkdev.h>
-#include <linux/genhd.h>
#include "../blk.h"
/*
@@ -9,7 +8,7 @@
* description.
*/
struct parsed_partitions {
- struct block_device *bdev;
+ struct gendisk *disk;
char name[BDEVNAME_SIZE];
struct {
sector_t from;
@@ -25,13 +24,13 @@ struct parsed_partitions {
};
typedef struct {
- struct page *v;
+ struct folio *v;
} Sector;
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p);
static inline void put_dev_sector(Sector p)
{
- put_page(p.v);
+ folio_put(p.v);
}
static inline void
diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c
index 8f545c36cde4..c03bc105e575 100644
--- a/block/partitions/cmdline.c
+++ b/block/partitions/cmdline.c
@@ -14,20 +14,245 @@
* For further information, see "Documentation/block/cmdline-partition.rst"
*
*/
+#include <linux/blkdev.h>
+#include <linux/fs.h>
+#include <linux/slab.h>
+#include "check.h"
-#include <linux/cmdline-parser.h>
-#include "check.h"
+/* partition flags */
+#define PF_RDONLY 0x01 /* Device is read only */
+#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */
+
+struct cmdline_subpart {
+ char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */
+ sector_t from;
+ sector_t size;
+ int flags;
+ struct cmdline_subpart *next_subpart;
+};
+
+struct cmdline_parts {
+ char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */
+ unsigned int nr_subparts;
+ struct cmdline_subpart *subpart;
+ struct cmdline_parts *next_parts;
+};
+
+static int parse_subpart(struct cmdline_subpart **subpart, char *partdef)
+{
+ int ret = 0;
+ struct cmdline_subpart *new_subpart;
+
+ *subpart = NULL;
+
+ new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL);
+ if (!new_subpart)
+ return -ENOMEM;
+
+ if (*partdef == '-') {
+ new_subpart->size = (sector_t)(~0ULL);
+ partdef++;
+ } else {
+ new_subpart->size = (sector_t)memparse(partdef, &partdef);
+ if (new_subpart->size < (sector_t)PAGE_SIZE) {
+ pr_warn("cmdline partition size is invalid.");
+ ret = -EINVAL;
+ goto fail;
+ }
+ }
+
+ if (*partdef == '@') {
+ partdef++;
+ new_subpart->from = (sector_t)memparse(partdef, &partdef);
+ } else {
+ new_subpart->from = (sector_t)(~0ULL);
+ }
+
+ if (*partdef == '(') {
+ int length;
+ char *next = strchr(++partdef, ')');
+
+ if (!next) {
+ pr_warn("cmdline partition format is invalid.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ length = min_t(int, next - partdef,
+ sizeof(new_subpart->name) - 1);
+ strscpy(new_subpart->name, partdef, length);
+
+ partdef = ++next;
+ } else
+ new_subpart->name[0] = '\0';
+
+ new_subpart->flags = 0;
+
+ if (!strncmp(partdef, "ro", 2)) {
+ new_subpart->flags |= PF_RDONLY;
+ partdef += 2;
+ }
+
+ if (!strncmp(partdef, "lk", 2)) {
+ new_subpart->flags |= PF_POWERUP_LOCK;
+ partdef += 2;
+ }
+
+ *subpart = new_subpart;
+ return 0;
+fail:
+ kfree(new_subpart);
+ return ret;
+}
+
+static void free_subpart(struct cmdline_parts *parts)
+{
+ struct cmdline_subpart *subpart;
+
+ while (parts->subpart) {
+ subpart = parts->subpart;
+ parts->subpart = subpart->next_subpart;
+ kfree(subpart);
+ }
+}
+
+static int parse_parts(struct cmdline_parts **parts, const char *bdevdef)
+{
+ int ret = -EINVAL;
+ char *next;
+ int length;
+ struct cmdline_subpart **next_subpart;
+ struct cmdline_parts *newparts;
+ char buf[BDEVNAME_SIZE + 32 + 4];
+
+ *parts = NULL;
+
+ newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL);
+ if (!newparts)
+ return -ENOMEM;
+
+ next = strchr(bdevdef, ':');
+ if (!next) {
+ pr_warn("cmdline partition has no block device.");
+ goto fail;
+ }
+
+ length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1);
+ strscpy(newparts->name, bdevdef, length);
+ newparts->nr_subparts = 0;
+
+ next_subpart = &newparts->subpart;
+
+ while (next && *(++next)) {
+ bdevdef = next;
+ next = strchr(bdevdef, ',');
+
+ length = (!next) ? (sizeof(buf) - 1) :
+ min_t(int, next - bdevdef, sizeof(buf) - 1);
+
+ strscpy(buf, bdevdef, length);
+
+ ret = parse_subpart(next_subpart, buf);
+ if (ret)
+ goto fail;
+
+ newparts->nr_subparts++;
+ next_subpart = &(*next_subpart)->next_subpart;
+ }
+
+ if (!newparts->subpart) {
+ pr_warn("cmdline partition has no valid partition.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ *parts = newparts;
+
+ return 0;
+fail:
+ free_subpart(newparts);
+ kfree(newparts);
+ return ret;
+}
+
+static void cmdline_parts_free(struct cmdline_parts **parts)
+{
+ struct cmdline_parts *next_parts;
+
+ while (*parts) {
+ next_parts = (*parts)->next_parts;
+ free_subpart(*parts);
+ kfree(*parts);
+ *parts = next_parts;
+ }
+}
+
+static int cmdline_parts_parse(struct cmdline_parts **parts,
+ const char *cmdline)
+{
+ int ret;
+ char *buf;
+ char *pbuf;
+ char *next;
+ struct cmdline_parts **next_parts;
+
+ *parts = NULL;
+
+ next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL);
+ if (!buf)
+ return -ENOMEM;
+
+ next_parts = parts;
+
+ while (next && *pbuf) {
+ next = strchr(pbuf, ';');
+ if (next)
+ *next = '\0';
+
+ ret = parse_parts(next_parts, pbuf);
+ if (ret)
+ goto fail;
+
+ if (next)
+ pbuf = ++next;
+
+ next_parts = &(*next_parts)->next_parts;
+ }
+
+ if (!*parts) {
+ pr_warn("cmdline partition has no valid partition.");
+ ret = -EINVAL;
+ goto fail;
+ }
+
+ ret = 0;
+done:
+ kfree(buf);
+ return ret;
+
+fail:
+ cmdline_parts_free(parts);
+ goto done;
+}
+
+static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts,
+ const char *bdev)
+{
+ while (parts && strncmp(bdev, parts->name, sizeof(parts->name)))
+ parts = parts->next_parts;
+ return parts;
+}
static char *cmdline;
static struct cmdline_parts *bdev_parts;
-static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
+static int add_part(int slot, struct cmdline_subpart *subpart,
+ struct parsed_partitions *state)
{
int label_min;
struct partition_meta_info *info;
char tmp[sizeof(info->volname) + 4];
- struct parsed_partitions *state = (struct parsed_partitions *)param;
if (slot >= state->limit)
return 1;
@@ -39,8 +264,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
label_min = min_t(int, sizeof(info->volname) - 1,
sizeof(subpart->name));
- strncpy(info->volname, subpart->name, label_min);
- info->volname[label_min] = '\0';
+ strscpy(info->volname, subpart->name, label_min);
snprintf(tmp, sizeof(tmp), "(%s)", info->volname);
strlcat(state->pp_buf, tmp, PAGE_SIZE);
@@ -50,6 +274,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param)
return 0;
}
+static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size,
+ struct parsed_partitions *state)
+{
+ sector_t from = 0;
+ struct cmdline_subpart *subpart;
+ int slot = 1;
+
+ for (subpart = parts->subpart; subpart;
+ subpart = subpart->next_subpart, slot++) {
+ if (subpart->from == (sector_t)(~0ULL))
+ subpart->from = from;
+ else
+ from = subpart->from;
+
+ if (from >= disk_size)
+ break;
+
+ if (subpart->size > (disk_size - from))
+ subpart->size = disk_size - from;
+
+ from += subpart->size;
+
+ if (add_part(slot, subpart, state))
+ break;
+ }
+
+ return slot;
+}
+
static int __init cmdline_parts_setup(char *s)
{
cmdline = s;
@@ -123,7 +376,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state)
int cmdline_partition(struct parsed_partitions *state)
{
sector_t disk_size;
- char bdev[BDEVNAME_SIZE];
struct cmdline_parts *parts;
if (cmdline) {
@@ -140,14 +392,13 @@ int cmdline_partition(struct parsed_partitions *state)
if (!bdev_parts)
return 0;
- bdevname(state->bdev, bdev);
- parts = cmdline_parts_find(bdev_parts, bdev);
+ parts = cmdline_parts_find(bdev_parts, state->disk->disk_name);
if (!parts)
return 0;
- disk_size = get_capacity(state->bdev->bd_disk) << 9;
+ disk_size = get_capacity(state->disk) << 9;
- cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state);
+ cmdline_parts_set(parts, disk_size, state);
cmdline_parts_verifier(1, state);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
diff --git a/block/partitions/core.c b/block/partitions/core.c
index 78951e33b2d7..5f5ed5c75f04 100644
--- a/block/partitions/core.c
+++ b/block/partitions/core.c
@@ -2,17 +2,17 @@
/*
* Copyright (C) 1991-1998 Linus Torvalds
* Re-organised Feb 1998 Russell King
+ * Copyright (C) 2020 Christoph Hellwig
*/
#include <linux/fs.h>
+#include <linux/major.h>
#include <linux/slab.h>
#include <linux/ctype.h>
-#include <linux/genhd.h>
#include <linux/vmalloc.h>
-#include <linux/blktrace_api.h>
#include <linux/raid/detect.h>
#include "check.h"
-static int (*check_part[])(struct parsed_partitions *) = {
+static int (*const check_part[])(struct parsed_partitions *) = {
/*
* Probe partition formats with tables at disk address 0
* that also have an ADFS boot block at 0xdc0.
@@ -88,13 +88,12 @@ static int (*check_part[])(struct parsed_partitions *) = {
static struct parsed_partitions *allocate_partitions(struct gendisk *hd)
{
struct parsed_partitions *state;
- int nr;
+ int nr = DISK_MAX_PARTS;
state = kzalloc(sizeof(*state), GFP_KERNEL);
if (!state)
return NULL;
- nr = disk_max_parts(hd);
state->parts = vzalloc(array_size(nr, sizeof(state->parts[0])));
if (!state->parts) {
kfree(state);
@@ -112,8 +111,7 @@ static void free_partitions(struct parsed_partitions *state)
kfree(state);
}
-static struct parsed_partitions *check_partition(struct gendisk *hd,
- struct block_device *bdev)
+static struct parsed_partitions *check_partition(struct gendisk *hd)
{
struct parsed_partitions *state;
int i, res, err;
@@ -128,8 +126,8 @@ static struct parsed_partitions *check_partition(struct gendisk *hd,
}
state->pp_buf[0] = '\0';
- state->bdev = bdev;
- disk_name(hd, 0, state->name);
+ state->disk = hd;
+ snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name);
snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name);
if (isdigit(state->name[strlen(state->name)-1]))
sprintf(state->name, "p");
@@ -175,38 +173,31 @@ static struct parsed_partitions *check_partition(struct gendisk *hd,
static ssize_t part_partition_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%d\n", p->partno);
+ return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno);
}
static ssize_t part_start_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
-
- return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect);
+ return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect);
}
static ssize_t part_ro_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%d\n", p->policy ? 1 : 0);
+ return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev)));
}
static ssize_t part_alignment_offset_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset);
+ return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev)));
}
static ssize_t part_discard_alignment_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
- struct hd_struct *p = dev_to_part(dev);
- return sprintf(buf, "%u\n", p->discard_alignment);
+ return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev)));
}
static DEVICE_ATTR(partition, 0444, part_partition_show, NULL);
@@ -237,7 +228,7 @@ static struct attribute *part_attrs[] = {
NULL
};
-static struct attribute_group part_attr_group = {
+static const struct attribute_group part_attr_group = {
.attrs = part_attrs,
};
@@ -251,87 +242,36 @@ static const struct attribute_group *part_attr_groups[] = {
static void part_release(struct device *dev)
{
- struct hd_struct *p = dev_to_part(dev);
- blk_free_devt(dev->devt);
- hd_free_part(p);
- kfree(p);
+ put_disk(dev_to_bdev(dev)->bd_disk);
+ iput(dev_to_bdev(dev)->bd_inode);
}
-static int part_uevent(struct device *dev, struct kobj_uevent_env *env)
+static int part_uevent(const struct device *dev, struct kobj_uevent_env *env)
{
- struct hd_struct *part = dev_to_part(dev);
+ const struct block_device *part = dev_to_bdev(dev);
- add_uevent_var(env, "PARTN=%u", part->partno);
- if (part->info && part->info->volname[0])
- add_uevent_var(env, "PARTNAME=%s", part->info->volname);
+ add_uevent_var(env, "PARTN=%u", part->bd_partno);
+ if (part->bd_meta_info && part->bd_meta_info->volname[0])
+ add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname);
return 0;
}
-struct device_type part_type = {
+const struct device_type part_type = {
.name = "partition",
.groups = part_attr_groups,
.release = part_release,
.uevent = part_uevent,
};
-static void hd_struct_free_work(struct work_struct *work)
-{
- struct hd_struct *part =
- container_of(to_rcu_work(work), struct hd_struct, rcu_work);
-
- part->start_sect = 0;
- part->nr_sects = 0;
- part_stat_set_all(part, 0);
- put_device(part_to_dev(part));
-}
-
-static void hd_struct_free(struct percpu_ref *ref)
-{
- struct hd_struct *part = container_of(ref, struct hd_struct, ref);
- struct gendisk *disk = part_to_disk(part);
- struct disk_part_tbl *ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
-
- rcu_assign_pointer(ptbl->last_lookup, NULL);
- put_device(disk_to_dev(disk));
-
- INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work);
- queue_rcu_work(system_wq, &part->rcu_work);
-}
-
-int hd_ref_init(struct hd_struct *part)
-{
- if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL))
- return -ENOMEM;
- return 0;
-}
-
-/*
- * Must be called either with bd_mutex held, before a disk can be opened or
- * after all disk users are gone.
- */
-void delete_partition(struct gendisk *disk, struct hd_struct *part)
+void drop_partition(struct block_device *part)
{
- struct disk_part_tbl *ptbl =
- rcu_dereference_protected(disk->part_tbl, 1);
+ lockdep_assert_held(&part->bd_disk->open_mutex);
- /*
- * ->part_tbl is referenced in this part's release handler, so
- * we have to hold the disk device
- */
- get_device(disk_to_dev(part_to_disk(part)));
- rcu_assign_pointer(ptbl->part[part->partno], NULL);
- kobject_put(part->holder_dir);
- device_del(part_to_dev(part));
+ xa_erase(&part->bd_disk->part_tbl, part->bd_partno);
+ kobject_put(part->bd_holder_dir);
- /*
- * Remove gendisk pointer from idr so that it cannot be looked up
- * while RCU period before freeing gendisk is running to prevent
- * use-after-free issues. Note that the device number stays
- * "in-use" until we really free the gendisk.
- */
- blk_invalidate_devt(part_devt(part));
- percpu_ref_kill(&part->ref);
+ device_del(&part->bd_device);
+ put_device(&part->bd_device);
}
static ssize_t whole_disk_show(struct device *dev,
@@ -339,84 +279,53 @@ static ssize_t whole_disk_show(struct device *dev,
{
return 0;
}
-static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
+static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL);
/*
- * Must be called either with bd_mutex held, before a disk can be opened or
+ * Must be called either with open_mutex held, before a disk can be opened or
* after all disk users are gone.
*/
-static struct hd_struct *add_partition(struct gendisk *disk, int partno,
+static struct block_device *add_partition(struct gendisk *disk, int partno,
sector_t start, sector_t len, int flags,
struct partition_meta_info *info)
{
- struct hd_struct *p;
dev_t devt = MKDEV(0, 0);
struct device *ddev = disk_to_dev(disk);
struct device *pdev;
- struct disk_part_tbl *ptbl;
+ struct block_device *bdev;
const char *dname;
int err;
+ lockdep_assert_held(&disk->open_mutex);
+
+ if (partno >= DISK_MAX_PARTS)
+ return ERR_PTR(-EINVAL);
+
/*
* Partitions are not supported on zoned block devices that are used as
* such.
*/
- switch (disk->queue->limits.zoned) {
- case BLK_ZONED_HM:
+ if (bdev_is_zoned(disk->part0)) {
pr_warn("%s: partitions not supported on host managed zoned block device\n",
disk->disk_name);
return ERR_PTR(-ENXIO);
- case BLK_ZONED_HA:
- pr_info("%s: disabling host aware zoned block device support due to partitions\n",
- disk->disk_name);
- disk->queue->limits.zoned = BLK_ZONED_NONE;
- break;
- case BLK_ZONED_NONE:
- break;
}
- err = disk_expand_part_tbl(disk, partno);
- if (err)
- return ERR_PTR(err);
- ptbl = rcu_dereference_protected(disk->part_tbl, 1);
-
- if (ptbl->part[partno])
- return ERR_PTR(-EBUSY);
-
- p = kzalloc(sizeof(*p), GFP_KERNEL);
- if (!p)
+ if (xa_load(&disk->part_tbl, partno))
return ERR_PTR(-EBUSY);
- p->dkstats = alloc_percpu(struct disk_stats);
- if (!p->dkstats) {
- err = -ENOMEM;
- goto out_free;
- }
-
- hd_sects_seq_init(p);
- pdev = part_to_dev(p);
+ /* ensure we always have a reference to the whole disk */
+ get_device(disk_to_dev(disk));
- p->start_sect = start;
- p->alignment_offset =
- queue_limit_alignment_offset(&disk->queue->limits, start);
- p->discard_alignment =
- queue_limit_discard_alignment(&disk->queue->limits, start);
- p->nr_sects = len;
- p->partno = partno;
- p->policy = get_disk_ro(disk);
-
- if (info) {
- struct partition_meta_info *pinfo;
+ err = -ENOMEM;
+ bdev = bdev_alloc(disk, partno);
+ if (!bdev)
+ goto out_put_disk;
- pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id);
- if (!pinfo) {
- err = -ENOMEM;
- goto out_free_stats;
- }
- memcpy(pinfo, info, sizeof(*info));
- p->info = pinfo;
- }
+ bdev->bd_start_sect = start;
+ bdev_set_nr_sectors(bdev, len);
+ pdev = &bdev->bd_device;
dname = dev_name(ddev);
if (isdigit(dname[strlen(dname) - 1]))
dev_set_name(pdev, "%sp%d", dname, partno);
@@ -428,11 +337,24 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
pdev->type = &part_type;
pdev->parent = ddev;
- err = blk_alloc_devt(p, &devt);
- if (err)
- goto out_free_info;
+ /* in consecutive minor range? */
+ if (bdev->bd_partno < disk->minors) {
+ devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno);
+ } else {
+ err = blk_alloc_ext_minor();
+ if (err < 0)
+ goto out_put;
+ devt = MKDEV(BLOCK_EXT_MAJOR, err);
+ }
pdev->devt = devt;
+ if (info) {
+ err = -ENOMEM;
+ bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL);
+ if (!bdev->bd_meta_info)
+ goto out_put;
+ }
+
/* delay uevent until 'holders' subdir is created */
dev_set_uevent_suppress(pdev, 1);
err = device_add(pdev);
@@ -440,8 +362,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
goto out_put;
err = -ENOMEM;
- p->holder_dir = kobject_create_and_add("holders", &pdev->kobj);
- if (!p->holder_dir)
+ bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj);
+ if (!bdev->bd_holder_dir)
goto out_del;
dev_set_uevent_suppress(pdev, 0);
@@ -451,200 +373,168 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno,
goto out_del;
}
- err = hd_ref_init(p);
- if (err) {
- if (flags & ADDPART_FLAG_WHOLEDISK)
- goto out_remove_file;
- goto out_del;
- }
-
/* everything is up and running, commence */
- rcu_assign_pointer(ptbl->part[partno], p);
+ err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL);
+ if (err)
+ goto out_del;
+ bdev_add(bdev, devt);
/* suppress uevent if the disk suppresses it */
if (!dev_get_uevent_suppress(ddev))
kobject_uevent(&pdev->kobj, KOBJ_ADD);
- return p;
-
-out_free_info:
- kfree(p->info);
-out_free_stats:
- free_percpu(p->dkstats);
-out_free:
- kfree(p);
- return ERR_PTR(err);
-out_remove_file:
- device_remove_file(pdev, &dev_attr_whole_disk);
+ return bdev;
+
out_del:
- kobject_put(p->holder_dir);
+ kobject_put(bdev->bd_holder_dir);
device_del(pdev);
out_put:
put_device(pdev);
return ERR_PTR(err);
+out_put_disk:
+ put_disk(disk);
+ return ERR_PTR(err);
}
static bool partition_overlaps(struct gendisk *disk, sector_t start,
sector_t length, int skip_partno)
{
- struct disk_part_iter piter;
- struct hd_struct *part;
+ struct block_device *part;
bool overlap = false;
-
- disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter))) {
- if (part->partno == skip_partno ||
- start >= part->start_sect + part->nr_sects ||
- start + length <= part->start_sect)
- continue;
- overlap = true;
- break;
+ unsigned long idx;
+
+ rcu_read_lock();
+ xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+ if (part->bd_partno != skip_partno &&
+ start < part->bd_start_sect + bdev_nr_sectors(part) &&
+ start + length > part->bd_start_sect) {
+ overlap = true;
+ break;
+ }
}
+ rcu_read_unlock();
- disk_part_iter_exit(&piter);
return overlap;
}
-int bdev_add_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length)
+int bdev_add_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length)
{
- struct hd_struct *part;
+ sector_t capacity = get_capacity(disk), end;
+ struct block_device *part;
+ int ret;
- mutex_lock(&bdev->bd_mutex);
- if (partition_overlaps(bdev->bd_disk, start, length, -1)) {
- mutex_unlock(&bdev->bd_mutex);
- return -EBUSY;
+ mutex_lock(&disk->open_mutex);
+ if (check_add_overflow(start, length, &end)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (start >= capacity || end > capacity) {
+ ret = -EINVAL;
+ goto out;
}
- part = add_partition(bdev->bd_disk, partno, start, length,
+ if (!disk_live(disk)) {
+ ret = -ENXIO;
+ goto out;
+ }
+
+ if (disk->flags & GENHD_FL_NO_PART) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ if (partition_overlaps(disk, start, length, -1)) {
+ ret = -EBUSY;
+ goto out;
+ }
+
+ part = add_partition(disk, partno, start, length,
ADDPART_FLAG_NONE, NULL);
- mutex_unlock(&bdev->bd_mutex);
- return PTR_ERR_OR_ZERO(part);
+ ret = PTR_ERR_OR_ZERO(part);
+out:
+ mutex_unlock(&disk->open_mutex);
+ return ret;
}
-int bdev_del_partition(struct block_device *bdev, int partno)
+int bdev_del_partition(struct gendisk *disk, int partno)
{
- struct block_device *bdevp;
- struct hd_struct *part;
- int ret = 0;
+ struct block_device *part = NULL;
+ int ret = -ENXIO;
- part = disk_get_part(bdev->bd_disk, partno);
+ mutex_lock(&disk->open_mutex);
+ part = xa_load(&disk->part_tbl, partno);
if (!part)
- return -ENXIO;
-
- ret = -ENOMEM;
- bdevp = bdget(part_devt(part));
- if (!bdevp)
- goto out_put_part;
-
- mutex_lock(&bdevp->bd_mutex);
+ goto out_unlock;
ret = -EBUSY;
- if (bdevp->bd_openers)
+ if (atomic_read(&part->bd_openers))
goto out_unlock;
- sync_blockdev(bdevp);
- invalidate_bdev(bdevp);
-
- mutex_lock_nested(&bdev->bd_mutex, 1);
- delete_partition(bdev->bd_disk, part);
- mutex_unlock(&bdev->bd_mutex);
+ /*
+ * We verified that @part->bd_openers is zero above and so
+ * @part->bd_holder{_ops} can't be set. And since we hold
+ * @disk->open_mutex the device can't be claimed by anyone.
+ *
+ * So no need to call @part->bd_holder_ops->mark_dead() here.
+ * Just delete the partition and invalidate it.
+ */
+ remove_inode_hash(part->bd_inode);
+ invalidate_bdev(part);
+ drop_partition(part);
ret = 0;
out_unlock:
- mutex_unlock(&bdevp->bd_mutex);
- bdput(bdevp);
-out_put_part:
- disk_put_part(part);
+ mutex_unlock(&disk->open_mutex);
return ret;
}
-int bdev_resize_partition(struct block_device *bdev, int partno,
- sector_t start, sector_t length)
+int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start,
+ sector_t length)
{
- struct block_device *bdevp;
- struct hd_struct *part;
- int ret = 0;
+ struct block_device *part = NULL;
+ int ret = -ENXIO;
- part = disk_get_part(bdev->bd_disk, partno);
+ mutex_lock(&disk->open_mutex);
+ part = xa_load(&disk->part_tbl, partno);
if (!part)
- return -ENXIO;
-
- ret = -ENOMEM;
- bdevp = bdget(part_devt(part));
- if (!bdevp)
- goto out_put_part;
-
- mutex_lock(&bdevp->bd_mutex);
- mutex_lock_nested(&bdev->bd_mutex, 1);
+ goto out_unlock;
ret = -EINVAL;
- if (start != part->start_sect)
+ if (start != part->bd_start_sect)
goto out_unlock;
ret = -EBUSY;
- if (partition_overlaps(bdev->bd_disk, start, length, partno))
+ if (partition_overlaps(disk, start, length, partno))
goto out_unlock;
- part_nr_sects_write(part, (sector_t)length);
- i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT);
+ bdev_set_nr_sectors(part, length);
ret = 0;
out_unlock:
- mutex_unlock(&bdevp->bd_mutex);
- mutex_unlock(&bdev->bd_mutex);
- bdput(bdevp);
-out_put_part:
- disk_put_part(part);
+ mutex_unlock(&disk->open_mutex);
return ret;
}
static bool disk_unlock_native_capacity(struct gendisk *disk)
{
- const struct block_device_operations *bdops = disk->fops;
-
- if (bdops->unlock_native_capacity &&
- !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) {
- printk(KERN_CONT "enabling native capacity\n");
- bdops->unlock_native_capacity(disk);
- disk->flags |= GENHD_FL_NATIVE_CAPACITY;
- return true;
- } else {
+ if (!disk->fops->unlock_native_capacity ||
+ test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) {
printk(KERN_CONT "truncated\n");
return false;
}
-}
-
-int blk_drop_partitions(struct block_device *bdev)
-{
- struct disk_part_iter piter;
- struct hd_struct *part;
-
- if (!disk_part_scan_enabled(bdev->bd_disk))
- return 0;
- if (bdev->bd_part_count)
- return -EBUSY;
- sync_blockdev(bdev);
- invalidate_bdev(bdev);
-
- disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY);
- while ((part = disk_part_iter_next(&piter)))
- delete_partition(bdev->bd_disk, part);
- disk_part_iter_exit(&piter);
-
- return 0;
+ printk(KERN_CONT "enabling native capacity\n");
+ disk->fops->unlock_native_capacity(disk);
+ return true;
}
-#ifdef CONFIG_S390
-/* for historic reasons in the DASD driver */
-EXPORT_SYMBOL_GPL(blk_drop_partitions);
-#endif
-static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
+static bool blk_add_partition(struct gendisk *disk,
struct parsed_partitions *state, int p)
{
sector_t size = state->parts[p].size;
sector_t from = state->parts[p].from;
- struct hd_struct *part;
+ struct block_device *part;
if (!size)
return true;
@@ -677,27 +567,30 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev,
part = add_partition(disk, p, from, size, state->parts[p].flags,
&state->parts[p].info);
if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) {
- printk(KERN_ERR " %s: p%d could not be added: %ld\n",
- disk->disk_name, p, -PTR_ERR(part));
+ printk(KERN_ERR " %s: p%d could not be added: %pe\n",
+ disk->disk_name, p, part);
return true;
}
if (IS_BUILTIN(CONFIG_BLK_DEV_MD) &&
(state->parts[p].flags & ADDPART_FLAG_RAID))
- md_autodetect_dev(part_to_dev(part)->devt);
+ md_autodetect_dev(part->bd_dev);
return true;
}
-int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
+static int blk_add_partitions(struct gendisk *disk)
{
struct parsed_partitions *state;
- int ret = -EAGAIN, p, highest;
+ int ret = -EAGAIN, p;
+
+ if (disk->flags & GENHD_FL_NO_PART)
+ return 0;
- if (!disk_part_scan_enabled(disk))
+ if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state))
return 0;
- state = check_partition(disk, bdev);
+ state = check_partition(disk);
if (!state)
return 0;
if (IS_ERR(state)) {
@@ -717,7 +610,7 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
/*
* Partitions are not supported on host managed zoned block devices.
*/
- if (disk->queue->limits.zoned == BLK_ZONED_HM) {
+ if (bdev_is_zoned(disk->part0)) {
pr_warn("%s: ignoring partition table on host managed zoned block device\n",
disk->disk_name);
ret = 0;
@@ -740,17 +633,8 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev)
/* tell userspace that the media / partition table may have changed */
kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
- /*
- * Detect the highest partition number and preallocate disk->part_tbl.
- * This is an optimization and not strictly necessary.
- */
- for (p = 1, highest = 0; p < state->limit; p++)
- if (state->parts[p].size)
- highest = p;
- disk_expand_part_tbl(disk, highest);
-
for (p = 1; p < state->limit; p++)
- if (!blk_add_partition(disk, bdev, state, p))
+ if (!blk_add_partition(disk, state, p))
goto out_free_state;
ret = 0;
@@ -759,28 +643,92 @@ out_free_state:
return ret;
}
+int bdev_disk_changed(struct gendisk *disk, bool invalidate)
+{
+ struct block_device *part;
+ unsigned long idx;
+ int ret = 0;
+
+ lockdep_assert_held(&disk->open_mutex);
+
+ if (!disk_live(disk))
+ return -ENXIO;
+
+rescan:
+ if (disk->open_partitions)
+ return -EBUSY;
+ sync_blockdev(disk->part0);
+ invalidate_bdev(disk->part0);
+
+ xa_for_each_start(&disk->part_tbl, idx, part, 1) {
+ /*
+ * Remove the block device from the inode hash, so that
+ * it cannot be looked up any more even when openers
+ * still hold references.
+ */
+ remove_inode_hash(part->bd_inode);
+
+ /*
+ * If @disk->open_partitions isn't elevated but there's
+ * still an active holder of that block device things
+ * are broken.
+ */
+ WARN_ON_ONCE(atomic_read(&part->bd_openers));
+ invalidate_bdev(part);
+ drop_partition(part);
+ }
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
+
+ /*
+ * Historically we only set the capacity to zero for devices that
+ * support partitions (independ of actually having partitions created).
+ * Doing that is rather inconsistent, but changing it broke legacy
+ * udisks polling for legacy ide-cdrom devices. Use the crude check
+ * below to get the sane behavior for most device while not breaking
+ * userspace for this particular setup.
+ */
+ if (invalidate) {
+ if (!(disk->flags & GENHD_FL_NO_PART) ||
+ !(disk->flags & GENHD_FL_REMOVABLE))
+ set_capacity(disk, 0);
+ }
+
+ if (get_capacity(disk)) {
+ ret = blk_add_partitions(disk);
+ if (ret == -EAGAIN)
+ goto rescan;
+ } else if (invalidate) {
+ /*
+ * Tell userspace that the media / partition table may have
+ * changed.
+ */
+ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE);
+ }
+
+ return ret;
+}
+/*
+ * Only exported for loop and dasd for historic reasons. Don't use in new
+ * code!
+ */
+EXPORT_SYMBOL_GPL(bdev_disk_changed);
+
void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p)
{
- struct address_space *mapping = state->bdev->bd_inode->i_mapping;
- struct page *page;
+ struct address_space *mapping = state->disk->part0->bd_inode->i_mapping;
+ struct folio *folio;
- if (n >= get_capacity(state->bdev->bd_disk)) {
+ if (n >= get_capacity(state->disk)) {
state->access_beyond_eod = true;
- return NULL;
+ goto out;
}
- page = read_mapping_page(mapping,
- (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL);
- if (IS_ERR(page))
+ folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL);
+ if (IS_ERR(folio))
goto out;
- if (PageError(page))
- goto out_put_page;
-
- p->v = page;
- return (unsigned char *)page_address(page) +
- ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT);
-out_put_page:
- put_page(page);
+
+ p->v = folio;
+ return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE);
out:
p->v = NULL;
return NULL;
diff --git a/block/partitions/efi.c b/block/partitions/efi.c
index b64bfdd4326c..5e9be13a56a8 100644
--- a/block/partitions/efi.c
+++ b/block/partitions/efi.c
@@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len)
/**
* last_lba(): return number of last logical block of device
- * @bdev: block device
+ * @disk: block device
*
* Description: Returns last LBA value on success, 0 on error.
* This is stored (by sd and ide-geometry) in
* the part[0] entry for this disk, and is the number of
* physical sectors available on the disk.
*/
-static u64 last_lba(struct block_device *bdev)
+static u64 last_lba(struct gendisk *disk)
{
- if (!bdev || !bdev->bd_inode)
- return 0;
- return div_u64(bdev->bd_inode->i_size,
- bdev_logical_block_size(bdev)) - 1ULL;
+ return div_u64(bdev_nr_bytes(disk->part0),
+ queue_logical_block_size(disk->queue)) - 1ULL;
}
static inline int pmbr_part_valid(gpt_mbr_record *part)
@@ -231,17 +229,17 @@ done:
* @buffer: destination buffer
* @count: bytes to read
*
- * Description: Reads @count bytes from @state->bdev into @buffer.
+ * Description: Reads @count bytes from @state->disk into @buffer.
* Returns number of bytes read on success, 0 on error.
*/
static size_t read_lba(struct parsed_partitions *state,
u64 lba, u8 *buffer, size_t count)
{
size_t totalreadcount = 0;
- struct block_device *bdev = state->bdev;
- sector_t n = lba * (bdev_logical_block_size(bdev) / 512);
+ sector_t n = lba *
+ (queue_logical_block_size(state->disk->queue) / 512);
- if (!buffer || lba > last_lba(bdev))
+ if (!buffer || lba > last_lba(state->disk))
return 0;
while (count) {
@@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state,
* @lba: the Logical Block Address of the partition table
*
* Description: returns GPT header on success, NULL on error. Allocates
- * and fills a GPT header starting at @ from @state->bdev.
+ * and fills a GPT header starting at @ from @state->disk.
* Note: remember to free gpt when finished with it.
*/
static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state,
u64 lba)
{
gpt_header *gpt;
- unsigned ssz = bdev_logical_block_size(state->bdev);
+ unsigned ssz = queue_logical_block_size(state->disk->queue);
gpt = kmalloc(ssz, GFP_KERNEL);
if (!gpt)
@@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
/* Check the GUID Partition Table header size is too big */
if (le32_to_cpu((*gpt)->header_size) >
- bdev_logical_block_size(state->bdev)) {
+ queue_logical_block_size(state->disk->queue)) {
pr_debug("GUID Partition Table Header size is too large: %u > %u\n",
le32_to_cpu((*gpt)->header_size),
- bdev_logical_block_size(state->bdev));
+ queue_logical_block_size(state->disk->queue));
goto fail;
}
@@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba,
/* Check the first_usable_lba and last_usable_lba are
* within the disk.
*/
- lastlba = last_lba(state->bdev);
+ lastlba = last_lba(state->disk);
if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) {
pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n",
(unsigned long long)le64_to_cpu((*gpt)->first_usable_lba),
@@ -587,13 +585,15 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
gpt_header *pgpt = NULL, *agpt = NULL;
gpt_entry *pptes = NULL, *aptes = NULL;
legacy_mbr *legacymbr;
- sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9;
+ struct gendisk *disk = state->disk;
+ const struct block_device_operations *fops = disk->fops;
+ sector_t total_sectors = get_capacity(state->disk);
u64 lastlba;
if (!ptes)
return 0;
- lastlba = last_lba(state->bdev);
+ lastlba = last_lba(state->disk);
if (!force_gpt) {
/* This will be added to the EFI Spec. per Intel after v1.02. */
legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL);
@@ -621,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt,
if (!good_agpt && force_gpt)
good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes);
+ if (!good_agpt && force_gpt && fops->alternative_gpt_sector) {
+ sector_t agpt_sector;
+ int err;
+
+ err = fops->alternative_gpt_sector(disk, &agpt_sector);
+ if (!err)
+ good_agpt = is_gpt_valid(state, agpt_sector,
+ &agpt, &aptes);
+ }
+
/* The obviously unsuccessful case */
if (!good_pgpt && !good_agpt)
goto fail;
@@ -682,7 +692,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out)
}
/**
- * efi_partition(struct parsed_partitions *state)
+ * efi_partition - scan for GPT partitions
* @state: disk parsed partitions
*
* Description: called from check.c, if the disk contains GPT
@@ -705,7 +715,7 @@ int efi_partition(struct parsed_partitions *state)
gpt_header *gpt = NULL;
gpt_entry *ptes = NULL;
u32 i;
- unsigned ssz = bdev_logical_block_size(state->bdev) / 512;
+ unsigned ssz = queue_logical_block_size(state->disk->queue) / 512;
if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) {
kfree(gpt);
@@ -722,7 +732,7 @@ int efi_partition(struct parsed_partitions *state)
u64 size = le64_to_cpu(ptes[i].ending_lba) -
le64_to_cpu(ptes[i].starting_lba) + 1ULL;
- if (!is_pte_valid(&ptes[i], last_lba(state->bdev)))
+ if (!is_pte_valid(&ptes[i], last_lba(state->disk)))
continue;
put_partition(state, i+1, start * ssz, size * ssz);
diff --git a/block/partitions/efi.h b/block/partitions/efi.h
index 8cc2b88d0aa8..84b9f36b9e47 100644
--- a/block/partitions/efi.h
+++ b/block/partitions/efi.h
@@ -13,7 +13,6 @@
#include <linux/types.h>
#include <linux/fs.h>
-#include <linux/genhd.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/string.h>
diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c
index d6e18df9c53c..82d9c4c3fb41 100644
--- a/block/partitions/ibm.c
+++ b/block/partitions/ibm.c
@@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo)
ptr->b;
}
+/* Volume Label Type/ID Length */
+#define DASD_VOL_TYPE_LEN 4
+#define DASD_VOL_ID_LEN 6
+
+/* Volume Label Types */
+#define DASD_VOLLBL_TYPE_VOL1 0
+#define DASD_VOLLBL_TYPE_LNX1 1
+#define DASD_VOLLBL_TYPE_CMS1 2
+
+struct dasd_vollabel {
+ char *type;
+ int idx;
+};
+
+static struct dasd_vollabel dasd_vollabels[] = {
+ [DASD_VOLLBL_TYPE_VOL1] = {
+ .type = "VOL1",
+ .idx = DASD_VOLLBL_TYPE_VOL1,
+ },
+ [DASD_VOLLBL_TYPE_LNX1] = {
+ .type = "LNX1",
+ .idx = DASD_VOLLBL_TYPE_LNX1,
+ },
+ [DASD_VOLLBL_TYPE_CMS1] = {
+ .type = "CMS1",
+ .idx = DASD_VOLLBL_TYPE_CMS1,
+ },
+};
+
+static int get_label_by_type(const char *type)
+{
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) {
+ if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN))
+ return dasd_vollabels[i].idx;
+ }
+
+ return -1;
+}
+
static int find_label(struct parsed_partitions *state,
dasd_information2_t *info,
struct hd_geometry *geo,
@@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state,
char type[],
union label_t *label)
{
- Sector sect;
- unsigned char *data;
sector_t testsect[3];
- unsigned char temp[5];
- int found = 0;
int i, testcount;
+ Sector sect;
+ void *data;
/* There a three places where we may find a valid label:
* - on an ECKD disk it's block 2
@@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state,
if (data == NULL)
continue;
memcpy(label, data, sizeof(*label));
- memcpy(temp, data, 4);
- temp[4] = 0;
- EBCASC(temp, 4);
+ memcpy(type, data, DASD_VOL_TYPE_LEN);
+ EBCASC(type, DASD_VOL_TYPE_LEN);
put_dev_sector(sect);
- if (!strcmp(temp, "VOL1") ||
- !strcmp(temp, "LNX1") ||
- !strcmp(temp, "CMS1")) {
- if (!strcmp(temp, "VOL1")) {
- strncpy(type, label->vol.vollbl, 4);
- strncpy(name, label->vol.volid, 6);
- } else {
- strncpy(type, label->lnx.vollbl, 4);
- strncpy(name, label->lnx.volid, 6);
- }
- EBCASC(type, 4);
- EBCASC(name, 6);
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
+ memcpy(name, label->vol.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
+ *labelsect = testsect[i];
+ return 1;
+ case DASD_VOLLBL_TYPE_LNX1:
+ case DASD_VOLLBL_TYPE_CMS1:
+ memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN);
+ EBCASC(name, DASD_VOL_ID_LEN);
*labelsect = testsect[i];
- found = 1;
+ return 1;
+ default:
break;
}
}
- if (!found)
- memset(label, 0, sizeof(*label));
- return found;
+ return 0;
}
static int find_vol1_partitions(struct parsed_partitions *state,
@@ -198,7 +233,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
char name[],
union label_t *label,
sector_t labelsect,
- loff_t i_size,
+ sector_t nr_sectors,
dasd_information2_t *info)
{
loff_t offset, geo_size, size;
@@ -213,14 +248,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
} else {
/*
* Formated w/o large volume support. If the sanity check
- * 'size based on geo == size based on i_size' is true, then
+ * 'size based on geo == size based on nr_sectors' is true, then
* we can safely assume that we know the formatted size of
* the disk, otherwise we need additional information
* that we can only get from a real DASD device.
*/
geo_size = geo->cylinders * geo->heads
* geo->sectors * secperblk;
- size = i_size >> 9;
+ size = nr_sectors;
if (size != geo_size) {
if (!info) {
strlcat(state->pp_buf, "\n", PAGE_SIZE);
@@ -229,7 +264,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state,
if (!strcmp(info->type, "ECKD"))
if (geo_size < size)
size = geo_size;
- /* else keep size based on i_size */
+ /* else keep size based on nr_sectors */
}
}
/* first and only partition starts in the first block after the label */
@@ -290,14 +325,15 @@ static int find_cms1_partitions(struct parsed_partitions *state,
int ibm_partition(struct parsed_partitions *state)
{
int (*fn)(struct gendisk *disk, dasd_information2_t *info);
- struct block_device *bdev = state->bdev;
- struct gendisk *disk = bdev->bd_disk;
+ struct gendisk *disk = state->disk;
+ struct block_device *bdev = disk->part0;
int blocksize, res;
- loff_t i_size, offset, size;
+ loff_t offset, size;
+ sector_t nr_sectors;
dasd_information2_t *info;
struct hd_geometry *geo;
- char type[5] = {0,};
- char name[7] = {0,};
+ char type[DASD_VOL_TYPE_LEN + 1] = "";
+ char name[DASD_VOL_ID_LEN + 1] = "";
sector_t labelsect;
union label_t *label;
@@ -305,13 +341,11 @@ int ibm_partition(struct parsed_partitions *state)
if (!disk->fops->getgeo)
goto out_exit;
fn = symbol_get(dasd_biodasdinfo);
- if (!fn)
- goto out_exit;
blocksize = bdev_logical_block_size(bdev);
if (blocksize <= 0)
goto out_symbol;
- i_size = i_size_read(bdev->bd_inode);
- if (i_size == 0)
+ nr_sectors = bdev_nr_sectors(bdev);
+ if (nr_sectors == 0)
goto out_symbol;
info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL);
if (info == NULL)
@@ -326,23 +360,26 @@ int ibm_partition(struct parsed_partitions *state)
geo->start = get_start_sect(bdev);
if (disk->fops->getgeo(bdev, geo))
goto out_freeall;
- if (fn(disk, info)) {
+ if (!fn || fn(disk, info)) {
kfree(info);
info = NULL;
}
- if (find_label(state, info, geo, blocksize, &labelsect, name, type,
- label)) {
- if (!strncmp(type, "VOL1", 4)) {
+ if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) {
+ switch (get_label_by_type(type)) {
+ case DASD_VOLLBL_TYPE_VOL1:
res = find_vol1_partitions(state, geo, blocksize, name,
label);
- } else if (!strncmp(type, "LNX1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_LNX1:
res = find_lnx1_partitions(state, geo, blocksize, name,
- label, labelsect, i_size,
+ label, labelsect, nr_sectors,
info);
- } else if (!strncmp(type, "CMS1", 4)) {
+ break;
+ case DASD_VOLLBL_TYPE_CMS1:
res = find_cms1_partitions(state, geo, blocksize, name,
label, labelsect);
+ break;
}
} else if (info) {
/*
@@ -355,7 +392,7 @@ int ibm_partition(struct parsed_partitions *state)
res = 1;
if (info->format == DASD_FORMAT_LDL) {
strlcat(state->pp_buf, "(nonl)", PAGE_SIZE);
- size = i_size >> 9;
+ size = nr_sectors;
offset = (info->label_block + 1) * (blocksize >> 9);
put_partition(state, 1, offset, size-offset);
strlcat(state->pp_buf, "\n", PAGE_SIZE);
@@ -370,7 +407,8 @@ out_nolab:
out_nogeo:
kfree(info);
out_symbol:
- symbol_put(dasd_biodasdinfo);
+ if (fn)
+ symbol_put(dasd_biodasdinfo);
out_exit:
return res;
}
diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c
index d333786b5c7e..38e58960ae03 100644
--- a/block/partitions/ldm.c
+++ b/block/partitions/ldm.c
@@ -1,5 +1,5 @@
// SPDX-License-Identifier: GPL-2.0-or-later
-/**
+/*
* ldm - Support for Windows Logical Disk Manager (Dynamic Disks)
*
* Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org>
@@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state,
}
}
- num_sects = state->bdev->bd_inode->i_size >> 9;
+ num_sects = get_capacity(state->disk);
if ((ph[0]->config_start > num_sects) ||
((ph[0]->config_start + ph[0]->config_size) > num_sects)) {
@@ -339,11 +339,11 @@ out:
/**
* ldm_validate_tocblocks - Validate the table of contents and its backups
* @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
+ * @base: Offset, into @state->disk, of the database
* @ldb: Cache of the database structures
*
* Find and compare the four tables of contents of the LDM Database stored on
- * @state->bdev and return the parsed information into @toc1.
+ * @state->disk and return the parsed information into @toc1.
*
* The offsets and sizes of the configs are range-checked against a privhead.
*
@@ -486,8 +486,8 @@ out:
* only likely to happen if the underlying device is strange. If that IS
* the case we should return zero to let someone else try.
*
- * Return: 'true' @state->bdev is a dynamic disk
- * 'false' @state->bdev is not a dynamic disk, or an error occurred
+ * Return: 'true' @state->disk is a dynamic disk
+ * 'false' @state->disk is not a dynamic disk, or an error occurred
*/
static bool ldm_validate_partition_table(struct parsed_partitions *state)
{
@@ -510,7 +510,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state)
p = (struct msdos_partition *)(data + 0x01BE);
for (i = 0; i < 4; i++, p++)
- if (SYS_IND (p) == LDM_PARTITION) {
+ if (p->sys_ind == LDM_PARTITION) {
result = true;
break;
}
@@ -736,7 +736,6 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb)
len = r_cols;
} else {
r_stripe = 0;
- r_cols = 0;
len = r_parent;
}
if (len < 0)
@@ -783,11 +782,8 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb)
r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid);
r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1);
len = r_id2;
- } else {
- r_id1 = 0;
- r_id2 = 0;
+ } else
len = r_diskid;
- }
if (len < 0)
return false;
@@ -826,11 +822,8 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb)
r_id1 = ldm_relative (buffer, buflen, 0x44, r_name);
r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1);
len = r_id2;
- } else {
- r_id1 = 0;
- r_id2 = 0;
+ } else
len = r_name;
- }
if (len < 0)
return false;
@@ -963,10 +956,8 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb)
return false;
}
len = r_index;
- } else {
- r_index = 0;
+ } else
len = r_diskid;
- }
if (len < 0) {
ldm_error("len %d < 0", len);
return false;
@@ -1340,7 +1331,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb)
/**
* ldm_get_vblks - Read the on-disk database of VBLKs into memory
* @state: Partition check state including device holding the LDM Database
- * @base: Offset, into @state->bdev, of the database
+ * @base: Offset, into @state->disk, of the database
* @ldb: Cache of the database structures
*
* To use the information from the VBLKs, they need to be read from the disk,
@@ -1432,10 +1423,10 @@ static void ldm_free_vblks (struct list_head *lh)
* example, if the device is hda, we would have: hda1: LDM database, hda2, hda3,
* and so on: the actual data containing partitions.
*
- * Return: 1 Success, @state->bdev is a dynamic disk and we handled it
- * 0 Success, @state->bdev is not a dynamic disk
+ * Return: 1 Success, @state->disk is a dynamic disk and we handled it
+ * 0 Success, @state->disk is not a dynamic disk
* -1 An error occurred before enough information had been read
- * Or @state->bdev is a dynamic disk, but it may be corrupted
+ * Or @state->disk is a dynamic disk, but it may be corrupted
*/
int ldm_partition(struct parsed_partitions *state)
{
diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h
index d8d6beaa72c4..0a747a0c782d 100644
--- a/block/partitions/ldm.h
+++ b/block/partitions/ldm.h
@@ -14,7 +14,6 @@
#include <linux/types.h>
#include <linux/list.h>
-#include <linux/genhd.h>
#include <linux/fs.h>
#include <asm/unaligned.h>
#include <asm/byteorder.h>
@@ -84,9 +83,6 @@ struct parsed_partitions;
#define TOC_BITMAP1 "config" /* Names of the two defined */
#define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */
-/* Borrowed from msdos.c */
-#define SYS_IND(p) (get_unaligned(&(p)->sys_ind))
-
struct frag { /* VBLK Fragment handling */
struct list_head list;
u32 group;
diff --git a/block/partitions/mac.c b/block/partitions/mac.c
index b6095335636c..7b521df00a39 100644
--- a/block/partitions/mac.c
+++ b/block/partitions/mac.c
@@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state)
}
#ifdef CONFIG_PPC_PMAC
if (found_root_goodness)
- note_bootable_part(state->bdev->bd_dev, found_root,
+ note_bootable_part(state->disk->part0->bd_dev, found_root,
found_root_goodness);
#endif
diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c
index 8f2fcc080264..b5d5c229cc3b 100644
--- a/block/partitions/msdos.c
+++ b/block/partitions/msdos.c
@@ -38,8 +38,6 @@
*/
#include <asm/unaligned.h>
-#define SYS_IND(p) get_unaligned(&p->sys_ind)
-
static inline sector_t nr_sects(struct msdos_partition *p)
{
return (sector_t)get_unaligned_le32(&p->nr_sects);
@@ -52,9 +50,9 @@ static inline sector_t start_sect(struct msdos_partition *p)
static inline int is_extended_partition(struct msdos_partition *p)
{
- return (SYS_IND(p) == DOS_EXTENDED_PARTITION ||
- SYS_IND(p) == WIN98_EXTENDED_PARTITION ||
- SYS_IND(p) == LINUX_EXTENDED_PARTITION);
+ return (p->sys_ind == DOS_EXTENDED_PARTITION ||
+ p->sys_ind == WIN98_EXTENDED_PARTITION ||
+ p->sys_ind == LINUX_EXTENDED_PARTITION);
}
#define MSDOS_LABEL_MAGIC1 0x55
@@ -137,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state,
Sector sect;
unsigned char *data;
sector_t this_sector, this_size;
- sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+ sector_t sector_size;
int loopct = 0; /* number of links followed
without finding a data partition */
int i;
+ sector_size = queue_logical_block_size(state->disk->queue) / 512;
this_sector = first_sector;
this_size = first_size;
@@ -193,7 +192,7 @@ static void parse_extended(struct parsed_partitions *state,
put_partition(state, state->next, next, size);
set_info(state, state->next, disksig);
- if (SYS_IND(p) == LINUX_RAID_PARTITION)
+ if (p->sys_ind == LINUX_RAID_PARTITION)
state->parts[state->next].flags = ADDPART_FLAG_RAID;
loopct = 0;
if (++state->next == state->limit)
@@ -546,7 +545,7 @@ static void parse_minix(struct parsed_partitions *state,
* a secondary MBR describing its subpartitions, or
* the normal boot sector. */
if (msdos_magic_present(data + 510) &&
- SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */
+ p->sys_ind == MINIX_PARTITION) { /* subpartition table present */
char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1];
snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin);
@@ -555,7 +554,7 @@ static void parse_minix(struct parsed_partitions *state,
if (state->next == state->limit)
break;
/* add each partition in use */
- if (SYS_IND(p) == MINIX_PARTITION)
+ if (p->sys_ind == MINIX_PARTITION)
put_partition(state, state->next++,
start_sect(p), nr_sects(p));
}
@@ -581,7 +580,7 @@ static struct {
int msdos_partition(struct parsed_partitions *state)
{
- sector_t sector_size = bdev_logical_block_size(state->bdev) / 512;
+ sector_t sector_size;
Sector sect;
unsigned char *data;
struct msdos_partition *p;
@@ -589,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state)
int slot;
u32 disksig;
+ sector_size = queue_logical_block_size(state->disk->queue) / 512;
data = read_part_sector(state, 0, &sect);
if (!data)
return -1;
@@ -622,7 +622,7 @@ int msdos_partition(struct parsed_partitions *state)
for (slot = 1; slot <= 4; slot++, p++) {
if (p->boot_ind != 0 && p->boot_ind != 0x80) {
/*
- * Even without a valid boot inidicator value
+ * Even without a valid boot indicator value
* its still possible this is valid FAT filesystem
* without a partition table.
*/
@@ -643,7 +643,7 @@ int msdos_partition(struct parsed_partitions *state)
p = (struct msdos_partition *) (data + 0x1be);
for (slot = 1 ; slot <= 4 ; slot++, p++) {
/* If this is an EFI GPT disk, msdos should ignore it. */
- if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) {
+ if (p->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT) {
put_dev_sector(sect);
return 0;
}
@@ -685,11 +685,11 @@ int msdos_partition(struct parsed_partitions *state)
}
put_partition(state, slot, start, size);
set_info(state, slot, disksig);
- if (SYS_IND(p) == LINUX_RAID_PARTITION)
+ if (p->sys_ind == LINUX_RAID_PARTITION)
state->parts[slot].flags = ADDPART_FLAG_RAID;
- if (SYS_IND(p) == DM6_PARTITION)
+ if (p->sys_ind == DM6_PARTITION)
strlcat(state->pp_buf, "[DM]", PAGE_SIZE);
- if (SYS_IND(p) == EZD_PARTITION)
+ if (p->sys_ind == EZD_PARTITION)
strlcat(state->pp_buf, "[EZD]", PAGE_SIZE);
}
@@ -698,7 +698,7 @@ int msdos_partition(struct parsed_partitions *state)
/* second pass - output for each on a separate line */
p = (struct msdos_partition *) (0x1be + data);
for (slot = 1 ; slot <= 4 ; slot++, p++) {
- unsigned char id = SYS_IND(p);
+ unsigned char id = p->sys_ind;
int n;
if (!nr_sects(p))
diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c
index 4273f1bb0515..9cc6b8c1eea4 100644
--- a/block/partitions/sgi.c
+++ b/block/partitions/sgi.c
@@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state)
Sector sect;
struct sgi_disklabel *label;
struct sgi_partition *p;
- char b[BDEVNAME_SIZE];
label = read_part_sector(state, 0, &sect);
if (!label)
@@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state)
magic = label->magic_mushroom;
if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) {
/*printk("Dev %s SGI disklabel: bad magic %08x\n",
- bdevname(bdev, b), be32_to_cpu(magic));*/
+ state->disk->disk_name, be32_to_cpu(magic));*/
put_dev_sector(sect);
return 0;
}
@@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state)
}
if(csum) {
printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n",
- bdevname(state->bdev, b));
+ state->disk->disk_name);
put_dev_sector(sect);
return 0;
}
diff --git a/block/partitions/sun.c b/block/partitions/sun.c
index 47dc53eccf77..ddf9e6def4b2 100644
--- a/block/partitions/sun.c
+++ b/block/partitions/sun.c
@@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state)
} * label;
struct sun_partition *p;
unsigned long spc;
- char b[BDEVNAME_SIZE];
int use_vtoc;
int nparts;
@@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state)
p = label->partitions;
if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) {
/* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n",
- bdevname(bdev, b), be16_to_cpu(label->magic)); */
+ state->disk->disk_name, be16_to_cpu(label->magic)); */
put_dev_sector(sect);
return 0;
}
@@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state)
csum ^= *ush--;
if (csum) {
printk("Dev %s Sun disklabel: Csum bad, label corrupted\n",
- bdevname(state->bdev, b));
+ state->disk->disk_name);
put_dev_sector(sect);
return 0;
}
diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c
deleted file mode 100644
index ef722f04f88a..000000000000
--- a/block/scsi_ioctl.c
+++ /dev/null
@@ -1,900 +0,0 @@
-// SPDX-License-Identifier: GPL-2.0
-/*
- * Copyright (C) 2001 Jens Axboe <axboe@suse.de>
- */
-#include <linux/compat.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/string.h>
-#include <linux/module.h>
-#include <linux/blkdev.h>
-#include <linux/capability.h>
-#include <linux/completion.h>
-#include <linux/cdrom.h>
-#include <linux/ratelimit.h>
-#include <linux/slab.h>
-#include <linux/times.h>
-#include <linux/uio.h>
-#include <linux/uaccess.h>
-
-#include <scsi/scsi.h>
-#include <scsi/scsi_ioctl.h>
-#include <scsi/scsi_cmnd.h>
-#include <scsi/sg.h>
-
-struct blk_cmd_filter {
- unsigned long read_ok[BLK_SCSI_CMD_PER_LONG];
- unsigned long write_ok[BLK_SCSI_CMD_PER_LONG];
-};
-
-static struct blk_cmd_filter blk_default_cmd_filter;
-
-/* Command group 3 is reserved and should never be used. */
-const unsigned char scsi_command_size_tbl[8] =
-{
- 6, 10, 10, 12,
- 16, 12, 10, 10
-};
-EXPORT_SYMBOL(scsi_command_size_tbl);
-
-#include <scsi/sg.h>
-
-static int sg_get_version(int __user *p)
-{
- static const int sg_version_num = 30527;
- return put_user(sg_version_num, p);
-}
-
-static int scsi_get_idlun(struct request_queue *q, int __user *p)
-{
- return put_user(0, p);
-}
-
-static int scsi_get_bus(struct request_queue *q, int __user *p)
-{
- return put_user(0, p);
-}
-
-static int sg_get_timeout(struct request_queue *q)
-{
- return jiffies_to_clock_t(q->sg_timeout);
-}
-
-static int sg_set_timeout(struct request_queue *q, int __user *p)
-{
- int timeout, err = get_user(timeout, p);
-
- if (!err)
- q->sg_timeout = clock_t_to_jiffies(timeout);
-
- return err;
-}
-
-static int max_sectors_bytes(struct request_queue *q)
-{
- unsigned int max_sectors = queue_max_sectors(q);
-
- max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9);
-
- return max_sectors << 9;
-}
-
-static int sg_get_reserved_size(struct request_queue *q, int __user *p)
-{
- int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q));
-
- return put_user(val, p);
-}
-
-static int sg_set_reserved_size(struct request_queue *q, int __user *p)
-{
- int size, err = get_user(size, p);
-
- if (err)
- return err;
-
- if (size < 0)
- return -EINVAL;
-
- q->sg_reserved_size = min(size, max_sectors_bytes(q));
- return 0;
-}
-
-/*
- * will always return that we are ATAPI even for a real SCSI drive, I'm not
- * so sure this is worth doing anything about (why would you care??)
- */
-static int sg_emulated_host(struct request_queue *q, int __user *p)
-{
- return put_user(1, p);
-}
-
-static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter)
-{
- /* Basic read-only commands */
- __set_bit(TEST_UNIT_READY, filter->read_ok);
- __set_bit(REQUEST_SENSE, filter->read_ok);
- __set_bit(READ_6, filter->read_ok);
- __set_bit(READ_10, filter->read_ok);
- __set_bit(READ_12, filter->read_ok);
- __set_bit(READ_16, filter->read_ok);
- __set_bit(READ_BUFFER, filter->read_ok);
- __set_bit(READ_DEFECT_DATA, filter->read_ok);
- __set_bit(READ_CAPACITY, filter->read_ok);
- __set_bit(READ_LONG, filter->read_ok);
- __set_bit(INQUIRY, filter->read_ok);
- __set_bit(MODE_SENSE, filter->read_ok);
- __set_bit(MODE_SENSE_10, filter->read_ok);
- __set_bit(LOG_SENSE, filter->read_ok);
- __set_bit(START_STOP, filter->read_ok);
- __set_bit(GPCMD_VERIFY_10, filter->read_ok);
- __set_bit(VERIFY_16, filter->read_ok);
- __set_bit(REPORT_LUNS, filter->read_ok);
- __set_bit(SERVICE_ACTION_IN_16, filter->read_ok);
- __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok);
- __set_bit(MAINTENANCE_IN, filter->read_ok);
- __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok);
-
- /* Audio CD commands */
- __set_bit(GPCMD_PLAY_CD, filter->read_ok);
- __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok);
- __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok);
- __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok);
- __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok);
-
- /* CD/DVD data reading */
- __set_bit(GPCMD_READ_CD, filter->read_ok);
- __set_bit(GPCMD_READ_CD_MSF, filter->read_ok);
- __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok);
- __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok);
- __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok);
- __set_bit(GPCMD_READ_HEADER, filter->read_ok);
- __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok);
- __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok);
- __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok);
- __set_bit(GPCMD_REPORT_KEY, filter->read_ok);
- __set_bit(GPCMD_SCAN, filter->read_ok);
- __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok);
- __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok);
- __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok);
- __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok);
- __set_bit(GPCMD_SEEK, filter->read_ok);
- __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok);
-
- /* Basic writing commands */
- __set_bit(WRITE_6, filter->write_ok);
- __set_bit(WRITE_10, filter->write_ok);
- __set_bit(WRITE_VERIFY, filter->write_ok);
- __set_bit(WRITE_12, filter->write_ok);
- __set_bit(WRITE_VERIFY_12, filter->write_ok);
- __set_bit(WRITE_16, filter->write_ok);
- __set_bit(WRITE_LONG, filter->write_ok);
- __set_bit(WRITE_LONG_2, filter->write_ok);
- __set_bit(WRITE_SAME, filter->write_ok);
- __set_bit(WRITE_SAME_16, filter->write_ok);
- __set_bit(WRITE_SAME_32, filter->write_ok);
- __set_bit(ERASE, filter->write_ok);
- __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok);
- __set_bit(MODE_SELECT, filter->write_ok);
- __set_bit(LOG_SELECT, filter->write_ok);
- __set_bit(GPCMD_BLANK, filter->write_ok);
- __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok);
- __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok);
- __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok);
- __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok);
- __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok);
- __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok);
- __set_bit(GPCMD_SEND_EVENT, filter->write_ok);
- __set_bit(GPCMD_SEND_KEY, filter->write_ok);
- __set_bit(GPCMD_SEND_OPC, filter->write_ok);
- __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok);
- __set_bit(GPCMD_SET_SPEED, filter->write_ok);
- __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok);
- __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok);
- __set_bit(GPCMD_SET_STREAMING, filter->write_ok);
- __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok);
-
- /* ZBC Commands */
- __set_bit(ZBC_OUT, filter->write_ok);
- __set_bit(ZBC_IN, filter->read_ok);
-}
-
-int blk_verify_command(unsigned char *cmd, fmode_t mode)
-{
- struct blk_cmd_filter *filter = &blk_default_cmd_filter;
-
- /* root can do any command. */
- if (capable(CAP_SYS_RAWIO))
- return 0;
-
- /* Anybody who can open the device can do a read-safe command */
- if (test_bit(cmd[0], filter->read_ok))
- return 0;
-
- /* Write-safe commands require a writable open */
- if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE))
- return 0;
-
- return -EPERM;
-}
-EXPORT_SYMBOL(blk_verify_command);
-
-static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq,
- struct sg_io_hdr *hdr, fmode_t mode)
-{
- struct scsi_request *req = scsi_req(rq);
-
- if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len))
- return -EFAULT;
- if (blk_verify_command(req->cmd, mode))
- return -EPERM;
-
- /*
- * fill in request structure
- */
- req->cmd_len = hdr->cmd_len;
-
- rq->timeout = msecs_to_jiffies(hdr->timeout);
- if (!rq->timeout)
- rq->timeout = q->sg_timeout;
- if (!rq->timeout)
- rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
- if (rq->timeout < BLK_MIN_SG_TIMEOUT)
- rq->timeout = BLK_MIN_SG_TIMEOUT;
-
- return 0;
-}
-
-static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr,
- struct bio *bio)
-{
- struct scsi_request *req = scsi_req(rq);
- int r, ret = 0;
-
- /*
- * fill in all the output members
- */
- hdr->status = req->result & 0xff;
- hdr->masked_status = status_byte(req->result);
- hdr->msg_status = msg_byte(req->result);
- hdr->host_status = host_byte(req->result);
- hdr->driver_status = driver_byte(req->result);
- hdr->info = 0;
- if (hdr->masked_status || hdr->host_status || hdr->driver_status)
- hdr->info |= SG_INFO_CHECK;
- hdr->resid = req->resid_len;
- hdr->sb_len_wr = 0;
-
- if (req->sense_len && hdr->sbp) {
- int len = min((unsigned int) hdr->mx_sb_len, req->sense_len);
-
- if (!copy_to_user(hdr->sbp, req->sense, len))
- hdr->sb_len_wr = len;
- else
- ret = -EFAULT;
- }
-
- r = blk_rq_unmap_user(bio);
- if (!ret)
- ret = r;
-
- return ret;
-}
-
-static int sg_io(struct request_queue *q, struct gendisk *bd_disk,
- struct sg_io_hdr *hdr, fmode_t mode)
-{
- unsigned long start_time;
- ssize_t ret = 0;
- int writing = 0;
- int at_head = 0;
- struct request *rq;
- struct scsi_request *req;
- struct bio *bio;
-
- if (hdr->interface_id != 'S')
- return -EINVAL;
-
- if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9))
- return -EIO;
-
- if (hdr->dxfer_len)
- switch (hdr->dxfer_direction) {
- default:
- return -EINVAL;
- case SG_DXFER_TO_DEV:
- writing = 1;
- break;
- case SG_DXFER_TO_FROM_DEV:
- case SG_DXFER_FROM_DEV:
- break;
- }
- if (hdr->flags & SG_FLAG_Q_AT_HEAD)
- at_head = 1;
-
- ret = -ENOMEM;
- rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
- req = scsi_req(rq);
-
- if (hdr->cmd_len > BLK_MAX_CDB) {
- req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL);
- if (!req->cmd)
- goto out_put_request;
- }
-
- ret = blk_fill_sghdr_rq(q, rq, hdr, mode);
- if (ret < 0)
- goto out_free_cdb;
-
- ret = 0;
- if (hdr->iovec_count) {
- struct iov_iter i;
- struct iovec *iov = NULL;
-
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall())
- ret = compat_import_iovec(rq_data_dir(rq),
- hdr->dxferp, hdr->iovec_count,
- 0, &iov, &i);
- else
-#endif
- ret = import_iovec(rq_data_dir(rq),
- hdr->dxferp, hdr->iovec_count,
- 0, &iov, &i);
- if (ret < 0)
- goto out_free_cdb;
-
- /* SG_IO howto says that the shorter of the two wins */
- iov_iter_truncate(&i, hdr->dxfer_len);
-
- ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL);
- kfree(iov);
- } else if (hdr->dxfer_len)
- ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len,
- GFP_KERNEL);
-
- if (ret)
- goto out_free_cdb;
-
- bio = rq->bio;
- req->retries = 0;
-
- start_time = jiffies;
-
- /* ignore return value. All information is passed back to caller
- * (if he doesn't check that is his problem).
- * N.B. a non-zero SCSI status is _not_ necessarily an error.
- */
- blk_execute_rq(q, bd_disk, rq, at_head);
-
- hdr->duration = jiffies_to_msecs(jiffies - start_time);
-
- ret = blk_complete_sghdr_rq(rq, hdr, bio);
-
-out_free_cdb:
- scsi_req_free_cmd(req);
-out_put_request:
- blk_put_request(rq);
- return ret;
-}
-
-/**
- * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl
- * @q: request queue to send scsi commands down
- * @disk: gendisk to operate on (option)
- * @mode: mode used to open the file through which the ioctl has been
- * submitted
- * @sic: userspace structure describing the command to perform
- *
- * Send down the scsi command described by @sic to the device below
- * the request queue @q. If @file is non-NULL it's used to perform
- * fine-grained permission checks that allow users to send down
- * non-destructive SCSI commands. If the caller has a struct gendisk
- * available it should be passed in as @disk to allow the low level
- * driver to use the information contained in it. A non-NULL @disk
- * is only allowed if the caller knows that the low level driver doesn't
- * need it (e.g. in the scsi subsystem).
- *
- * Notes:
- * - This interface is deprecated - users should use the SG_IO
- * interface instead, as this is a more flexible approach to
- * performing SCSI commands on a device.
- * - The SCSI command length is determined by examining the 1st byte
- * of the given command. There is no way to override this.
- * - Data transfers are limited to PAGE_SIZE
- * - The length (x + y) must be at least OMAX_SB_LEN bytes long to
- * accommodate the sense buffer when an error occurs.
- * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that
- * old code will not be surprised.
- * - If a Unix error occurs (e.g. ENOMEM) then the user will receive
- * a negative return and the Unix error code in 'errno'.
- * If the SCSI command succeeds then 0 is returned.
- * Positive numbers returned are the compacted SCSI error codes (4
- * bytes in one int) where the lowest byte is the SCSI status.
- */
-int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode,
- struct scsi_ioctl_command __user *sic)
-{
- enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */
- struct request *rq;
- struct scsi_request *req;
- int err;
- unsigned int in_len, out_len, bytes, opcode, cmdlen;
- char *buffer = NULL;
-
- if (!sic)
- return -EINVAL;
-
- /*
- * get in an out lengths, verify they don't exceed a page worth of data
- */
- if (get_user(in_len, &sic->inlen))
- return -EFAULT;
- if (get_user(out_len, &sic->outlen))
- return -EFAULT;
- if (in_len > PAGE_SIZE || out_len > PAGE_SIZE)
- return -EINVAL;
- if (get_user(opcode, sic->data))
- return -EFAULT;
-
- bytes = max(in_len, out_len);
- if (bytes) {
- buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN);
- if (!buffer)
- return -ENOMEM;
-
- }
-
- rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0);
- if (IS_ERR(rq)) {
- err = PTR_ERR(rq);
- goto error_free_buffer;
- }
- req = scsi_req(rq);
-
- cmdlen = COMMAND_SIZE(opcode);
-
- /*
- * get command and data to send to device, if any
- */
- err = -EFAULT;
- req->cmd_len = cmdlen;
- if (copy_from_user(req->cmd, sic->data, cmdlen))
- goto error;
-
- if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len))
- goto error;
-
- err = blk_verify_command(req->cmd, mode);
- if (err)
- goto error;
-
- /* default. possible overriden later */
- req->retries = 5;
-
- switch (opcode) {
- case SEND_DIAGNOSTIC:
- case FORMAT_UNIT:
- rq->timeout = FORMAT_UNIT_TIMEOUT;
- req->retries = 1;
- break;
- case START_STOP:
- rq->timeout = START_STOP_TIMEOUT;
- break;
- case MOVE_MEDIUM:
- rq->timeout = MOVE_MEDIUM_TIMEOUT;
- break;
- case READ_ELEMENT_STATUS:
- rq->timeout = READ_ELEMENT_STATUS_TIMEOUT;
- break;
- case READ_DEFECT_DATA:
- rq->timeout = READ_DEFECT_DATA_TIMEOUT;
- req->retries = 1;
- break;
- default:
- rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
- break;
- }
-
- if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) {
- err = DRIVER_ERROR << 24;
- goto error;
- }
-
- blk_execute_rq(q, disk, rq, 0);
-
- err = req->result & 0xff; /* only 8 bit SCSI status */
- if (err) {
- if (req->sense_len && req->sense) {
- bytes = (OMAX_SB_LEN > req->sense_len) ?
- req->sense_len : OMAX_SB_LEN;
- if (copy_to_user(sic->data, req->sense, bytes))
- err = -EFAULT;
- }
- } else {
- if (copy_to_user(sic->data, buffer, out_len))
- err = -EFAULT;
- }
-
-error:
- blk_put_request(rq);
-
-error_free_buffer:
- kfree(buffer);
-
- return err;
-}
-EXPORT_SYMBOL_GPL(sg_scsi_ioctl);
-
-/* Send basic block requests */
-static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk,
- int cmd, int data)
-{
- struct request *rq;
- int err;
-
- rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0);
- if (IS_ERR(rq))
- return PTR_ERR(rq);
- rq->timeout = BLK_DEFAULT_SG_TIMEOUT;
- scsi_req(rq)->cmd[0] = cmd;
- scsi_req(rq)->cmd[4] = data;
- scsi_req(rq)->cmd_len = 6;
- blk_execute_rq(q, bd_disk, rq, 0);
- err = scsi_req(rq)->result ? -EIO : 0;
- blk_put_request(rq);
-
- return err;
-}
-
-static inline int blk_send_start_stop(struct request_queue *q,
- struct gendisk *bd_disk, int data)
-{
- return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data);
-}
-
-int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp)
-{
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall()) {
- struct compat_sg_io_hdr hdr32 = {
- .interface_id = hdr->interface_id,
- .dxfer_direction = hdr->dxfer_direction,
- .cmd_len = hdr->cmd_len,
- .mx_sb_len = hdr->mx_sb_len,
- .iovec_count = hdr->iovec_count,
- .dxfer_len = hdr->dxfer_len,
- .dxferp = (uintptr_t)hdr->dxferp,
- .cmdp = (uintptr_t)hdr->cmdp,
- .sbp = (uintptr_t)hdr->sbp,
- .timeout = hdr->timeout,
- .flags = hdr->flags,
- .pack_id = hdr->pack_id,
- .usr_ptr = (uintptr_t)hdr->usr_ptr,
- .status = hdr->status,
- .masked_status = hdr->masked_status,
- .msg_status = hdr->msg_status,
- .sb_len_wr = hdr->sb_len_wr,
- .host_status = hdr->host_status,
- .driver_status = hdr->driver_status,
- .resid = hdr->resid,
- .duration = hdr->duration,
- .info = hdr->info,
- };
-
- if (copy_to_user(argp, &hdr32, sizeof(hdr32)))
- return -EFAULT;
-
- return 0;
- }
-#endif
-
- if (copy_to_user(argp, hdr, sizeof(*hdr)))
- return -EFAULT;
-
- return 0;
-}
-EXPORT_SYMBOL(put_sg_io_hdr);
-
-int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp)
-{
-#ifdef CONFIG_COMPAT
- struct compat_sg_io_hdr hdr32;
-
- if (in_compat_syscall()) {
- if (copy_from_user(&hdr32, argp, sizeof(hdr32)))
- return -EFAULT;
-
- *hdr = (struct sg_io_hdr) {
- .interface_id = hdr32.interface_id,
- .dxfer_direction = hdr32.dxfer_direction,
- .cmd_len = hdr32.cmd_len,
- .mx_sb_len = hdr32.mx_sb_len,
- .iovec_count = hdr32.iovec_count,
- .dxfer_len = hdr32.dxfer_len,
- .dxferp = compat_ptr(hdr32.dxferp),
- .cmdp = compat_ptr(hdr32.cmdp),
- .sbp = compat_ptr(hdr32.sbp),
- .timeout = hdr32.timeout,
- .flags = hdr32.flags,
- .pack_id = hdr32.pack_id,
- .usr_ptr = compat_ptr(hdr32.usr_ptr),
- .status = hdr32.status,
- .masked_status = hdr32.masked_status,
- .msg_status = hdr32.msg_status,
- .sb_len_wr = hdr32.sb_len_wr,
- .host_status = hdr32.host_status,
- .driver_status = hdr32.driver_status,
- .resid = hdr32.resid,
- .duration = hdr32.duration,
- .info = hdr32.info,
- };
-
- return 0;
- }
-#endif
-
- if (copy_from_user(hdr, argp, sizeof(*hdr)))
- return -EFAULT;
-
- return 0;
-}
-EXPORT_SYMBOL(get_sg_io_hdr);
-
-#ifdef CONFIG_COMPAT
-struct compat_cdrom_generic_command {
- unsigned char cmd[CDROM_PACKET_SIZE];
- compat_caddr_t buffer;
- compat_uint_t buflen;
- compat_int_t stat;
- compat_caddr_t sense;
- unsigned char data_direction;
- compat_int_t quiet;
- compat_int_t timeout;
- compat_caddr_t reserved[1];
-};
-#endif
-
-static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc,
- const void __user *arg)
-{
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall()) {
- struct compat_cdrom_generic_command cgc32;
-
- if (copy_from_user(&cgc32, arg, sizeof(cgc32)))
- return -EFAULT;
-
- *cgc = (struct cdrom_generic_command) {
- .buffer = compat_ptr(cgc32.buffer),
- .buflen = cgc32.buflen,
- .stat = cgc32.stat,
- .sense = compat_ptr(cgc32.sense),
- .data_direction = cgc32.data_direction,
- .quiet = cgc32.quiet,
- .timeout = cgc32.timeout,
- .reserved[0] = compat_ptr(cgc32.reserved[0]),
- };
- memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE);
- return 0;
- }
-#endif
- if (copy_from_user(cgc, arg, sizeof(*cgc)))
- return -EFAULT;
-
- return 0;
-}
-
-static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc,
- void __user *arg)
-{
-#ifdef CONFIG_COMPAT
- if (in_compat_syscall()) {
- struct compat_cdrom_generic_command cgc32 = {
- .buffer = (uintptr_t)(cgc->buffer),
- .buflen = cgc->buflen,
- .stat = cgc->stat,
- .sense = (uintptr_t)(cgc->sense),
- .data_direction = cgc->data_direction,
- .quiet = cgc->quiet,
- .timeout = cgc->timeout,
- .reserved[0] = (uintptr_t)(cgc->reserved[0]),
- };
- memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE);
-
- if (copy_to_user(arg, &cgc32, sizeof(cgc32)))
- return -EFAULT;
-
- return 0;
- }
-#endif
- if (copy_to_user(arg, cgc, sizeof(*cgc)))
- return -EFAULT;
-
- return 0;
-}
-
-static int scsi_cdrom_send_packet(struct request_queue *q,
- struct gendisk *bd_disk,
- fmode_t mode, void __user *arg)
-{
- struct cdrom_generic_command cgc;
- struct sg_io_hdr hdr;
- int err;
-
- err = scsi_get_cdrom_generic_arg(&cgc, arg);
- if (err)
- return err;
-
- cgc.timeout = clock_t_to_jiffies(cgc.timeout);
- memset(&hdr, 0, sizeof(hdr));
- hdr.interface_id = 'S';
- hdr.cmd_len = sizeof(cgc.cmd);
- hdr.dxfer_len = cgc.buflen;
- switch (cgc.data_direction) {
- case CGC_DATA_UNKNOWN:
- hdr.dxfer_direction = SG_DXFER_UNKNOWN;
- break;
- case CGC_DATA_WRITE:
- hdr.dxfer_direction = SG_DXFER_TO_DEV;
- break;
- case CGC_DATA_READ:
- hdr.dxfer_direction = SG_DXFER_FROM_DEV;
- break;
- case CGC_DATA_NONE:
- hdr.dxfer_direction = SG_DXFER_NONE;
- break;
- default:
- return -EINVAL;
- }
-
- hdr.dxferp = cgc.buffer;
- hdr.sbp = cgc.sense;
- if (hdr.sbp)
- hdr.mx_sb_len = sizeof(struct request_sense);
- hdr.timeout = jiffies_to_msecs(cgc.timeout);
- hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd;
- hdr.cmd_len = sizeof(cgc.cmd);
-
- err = sg_io(q, bd_disk, &hdr, mode);
- if (err == -EFAULT)
- return -EFAULT;
-
- if (hdr.status)
- return -EIO;
-
- cgc.stat = err;
- cgc.buflen = hdr.resid;
- if (scsi_put_cdrom_generic_arg(&cgc, arg))
- return -EFAULT;
-
- return err;
-}
-
-int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode,
- unsigned int cmd, void __user *arg)
-{
- int err;
-
- if (!q)
- return -ENXIO;
-
- switch (cmd) {
- /*
- * new sgv3 interface
- */
- case SG_GET_VERSION_NUM:
- err = sg_get_version(arg);
- break;
- case SCSI_IOCTL_GET_IDLUN:
- err = scsi_get_idlun(q, arg);
- break;
- case SCSI_IOCTL_GET_BUS_NUMBER:
- err = scsi_get_bus(q, arg);
- break;
- case SG_SET_TIMEOUT:
- err = sg_set_timeout(q, arg);
- break;
- case SG_GET_TIMEOUT:
- err = sg_get_timeout(q);
- break;
- case SG_GET_RESERVED_SIZE:
- err = sg_get_reserved_size(q, arg);
- break;
- case SG_SET_RESERVED_SIZE:
- err = sg_set_reserved_size(q, arg);
- break;
- case SG_EMULATED_HOST:
- err = sg_emulated_host(q, arg);
- break;
- case SG_IO: {
- struct sg_io_hdr hdr;
-
- err = get_sg_io_hdr(&hdr, arg);
- if (err)
- break;
- err = sg_io(q, bd_disk, &hdr, mode);
- if (err == -EFAULT)
- break;
-
- if (put_sg_io_hdr(&hdr, arg))
- err = -EFAULT;
- break;
- }
- case CDROM_SEND_PACKET:
- err = scsi_cdrom_send_packet(q, bd_disk, mode, arg);
- break;
-
- /*
- * old junk scsi send command ioctl
- */
- case SCSI_IOCTL_SEND_COMMAND:
- printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm);
- err = -EINVAL;
- if (!arg)
- break;
-
- err = sg_scsi_ioctl(q, bd_disk, mode, arg);
- break;
- case CDROMCLOSETRAY:
- err = blk_send_start_stop(q, bd_disk, 0x03);
- break;
- case CDROMEJECT:
- err = blk_send_start_stop(q, bd_disk, 0x02);
- break;
- default:
- err = -ENOTTY;
- }
-
- return err;
-}
-EXPORT_SYMBOL(scsi_cmd_ioctl);
-
-int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd)
-{
- if (bd && bd == bd->bd_contains)
- return 0;
-
- if (capable(CAP_SYS_RAWIO))
- return 0;
-
- return -ENOIOCTLCMD;
-}
-EXPORT_SYMBOL(scsi_verify_blk_ioctl);
-
-int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode,
- unsigned int cmd, void __user *arg)
-{
- int ret;
-
- ret = scsi_verify_blk_ioctl(bd, cmd);
- if (ret < 0)
- return ret;
-
- return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg);
-}
-EXPORT_SYMBOL(scsi_cmd_blk_ioctl);
-
-/**
- * scsi_req_init - initialize certain fields of a scsi_request structure
- * @req: Pointer to a scsi_request structure.
- * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members
- * of struct scsi_request.
- */
-void scsi_req_init(struct scsi_request *req)
-{
- memset(req->__cmd, 0, sizeof(req->__cmd));
- req->cmd = req->__cmd;
- req->cmd_len = BLK_MAX_CDB;
- req->sense_len = 0;
-}
-EXPORT_SYMBOL(scsi_req_init);
-
-static int __init blk_scsi_ioctl_init(void)
-{
- blk_set_cmd_filter_defaults(&blk_default_cmd_filter);
- return 0;
-}
-fs_initcall(blk_scsi_ioctl_init);
diff --git a/block/sed-opal.c b/block/sed-opal.c
index daafadbb88ca..fa4dba5d8531 100644
--- a/block/sed-opal.c
+++ b/block/sed-opal.c
@@ -13,13 +13,17 @@
#include <linux/device.h>
#include <linux/kernel.h>
#include <linux/list.h>
-#include <linux/genhd.h>
+#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/uaccess.h>
#include <uapi/linux/sed-opal.h>
#include <linux/sed-opal.h>
+#include <linux/sed-opal-key.h>
#include <linux/string.h>
#include <linux/kdev_t.h>
+#include <linux/key.h>
+#include <linux/key-type.h>
+#include <keys/user-type.h>
#include "opal_proto.h"
@@ -29,6 +33,8 @@
/* Number of bytes needed by cmd_finalize. */
#define CMD_FINALIZE_BYTES_NEEDED 7
+static struct key *sed_opal_keyring;
+
struct opal_step {
int (*fn)(struct opal_dev *dev, void *data);
void *data;
@@ -74,8 +80,7 @@ struct parsed_resp {
};
struct opal_dev {
- bool supported;
- bool mbr_enabled;
+ u32 flags;
void *data;
sec_send_recv *send_recv;
@@ -84,12 +89,14 @@ struct opal_dev {
u16 comid;
u32 hsn;
u32 tsn;
- u64 align;
+ u64 align; /* alignment granularity */
u64 lowest_lba;
+ u32 logical_block_size;
+ u8 align_required; /* ALIGN: 0 or 1 */
size_t pos;
- u8 cmd[IO_BUFFER_LENGTH];
- u8 resp[IO_BUFFER_LENGTH];
+ u8 *cmd;
+ u8 *resp;
struct parsed_resp parsed;
size_t prev_d_len;
@@ -133,6 +140,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = {
{ 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 },
[OPAL_LOCKINGRANGE_GLOBAL] =
{ 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 },
+ [OPAL_LOCKINGRANGE_ACE_START_TO_KEY] =
+ { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xD0, 0x01 },
[OPAL_LOCKINGRANGE_ACE_RDLOCKED] =
{ 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 },
[OPAL_LOCKINGRANGE_ACE_WRLOCKED] =
@@ -266,6 +275,101 @@ static void print_buffer(const u8 *ptr, u32 length)
#endif
}
+/*
+ * Allocate/update a SED Opal key and add it to the SED Opal keyring.
+ */
+static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen)
+{
+ key_ref_t kr;
+
+ if (!sed_opal_keyring)
+ return -ENOKEY;
+
+ kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user",
+ desc, (const void *)key_data, keylen,
+ KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE,
+ KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN |
+ KEY_ALLOC_BYPASS_RESTRICTION);
+ if (IS_ERR(kr)) {
+ pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr));
+ return PTR_ERR(kr);
+ }
+
+ return 0;
+}
+
+/*
+ * Read a SED Opal key from the SED Opal keyring.
+ */
+static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen)
+{
+ int ret;
+ key_ref_t kref;
+ struct key *key;
+
+ if (!sed_opal_keyring)
+ return -ENOKEY;
+
+ kref = keyring_search(make_key_ref(sed_opal_keyring, true),
+ &key_type_user, key_name, true);
+
+ if (IS_ERR(kref))
+ ret = PTR_ERR(kref);
+
+ key = key_ref_to_ptr(kref);
+ down_read(&key->sem);
+ ret = key_validate(key);
+ if (ret == 0) {
+ if (buflen > key->datalen)
+ buflen = key->datalen;
+
+ ret = key->type->read(key, (char *)buffer, buflen);
+ }
+ up_read(&key->sem);
+
+ key_ref_put(kref);
+
+ return ret;
+}
+
+static int opal_get_key(struct opal_dev *dev, struct opal_key *key)
+{
+ int ret = 0;
+
+ switch (key->key_type) {
+ case OPAL_INCLUDED:
+ /* the key is ready to use */
+ break;
+ case OPAL_KEYRING:
+ /* the key is in the keyring */
+ ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX);
+ if (ret > 0) {
+ if (ret > U8_MAX) {
+ ret = -ENOSPC;
+ goto error;
+ }
+ key->key_len = ret;
+ key->key_type = OPAL_INCLUDED;
+ }
+ break;
+ default:
+ ret = -EINVAL;
+ break;
+ }
+ if (ret < 0)
+ goto error;
+
+ /* must have a PEK by now or it's an error */
+ if (key->key_type != OPAL_INCLUDED || key->key_len == 0) {
+ ret = -EINVAL;
+ goto error;
+ }
+ return 0;
+error:
+ pr_debug("Error getting password: %d\n", ret);
+ return ret;
+}
+
static bool check_tper(const void *data)
{
const struct d0_tper_features *tper = data;
@@ -280,6 +384,30 @@ static bool check_tper(const void *data)
return true;
}
+static bool check_lcksuppt(const void *data)
+{
+ const struct d0_locking_features *lfeat = data;
+ u8 sup_feat = lfeat->supported_features;
+
+ return !!(sup_feat & LOCKING_SUPPORTED_MASK);
+}
+
+static bool check_lckenabled(const void *data)
+{
+ const struct d0_locking_features *lfeat = data;
+ u8 sup_feat = lfeat->supported_features;
+
+ return !!(sup_feat & LOCKING_ENABLED_MASK);
+}
+
+static bool check_locked(const void *data)
+{
+ const struct d0_locking_features *lfeat = data;
+ u8 sup_feat = lfeat->supported_features;
+
+ return !!(sup_feat & LOCKED_MASK);
+}
+
static bool check_mbrenabled(const void *data)
{
const struct d0_locking_features *lfeat = data;
@@ -288,6 +416,14 @@ static bool check_mbrenabled(const void *data)
return !!(sup_feat & MBR_ENABLED_MASK);
}
+static bool check_mbrdone(const void *data)
+{
+ const struct d0_locking_features *lfeat = data;
+ u8 sup_feat = lfeat->supported_features;
+
+ return !!(sup_feat & MBR_DONE_MASK);
+}
+
static bool check_sum(const void *data)
{
const struct d0_single_user_mode *sum = data;
@@ -376,6 +512,8 @@ static void check_geometry(struct opal_dev *dev, const void *data)
dev->align = be64_to_cpu(geo->alignment_granularity);
dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba);
+ dev->logical_block_size = be32_to_cpu(geo->logical_block_size);
+ dev->align_required = geo->reserved01 & 1;
}
static int execute_step(struct opal_dev *dev,
@@ -426,8 +564,11 @@ out_error:
return error;
}
-static int opal_discovery0_end(struct opal_dev *dev)
+static int opal_discovery0_end(struct opal_dev *dev, void *data)
{
+ struct opal_discovery *discv_out = data; /* may be NULL */
+ u8 __user *buf_out;
+ u64 len_out;
bool found_com_id = false, supported = true, single_user = false;
const struct d0_header *hdr = (struct d0_header *)dev->resp;
const u8 *epos = dev->resp, *cpos = dev->resp;
@@ -435,7 +576,7 @@ static int opal_discovery0_end(struct opal_dev *dev)
u32 hlen = be32_to_cpu(hdr->length);
print_buffer(dev->resp, hlen);
- dev->mbr_enabled = false;
+ dev->flags &= OPAL_FL_SUPPORTED;
if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) {
pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n",
@@ -443,6 +584,15 @@ static int opal_discovery0_end(struct opal_dev *dev)
return -EFAULT;
}
+ if (discv_out) {
+ buf_out = (u8 __user *)(uintptr_t)discv_out->data;
+ len_out = min_t(u64, discv_out->size, hlen);
+ if (buf_out && copy_to_user(buf_out, dev->resp, len_out))
+ return -EFAULT;
+
+ discv_out->size = hlen; /* actual size of data */
+ }
+
epos += hlen; /* end of buffer */
cpos += sizeof(*hdr); /* current position on buffer */
@@ -456,12 +606,23 @@ static int opal_discovery0_end(struct opal_dev *dev)
break;
case FC_SINGLEUSER:
single_user = check_sum(body->features);
+ if (single_user)
+ dev->flags |= OPAL_FL_SUM_SUPPORTED;
break;
case FC_GEOMETRY:
check_geometry(dev, body);
break;
case FC_LOCKING:
- dev->mbr_enabled = check_mbrenabled(body->features);
+ if (check_lcksuppt(body->features))
+ dev->flags |= OPAL_FL_LOCKING_SUPPORTED;
+ if (check_lckenabled(body->features))
+ dev->flags |= OPAL_FL_LOCKING_ENABLED;
+ if (check_locked(body->features))
+ dev->flags |= OPAL_FL_LOCKED;
+ if (check_mbrenabled(body->features))
+ dev->flags |= OPAL_FL_MBR_ENABLED;
+ if (check_mbrdone(body->features))
+ dev->flags |= OPAL_FL_MBR_DONE;
break;
case FC_ENTERPRISE:
case FC_DATASTORE:
@@ -517,13 +678,13 @@ static int opal_discovery0(struct opal_dev *dev, void *data)
if (ret)
return ret;
- return opal_discovery0_end(dev);
+ return opal_discovery0_end(dev, data);
}
static int opal_discovery0_step(struct opal_dev *dev)
{
const struct opal_step discovery0_step = {
- opal_discovery0,
+ opal_discovery0, NULL
};
return execute_step(dev, &discovery0_step, 0);
@@ -895,16 +1056,20 @@ static int response_parse(const u8 *buf, size_t length,
token_length = response_parse_medium(iter, pos);
else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */
token_length = response_parse_long(iter, pos);
+ else if (pos[0] == EMPTY_ATOM_BYTE) /* empty atom */
+ token_length = 1;
else /* TOKEN */
token_length = response_parse_token(iter, pos);
if (token_length < 0)
return token_length;
+ if (pos[0] != EMPTY_ATOM_BYTE)
+ num_entries++;
+
pos += token_length;
total -= token_length;
iter++;
- num_entries++;
}
resp->num = num_entries;
@@ -1105,12 +1270,8 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont)
return opal_send_recv(dev, cont);
}
-/*
- * request @column from table @table on device @dev. On success, the column
- * data will be available in dev->resp->tok[4]
- */
-static int generic_get_column(struct opal_dev *dev, const u8 *table,
- u64 column)
+static int generic_get_columns(struct opal_dev *dev, const u8 *table,
+ u64 start_column, u64 end_column)
{
int err;
@@ -1120,12 +1281,12 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table,
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_STARTCOLUMN);
- add_token_u64(&err, dev, column);
+ add_token_u64(&err, dev, start_column);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_STARTNAME);
add_token_u8(&err, dev, OPAL_ENDCOLUMN);
- add_token_u64(&err, dev, column);
+ add_token_u64(&err, dev, end_column);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_ENDLIST);
@@ -1137,6 +1298,16 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table,
}
/*
+ * request @column from table @table on device @dev. On success, the column
+ * data will be available in dev->resp->tok[4]
+ */
+static int generic_get_column(struct opal_dev *dev, const u8 *table,
+ u64 column)
+{
+ return generic_get_columns(dev, table, column, column);
+}
+
+/*
* see TCG SAS 5.3.2.3 for a description of the available columns
*
* the result is provided in dev->resp->tok[4]
@@ -1395,6 +1566,129 @@ static int setup_locking_range(struct opal_dev *dev, void *data)
return finalize_and_send(dev, parse_and_check_status);
}
+static int response_get_column(const struct parsed_resp *resp,
+ int *iter,
+ u8 column,
+ u64 *value)
+{
+ const struct opal_resp_tok *tok;
+ int n = *iter;
+ u64 val;
+
+ tok = response_get_token(resp, n);
+ if (IS_ERR(tok))
+ return PTR_ERR(tok);
+
+ if (!response_token_matches(tok, OPAL_STARTNAME)) {
+ pr_debug("Unexpected response token type %d.\n", n);
+ return OPAL_INVAL_PARAM;
+ }
+ n++;
+
+ if (response_get_u64(resp, n) != column) {
+ pr_debug("Token %d does not match expected column %u.\n",
+ n, column);
+ return OPAL_INVAL_PARAM;
+ }
+ n++;
+
+ val = response_get_u64(resp, n);
+ n++;
+
+ tok = response_get_token(resp, n);
+ if (IS_ERR(tok))
+ return PTR_ERR(tok);
+
+ if (!response_token_matches(tok, OPAL_ENDNAME)) {
+ pr_debug("Unexpected response token type %d.\n", n);
+ return OPAL_INVAL_PARAM;
+ }
+ n++;
+
+ *value = val;
+ *iter = n;
+
+ return 0;
+}
+
+static int locking_range_status(struct opal_dev *dev, void *data)
+{
+ u8 lr_buffer[OPAL_UID_LENGTH];
+ u64 resp;
+ bool rlocked, wlocked;
+ int err, tok_n = 2;
+ struct opal_lr_status *lrst = data;
+
+ err = build_locking_range(lr_buffer, sizeof(lr_buffer),
+ lrst->session.opal_key.lr);
+ if (err)
+ return err;
+
+ err = generic_get_columns(dev, lr_buffer, OPAL_RANGESTART,
+ OPAL_WRITELOCKED);
+ if (err) {
+ pr_debug("Couldn't get lr %u table columns %d to %d.\n",
+ lrst->session.opal_key.lr, OPAL_RANGESTART,
+ OPAL_WRITELOCKED);
+ return err;
+ }
+
+ /* range start */
+ err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGESTART,
+ &lrst->range_start);
+ if (err)
+ return err;
+
+ /* range length */
+ err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGELENGTH,
+ &lrst->range_length);
+ if (err)
+ return err;
+
+ /* RLE */
+ err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKENABLED,
+ &resp);
+ if (err)
+ return err;
+
+ lrst->RLE = !!resp;
+
+ /* WLE */
+ err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKENABLED,
+ &resp);
+ if (err)
+ return err;
+
+ lrst->WLE = !!resp;
+
+ /* read locked */
+ err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKED, &resp);
+ if (err)
+ return err;
+
+ rlocked = !!resp;
+
+ /* write locked */
+ err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKED, &resp);
+ if (err)
+ return err;
+
+ wlocked = !!resp;
+
+ /* opal_lock_state can not map 'read locked' only state. */
+ lrst->l_state = OPAL_RW;
+ if (rlocked && wlocked)
+ lrst->l_state = OPAL_LK;
+ else if (wlocked)
+ lrst->l_state = OPAL_RO;
+ else if (rlocked) {
+ pr_debug("Can not report read locked only state.\n");
+ return -EINVAL;
+ }
+
+ return 0;
+}
+
static int start_generic_opal_session(struct opal_dev *dev,
enum opal_uid auth,
enum opal_uid sp_type,
@@ -1580,6 +1874,26 @@ static int internal_activate_user(struct opal_dev *dev, void *data)
return finalize_and_send(dev, parse_and_check_status);
}
+static int revert_lsp(struct opal_dev *dev, void *data)
+{
+ struct opal_revert_lsp *rev = data;
+ int err;
+
+ err = cmd_start(dev, opaluid[OPAL_THISSP_UID],
+ opalmethod[OPAL_REVERTSP]);
+ add_token_u8(&err, dev, OPAL_STARTNAME);
+ add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY);
+ add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ?
+ OPAL_TRUE : OPAL_FALSE);
+ add_token_u8(&err, dev, OPAL_ENDNAME);
+ if (err) {
+ pr_debug("Error building REVERT SP command.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
static int erase_locking_range(struct opal_dev *dev, void *data)
{
struct opal_session_info *session = data;
@@ -1717,25 +2031,43 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data)
return finalize_and_send(dev, parse_and_check_status);
}
-static int add_user_to_lr(struct opal_dev *dev, void *data)
+static void add_authority_object_ref(int *err,
+ struct opal_dev *dev,
+ const u8 *uid,
+ size_t uid_len)
+{
+ add_token_u8(err, dev, OPAL_STARTNAME);
+ add_token_bytestring(err, dev,
+ opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
+ OPAL_UID_LENGTH/2);
+ add_token_bytestring(err, dev, uid, uid_len);
+ add_token_u8(err, dev, OPAL_ENDNAME);
+}
+
+static void add_boolean_object_ref(int *err,
+ struct opal_dev *dev,
+ u8 boolean_op)
+{
+ add_token_u8(err, dev, OPAL_STARTNAME);
+ add_token_bytestring(err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE],
+ OPAL_UID_LENGTH/2);
+ add_token_u8(err, dev, boolean_op);
+ add_token_u8(err, dev, OPAL_ENDNAME);
+}
+
+static int set_lr_boolean_ace(struct opal_dev *dev,
+ unsigned int opal_uid,
+ u8 lr,
+ const u8 *users,
+ size_t users_len)
{
u8 lr_buffer[OPAL_UID_LENGTH];
u8 user_uid[OPAL_UID_LENGTH];
- struct opal_lock_unlock *lkul = data;
+ u8 u;
int err;
- memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED],
- OPAL_UID_LENGTH);
-
- if (lkul->l_state == OPAL_RW)
- memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED],
- OPAL_UID_LENGTH);
-
- lr_buffer[7] = lkul->session.opal_key.lr;
-
- memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH);
-
- user_uid[7] = lkul->session.who;
+ memcpy(lr_buffer, opaluid[opal_uid], OPAL_UID_LENGTH);
+ lr_buffer[7] = lr;
err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]);
@@ -1748,35 +2080,49 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
add_token_u8(&err, dev, OPAL_STARTLIST);
+ for (u = 0; u < users_len; u++) {
+ if (users[u] == OPAL_ADMIN1)
+ memcpy(user_uid, opaluid[OPAL_ADMIN1_UID],
+ OPAL_UID_LENGTH);
+ else {
+ memcpy(user_uid, opaluid[OPAL_USER1_UID],
+ OPAL_UID_LENGTH);
+ user_uid[7] = users[u];
+ }
- add_token_u8(&err, dev, OPAL_STARTNAME);
- add_token_bytestring(&err, dev,
- opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
- OPAL_UID_LENGTH/2);
- add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
- add_token_u8(&err, dev, OPAL_ENDNAME);
-
-
- add_token_u8(&err, dev, OPAL_STARTNAME);
- add_token_bytestring(&err, dev,
- opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF],
- OPAL_UID_LENGTH/2);
- add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH);
- add_token_u8(&err, dev, OPAL_ENDNAME);
-
-
- add_token_u8(&err, dev, OPAL_STARTNAME);
- add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE],
- OPAL_UID_LENGTH/2);
- add_token_u8(&err, dev, 1);
- add_token_u8(&err, dev, OPAL_ENDNAME);
+ add_authority_object_ref(&err, dev, user_uid, sizeof(user_uid));
+ /*
+ * Add boolean operator in postfix only with
+ * two or more authorities being added in ACE
+ * expresion.
+ * */
+ if (u > 0)
+ add_boolean_object_ref(&err, dev, OPAL_BOOLEAN_OR);
+ }
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
add_token_u8(&err, dev, OPAL_ENDLIST);
add_token_u8(&err, dev, OPAL_ENDNAME);
+ return err;
+}
+
+static int add_user_to_lr(struct opal_dev *dev, void *data)
+{
+ int err;
+ struct opal_lock_unlock *lkul = data;
+ const u8 users[] = {
+ lkul->session.who
+ };
+
+ err = set_lr_boolean_ace(dev,
+ lkul->l_state == OPAL_RW ?
+ OPAL_LOCKINGRANGE_ACE_WRLOCKED :
+ OPAL_LOCKINGRANGE_ACE_RDLOCKED,
+ lkul->session.opal_key.lr, users,
+ ARRAY_SIZE(users));
if (err) {
pr_debug("Error building add user to locking range command.\n");
return err;
@@ -1785,6 +2131,27 @@ static int add_user_to_lr(struct opal_dev *dev, void *data)
return finalize_and_send(dev, parse_and_check_status);
}
+static int add_user_to_lr_ace(struct opal_dev *dev, void *data)
+{
+ int err;
+ struct opal_lock_unlock *lkul = data;
+ const u8 users[] = {
+ OPAL_ADMIN1,
+ lkul->session.who
+ };
+
+ err = set_lr_boolean_ace(dev, OPAL_LOCKINGRANGE_ACE_START_TO_KEY,
+ lkul->session.opal_key.lr, users,
+ ARRAY_SIZE(users));
+
+ if (err) {
+ pr_debug("Error building add user to locking ranges ACEs.\n");
+ return err;
+ }
+
+ return finalize_and_send(dev, parse_and_check_status);
+}
+
static int lock_unlock_locking_range(struct opal_dev *dev, void *data)
{
u8 lr_buffer[OPAL_UID_LENGTH];
@@ -2109,7 +2476,8 @@ static int check_opal_support(struct opal_dev *dev)
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = opal_discovery0_step(dev);
- dev->supported = !ret;
+ if (!ret)
+ dev->flags |= OPAL_FL_SUPPORTED;
mutex_unlock(&dev->dev_lock);
return ret;
@@ -2134,6 +2502,8 @@ void free_opal_dev(struct opal_dev *dev)
return;
clean_opal_dev(dev);
+ kfree(dev->resp);
+ kfree(dev->cmd);
kfree(dev);
}
EXPORT_SYMBOL(free_opal_dev);
@@ -2146,17 +2516,40 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv)
if (!dev)
return NULL;
+ /*
+ * Presumably DMA-able buffers must be cache-aligned. Kmalloc makes
+ * sure the allocated buffer is DMA-safe in that regard.
+ */
+ dev->cmd = kmalloc(IO_BUFFER_LENGTH, GFP_KERNEL);
+ if (!dev->cmd)
+ goto err_free_dev;
+
+ dev->resp = kmalloc(IO_BUFFER_LENGTH, GFP_KERNEL);
+ if (!dev->resp)
+ goto err_free_cmd;
+
INIT_LIST_HEAD(&dev->unlk_lst);
mutex_init(&dev->dev_lock);
+ dev->flags = 0;
dev->data = data;
dev->send_recv = send_recv;
if (check_opal_support(dev) != 0) {
pr_debug("Opal is not supported on this device\n");
- kfree(dev);
- return NULL;
+ goto err_free_resp;
}
return dev;
+
+err_free_resp:
+ kfree(dev->resp);
+
+err_free_cmd:
+ kfree(dev->cmd);
+
+err_free_dev:
+ kfree(dev);
+
+ return NULL;
}
EXPORT_SYMBOL(init_opal_dev);
@@ -2171,6 +2564,9 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
};
int ret;
+ ret = opal_get_key(dev, &opal_session->opal_key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
@@ -2179,6 +2575,42 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev,
return ret;
}
+static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv)
+{
+ const struct opal_step discovery0_step = {
+ opal_discovery0, discv
+ };
+ int ret = 0;
+
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_step(dev, &discovery0_step, 0);
+ mutex_unlock(&dev->dev_lock);
+ if (ret)
+ return ret;
+ return discv->size; /* modified to actual length of data */
+}
+
+static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev)
+{
+ /* controller will terminate session */
+ const struct opal_step steps[] = {
+ { start_admin1LSP_opal_session, &rev->key },
+ { revert_lsp, rev }
+ };
+ int ret;
+
+ ret = opal_get_key(dev, &rev->key);
+ if (ret)
+ return ret;
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
+ mutex_unlock(&dev->dev_lock);
+
+ return ret;
+}
+
static int opal_erase_locking_range(struct opal_dev *dev,
struct opal_session_info *opal_session)
{
@@ -2189,6 +2621,9 @@ static int opal_erase_locking_range(struct opal_dev *dev,
};
int ret;
+ ret = opal_get_key(dev, &opal_session->opal_key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps));
@@ -2217,6 +2652,9 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev,
opal_mbr->enable_disable != OPAL_MBR_DISABLE)
return -EINVAL;
+ ret = opal_get_key(dev, &opal_mbr->key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
@@ -2242,6 +2680,9 @@ static int opal_set_mbr_done(struct opal_dev *dev,
mbr_done->done_flag != OPAL_MBR_NOT_DONE)
return -EINVAL;
+ ret = opal_get_key(dev, &mbr_done->key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
@@ -2263,6 +2704,9 @@ static int opal_write_shadow_mbr(struct opal_dev *dev,
if (info->size == 0)
return 0;
+ ret = opal_get_key(dev, &info->key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps));
@@ -2296,6 +2740,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
const struct opal_step steps[] = {
{ start_admin1LSP_opal_session, &lk_unlk->session.opal_key },
{ add_user_to_lr, lk_unlk },
+ { add_user_to_lr_ace, lk_unlk },
{ end_opal_session, }
};
int ret;
@@ -2319,6 +2764,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev,
return -EINVAL;
}
+ ret = opal_get_key(dev, &lk_unlk->session.opal_key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, steps, ARRAY_SIZE(steps));
@@ -2341,6 +2789,10 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psi
int ret;
+ ret = opal_get_key(dev, opal);
+
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
if (psid)
@@ -2395,6 +2847,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key)
return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step));
}
+static void opal_lock_check_for_saved_key(struct opal_dev *dev,
+ struct opal_lock_unlock *lk_unlk)
+{
+ struct opal_suspend_data *iter;
+
+ if (lk_unlk->l_state != OPAL_LK ||
+ lk_unlk->session.opal_key.key_len > 0)
+ return;
+
+ /*
+ * Usually when closing a crypto device (eg: dm-crypt with LUKS) the
+ * volume key is not required, as it requires root privileges anyway,
+ * and root can deny access to a disk in many ways regardless.
+ * Requiring the volume key to lock the device is a peculiarity of the
+ * OPAL specification. Given we might already have saved the key if
+ * the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use
+ * that key to lock the device if no key was provided here, the
+ * locking range matches and the appropriate flag was passed with
+ * 'IOC_OPAL_SAVE'.
+ * This allows integrating OPAL with tools and libraries that are used
+ * to the common behaviour and do not ask for the volume key when
+ * closing a device.
+ */
+ setup_opal_dev(dev);
+ list_for_each_entry(iter, &dev->unlk_lst, node) {
+ if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) &&
+ iter->lr == lk_unlk->session.opal_key.lr &&
+ iter->unlk.session.opal_key.key_len > 0) {
+ lk_unlk->session.opal_key.key_len =
+ iter->unlk.session.opal_key.key_len;
+ memcpy(lk_unlk->session.opal_key.key,
+ iter->unlk.session.opal_key.key,
+ iter->unlk.session.opal_key.key_len);
+ break;
+ }
+ }
+}
+
static int opal_lock_unlock(struct opal_dev *dev,
struct opal_lock_unlock *lk_unlk)
{
@@ -2404,7 +2894,10 @@ static int opal_lock_unlock(struct opal_dev *dev,
return -EINVAL;
mutex_lock(&dev->dev_lock);
- ret = __opal_lock_unlock(dev, lk_unlk);
+ opal_lock_check_for_saved_key(dev, lk_unlk);
+ ret = opal_get_key(dev, &lk_unlk->session.opal_key);
+ if (!ret)
+ ret = __opal_lock_unlock(dev, lk_unlk);
mutex_unlock(&dev->dev_lock);
return ret;
@@ -2425,6 +2918,9 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal)
if (!dev)
return -ENODEV;
+ ret = opal_get_key(dev, opal);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps));
@@ -2447,6 +2943,9 @@ static int opal_activate_lsp(struct opal_dev *dev,
if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS)
return -EINVAL;
+ ret = opal_get_key(dev, &opal_lr_act->key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps));
@@ -2465,11 +2964,41 @@ static int opal_setup_locking_range(struct opal_dev *dev,
};
int ret;
+ ret = opal_get_key(dev, &opal_lrs->session.opal_key);
+ if (ret)
+ return ret;
+ mutex_lock(&dev->dev_lock);
+ setup_opal_dev(dev);
+ ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
+ mutex_unlock(&dev->dev_lock);
+
+ return ret;
+}
+
+static int opal_locking_range_status(struct opal_dev *dev,
+ struct opal_lr_status *opal_lrst,
+ void __user *data)
+{
+ const struct opal_step lr_steps[] = {
+ { start_auth_opal_session, &opal_lrst->session },
+ { locking_range_status, opal_lrst },
+ { end_opal_session, }
+ };
+ int ret;
+
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps));
mutex_unlock(&dev->dev_lock);
+ /* skip session info when copying back to uspace */
+ if (!ret && copy_to_user(data + offsetof(struct opal_lr_status, range_start),
+ (void *)opal_lrst + offsetof(struct opal_lr_status, range_start),
+ sizeof(*opal_lrst) - offsetof(struct opal_lr_status, range_start))) {
+ pr_debug("Error copying status to userspace\n");
+ return -EFAULT;
+ }
+
return ret;
}
@@ -2491,6 +3020,20 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw)
ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps));
mutex_unlock(&dev->dev_lock);
+ if (ret)
+ return ret;
+
+ /* update keyring and key store with new password */
+ ret = sed_write_key(OPAL_AUTH_KEY,
+ opal_pw->new_user_pw.opal_key.key,
+ opal_pw->new_user_pw.opal_key.key_len);
+ if (ret != -EOPNOTSUPP)
+ pr_warn("error updating SED key: %d\n", ret);
+
+ ret = update_sed_opal_key(OPAL_AUTH_KEY,
+ opal_pw->new_user_pw.opal_key.key,
+ opal_pw->new_user_pw.opal_key.key_len);
+
return ret;
}
@@ -2511,6 +3054,9 @@ static int opal_activate_user(struct opal_dev *dev,
return -EINVAL;
}
+ ret = opal_get_key(dev, &opal_session->opal_key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps));
@@ -2528,7 +3074,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
if (!dev)
return false;
- if (!dev->supported)
+ if (!(dev->flags & OPAL_FL_SUPPORTED))
return false;
mutex_lock(&dev->dev_lock);
@@ -2546,7 +3092,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev)
was_failure = true;
}
- if (dev->mbr_enabled) {
+ if (dev->flags & OPAL_FL_MBR_ENABLED) {
ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key);
if (ret)
pr_debug("Failed to set MBR Done in S3 resume\n");
@@ -2597,6 +3143,9 @@ static int opal_generic_read_write_table(struct opal_dev *dev,
{
int ret, bit_set;
+ ret = opal_get_key(dev, &rw_tbl->key);
+ if (ret)
+ return ret;
mutex_lock(&dev->dev_lock);
setup_opal_dev(dev);
@@ -2620,6 +3169,43 @@ static int opal_generic_read_write_table(struct opal_dev *dev,
return ret;
}
+static int opal_get_status(struct opal_dev *dev, void __user *data)
+{
+ struct opal_status sts = {0};
+
+ /*
+ * check_opal_support() error is not fatal,
+ * !dev->supported is a valid condition
+ */
+ if (!check_opal_support(dev))
+ sts.flags = dev->flags;
+ if (copy_to_user(data, &sts, sizeof(sts))) {
+ pr_debug("Error copying status to userspace\n");
+ return -EFAULT;
+ }
+ return 0;
+}
+
+static int opal_get_geometry(struct opal_dev *dev, void __user *data)
+{
+ struct opal_geometry geo = {0};
+
+ if (check_opal_support(dev))
+ return -EINVAL;
+
+ geo.align = dev->align_required;
+ geo.logical_block_size = dev->logical_block_size;
+ geo.alignment_granularity = dev->align;
+ geo.lowest_aligned_lba = dev->lowest_lba;
+
+ if (copy_to_user(data, &geo, sizeof(geo))) {
+ pr_debug("Error copying geometry data to userspace\n");
+ return -EFAULT;
+ }
+
+ return 0;
+}
+
int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
{
void *p;
@@ -2628,13 +3214,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
if (!capable(CAP_SYS_ADMIN))
return -EACCES;
if (!dev)
- return -ENOTSUPP;
- if (!dev->supported)
- return -ENOTSUPP;
+ return -EOPNOTSUPP;
+ if (!(dev->flags & OPAL_FL_SUPPORTED))
+ return -EOPNOTSUPP;
- p = memdup_user(arg, _IOC_SIZE(cmd));
- if (IS_ERR(p))
- return PTR_ERR(p);
+ if (cmd & IOC_IN) {
+ p = memdup_user(arg, _IOC_SIZE(cmd));
+ if (IS_ERR(p))
+ return PTR_ERR(p);
+ }
switch (cmd) {
case IOC_OPAL_SAVE:
@@ -2685,11 +3273,54 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg)
case IOC_OPAL_GENERIC_TABLE_RW:
ret = opal_generic_read_write_table(dev, p);
break;
+ case IOC_OPAL_GET_STATUS:
+ ret = opal_get_status(dev, arg);
+ break;
+ case IOC_OPAL_GET_LR_STATUS:
+ ret = opal_locking_range_status(dev, p, arg);
+ break;
+ case IOC_OPAL_GET_GEOMETRY:
+ ret = opal_get_geometry(dev, arg);
+ break;
+ case IOC_OPAL_REVERT_LSP:
+ ret = opal_revertlsp(dev, p);
+ break;
+ case IOC_OPAL_DISCOVERY:
+ ret = opal_get_discv(dev, p);
+ break;
+
default:
break;
}
- kfree(p);
+ if (cmd & IOC_IN)
+ kfree(p);
return ret;
}
EXPORT_SYMBOL_GPL(sed_ioctl);
+
+static int __init sed_opal_init(void)
+{
+ struct key *kr;
+ char init_sed_key[OPAL_KEY_MAX];
+ int keylen = OPAL_KEY_MAX - 1;
+
+ kr = keyring_alloc(".sed_opal",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+ (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW |
+ KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE,
+ KEY_ALLOC_NOT_IN_QUOTA,
+ NULL, NULL);
+ if (IS_ERR(kr))
+ return PTR_ERR(kr);
+
+ sed_opal_keyring = kr;
+
+ if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) {
+ memset(init_sed_key, '\0', sizeof(init_sed_key));
+ keylen = OPAL_KEY_MAX - 1;
+ }
+
+ return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen);
+}
+late_initcall(sed_opal_init);
diff --git a/block/t10-pi.c b/block/t10-pi.c
index d910534b3a41..914d8cddd43a 100644
--- a/block/t10-pi.c
+++ b/block/t10-pi.c
@@ -5,10 +5,12 @@
*/
#include <linux/t10-pi.h>
-#include <linux/blkdev.h>
+#include <linux/blk-integrity.h>
#include <linux/crc-t10dif.h>
+#include <linux/crc64.h>
#include <linux/module.h>
#include <net/checksum.h>
+#include <asm/unaligned.h>
typedef __be16 (csum_fn) (void *, unsigned int);
@@ -44,7 +46,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter,
pi->ref_tag = 0;
iter->data_buf += iter->interval;
- iter->prot_buf += sizeof(struct t10_pi_tuple);
+ iter->prot_buf += iter->tuple_size;
iter->seed++;
}
@@ -93,7 +95,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter,
next:
iter->data_buf += iter->interval;
- iter->prot_buf += sizeof(struct t10_pi_tuple);
+ iter->prot_buf += iter->tuple_size;
iter->seed++;
}
@@ -147,11 +149,10 @@ static void t10_pi_type1_prepare(struct request *rq)
break;
bip_for_each_vec(iv, bip, iter) {
- void *p, *pmap;
unsigned int j;
+ void *p;
- pmap = kmap_atomic(iv.bv_page);
- p = pmap + iv.bv_offset;
+ p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len; j += tuple_sz) {
struct t10_pi_tuple *pi = p;
@@ -161,8 +162,7 @@ static void t10_pi_type1_prepare(struct request *rq)
ref_tag++;
p += tuple_sz;
}
-
- kunmap_atomic(pmap);
+ kunmap_local(p);
}
bip->bip_flags |= BIP_MAPPED_INTEGRITY;
@@ -195,11 +195,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
struct bvec_iter iter;
bip_for_each_vec(iv, bip, iter) {
- void *p, *pmap;
unsigned int j;
+ void *p;
- pmap = kmap_atomic(iv.bv_page);
- p = pmap + iv.bv_offset;
+ p = bvec_kmap_local(&iv);
for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
struct t10_pi_tuple *pi = p;
@@ -210,8 +209,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
intervals--;
p += tuple_sz;
}
-
- kunmap_atomic(pmap);
+ kunmap_local(p);
}
}
}
@@ -282,4 +280,196 @@ const struct blk_integrity_profile t10_pi_type3_ip = {
};
EXPORT_SYMBOL(t10_pi_type3_ip);
+static __be64 ext_pi_crc64(void *data, unsigned int len)
+{
+ return cpu_to_be64(crc64_rocksoft(data, len));
+}
+
+static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter,
+ enum t10_dif_type type)
+{
+ unsigned int i;
+
+ for (i = 0 ; i < iter->data_size ; i += iter->interval) {
+ struct crc64_pi_tuple *pi = iter->prot_buf;
+
+ pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval);
+ pi->app_tag = 0;
+
+ if (type == T10_PI_TYPE1_PROTECTION)
+ put_unaligned_be48(iter->seed, pi->ref_tag);
+ else
+ put_unaligned_be48(0ULL, pi->ref_tag);
+
+ iter->data_buf += iter->interval;
+ iter->prot_buf += iter->tuple_size;
+ iter->seed++;
+ }
+
+ return BLK_STS_OK;
+}
+
+static bool ext_pi_ref_escape(u8 *ref_tag)
+{
+ static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff };
+
+ return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0;
+}
+
+static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter,
+ enum t10_dif_type type)
+{
+ unsigned int i;
+
+ for (i = 0; i < iter->data_size; i += iter->interval) {
+ struct crc64_pi_tuple *pi = iter->prot_buf;
+ u64 ref, seed;
+ __be64 csum;
+
+ if (type == T10_PI_TYPE1_PROTECTION) {
+ if (pi->app_tag == T10_PI_APP_ESCAPE)
+ goto next;
+
+ ref = get_unaligned_be48(pi->ref_tag);
+ seed = lower_48_bits(iter->seed);
+ if (ref != seed) {
+ pr_err("%s: ref tag error at location %llu (rcvd %llu)\n",
+ iter->disk_name, seed, ref);
+ return BLK_STS_PROTECTION;
+ }
+ } else if (type == T10_PI_TYPE3_PROTECTION) {
+ if (pi->app_tag == T10_PI_APP_ESCAPE &&
+ ext_pi_ref_escape(pi->ref_tag))
+ goto next;
+ }
+
+ csum = ext_pi_crc64(iter->data_buf, iter->interval);
+ if (pi->guard_tag != csum) {
+ pr_err("%s: guard tag error at sector %llu " \
+ "(rcvd %016llx, want %016llx)\n",
+ iter->disk_name, (unsigned long long)iter->seed,
+ be64_to_cpu(pi->guard_tag), be64_to_cpu(csum));
+ return BLK_STS_PROTECTION;
+ }
+
+next:
+ iter->data_buf += iter->interval;
+ iter->prot_buf += iter->tuple_size;
+ iter->seed++;
+ }
+
+ return BLK_STS_OK;
+}
+
+static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION);
+}
+
+static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION);
+}
+
+static void ext_pi_type1_prepare(struct request *rq)
+{
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u64 ref_tag = ext_pi_ref_tag(rq);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u64 virt = lower_48_bits(bip_get_seed(bip));
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ /* Already remapped? */
+ if (bip->bip_flags & BIP_MAPPED_INTEGRITY)
+ break;
+
+ bip_for_each_vec(iv, bip, iter) {
+ unsigned int j;
+ void *p;
+
+ p = bvec_kmap_local(&iv);
+ for (j = 0; j < iv.bv_len; j += tuple_sz) {
+ struct crc64_pi_tuple *pi = p;
+ u64 ref = get_unaligned_be48(pi->ref_tag);
+
+ if (ref == virt)
+ put_unaligned_be48(ref_tag, pi->ref_tag);
+ virt++;
+ ref_tag++;
+ p += tuple_sz;
+ }
+ kunmap_local(p);
+ }
+
+ bip->bip_flags |= BIP_MAPPED_INTEGRITY;
+ }
+}
+
+static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes)
+{
+ unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp;
+ const int tuple_sz = rq->q->integrity.tuple_size;
+ u64 ref_tag = ext_pi_ref_tag(rq);
+ struct bio *bio;
+
+ __rq_for_each_bio(bio, rq) {
+ struct bio_integrity_payload *bip = bio_integrity(bio);
+ u64 virt = lower_48_bits(bip_get_seed(bip));
+ struct bio_vec iv;
+ struct bvec_iter iter;
+
+ bip_for_each_vec(iv, bip, iter) {
+ unsigned int j;
+ void *p;
+
+ p = bvec_kmap_local(&iv);
+ for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) {
+ struct crc64_pi_tuple *pi = p;
+ u64 ref = get_unaligned_be48(pi->ref_tag);
+
+ if (ref == ref_tag)
+ put_unaligned_be48(virt, pi->ref_tag);
+ virt++;
+ ref_tag++;
+ intervals--;
+ p += tuple_sz;
+ }
+ kunmap_local(p);
+ }
+ }
+}
+
+static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION);
+}
+
+static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter)
+{
+ return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION);
+}
+
+const struct blk_integrity_profile ext_pi_type1_crc64 = {
+ .name = "EXT-DIF-TYPE1-CRC64",
+ .generate_fn = ext_pi_type1_generate_crc64,
+ .verify_fn = ext_pi_type1_verify_crc64,
+ .prepare_fn = ext_pi_type1_prepare,
+ .complete_fn = ext_pi_type1_complete,
+};
+EXPORT_SYMBOL_GPL(ext_pi_type1_crc64);
+
+const struct blk_integrity_profile ext_pi_type3_crc64 = {
+ .name = "EXT-DIF-TYPE3-CRC64",
+ .generate_fn = ext_pi_type3_generate_crc64,
+ .verify_fn = ext_pi_type3_verify_crc64,
+ .prepare_fn = t10_pi_type3_prepare,
+ .complete_fn = t10_pi_type3_complete,
+};
+EXPORT_SYMBOL_GPL(ext_pi_type3_crc64);
+
+MODULE_LICENSE("GPL");
MODULE_LICENSE("GPL");