diff options
Diffstat (limited to 'block')
101 files changed, 23170 insertions, 15019 deletions
diff --git a/block/Kconfig b/block/Kconfig index 9357d7302398..1de4682d48cc 100644 --- a/block/Kconfig +++ b/block/Kconfig @@ -5,8 +5,8 @@ menuconfig BLOCK bool "Enable the block layer" if EXPERT default y + select FS_IOMAP select SBITMAP - select SRCU help Provide block layer support for the kernel. @@ -26,38 +26,34 @@ menuconfig BLOCK if BLOCK -config BLK_RQ_ALLOC_TIME - bool +config BLOCK_LEGACY_AUTOLOAD + bool "Legacy autoloading support" + default y + help + Enable loading modules and creating block device instances based on + accesses through their device special file. This is a historic Linux + feature and makes no sense in a udev world where device files are + created on demand, but scripts that manually create device nodes and + then call losetup might rely on this behavior. -config BLK_SCSI_REQUEST +config BLK_RQ_ALLOC_TIME bool config BLK_CGROUP_RWSTAT bool -config BLK_DEV_BSG - bool "Block layer SG support v4" - default y - select BLK_SCSI_REQUEST - help - Saying Y here will enable generic SG (SCSI generic) v4 support - for any block device. - - Unlike SG v3 (aka block/scsi_ioctl.c drivers/scsi/sg.c), SG v4 - can handle complicated SCSI commands: tagged variable length cdbs - with bidirectional data transfers and generic request/response - protocols (e.g. Task Management Functions and SMP in Serial - Attached SCSI). +config BLK_CGROUP_PUNT_BIO + bool - This option is required by recent UDEV versions to properly - access device serial numbers, etc. +config BLK_DEV_BSG_COMMON + tristate - If unsure, say Y. +config BLK_ICQ + bool config BLK_DEV_BSGLIB bool "Block layer SG support v4 helper lib" - select BLK_DEV_BSG - select BLK_SCSI_REQUEST + select BLK_DEV_BSG_COMMON help Subsystems will normally enable this if needed. Users will not normally need to manually enable this. @@ -80,19 +76,41 @@ config BLK_DEV_INTEGRITY_T10 tristate depends on BLK_DEV_INTEGRITY select CRC_T10DIF + select CRC64_ROCKSOFT + +config BLK_DEV_WRITE_MOUNTED + bool "Allow writing to mounted block devices" + default y + help + When a block device is mounted, writing to its buffer cache is very + likely going to cause filesystem corruption. It is also rather easy to + crash the kernel in this way since the filesystem has no practical way + of detecting these writes to buffer cache and verifying its metadata + integrity. However there are some setups that need this capability + like running fsck on read-only mounted root device, modifying some + features on mounted ext4 filesystem, and similar. If you say N, the + kernel will prevent processes from writing to block devices that are + mounted by filesystems which provides some more protection from runaway + privileged processes and generally makes it much harder to crash + filesystem drivers. Note however that this does not prevent + underlying device(s) from being modified by other means, e.g. by + directly submitting SCSI commands or through access to lower layers of + storage stack. If in doubt, say Y. The configuration can be overridden + with the bdev_allow_write_mounted boot option. config BLK_DEV_ZONED bool "Zoned block device support" select MQ_IOSCHED_DEADLINE help Block layer zoned block device support. This option enables - support for ZAC/ZBC host-managed and host-aware zoned block devices. + support for ZAC/ZBC/ZNS host-managed and host-aware zoned block + devices. - Say yes here if you have a ZAC or ZBC storage device. + Say yes here if you have a ZAC, ZBC, or ZNS storage device. config BLK_DEV_THROTTLING bool "Block layer bio throttling support" - depends on BLK_CGROUP=y + depends on BLK_CGROUP select BLK_CGROUP_RWSTAT help Block layer bio throttling support. It can be used to limit @@ -113,16 +131,6 @@ config BLK_DEV_THROTTLING_LOW Note, this is an experimental interface and could be changed someday. -config BLK_CMDLINE_PARSER - bool "Block device command line partition parser" - help - Enabling this option allows you to specify the partition layout from - the kernel boot args. This is typically of use for embedded devices - which don't otherwise have any standardized method for listing the - partitions on a block device. - - See Documentation/block/cmdline-partition.rst for more information. - config BLK_WBT bool "Enable support for block device writeback throttling" help @@ -132,9 +140,16 @@ config BLK_WBT dynamically on an algorithm loosely based on CoDel, factoring in the realtime performance of the disk. +config BLK_WBT_MQ + bool "Enable writeback throttling by default" + default y + depends on BLK_WBT + help + Enable writeback throttling by default for request-based block devices. + config BLK_CGROUP_IOLATENCY bool "Enable support for latency based cgroup IO protection" - depends on BLK_CGROUP=y + depends on BLK_CGROUP help Enabling this option enables the .latency interface for IO throttling. The IO controller will attempt to maintain average IO latencies below @@ -143,10 +158,18 @@ config BLK_CGROUP_IOLATENCY Note, this is an experimental interface and could be changed someday. +config BLK_CGROUP_FC_APPID + bool "Enable support to track FC I/O Traffic across cgroup applications" + depends on BLK_CGROUP && NVME_FC + help + Enabling this option enables the support to track FC I/O traffic across + cgroup applications. It enables the Fabric and the storage targets to + identify, monitor, and handle FC traffic based on VM tags by inserting + application specific identification into the FC frame. + config BLK_CGROUP_IOCOST bool "Enable support for cost model based cgroup IO controller" - depends on BLK_CGROUP=y - select BLK_RQ_IO_DATA_LEN + depends on BLK_CGROUP select BLK_RQ_ALLOC_TIME help Enabling this option enables the .weight interface for cost @@ -154,14 +177,14 @@ config BLK_CGROUP_IOCOST distributes IO capacity between different groups based on their share of the overall weight distribution. -config BLK_WBT_MQ - bool "Multiqueue writeback throttling" - default y - depends on BLK_WBT +config BLK_CGROUP_IOPRIO + bool "Cgroup I/O controller for assigning an I/O priority class" + depends on BLK_CGROUP help - Enable writeback throttling by default on multiqueue devices. - Multiqueue currently doesn't have support for IO scheduling, - enabling this option is recommended. + Enable the .prio interface for assigning an I/O priority class to + requests. The I/O priority class affects the order in which an I/O + scheduler and block devices process requests. Only some I/O schedulers + and some block devices support I/O priorities. config BLK_DEBUG_FS bool "Block layer debugging information in debugfs" @@ -181,6 +204,9 @@ config BLK_DEBUG_FS_ZONED config BLK_SED_OPAL bool "Logic for interfacing with Opal enabled SEDs" + depends on KEYS + select PSERIES_PLPKS if PPC_PSERIES + select PSERIES_PLPKS_SED if PPC_PSERIES help Builds Logic for interfacing with Opal enabled controllers. Enabling this option enables users to setup/unlock/lock @@ -203,35 +229,26 @@ config BLK_INLINE_ENCRYPTION_FALLBACK by falling back to the kernel crypto API when inline encryption hardware is not present. -menu "Partition Types" - source "block/partitions/Kconfig" -endmenu - -endif # BLOCK - -config BLOCK_COMPAT - bool - depends on BLOCK && COMPAT - default y - config BLK_MQ_PCI - bool - depends on BLOCK && PCI - default y + def_bool PCI config BLK_MQ_VIRTIO bool - depends on BLOCK && VIRTIO + depends on VIRTIO default y -config BLK_MQ_RDMA +config BLK_PM + def_bool PM + +# do not use in new code +config BLOCK_HOLDER_DEPRECATED bool - depends on BLOCK && INFINIBAND - default y -config BLK_PM - def_bool BLOCK && PM +config BLK_MQ_STACKING + bool source "block/Kconfig.iosched" + +endif # BLOCK diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched index 2f2158e05a91..27f11320b8d1 100644 --- a/block/Kconfig.iosched +++ b/block/Kconfig.iosched @@ -1,6 +1,4 @@ # SPDX-License-Identifier: GPL-2.0 -if BLOCK - menu "IO Schedulers" config MQ_IOSCHED_DEADLINE @@ -20,6 +18,7 @@ config MQ_IOSCHED_KYBER config IOSCHED_BFQ tristate "BFQ I/O scheduler" + select BLK_ICQ help BFQ I/O scheduler for BLK-MQ. BFQ distributes the bandwidth of of the device among all processes according to their weights, @@ -31,6 +30,7 @@ config IOSCHED_BFQ config BFQ_GROUP_IOSCHED bool "BFQ hierarchical scheduling support" depends on IOSCHED_BFQ && BLK_CGROUP + default y select BLK_CGROUP_RWSTAT help @@ -45,5 +45,3 @@ config BFQ_CGROUP_DEBUG files in a cgroup which can be useful for debugging. endmenu - -endif diff --git a/block/Makefile b/block/Makefile index 78719169fb2a..46ada9dc8bbf 100644 --- a/block/Makefile +++ b/block/Makefile @@ -3,20 +3,22 @@ # Makefile for the kernel block layer # -obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-sysfs.o \ +obj-y := bdev.o fops.o bio.o elevator.o blk-core.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ - blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ + blk-merge.o blk-timeout.o \ blk-lib.o blk-mq.o blk-mq-tag.o blk-stat.o \ blk-mq-sysfs.o blk-mq-cpumap.o blk-mq-sched.o ioctl.o \ - genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o + genhd.o ioprio.o badblocks.o partitions/ blk-rq-qos.o \ + disk-events.o blk-ia-ranges.o early-lookup.o obj-$(CONFIG_BOUNCE) += bounce.o -obj-$(CONFIG_BLK_SCSI_REQUEST) += scsi_ioctl.o -obj-$(CONFIG_BLK_DEV_BSG) += bsg.o +obj-$(CONFIG_BLK_DEV_BSG_COMMON) += bsg.o obj-$(CONFIG_BLK_DEV_BSGLIB) += bsg-lib.o obj-$(CONFIG_BLK_CGROUP) += blk-cgroup.o obj-$(CONFIG_BLK_CGROUP_RWSTAT) += blk-cgroup-rwstat.o +obj-$(CONFIG_BLK_CGROUP_FC_APPID) += blk-cgroup-fc-appid.o obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o +obj-$(CONFIG_BLK_CGROUP_IOPRIO) += blk-ioprio.o obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o @@ -24,17 +26,17 @@ obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o obj-$(CONFIG_IOSCHED_BFQ) += bfq.o -obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o obj-$(CONFIG_BLK_DEV_INTEGRITY) += bio-integrity.o blk-integrity.o obj-$(CONFIG_BLK_DEV_INTEGRITY_T10) += t10-pi.o obj-$(CONFIG_BLK_MQ_PCI) += blk-mq-pci.o obj-$(CONFIG_BLK_MQ_VIRTIO) += blk-mq-virtio.o -obj-$(CONFIG_BLK_MQ_RDMA) += blk-mq-rdma.o obj-$(CONFIG_BLK_DEV_ZONED) += blk-zoned.o obj-$(CONFIG_BLK_WBT) += blk-wbt.o obj-$(CONFIG_BLK_DEBUG_FS) += blk-mq-debugfs.o obj-$(CONFIG_BLK_DEBUG_FS_ZONED)+= blk-mq-debugfs-zoned.o obj-$(CONFIG_BLK_SED_OPAL) += sed-opal.o obj-$(CONFIG_BLK_PM) += blk-pm.o -obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += keyslot-manager.o blk-crypto.o +obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ + blk-crypto-sysfs.o obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o +obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o diff --git a/block/badblocks.c b/block/badblocks.c index 2e5f5697db35..db4ec8b9b2a8 100644 --- a/block/badblocks.c +++ b/block/badblocks.c @@ -16,120 +16,830 @@ #include <linux/types.h> #include <linux/slab.h> -/** - * badblocks_check() - check a given range for bad sectors - * @bb: the badblocks structure that holds all badblock information - * @s: sector (start) at which to check for badblocks - * @sectors: number of sectors to check for badblocks - * @first_bad: pointer to store location of the first badblock - * @bad_sectors: pointer to store number of badblocks after @first_bad +/* + * The purpose of badblocks set/clear is to manage bad blocks ranges which are + * identified by LBA addresses. * - * We can record which blocks on each device are 'bad' and so just - * fail those blocks, or that stripe, rather than the whole device. - * Entries in the bad-block table are 64bits wide. This comprises: - * Length of bad-range, in sectors: 0-511 for lengths 1-512 - * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) - * A 'shift' can be set so that larger blocks are tracked and - * consequently larger devices can be covered. - * 'Acknowledged' flag - 1 bit. - the most significant bit. + * When the caller of badblocks_set() wants to set a range of bad blocks, the + * setting range can be acked or unacked. And the setting range may merge, + * overwrite, skip the overlapped already set range, depends on who they are + * overlapped or adjacent, and the acknowledgment type of the ranges. It can be + * more complicated when the setting range covers multiple already set bad block + * ranges, with restrictions of maximum length of each bad range and the bad + * table space limitation. * - * Locking of the bad-block table uses a seqlock so badblocks_check - * might need to retry if it is very unlucky. - * We will sometimes want to check for bad blocks in a bi_end_io function, - * so we use the write_seqlock_irq variant. + * It is difficult and unnecessary to take care of all the possible situations, + * for setting a large range of bad blocks, we can handle it by dividing the + * large range into smaller ones when encounter overlap, max range length or + * bad table full conditions. Every time only a smaller piece of the bad range + * is handled with a limited number of conditions how it is interacted with + * possible overlapped or adjacent already set bad block ranges. Then the hard + * complicated problem can be much simpler to handle in proper way. * - * When looking for a bad block we specify a range and want to - * know if any block in the range is bad. So we binary-search - * to the last range that starts at-or-before the given endpoint, - * (or "before the sector after the target range") - * then see if it ends after the given start. + * When setting a range of bad blocks to the bad table, the simplified situations + * to be considered are, (The already set bad blocks ranges are naming with + * prefix E, and the setting bad blocks range is naming with prefix S) * - * Return: - * 0: there are no known bad blocks in the range - * 1: there are known bad block which are all acknowledged - * -1: there are bad blocks which have not yet been acknowledged in metadata. - * plus the start/length of the first bad section we overlap. + * 1) A setting range is not overlapped or adjacent to any other already set bad + * block range. + * +--------+ + * | S | + * +--------+ + * +-------------+ +-------------+ + * | E1 | | E2 | + * +-------------+ +-------------+ + * For this situation if the bad blocks table is not full, just allocate a + * free slot from the bad blocks table to mark the setting range S. The + * result is, + * +-------------+ +--------+ +-------------+ + * | E1 | | S | | E2 | + * +-------------+ +--------+ +-------------+ + * 2) A setting range starts exactly at a start LBA of an already set bad blocks + * range. + * 2.1) The setting range size < already set range size + * +--------+ + * | S | + * +--------+ + * +-------------+ + * | E | + * +-------------+ + * 2.1.1) If S and E are both acked or unacked range, the setting range S can + * be merged into existing bad range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.1.2) If S is unacked setting and E is acked, the setting will be denied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ + * 2.1.3) If S is acked setting and E is unacked, range S can overwrite on E. + * An extra slot from the bad blocks table will be allocated for S, and head + * of E will move to end of the inserted range S. The result is, + * +--------+----+ + * | S | E | + * +--------+----+ + * 2.2) The setting range size == already set range size + * 2.2.1) If S and E are both acked or unacked range, the setting range S can + * be merged into existing bad range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.2.2) If S is unacked setting and E is acked, the setting will be denied, and + * the result is, + * +-------------+ + * | E | + * +-------------+ + * 2.2.3) If S is acked setting and E is unacked, range S can overwrite all of + bad blocks range E. The result is, + * +-------------+ + * | S | + * +-------------+ + * 2.3) The setting range size > already set range size + * +-------------------+ + * | S | + * +-------------------+ + * +-------------+ + * | E | + * +-------------+ + * For such situation, the setting range S can be treated as two parts, the + * first part (S1) is as same size as the already set range E, the second + * part (S2) is the rest of setting range. + * +-------------+-----+ +-------------+ +-----+ + * | S1 | S2 | | S1 | | S2 | + * +-------------+-----+ ===> +-------------+ +-----+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now we only focus on how to handle the setting range S1 and already set + * range E, which are already explained in 2.2), for the rest S2 it will be + * handled later in next loop. + * 3) A setting range starts before the start LBA of an already set bad blocks + * range. + * +-------------+ + * | S | + * +-------------+ + * +-------------+ + * | E | + * +-------------+ + * For this situation, the setting range S can be divided into two parts, the + * first (S1) ends at the start LBA of already set range E, the second part + * (S2) starts exactly at a start LBA of the already set range E. + * +----+---------+ +----+ +---------+ + * | S1 | S2 | | S1 | | S2 | + * +----+---------+ ===> +----+ +---------+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now only the first part S1 should be handled in this loop, which is in + * similar condition as 1). The rest part S2 has exact same start LBA address + * of the already set range E, they will be handled in next loop in one of + * situations in 2). + * 4) A setting range starts after the start LBA of an already set bad blocks + * range. + * 4.1) If the setting range S exactly matches the tail part of already set bad + * blocks range E, like the following chart shows, + * +---------+ + * | S | + * +---------+ + * +-------------+ + * | E | + * +-------------+ + * 4.1.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +-------------+ + * | S | + * +-------------+ + * 4.1.2) If range E is acked and the setting range S is unacked, the setting + * request of S will be rejected, the result is, + * +-------------+ + * | E | + * +-------------+ + * 4.1.3) If range E is unacked, and the setting range S is acked, then S may + * overwrite the overlapped range of E, the result is, + * +---+---------+ + * | E | S | + * +---+---------+ + * 4.2) If the setting range S stays in middle of an already set range E, like + * the following chart shows, + * +----+ + * | S | + * +----+ + * +--------------+ + * | E | + * +--------------+ + * 4.2.1) If range S and E have same acknowledge value (both acked or unacked), + * they will be merged into one, the result is, + * +--------------+ + * | S | + * +--------------+ + * 4.2.2) If range E is acked and the setting range S is unacked, the setting + * request of S will be rejected, the result is also, + * +--------------+ + * | E | + * +--------------+ + * 4.2.3) If range E is unacked, and the setting range S is acked, then S will + * inserted into middle of E and split previous range E into two parts (E1 + * and E2), the result is, + * +----+----+----+ + * | E1 | S | E2 | + * +----+----+----+ + * 4.3) If the setting bad blocks range S is overlapped with an already set bad + * blocks range E. The range S starts after the start LBA of range E, and + * ends after the end LBA of range E, as the following chart shows, + * +-------------------+ + * | S | + * +-------------------+ + * +-------------+ + * | E | + * +-------------+ + * For this situation the range S can be divided into two parts, the first + * part (S1) ends at end range E, and the second part (S2) has rest range of + * origin S. + * +---------+---------+ +---------+ +---------+ + * | S1 | S2 | | S1 | | S2 | + * +---------+---------+ ===> +---------+ +---------+ + * +-------------+ +-------------+ + * | E | | E | + * +-------------+ +-------------+ + * Now in this loop the setting range S1 and already set range E can be + * handled as the situations 4.1), the rest range S2 will be handled in next + * loop and ignored in this loop. + * 5) A setting bad blocks range S is adjacent to one or more already set bad + * blocks range(s), and they are all acked or unacked range. + * 5.1) Front merge: If the already set bad blocks range E is before setting + * range S and they are adjacent, + * +------+ + * | S | + * +------+ + * +-------+ + * | E | + * +-------+ + * 5.1.1) When total size of range S and E <= BB_MAX_LEN, and their acknowledge + * values are same, the setting range S can front merges into range E. The + * result is, + * +--------------+ + * | S | + * +--------------+ + * 5.1.2) Otherwise these two ranges cannot merge, just insert the setting + * range S right after already set range E into the bad blocks table. The + * result is, + * +--------+------+ + * | E | S | + * +--------+------+ + * 6) Special cases which above conditions cannot handle + * 6.1) Multiple already set ranges may merge into less ones in a full bad table + * +-------------------------------------------------------+ + * | S | + * +-------------------------------------------------------+ + * |<----- BB_MAX_LEN ----->| + * +-----+ +-----+ +-----+ + * | E1 | | E2 | | E3 | + * +-----+ +-----+ +-----+ + * In the above example, when the bad blocks table is full, inserting the + * first part of setting range S will fail because no more available slot + * can be allocated from bad blocks table. In this situation a proper + * setting method should be go though all the setting bad blocks range and + * look for chance to merge already set ranges into less ones. When there + * is available slot from bad blocks table, re-try again to handle more + * setting bad blocks ranges as many as possible. + * +------------------------+ + * | S3 | + * +------------------------+ + * |<----- BB_MAX_LEN ----->| + * +-----+-----+-----+---+-----+--+ + * | S1 | S2 | + * +-----+-----+-----+---+-----+--+ + * The above chart shows although the first part (S3) cannot be inserted due + * to no-space in bad blocks table, but the following E1, E2 and E3 ranges + * can be merged with rest part of S into less range S1 and S2. Now there is + * 1 free slot in bad blocks table. + * +------------------------+-----+-----+-----+---+-----+--+ + * | S3 | S1 | S2 | + * +------------------------+-----+-----+-----+---+-----+--+ + * Since the bad blocks table is not full anymore, re-try again for the + * origin setting range S. Now the setting range S3 can be inserted into the + * bad blocks table with previous freed slot from multiple ranges merge. + * 6.2) Front merge after overwrite + * In the following example, in bad blocks table, E1 is an acked bad blocks + * range and E2 is an unacked bad blocks range, therefore they are not able + * to merge into a larger range. The setting bad blocks range S is acked, + * therefore part of E2 can be overwritten by S. + * +--------+ + * | S | acknowledged + * +--------+ S: 1 + * +-------+-------------+ E1: 1 + * | E1 | E2 | E2: 0 + * +-------+-------------+ + * With previous simplified routines, after overwriting part of E2 with S, + * the bad blocks table should be (E3 is remaining part of E2 which is not + * overwritten by S), + * acknowledged + * +-------+--------+----+ S: 1 + * | E1 | S | E3 | E1: 1 + * +-------+--------+----+ E3: 0 + * The above result is correct but not perfect. Range E1 and S in the bad + * blocks table are all acked, merging them into a larger one range may + * occupy less bad blocks table space and make badblocks_check() faster. + * Therefore in such situation, after overwriting range S, the previous range + * E1 should be checked for possible front combination. Then the ideal + * result can be, + * +----------------+----+ acknowledged + * | E1 | E3 | E1: 1 + * +----------------+----+ E3: 0 + * 6.3) Behind merge: If the already set bad blocks range E is behind the setting + * range S and they are adjacent. Normally we don't need to care about this + * because front merge handles this while going though range S from head to + * tail, except for the tail part of range S. When the setting range S are + * fully handled, all the above simplified routine doesn't check whether the + * tail LBA of range S is adjacent to the next already set range and not + * merge them even it is possible. + * +------+ + * | S | + * +------+ + * +-------+ + * | E | + * +-------+ + * For the above special situation, when the setting range S are all handled + * and the loop ends, an extra check is necessary for whether next already + * set range E is right after S and mergeable. + * 6.3.1) When total size of range E and S <= BB_MAX_LEN, and their acknowledge + * values are same, the setting range S can behind merges into range E. The + * result is, + * +--------------+ + * | S | + * +--------------+ + * 6.3.2) Otherwise these two ranges cannot merge, just insert the setting range + * S in front of the already set range E in the bad blocks table. The result + * is, + * +------+-------+ + * | S | E | + * +------+-------+ + * + * All the above 5 simplified situations and 3 special cases may cover 99%+ of + * the bad block range setting conditions. Maybe there is some rare corner case + * is not considered and optimized, it won't hurt if badblocks_set() fails due + * to no space, or some ranges are not merged to save bad blocks table space. + * + * Inside badblocks_set() each loop starts by jumping to re_insert label, every + * time for the new loop prev_badblocks() is called to find an already set range + * which starts before or at current setting range. Since the setting bad blocks + * range is handled from head to tail, most of the cases it is unnecessary to do + * the binary search inside prev_badblocks(), it is possible to provide a hint + * to prev_badblocks() for a fast path, then the expensive binary search can be + * avoided. In my test with the hint to prev_badblocks(), except for the first + * loop, all rested calls to prev_badblocks() can go into the fast path and + * return correct bad blocks table index immediately. + * + * + * Clearing a bad blocks range from the bad block table has similar idea as + * setting does, but much more simpler. The only thing needs to be noticed is + * when the clearing range hits middle of a bad block range, the existing bad + * block range will split into two, and one more item should be added into the + * bad block table. The simplified situations to be considered are, (The already + * set bad blocks ranges in bad block table are naming with prefix E, and the + * clearing bad blocks range is naming with prefix C) + * + * 1) A clearing range is not overlapped to any already set ranges in bad block + * table. + * +-----+ | +-----+ | +-----+ + * | C | | | C | | | C | + * +-----+ or +-----+ or +-----+ + * +---+ | +----+ +----+ | +---+ + * | E | | | E1 | | E2 | | | E | + * +---+ | +----+ +----+ | +---+ + * For the above situations, no bad block to be cleared and no failure + * happens, simply returns 0. + * 2) The clearing range hits middle of an already setting bad blocks range in + * the bad block table. + * +---+ + * | C | + * +---+ + * +-----------------+ + * | E | + * +-----------------+ + * In this situation if the bad block table is not full, the range E will be + * split into two ranges E1 and E2. The result is, + * +------+ +------+ + * | E1 | | E2 | + * +------+ +------+ + * 3) The clearing range starts exactly at same LBA as an already set bad block range + * from the bad block table. + * 3.1) Partially covered at head part + * +------------+ + * | C | + * +------------+ + * +-----------------+ + * | E | + * +-----------------+ + * For this situation, the overlapped already set range will update the + * start LBA to end of C and shrink the range to BB_LEN(E) - BB_LEN(C). No + * item deleted from bad block table. The result is, + * +----+ + * | E1 | + * +----+ + * 3.2) Exact fully covered + * +-----------------+ + * | C | + * +-----------------+ + * +-----------------+ + * | E | + * +-----------------+ + * For this situation the whole bad blocks range E will be cleared and its + * corresponded item is deleted from the bad block table. + * 4) The clearing range exactly ends at same LBA as an already set bad block + * range. + * +-------+ + * | C | + * +-------+ + * +-----------------+ + * | E | + * +-----------------+ + * For the above situation, the already set range E is updated to shrink its + * end to the start of C, and reduce its length to BB_LEN(E) - BB_LEN(C). + * The result is, + * +---------+ + * | E | + * +---------+ + * 5) The clearing range is partially overlapped with an already set bad block + * range from the bad block table. + * 5.1) The already set bad block range is front overlapped with the clearing + * range. + * +----------+ + * | C | + * +----------+ + * +------------+ + * | E | + * +------------+ + * For such situation, the clearing range C can be treated as two parts. The + * first part ends at the start LBA of range E, and the second part starts at + * same LBA of range E. + * +----+-----+ +----+ +-----+ + * | C1 | C2 | | C1 | | C2 | + * +----+-----+ ===> +----+ +-----+ + * +------------+ +------------+ + * | E | | E | + * +------------+ +------------+ + * Now the first part C1 can be handled as condition 1), and the second part C2 can be + * handled as condition 3.1) in next loop. + * 5.2) The already set bad block range is behind overlaopped with the clearing + * range. + * +----------+ + * | C | + * +----------+ + * +------------+ + * | E | + * +------------+ + * For such situation, the clearing range C can be treated as two parts. The + * first part C1 ends at same end LBA of range E, and the second part starts + * at end LBA of range E. + * +----+-----+ +----+ +-----+ + * | C1 | C2 | | C1 | | C2 | + * +----+-----+ ===> +----+ +-----+ + * +------------+ +------------+ + * | E | | E | + * +------------+ +------------+ + * Now the first part clearing range C1 can be handled as condition 4), and + * the second part clearing range C2 can be handled as condition 1) in next + * loop. + * + * All bad blocks range clearing can be simplified into the above 5 situations + * by only handling the head part of the clearing range in each run of the + * while-loop. The idea is similar to bad blocks range setting but much + * simpler. */ -int badblocks_check(struct badblocks *bb, sector_t s, int sectors, - sector_t *first_bad, int *bad_sectors) + +/* + * Find the range starts at-or-before 's' from bad table. The search + * starts from index 'hint' and stops at index 'hint_end' from the bad + * table. + */ +static int prev_by_hint(struct badblocks *bb, sector_t s, int hint) { - int hi; - int lo; + int hint_end = hint + 2; u64 *p = bb->page; - int rv; - sector_t target = s + sectors; - unsigned seq; + int ret = -1; - if (bb->shift > 0) { - /* round the start down, and the end up */ - s >>= bb->shift; - target += (1<<bb->shift) - 1; - target >>= bb->shift; - sectors = target - s; + while ((hint < hint_end) && ((hint + 1) <= bb->count) && + (BB_OFFSET(p[hint]) <= s)) { + if ((hint + 1) == bb->count || BB_OFFSET(p[hint + 1]) > s) { + ret = hint; + break; + } + hint++; + } + + return ret; +} + +/* + * Find the range starts at-or-before bad->start. If 'hint' is provided + * (hint >= 0) then search in the bad table from hint firstly. It is + * very probably the wanted bad range can be found from the hint index, + * then the unnecessary while-loop iteration can be avoided. + */ +static int prev_badblocks(struct badblocks *bb, struct badblocks_context *bad, + int hint) +{ + sector_t s = bad->start; + int ret = -1; + int lo, hi; + u64 *p; + + if (!bb->count) + goto out; + + if (hint >= 0) { + ret = prev_by_hint(bb, s, hint); + if (ret >= 0) + goto out; } - /* 'target' is now the first block after the bad range */ -retry: - seq = read_seqbegin(&bb->lock); lo = 0; - rv = 0; hi = bb->count; + p = bb->page; - /* Binary search between lo and hi for 'target' - * i.e. for the last range that starts before 'target' - */ - /* INVARIANT: ranges before 'lo' and at-or-after 'hi' - * are known not to be the last range before target. - * VARIANT: hi-lo is the number of possible - * ranges, and decreases until it reaches 1 - */ + /* The following bisect search might be unnecessary */ + if (BB_OFFSET(p[lo]) > s) + return -1; + if (BB_OFFSET(p[hi - 1]) <= s) + return hi - 1; + + /* Do bisect search in bad table */ while (hi - lo > 1) { - int mid = (lo + hi) / 2; + int mid = (lo + hi)/2; sector_t a = BB_OFFSET(p[mid]); - if (a < target) - /* This could still be the one, earlier ranges - * could not. - */ + if (a == s) { + ret = mid; + goto out; + } + + if (a < s) lo = mid; else - /* This and later ranges are definitely out. */ hi = mid; } - /* 'lo' might be the last that started before target, but 'hi' isn't */ - if (hi > lo) { - /* need to check all range that end after 's' to see if - * any are unacknowledged. + + if (BB_OFFSET(p[lo]) <= s) + ret = lo; +out: + return ret; +} + +/* + * Return 'true' if the range indicated by 'bad' can be backward merged + * with the bad range (from the bad table) index by 'behind'. + */ +static bool can_merge_behind(struct badblocks *bb, + struct badblocks_context *bad, int behind) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + + if ((s < BB_OFFSET(p[behind])) && + ((s + sectors) >= BB_OFFSET(p[behind])) && + ((BB_END(p[behind]) - s) <= BB_MAX_LEN) && + BB_ACK(p[behind]) == bad->ack) + return true; + return false; +} + +/* + * Do backward merge for range indicated by 'bad' and the bad range + * (from the bad table) indexed by 'behind'. The return value is merged + * sectors from bad->len. + */ +static int behind_merge(struct badblocks *bb, struct badblocks_context *bad, + int behind) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int merged = 0; + + WARN_ON(s >= BB_OFFSET(p[behind])); + WARN_ON((s + sectors) < BB_OFFSET(p[behind])); + + if (s < BB_OFFSET(p[behind])) { + merged = BB_OFFSET(p[behind]) - s; + p[behind] = BB_MAKE(s, BB_LEN(p[behind]) + merged, bad->ack); + + WARN_ON((BB_LEN(p[behind]) + merged) >= BB_MAX_LEN); + } + + return merged; +} + +/* + * Return 'true' if the range indicated by 'bad' can be forward + * merged with the bad range (from the bad table) indexed by 'prev'. + */ +static bool can_merge_front(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + sector_t s = bad->start; + u64 *p = bb->page; + + if (BB_ACK(p[prev]) == bad->ack && + (s < BB_END(p[prev]) || + (s == BB_END(p[prev]) && (BB_LEN(p[prev]) < BB_MAX_LEN)))) + return true; + return false; +} + +/* + * Do forward merge for range indicated by 'bad' and the bad range + * (from bad table) indexed by 'prev'. The return value is sectors + * merged from bad->len. + */ +static int front_merge(struct badblocks *bb, int prev, struct badblocks_context *bad) +{ + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int merged = 0; + + WARN_ON(s > BB_END(p[prev])); + + if (s < BB_END(p[prev])) { + merged = min_t(sector_t, sectors, BB_END(p[prev]) - s); + } else { + merged = min_t(sector_t, sectors, BB_MAX_LEN - BB_LEN(p[prev])); + if ((prev + 1) < bb->count && + merged > (BB_OFFSET(p[prev + 1]) - BB_END(p[prev]))) { + merged = BB_OFFSET(p[prev + 1]) - BB_END(p[prev]); + } + + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + merged, bad->ack); + } + + return merged; +} + +/* + * 'Combine' is a special case which can_merge_front() is not able to + * handle: If a bad range (indexed by 'prev' from bad table) exactly + * starts as bad->start, and the bad range ahead of 'prev' (indexed by + * 'prev - 1' from bad table) exactly ends at where 'prev' starts, and + * the sum of their lengths does not exceed BB_MAX_LEN limitation, then + * these two bad range (from bad table) can be combined. + * + * Return 'true' if bad ranges indexed by 'prev' and 'prev - 1' from bad + * table can be combined. + */ +static bool can_combine_front(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + + if ((prev > 0) && + (BB_OFFSET(p[prev]) == bad->start) && + (BB_END(p[prev - 1]) == BB_OFFSET(p[prev])) && + (BB_LEN(p[prev - 1]) + BB_LEN(p[prev]) <= BB_MAX_LEN) && + (BB_ACK(p[prev - 1]) == BB_ACK(p[prev]))) + return true; + return false; +} + +/* + * Combine the bad ranges indexed by 'prev' and 'prev - 1' (from bad + * table) into one larger bad range, and the new range is indexed by + * 'prev - 1'. + * The caller of front_combine() will decrease bb->count, therefore + * it is unnecessary to clear p[perv] after front merge. + */ +static void front_combine(struct badblocks *bb, int prev) +{ + u64 *p = bb->page; + + p[prev - 1] = BB_MAKE(BB_OFFSET(p[prev - 1]), + BB_LEN(p[prev - 1]) + BB_LEN(p[prev]), + BB_ACK(p[prev])); + if ((prev + 1) < bb->count) + memmove(p + prev, p + prev + 1, (bb->count - prev - 1) * 8); +} + +/* + * Return 'true' if the range indicated by 'bad' is exactly forward + * overlapped with the bad range (from bad table) indexed by 'front'. + * Exactly forward overlap means the bad range (from bad table) indexed + * by 'prev' does not cover the whole range indicated by 'bad'. + */ +static bool overlap_front(struct badblocks *bb, int front, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + + if (bad->start >= BB_OFFSET(p[front]) && + bad->start < BB_END(p[front])) + return true; + return false; +} + +/* + * Return 'true' if the range indicated by 'bad' is exactly backward + * overlapped with the bad range (from bad table) indexed by 'behind'. + */ +static bool overlap_behind(struct badblocks *bb, struct badblocks_context *bad, + int behind) +{ + u64 *p = bb->page; + + if (bad->start < BB_OFFSET(p[behind]) && + (bad->start + bad->len) > BB_OFFSET(p[behind])) + return true; + return false; +} + +/* + * Return 'true' if the range indicated by 'bad' can overwrite the bad + * range (from bad table) indexed by 'prev'. + * + * The range indicated by 'bad' can overwrite the bad range indexed by + * 'prev' when, + * 1) The whole range indicated by 'bad' can cover partial or whole bad + * range (from bad table) indexed by 'prev'. + * 2) The ack value of 'bad' is larger or equal to the ack value of bad + * range 'prev'. + * + * If the overwriting doesn't cover the whole bad range (from bad table) + * indexed by 'prev', new range might be split from existing bad range, + * 1) The overwrite covers head or tail part of existing bad range, 1 + * extra bad range will be split and added into the bad table. + * 2) The overwrite covers middle of existing bad range, 2 extra bad + * ranges will be split (ahead and after the overwritten range) and + * added into the bad table. + * The number of extra split ranges of the overwriting is stored in + * 'extra' and returned for the caller. + */ +static bool can_front_overwrite(struct badblocks *bb, int prev, + struct badblocks_context *bad, int *extra) +{ + u64 *p = bb->page; + int len; + + WARN_ON(!overlap_front(bb, prev, bad)); + + if (BB_ACK(p[prev]) >= bad->ack) + return false; + + if (BB_END(p[prev]) <= (bad->start + bad->len)) { + len = BB_END(p[prev]) - bad->start; + if (BB_OFFSET(p[prev]) == bad->start) + *extra = 0; + else + *extra = 1; + + bad->len = len; + } else { + if (BB_OFFSET(p[prev]) == bad->start) + *extra = 1; + else + /* + * prev range will be split into two, beside the overwritten + * one, an extra slot needed from bad table. */ - while (lo >= 0 && - BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) { - if (BB_OFFSET(p[lo]) < target) { - /* starts before the end, and finishes after - * the start, so they must overlap - */ - if (rv != -1 && BB_ACK(p[lo])) - rv = 1; - else - rv = -1; - *first_bad = BB_OFFSET(p[lo]); - *bad_sectors = BB_LEN(p[lo]); - } - lo--; + *extra = 2; + } + + if ((bb->count + (*extra)) >= MAX_BADBLOCKS) + return false; + + return true; +} + +/* + * Do the overwrite from the range indicated by 'bad' to the bad range + * (from bad table) indexed by 'prev'. + * The previously called can_front_overwrite() will provide how many + * extra bad range(s) might be split and added into the bad table. All + * the splitting cases in the bad table will be handled here. + */ +static int front_overwrite(struct badblocks *bb, int prev, + struct badblocks_context *bad, int extra) +{ + u64 *p = bb->page; + sector_t orig_end = BB_END(p[prev]); + int orig_ack = BB_ACK(p[prev]); + + switch (extra) { + case 0: + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), BB_LEN(p[prev]), + bad->ack); + break; + case 1: + if (BB_OFFSET(p[prev]) == bad->start) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->len, bad->ack); + memmove(p + prev + 2, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start + bad->len, + orig_end - BB_END(p[prev]), + orig_ack); + } else { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->start - BB_OFFSET(p[prev]), + orig_ack); + /* + * prev +2 -> prev + 1 + 1, which is for, + * 1) prev + 1: the slot index of the previous one + * 2) + 1: one more slot for extra being 1. + */ + memmove(p + prev + 2, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); } + break; + case 2: + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + bad->start - BB_OFFSET(p[prev]), + orig_ack); + /* + * prev + 3 -> prev + 1 + 2, which is for, + * 1) prev + 1: the slot index of the previous one + * 2) + 2: two more slots for extra being 2. + */ + memmove(p + prev + 3, p + prev + 1, + (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(bad->start, bad->len, bad->ack); + p[prev + 2] = BB_MAKE(BB_END(p[prev + 1]), + orig_end - BB_END(p[prev + 1]), + orig_ack); + break; + default: + break; } - if (read_seqretry(&bb->lock, seq)) - goto retry; + return bad->len; +} - return rv; +/* + * Explicitly insert a range indicated by 'bad' to the bad table, where + * the location is indexed by 'at'. + */ +static int insert_at(struct badblocks *bb, int at, struct badblocks_context *bad) +{ + u64 *p = bb->page; + int len; + + WARN_ON(badblocks_full(bb)); + + len = min_t(sector_t, bad->len, BB_MAX_LEN); + if (at < bb->count) + memmove(p + at + 1, p + at, (bb->count - at) * 8); + p[at] = BB_MAKE(bad->start, len, bad->ack); + + return len; } -EXPORT_SYMBOL_GPL(badblocks_check); static void badblocks_update_acked(struct badblocks *bb) { + bool unacked = false; u64 *p = bb->page; int i; - bool unacked = false; if (!bb->unacked_exist) return; @@ -145,282 +855,602 @@ static void badblocks_update_acked(struct badblocks *bb) bb->unacked_exist = 0; } -/** - * badblocks_set() - Add a range of bad blocks to the table. - * @bb: the badblocks structure that holds all badblock information - * @s: first sector to mark as bad - * @sectors: number of sectors to mark as bad - * @acknowledged: weather to mark the bad sectors as acknowledged - * - * This might extend the table, or might contract it if two adjacent ranges - * can be merged. We binary-search to find the 'insertion' point, then - * decide how best to handle it. - * - * Return: - * 0: success - * 1: failed to set badblocks (out of space) - */ -int badblocks_set(struct badblocks *bb, sector_t s, int sectors, - int acknowledged) +/* Do exact work to set bad block range into the bad block table */ +static int _badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) { - u64 *p; - int lo, hi; - int rv = 0; + int retried = 0, space_desired = 0; + int orig_len, len = 0, added = 0; + struct badblocks_context bad; + int prev = -1, hint = -1; + sector_t orig_start; unsigned long flags; + int rv = 0; + u64 *p; if (bb->shift < 0) /* badblocks are disabled */ return 1; + if (sectors == 0) + /* Invalid sectors number */ + return 1; + if (bb->shift) { /* round the start down, and the end up */ sector_t next = s + sectors; - s >>= bb->shift; - next += (1<<bb->shift) - 1; - next >>= bb->shift; + rounddown(s, bb->shift); + roundup(next, bb->shift); sectors = next - s; } write_seqlock_irqsave(&bb->lock, flags); + orig_start = s; + orig_len = sectors; + bad.ack = acknowledged; p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts at-or-before 's' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a <= s) - lo = mid; - else - hi = mid; +re_insert: + bad.start = s; + bad.len = sectors; + len = 0; + + if (badblocks_empty(bb)) { + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + goto update_sectors; } - if (hi > lo && BB_OFFSET(p[lo]) > s) - hi = lo; - if (hi > lo) { - /* we found a range that might merge with the start - * of our new range - */ - sector_t a = BB_OFFSET(p[lo]); - sector_t e = a + BB_LEN(p[lo]); - int ack = BB_ACK(p[lo]); - - if (e >= s) { - /* Yes, we can merge with a previous range */ - if (s == a && s + sectors >= e) - /* new range covers old */ - ack = acknowledged; - else - ack = ack && acknowledged; - - if (e < s + sectors) - e = s + sectors; - if (e - a <= BB_MAX_LEN) { - p[lo] = BB_MAKE(a, e-a, ack); - s = e; + prev = prev_badblocks(bb, &bad, hint); + + /* start before all badblocks */ + if (prev < 0) { + if (!badblocks_full(bb)) { + /* insert on the first */ + if (bad.len > (BB_OFFSET(p[0]) - bad.start)) + bad.len = BB_OFFSET(p[0]) - bad.start; + len = insert_at(bb, 0, &bad); + bb->count++; + added++; + hint = 0; + goto update_sectors; + } + + /* No sapce, try to merge */ + if (overlap_behind(bb, &bad, 0)) { + if (can_merge_behind(bb, &bad, 0)) { + len = behind_merge(bb, &bad, 0); + added++; } else { - /* does not all fit in one range, - * make p[lo] maximal - */ - if (BB_LEN(p[lo]) != BB_MAX_LEN) - p[lo] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; + len = BB_OFFSET(p[0]) - s; + space_desired = 1; } - sectors = e - s; + hint = 0; + goto update_sectors; } + + /* no table space and give up */ + goto out; } - if (sectors && hi < bb->count) { - /* 'hi' points to the first range that starts after 's'. - * Maybe we can merge with the start of that range - */ - sector_t a = BB_OFFSET(p[hi]); - sector_t e = a + BB_LEN(p[hi]); - int ack = BB_ACK(p[hi]); - - if (a <= s + sectors) { - /* merging is possible */ - if (e <= s + sectors) { - /* full overlap */ - e = s + sectors; - ack = acknowledged; - } else - ack = ack && acknowledged; - - a = s; - if (e - a <= BB_MAX_LEN) { - p[hi] = BB_MAKE(a, e-a, ack); - s = e; - } else { - p[hi] = BB_MAKE(a, BB_MAX_LEN, ack); - s = a + BB_MAX_LEN; + + /* in case p[prev-1] can be merged with p[prev] */ + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; + added++; + hint = prev; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + } else { + int extra = 0; + + if (!can_front_overwrite(bb, prev, &bad, &extra)) { + len = min_t(sector_t, + BB_END(p[prev]) - s, sectors); + hint = prev; + goto update_sectors; + } + + len = front_overwrite(bb, prev, &bad, extra); + added++; + bb->count += extra; + + if (can_combine_front(bb, prev, &bad)) { + front_combine(bb, prev); + bb->count--; } - sectors = e - s; - lo = hi; - hi++; } + hint = prev; + goto update_sectors; + } + + if (can_merge_front(bb, prev, &bad)) { + len = front_merge(bb, prev, &bad); + added++; + hint = prev; + goto update_sectors; } - if (sectors == 0 && hi < bb->count) { - /* we might be able to combine lo and hi */ - /* Note: 's' is at the end of 'lo' */ - sector_t a = BB_OFFSET(p[hi]); - int lolen = BB_LEN(p[lo]); - int hilen = BB_LEN(p[hi]); - int newlen = lolen + hilen - (s - a); - - if (s >= a && newlen < BB_MAX_LEN) { - /* yes, we can combine them */ - int ack = BB_ACK(p[lo]) && BB_ACK(p[hi]); - - p[lo] = BB_MAKE(BB_OFFSET(p[lo]), newlen, ack); - memmove(p + hi, p + hi + 1, - (bb->count - hi - 1) * 8); - bb->count--; + + /* if no space in table, still try to merge in the covered range */ + if (badblocks_full(bb)) { + /* skip the cannot-merge range */ + if (((prev + 1) < bb->count) && + overlap_behind(bb, &bad, prev + 1) && + ((s + sectors) >= BB_END(p[prev + 1]))) { + len = BB_END(p[prev + 1]) - s; + hint = prev + 1; + goto update_sectors; } + + /* no retry any more */ + len = sectors; + space_desired = 1; + hint = -1; + goto update_sectors; } - while (sectors) { - /* didn't merge (it all). - * Need to add a range just before 'hi' - */ - if (bb->count >= MAX_BADBLOCKS) { - /* No room for more */ - rv = 1; - break; - } else { - int this_sectors = sectors; - memmove(p + hi + 1, p + hi, - (bb->count - hi) * 8); - bb->count++; + /* cannot merge and there is space in bad table */ + if ((prev + 1) < bb->count && + overlap_behind(bb, &bad, prev + 1)) + bad.len = min_t(sector_t, + bad.len, BB_OFFSET(p[prev + 1]) - bad.start); - if (this_sectors > BB_MAX_LEN) - this_sectors = BB_MAX_LEN; - p[hi] = BB_MAKE(s, this_sectors, acknowledged); - sectors -= this_sectors; - s += this_sectors; - } + len = insert_at(bb, prev + 1, &bad); + bb->count++; + added++; + hint = prev + 1; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_insert; + + WARN_ON(sectors < 0); + + /* + * Check whether the following already set range can be + * merged. (prev < 0) condition is not handled here, + * because it's already complicated enough. + */ + if (prev >= 0 && + (prev + 1) < bb->count && + BB_END(p[prev]) == BB_OFFSET(p[prev + 1]) && + (BB_LEN(p[prev]) + BB_LEN(p[prev + 1])) <= BB_MAX_LEN && + BB_ACK(p[prev]) == BB_ACK(p[prev + 1])) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + BB_LEN(p[prev]) + BB_LEN(p[prev + 1]), + BB_ACK(p[prev])); + + if ((prev + 2) < bb->count) + memmove(p + prev + 1, p + prev + 2, + (bb->count - (prev + 2)) * 8); + bb->count--; + } + + if (space_desired && !badblocks_full(bb)) { + s = orig_start; + sectors = orig_len; + space_desired = 0; + if (retried++ < 3) + goto re_insert; + } + +out: + if (added) { + set_changed(bb); + + if (!acknowledged) + bb->unacked_exist = 1; + else + badblocks_update_acked(bb); } - bb->changed = 1; - if (!acknowledged) - bb->unacked_exist = 1; - else - badblocks_update_acked(bb); write_sequnlock_irqrestore(&bb->lock, flags); + if (!added) + rv = 1; + return rv; } -EXPORT_SYMBOL_GPL(badblocks_set); -/** - * badblocks_clear() - Remove a range of bad blocks to the table. - * @bb: the badblocks structure that holds all badblock information - * @s: first sector to mark as bad - * @sectors: number of sectors to mark as bad - * - * This may involve extending the table if we spilt a region, - * but it must not fail. So if the table becomes full, we just - * drop the remove request. - * - * Return: - * 0: success - * 1: failed to clear badblocks +/* + * Clear the bad block range from bad block table which is front overlapped + * with the clearing range. The return value is how many sectors from an + * already set bad block range are cleared. If the whole bad block range is + * covered by the clearing range and fully cleared, 'delete' is set as 1 for + * the caller to reduce bb->count. */ -int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +static int front_clear(struct badblocks *bb, int prev, + struct badblocks_context *bad, int *deleted) { - u64 *p; - int lo, hi; - sector_t target = s + sectors; + sector_t sectors = bad->len; + sector_t s = bad->start; + u64 *p = bb->page; + int cleared = 0; + + *deleted = 0; + if (s == BB_OFFSET(p[prev])) { + if (BB_LEN(p[prev]) > sectors) { + p[prev] = BB_MAKE(BB_OFFSET(p[prev]) + sectors, + BB_LEN(p[prev]) - sectors, + BB_ACK(p[prev])); + cleared = sectors; + } else { + /* BB_LEN(p[prev]) <= sectors */ + cleared = BB_LEN(p[prev]); + if ((prev + 1) < bb->count) + memmove(p + prev, p + prev + 1, + (bb->count - prev - 1) * 8); + *deleted = 1; + } + } else if (s > BB_OFFSET(p[prev])) { + if (BB_END(p[prev]) <= (s + sectors)) { + cleared = BB_END(p[prev]) - s; + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + s - BB_OFFSET(p[prev]), + BB_ACK(p[prev])); + } else { + /* Splitting is handled in front_splitting_clear() */ + BUG(); + } + } + + return cleared; +} + +/* + * Handle the condition that the clearing range hits middle of an already set + * bad block range from bad block table. In this condition the existing bad + * block range is split into two after the middle part is cleared. + */ +static int front_splitting_clear(struct badblocks *bb, int prev, + struct badblocks_context *bad) +{ + u64 *p = bb->page; + u64 end = BB_END(p[prev]); + int ack = BB_ACK(p[prev]); + sector_t sectors = bad->len; + sector_t s = bad->start; + + p[prev] = BB_MAKE(BB_OFFSET(p[prev]), + s - BB_OFFSET(p[prev]), + ack); + memmove(p + prev + 2, p + prev + 1, (bb->count - prev - 1) * 8); + p[prev + 1] = BB_MAKE(s + sectors, end - s - sectors, ack); + return sectors; +} + +/* Do the exact work to clear bad block range from the bad block table */ +static int _badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + struct badblocks_context bad; + int prev = -1, hint = -1; + int len = 0, cleared = 0; int rv = 0; + u64 *p; + + if (bb->shift < 0) + /* badblocks are disabled */ + return 1; + + if (sectors == 0) + /* Invalid sectors number */ + return 1; + + if (bb->shift) { + sector_t target; - if (bb->shift > 0) { /* When clearing we round the start up and the end down. * This should not matter as the shift should align with * the block size and no rounding should ever be needed. * However it is better the think a block is bad when it * isn't than to think a block is not bad when it is. */ - s += (1<<bb->shift) - 1; - s >>= bb->shift; - target >>= bb->shift; + target = s + sectors; + roundup(s, bb->shift); + rounddown(target, bb->shift); sectors = target - s; } write_seqlock_irq(&bb->lock); + bad.ack = true; p = bb->page; - lo = 0; - hi = bb->count; - /* Find the last range that starts before 'target' */ - while (hi - lo > 1) { - int mid = (lo + hi) / 2; - sector_t a = BB_OFFSET(p[mid]); - if (a < target) - lo = mid; - else - hi = mid; +re_clear: + bad.start = s; + bad.len = sectors; + + if (badblocks_empty(bb)) { + len = sectors; + cleared++; + goto update_sectors; } - if (hi > lo) { - /* p[lo] is the last range that could overlap the - * current range. Earlier ranges could also overlap, - * but only this one can overlap the end of the range. - */ - if ((BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > target) && - (BB_OFFSET(p[lo]) < target)) { - /* Partial overlap, leave the tail of this range */ - int ack = BB_ACK(p[lo]); - sector_t a = BB_OFFSET(p[lo]); - sector_t end = a + BB_LEN(p[lo]); - - if (a < s) { - /* we need to split this range */ - if (bb->count >= MAX_BADBLOCKS) { - rv = -ENOSPC; - goto out; - } - memmove(p+lo+1, p+lo, (bb->count - lo) * 8); - bb->count++; - p[lo] = BB_MAKE(a, s-a, ack); - lo++; - } - p[lo] = BB_MAKE(target, end - target, ack); - /* there is no longer an overlap */ - hi = lo; - lo--; - } - while (lo >= 0 && - (BB_OFFSET(p[lo]) + BB_LEN(p[lo]) > s) && - (BB_OFFSET(p[lo]) < target)) { - /* This range does overlap */ - if (BB_OFFSET(p[lo]) < s) { - /* Keep the early parts of this range. */ - int ack = BB_ACK(p[lo]); - sector_t start = BB_OFFSET(p[lo]); - - p[lo] = BB_MAKE(start, s - start, ack); - /* now low doesn't overlap, so.. */ - break; - } - lo--; + + + prev = prev_badblocks(bb, &bad, hint); + + /* Start before all badblocks */ + if (prev < 0) { + if (overlap_behind(bb, &bad, 0)) { + len = BB_OFFSET(p[0]) - s; + hint = 0; + } else { + len = sectors; } - /* 'lo' is strictly before, 'hi' is strictly after, - * anything between needs to be discarded + /* + * Both situations are to clear non-bad range, + * should be treated as successful */ - if (hi - lo > 1) { - memmove(p+lo+1, p+hi, (bb->count - hi) * 8); - bb->count -= (hi - lo - 1); + cleared++; + goto update_sectors; + } + + /* Start after all badblocks */ + if ((prev + 1) >= bb->count && !overlap_front(bb, prev, &bad)) { + len = sectors; + cleared++; + goto update_sectors; + } + + /* Clear will split a bad record but the table is full */ + if (badblocks_full(bb) && (BB_OFFSET(p[prev]) < bad.start) && + (BB_END(p[prev]) > (bad.start + sectors))) { + len = sectors; + goto update_sectors; + } + + if (overlap_front(bb, prev, &bad)) { + if ((BB_OFFSET(p[prev]) < bad.start) && + (BB_END(p[prev]) > (bad.start + bad.len))) { + /* Splitting */ + if ((bb->count + 1) < MAX_BADBLOCKS) { + len = front_splitting_clear(bb, prev, &bad); + bb->count += 1; + cleared++; + } else { + /* No space to split, give up */ + len = sectors; + } + } else { + int deleted = 0; + + len = front_clear(bb, prev, &bad, &deleted); + bb->count -= deleted; + cleared++; + hint = prev; } + + goto update_sectors; + } + + /* Not front overlap, but behind overlap */ + if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { + len = BB_OFFSET(p[prev + 1]) - bad.start; + hint = prev + 1; + /* Clear non-bad range should be treated as successful */ + cleared++; + goto update_sectors; + } + + /* Not cover any badblocks range in the table */ + len = sectors; + /* Clear non-bad range should be treated as successful */ + cleared++; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_clear; + + WARN_ON(sectors < 0); + + if (cleared) { + badblocks_update_acked(bb); + set_changed(bb); } - badblocks_update_acked(bb); - bb->changed = 1; -out: write_sequnlock_irq(&bb->lock); + + if (!cleared) + rv = 1; + + return rv; +} + +/* Do the exact work to check bad blocks range from the bad block table */ +static int _badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + int unacked_badblocks, acked_badblocks; + int prev = -1, hint = -1, set = 0; + struct badblocks_context bad; + unsigned int seq; + int len, rv; + u64 *p; + + WARN_ON(bb->shift < 0 || sectors == 0); + + if (bb->shift > 0) { + sector_t target; + + /* round the start down, and the end up */ + target = s + sectors; + rounddown(s, bb->shift); + roundup(target, bb->shift); + sectors = target - s; + } + +retry: + seq = read_seqbegin(&bb->lock); + + p = bb->page; + unacked_badblocks = 0; + acked_badblocks = 0; + +re_check: + bad.start = s; + bad.len = sectors; + + if (badblocks_empty(bb)) { + len = sectors; + goto update_sectors; + } + + prev = prev_badblocks(bb, &bad, hint); + + /* start after all badblocks */ + if ((prev >= 0) && + ((prev + 1) >= bb->count) && !overlap_front(bb, prev, &bad)) { + len = sectors; + goto update_sectors; + } + + /* Overlapped with front badblocks record */ + if ((prev >= 0) && overlap_front(bb, prev, &bad)) { + if (BB_ACK(p[prev])) + acked_badblocks++; + else + unacked_badblocks++; + + if (BB_END(p[prev]) >= (s + sectors)) + len = sectors; + else + len = BB_END(p[prev]) - s; + + if (set == 0) { + *first_bad = BB_OFFSET(p[prev]); + *bad_sectors = BB_LEN(p[prev]); + set = 1; + } + goto update_sectors; + } + + /* Not front overlap, but behind overlap */ + if ((prev + 1) < bb->count && overlap_behind(bb, &bad, prev + 1)) { + len = BB_OFFSET(p[prev + 1]) - bad.start; + hint = prev + 1; + goto update_sectors; + } + + /* not cover any badblocks range in the table */ + len = sectors; + +update_sectors: + s += len; + sectors -= len; + + if (sectors > 0) + goto re_check; + + WARN_ON(sectors < 0); + + if (unacked_badblocks > 0) + rv = -1; + else if (acked_badblocks > 0) + rv = 1; + else + rv = 0; + + if (read_seqretry(&bb->lock, seq)) + goto retry; + return rv; } + +/** + * badblocks_check() - check a given range for bad sectors + * @bb: the badblocks structure that holds all badblock information + * @s: sector (start) at which to check for badblocks + * @sectors: number of sectors to check for badblocks + * @first_bad: pointer to store location of the first badblock + * @bad_sectors: pointer to store number of badblocks after @first_bad + * + * We can record which blocks on each device are 'bad' and so just + * fail those blocks, or that stripe, rather than the whole device. + * Entries in the bad-block table are 64bits wide. This comprises: + * Length of bad-range, in sectors: 0-511 for lengths 1-512 + * Start of bad-range, sector offset, 54 bits (allows 8 exbibytes) + * A 'shift' can be set so that larger blocks are tracked and + * consequently larger devices can be covered. + * 'Acknowledged' flag - 1 bit. - the most significant bit. + * + * Locking of the bad-block table uses a seqlock so badblocks_check + * might need to retry if it is very unlucky. + * We will sometimes want to check for bad blocks in a bi_end_io function, + * so we use the write_seqlock_irq variant. + * + * When looking for a bad block we specify a range and want to + * know if any block in the range is bad. So we binary-search + * to the last range that starts at-or-before the given endpoint, + * (or "before the sector after the target range") + * then see if it ends after the given start. + * + * Return: + * 0: there are no known bad blocks in the range + * 1: there are known bad block which are all acknowledged + * -1: there are bad blocks which have not yet been acknowledged in metadata. + * plus the start/length of the first bad section we overlap. + */ +int badblocks_check(struct badblocks *bb, sector_t s, int sectors, + sector_t *first_bad, int *bad_sectors) +{ + return _badblocks_check(bb, s, sectors, first_bad, bad_sectors); +} +EXPORT_SYMBOL_GPL(badblocks_check); + +/** + * badblocks_set() - Add a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * @acknowledged: weather to mark the bad sectors as acknowledged + * + * This might extend the table, or might contract it if two adjacent ranges + * can be merged. We binary-search to find the 'insertion' point, then + * decide how best to handle it. + * + * Return: + * 0: success + * 1: failed to set badblocks (out of space) + */ +int badblocks_set(struct badblocks *bb, sector_t s, int sectors, + int acknowledged) +{ + return _badblocks_set(bb, s, sectors, acknowledged); +} +EXPORT_SYMBOL_GPL(badblocks_set); + +/** + * badblocks_clear() - Remove a range of bad blocks to the table. + * @bb: the badblocks structure that holds all badblock information + * @s: first sector to mark as bad + * @sectors: number of sectors to mark as bad + * + * This may involve extending the table if we spilt a region, + * but it must not fail. So if the table becomes full, we just + * drop the remove request. + * + * Return: + * 0: success + * 1: failed to clear badblocks + */ +int badblocks_clear(struct badblocks *bb, sector_t s, int sectors) +{ + return _badblocks_clear(bb, s, sectors); +} EXPORT_SYMBOL_GPL(badblocks_clear); /** @@ -525,7 +1555,7 @@ ssize_t badblocks_store(struct badblocks *bb, const char *page, size_t len, case 3: if (newline != '\n') return -EINVAL; - /* fall through */ + fallthrough; case 2: if (length <= 0) return -EINVAL; diff --git a/block/bdev.c b/block/bdev.c new file mode 100644 index 000000000000..e9f1b12bd75c --- /dev/null +++ b/block/bdev.c @@ -0,0 +1,1148 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig + */ + +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/slab.h> +#include <linux/kmod.h> +#include <linux/major.h> +#include <linux/device_cgroup.h> +#include <linux/blkdev.h> +#include <linux/blk-integrity.h> +#include <linux/backing-dev.h> +#include <linux/module.h> +#include <linux/blkpg.h> +#include <linux/magic.h> +#include <linux/buffer_head.h> +#include <linux/swap.h> +#include <linux/writeback.h> +#include <linux/mount.h> +#include <linux/pseudo_fs.h> +#include <linux/uio.h> +#include <linux/namei.h> +#include <linux/part_stat.h> +#include <linux/uaccess.h> +#include <linux/stat.h> +#include "../fs/internal.h" +#include "blk.h" + +/* Should we allow writing to mounted block devices? */ +static bool bdev_allow_write_mounted = IS_ENABLED(CONFIG_BLK_DEV_WRITE_MOUNTED); + +struct bdev_inode { + struct block_device bdev; + struct inode vfs_inode; +}; + +static inline struct bdev_inode *BDEV_I(struct inode *inode) +{ + return container_of(inode, struct bdev_inode, vfs_inode); +} + +struct block_device *I_BDEV(struct inode *inode) +{ + return &BDEV_I(inode)->bdev; +} +EXPORT_SYMBOL(I_BDEV); + +static void bdev_write_inode(struct block_device *bdev) +{ + struct inode *inode = bdev->bd_inode; + int ret; + + spin_lock(&inode->i_lock); + while (inode->i_state & I_DIRTY) { + spin_unlock(&inode->i_lock); + ret = write_inode_now(inode, true); + if (ret) + pr_warn_ratelimited( + "VFS: Dirty inode writeback failed for block device %pg (err=%d).\n", + bdev, ret); + spin_lock(&inode->i_lock); + } + spin_unlock(&inode->i_lock); +} + +/* Kill _all_ buffers and pagecache , dirty or not.. */ +static void kill_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping_empty(mapping)) + return; + + invalidate_bh_lrus(); + truncate_inode_pages(mapping, 0); +} + +/* Invalidate clean unused buffers and pagecache. */ +void invalidate_bdev(struct block_device *bdev) +{ + struct address_space *mapping = bdev->bd_inode->i_mapping; + + if (mapping->nrpages) { + invalidate_bh_lrus(); + lru_add_drain_all(); /* make sure all lru add caches are flushed */ + invalidate_mapping_pages(mapping, 0, -1); + } +} +EXPORT_SYMBOL(invalidate_bdev); + +/* + * Drop all buffers & page cache for given bdev range. This function bails + * with error if bdev has other exclusive owner (such as filesystem). + */ +int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, + loff_t lstart, loff_t lend) +{ + /* + * If we don't hold exclusive handle for the device, upgrade to it + * while we discard the buffer cache to avoid discarding buffers + * under live filesystem. + */ + if (!(mode & BLK_OPEN_EXCL)) { + int err = bd_prepare_to_claim(bdev, truncate_bdev_range, NULL); + if (err) + goto invalidate; + } + + truncate_inode_pages_range(bdev->bd_inode->i_mapping, lstart, lend); + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(bdev, truncate_bdev_range); + return 0; + +invalidate: + /* + * Someone else has handle exclusively open. Try invalidating instead. + * The 'end' argument is inclusive so the rounding is safe. + */ + return invalidate_inode_pages2_range(bdev->bd_inode->i_mapping, + lstart >> PAGE_SHIFT, + lend >> PAGE_SHIFT); +} + +static void set_init_blocksize(struct block_device *bdev) +{ + unsigned int bsize = bdev_logical_block_size(bdev); + loff_t size = i_size_read(bdev->bd_inode); + + while (bsize < PAGE_SIZE) { + if (size & bsize) + break; + bsize <<= 1; + } + bdev->bd_inode->i_blkbits = blksize_bits(bsize); +} + +int set_blocksize(struct block_device *bdev, int size) +{ + /* Size must be a power of two, and between 512 and PAGE_SIZE */ + if (size > PAGE_SIZE || size < 512 || !is_power_of_2(size)) + return -EINVAL; + + /* Size cannot be smaller than the size supported by the device */ + if (size < bdev_logical_block_size(bdev)) + return -EINVAL; + + /* Don't change the size if it is same as current */ + if (bdev->bd_inode->i_blkbits != blksize_bits(size)) { + sync_blockdev(bdev); + bdev->bd_inode->i_blkbits = blksize_bits(size); + kill_bdev(bdev); + } + return 0; +} + +EXPORT_SYMBOL(set_blocksize); + +int sb_set_blocksize(struct super_block *sb, int size) +{ + if (set_blocksize(sb->s_bdev, size)) + return 0; + /* If we get here, we know size is power of two + * and it's value is between 512 and PAGE_SIZE */ + sb->s_blocksize = size; + sb->s_blocksize_bits = blksize_bits(size); + return sb->s_blocksize; +} + +EXPORT_SYMBOL(sb_set_blocksize); + +int sb_min_blocksize(struct super_block *sb, int size) +{ + int minsize = bdev_logical_block_size(sb->s_bdev); + if (size < minsize) + size = minsize; + return sb_set_blocksize(sb, size); +} + +EXPORT_SYMBOL(sb_min_blocksize); + +int sync_blockdev_nowait(struct block_device *bdev) +{ + if (!bdev) + return 0; + return filemap_flush(bdev->bd_inode->i_mapping); +} +EXPORT_SYMBOL_GPL(sync_blockdev_nowait); + +/* + * Write out and wait upon all the dirty data associated with a block + * device via its mapping. Does not take the superblock lock. + */ +int sync_blockdev(struct block_device *bdev) +{ + if (!bdev) + return 0; + return filemap_write_and_wait(bdev->bd_inode->i_mapping); +} +EXPORT_SYMBOL(sync_blockdev); + +int sync_blockdev_range(struct block_device *bdev, loff_t lstart, loff_t lend) +{ + return filemap_write_and_wait_range(bdev->bd_inode->i_mapping, + lstart, lend); +} +EXPORT_SYMBOL(sync_blockdev_range); + +/** + * bdev_freeze - lock a filesystem and force it into a consistent state + * @bdev: blockdevice to lock + * + * If a superblock is found on this device, we take the s_umount semaphore + * on it to make sure nobody unmounts until the snapshot creation is done. + * The reference counter (bd_fsfreeze_count) guarantees that only the last + * unfreeze process can unfreeze the frozen filesystem actually when multiple + * freeze requests arrive simultaneously. It counts up in bdev_freeze() and + * count down in bdev_thaw(). When it becomes 0, thaw_bdev() will unfreeze + * actually. + * + * Return: On success zero is returned, negative error code on failure. + */ +int bdev_freeze(struct block_device *bdev) +{ + int error = 0; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + + if (atomic_inc_return(&bdev->bd_fsfreeze_count) > 1) { + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return 0; + } + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->freeze) { + error = bdev->bd_holder_ops->freeze(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + error = sync_blockdev(bdev); + } + + if (error) + atomic_dec(&bdev->bd_fsfreeze_count); + + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; +} +EXPORT_SYMBOL(bdev_freeze); + +/** + * bdev_thaw - unlock filesystem + * @bdev: blockdevice to unlock + * + * Unlocks the filesystem and marks it writeable again after bdev_freeze(). + * + * Return: On success zero is returned, negative error code on failure. + */ +int bdev_thaw(struct block_device *bdev) +{ + int error = -EINVAL, nr_freeze; + + mutex_lock(&bdev->bd_fsfreeze_mutex); + + /* + * If this returns < 0 it means that @bd_fsfreeze_count was + * already 0 and no decrement was performed. + */ + nr_freeze = atomic_dec_if_positive(&bdev->bd_fsfreeze_count); + if (nr_freeze < 0) + goto out; + + error = 0; + if (nr_freeze > 0) + goto out; + + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->thaw) { + error = bdev->bd_holder_ops->thaw(bdev); + lockdep_assert_not_held(&bdev->bd_holder_lock); + } else { + mutex_unlock(&bdev->bd_holder_lock); + } + + if (error) + atomic_inc(&bdev->bd_fsfreeze_count); +out: + mutex_unlock(&bdev->bd_fsfreeze_mutex); + return error; +} +EXPORT_SYMBOL(bdev_thaw); + +/* + * pseudo-fs + */ + +static __cacheline_aligned_in_smp DEFINE_MUTEX(bdev_lock); +static struct kmem_cache *bdev_cachep __ro_after_init; + +static struct inode *bdev_alloc_inode(struct super_block *sb) +{ + struct bdev_inode *ei = alloc_inode_sb(sb, bdev_cachep, GFP_KERNEL); + + if (!ei) + return NULL; + memset(&ei->bdev, 0, sizeof(ei->bdev)); + return &ei->vfs_inode; +} + +static void bdev_free_inode(struct inode *inode) +{ + struct block_device *bdev = I_BDEV(inode); + + free_percpu(bdev->bd_stats); + kfree(bdev->bd_meta_info); + + if (!bdev_is_partition(bdev)) { + if (bdev->bd_disk && bdev->bd_disk->bdi) + bdi_put(bdev->bd_disk->bdi); + kfree(bdev->bd_disk); + } + + if (MAJOR(bdev->bd_dev) == BLOCK_EXT_MAJOR) + blk_free_ext_minor(MINOR(bdev->bd_dev)); + + kmem_cache_free(bdev_cachep, BDEV_I(inode)); +} + +static void init_once(void *data) +{ + struct bdev_inode *ei = data; + + inode_init_once(&ei->vfs_inode); +} + +static void bdev_evict_inode(struct inode *inode) +{ + truncate_inode_pages_final(&inode->i_data); + invalidate_inode_buffers(inode); /* is it needed here? */ + clear_inode(inode); +} + +static const struct super_operations bdev_sops = { + .statfs = simple_statfs, + .alloc_inode = bdev_alloc_inode, + .free_inode = bdev_free_inode, + .drop_inode = generic_delete_inode, + .evict_inode = bdev_evict_inode, +}; + +static int bd_init_fs_context(struct fs_context *fc) +{ + struct pseudo_fs_context *ctx = init_pseudo(fc, BDEVFS_MAGIC); + if (!ctx) + return -ENOMEM; + fc->s_iflags |= SB_I_CGROUPWB; + ctx->ops = &bdev_sops; + return 0; +} + +static struct file_system_type bd_type = { + .name = "bdev", + .init_fs_context = bd_init_fs_context, + .kill_sb = kill_anon_super, +}; + +struct super_block *blockdev_superblock __ro_after_init; +EXPORT_SYMBOL_GPL(blockdev_superblock); + +void __init bdev_cache_init(void) +{ + int err; + static struct vfsmount *bd_mnt __ro_after_init; + + bdev_cachep = kmem_cache_create("bdev_cache", sizeof(struct bdev_inode), + 0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT| + SLAB_MEM_SPREAD|SLAB_ACCOUNT|SLAB_PANIC), + init_once); + err = register_filesystem(&bd_type); + if (err) + panic("Cannot register bdev pseudo-fs"); + bd_mnt = kern_mount(&bd_type); + if (IS_ERR(bd_mnt)) + panic("Cannot create bdev pseudo-fs"); + blockdev_superblock = bd_mnt->mnt_sb; /* For writeback */ +} + +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno) +{ + struct block_device *bdev; + struct inode *inode; + + inode = new_inode(blockdev_superblock); + if (!inode) + return NULL; + inode->i_mode = S_IFBLK; + inode->i_rdev = 0; + inode->i_data.a_ops = &def_blk_aops; + mapping_set_gfp_mask(&inode->i_data, GFP_USER); + + bdev = I_BDEV(inode); + mutex_init(&bdev->bd_fsfreeze_mutex); + spin_lock_init(&bdev->bd_size_lock); + mutex_init(&bdev->bd_holder_lock); + bdev->bd_partno = partno; + bdev->bd_inode = inode; + bdev->bd_queue = disk->queue; + if (partno) + bdev->bd_has_submit_bio = disk->part0->bd_has_submit_bio; + else + bdev->bd_has_submit_bio = false; + bdev->bd_stats = alloc_percpu(struct disk_stats); + if (!bdev->bd_stats) { + iput(inode); + return NULL; + } + bdev->bd_disk = disk; + return bdev; +} + +void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors) +{ + spin_lock(&bdev->bd_size_lock); + i_size_write(bdev->bd_inode, (loff_t)sectors << SECTOR_SHIFT); + bdev->bd_nr_sectors = sectors; + spin_unlock(&bdev->bd_size_lock); +} + +void bdev_add(struct block_device *bdev, dev_t dev) +{ + if (bdev_stable_writes(bdev)) + mapping_set_stable_writes(bdev->bd_inode->i_mapping); + bdev->bd_dev = dev; + bdev->bd_inode->i_rdev = dev; + bdev->bd_inode->i_ino = dev; + insert_inode_hash(bdev->bd_inode); +} + +long nr_blockdev_pages(void) +{ + struct inode *inode; + long ret = 0; + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + ret += inode->i_mapping->nrpages; + spin_unlock(&blockdev_superblock->s_inode_list_lock); + + return ret; +} + +/** + * bd_may_claim - test whether a block device can be claimed + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * @hops: holder ops + * + * Test whether @bdev can be claimed by @holder. + * + * RETURNS: + * %true if @bdev can be claimed, %false otherwise. + */ +static bool bd_may_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) +{ + struct block_device *whole = bdev_whole(bdev); + + lockdep_assert_held(&bdev_lock); + + if (bdev->bd_holder) { + /* + * The same holder can always re-claim. + */ + if (bdev->bd_holder == holder) { + if (WARN_ON_ONCE(bdev->bd_holder_ops != hops)) + return false; + return true; + } + return false; + } + + /* + * If the whole devices holder is set to bd_may_claim, a partition on + * the device is claimed, but not the whole device. + */ + if (whole != bdev && + whole->bd_holder && whole->bd_holder != bd_may_claim) + return false; + return true; +} + +/** + * bd_prepare_to_claim - claim a block device + * @bdev: block device of interest + * @holder: holder trying to claim @bdev + * @hops: holder ops. + * + * Claim @bdev. This function fails if @bdev is already claimed by another + * holder and waits if another claiming is in progress. return, the caller + * has ownership of bd_claiming and bd_holder[s]. + * + * RETURNS: + * 0 if @bdev can be claimed, -EBUSY otherwise. + */ +int bd_prepare_to_claim(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) +{ + struct block_device *whole = bdev_whole(bdev); + + if (WARN_ON_ONCE(!holder)) + return -EINVAL; +retry: + mutex_lock(&bdev_lock); + /* if someone else claimed, fail */ + if (!bd_may_claim(bdev, holder, hops)) { + mutex_unlock(&bdev_lock); + return -EBUSY; + } + + /* if claiming is already in progress, wait for it to finish */ + if (whole->bd_claiming) { + wait_queue_head_t *wq = bit_waitqueue(&whole->bd_claiming, 0); + DEFINE_WAIT(wait); + + prepare_to_wait(wq, &wait, TASK_UNINTERRUPTIBLE); + mutex_unlock(&bdev_lock); + schedule(); + finish_wait(wq, &wait); + goto retry; + } + + /* yay, all mine */ + whole->bd_claiming = holder; + mutex_unlock(&bdev_lock); + return 0; +} +EXPORT_SYMBOL_GPL(bd_prepare_to_claim); /* only for the loop driver */ + +static void bd_clear_claiming(struct block_device *whole, void *holder) +{ + lockdep_assert_held(&bdev_lock); + /* tell others that we're done */ + BUG_ON(whole->bd_claiming != holder); + whole->bd_claiming = NULL; + wake_up_bit(&whole->bd_claiming, 0); +} + +/** + * bd_finish_claiming - finish claiming of a block device + * @bdev: block device of interest + * @holder: holder that has claimed @bdev + * @hops: block device holder operations + * + * Finish exclusive open of a block device. Mark the device as exlusively + * open by the holder and wake up all waiters for exclusive open to finish. + */ +static void bd_finish_claiming(struct block_device *bdev, void *holder, + const struct blk_holder_ops *hops) +{ + struct block_device *whole = bdev_whole(bdev); + + mutex_lock(&bdev_lock); + BUG_ON(!bd_may_claim(bdev, holder, hops)); + /* + * Note that for a whole device bd_holders will be incremented twice, + * and bd_holder will be set to bd_may_claim before being set to holder + */ + whole->bd_holders++; + whole->bd_holder = bd_may_claim; + bdev->bd_holders++; + mutex_lock(&bdev->bd_holder_lock); + bdev->bd_holder = holder; + bdev->bd_holder_ops = hops; + mutex_unlock(&bdev->bd_holder_lock); + bd_clear_claiming(whole, holder); + mutex_unlock(&bdev_lock); +} + +/** + * bd_abort_claiming - abort claiming of a block device + * @bdev: block device of interest + * @holder: holder that has claimed @bdev + * + * Abort claiming of a block device when the exclusive open failed. This can be + * also used when exclusive open is not actually desired and we just needed + * to block other exclusive openers for a while. + */ +void bd_abort_claiming(struct block_device *bdev, void *holder) +{ + mutex_lock(&bdev_lock); + bd_clear_claiming(bdev_whole(bdev), holder); + mutex_unlock(&bdev_lock); +} +EXPORT_SYMBOL(bd_abort_claiming); + +static void bd_end_claim(struct block_device *bdev, void *holder) +{ + struct block_device *whole = bdev_whole(bdev); + bool unblock = false; + + /* + * Release a claim on the device. The holder fields are protected with + * bdev_lock. open_mutex is used to synchronize disk_holder unlinking. + */ + mutex_lock(&bdev_lock); + WARN_ON_ONCE(bdev->bd_holder != holder); + WARN_ON_ONCE(--bdev->bd_holders < 0); + WARN_ON_ONCE(--whole->bd_holders < 0); + if (!bdev->bd_holders) { + mutex_lock(&bdev->bd_holder_lock); + bdev->bd_holder = NULL; + bdev->bd_holder_ops = NULL; + mutex_unlock(&bdev->bd_holder_lock); + if (bdev->bd_write_holder) + unblock = true; + } + if (!whole->bd_holders) + whole->bd_holder = NULL; + mutex_unlock(&bdev_lock); + + /* + * If this was the last claim, remove holder link and unblock evpoll if + * it was a write holder. + */ + if (unblock) { + disk_unblock_events(bdev->bd_disk); + bdev->bd_write_holder = false; + } +} + +static void blkdev_flush_mapping(struct block_device *bdev) +{ + WARN_ON_ONCE(bdev->bd_holders); + sync_blockdev(bdev); + kill_bdev(bdev); + bdev_write_inode(bdev); +} + +static int blkdev_get_whole(struct block_device *bdev, blk_mode_t mode) +{ + struct gendisk *disk = bdev->bd_disk; + int ret; + + if (disk->fops->open) { + ret = disk->fops->open(disk, mode); + if (ret) { + /* avoid ghost partitions on a removed medium */ + if (ret == -ENOMEDIUM && + test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, true); + return ret; + } + } + + if (!atomic_read(&bdev->bd_openers)) + set_init_blocksize(bdev); + if (test_bit(GD_NEED_PART_SCAN, &disk->state)) + bdev_disk_changed(disk, false); + atomic_inc(&bdev->bd_openers); + return 0; +} + +static void blkdev_put_whole(struct block_device *bdev) +{ + if (atomic_dec_and_test(&bdev->bd_openers)) + blkdev_flush_mapping(bdev); + if (bdev->bd_disk->fops->release) + bdev->bd_disk->fops->release(bdev->bd_disk); +} + +static int blkdev_get_part(struct block_device *part, blk_mode_t mode) +{ + struct gendisk *disk = part->bd_disk; + int ret; + + ret = blkdev_get_whole(bdev_whole(part), mode); + if (ret) + return ret; + + ret = -ENXIO; + if (!bdev_nr_sectors(part)) + goto out_blkdev_put; + + if (!atomic_read(&part->bd_openers)) { + disk->open_partitions++; + set_init_blocksize(part); + } + atomic_inc(&part->bd_openers); + return 0; + +out_blkdev_put: + blkdev_put_whole(bdev_whole(part)); + return ret; +} + +static void blkdev_put_part(struct block_device *part) +{ + struct block_device *whole = bdev_whole(part); + + if (atomic_dec_and_test(&part->bd_openers)) { + blkdev_flush_mapping(part); + whole->bd_disk->open_partitions--; + } + blkdev_put_whole(whole); +} + +struct block_device *blkdev_get_no_open(dev_t dev) +{ + struct block_device *bdev; + struct inode *inode; + + inode = ilookup(blockdev_superblock, dev); + if (!inode && IS_ENABLED(CONFIG_BLOCK_LEGACY_AUTOLOAD)) { + blk_request_module(dev); + inode = ilookup(blockdev_superblock, dev); + if (inode) + pr_warn_ratelimited( +"block device autoloading is deprecated and will be removed.\n"); + } + if (!inode) + return NULL; + + /* switch from the inode reference to a device mode one: */ + bdev = &BDEV_I(inode)->bdev; + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + bdev = NULL; + iput(inode); + return bdev; +} + +void blkdev_put_no_open(struct block_device *bdev) +{ + put_device(&bdev->bd_device); +} + +static bool bdev_writes_blocked(struct block_device *bdev) +{ + return bdev->bd_writers == -1; +} + +static void bdev_block_writes(struct block_device *bdev) +{ + bdev->bd_writers = -1; +} + +static void bdev_unblock_writes(struct block_device *bdev) +{ + bdev->bd_writers = 0; +} + +static bool bdev_may_open(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return true; + /* Writes blocked? */ + if (mode & BLK_OPEN_WRITE && bdev_writes_blocked(bdev)) + return false; + if (mode & BLK_OPEN_RESTRICT_WRITES && bdev->bd_writers > 0) + return false; + return true; +} + +static void bdev_claim_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Claim exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_block_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers++; +} + +static void bdev_yield_write_access(struct block_device *bdev, blk_mode_t mode) +{ + if (bdev_allow_write_mounted) + return; + + /* Yield exclusive or shared write access. */ + if (mode & BLK_OPEN_RESTRICT_WRITES) + bdev_unblock_writes(bdev); + else if (mode & BLK_OPEN_WRITE) + bdev->bd_writers--; +} + +/** + * bdev_open_by_dev - open a block device by device number + * @dev: device number of block device to open + * @mode: open mode (BLK_OPEN_*) + * @holder: exclusive holder identifier + * @hops: holder operations + * + * Open the block device described by device number @dev. If @holder is not + * %NULL, the block device is opened with exclusive access. Exclusive opens may + * nest for the same @holder. + * + * Use this interface ONLY if you really do not have anything better - i.e. when + * you are behind a truly sucky interface and all you are given is a device + * number. Everything else should use bdev_open_by_path(). + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. + */ +struct bdev_handle *bdev_open_by_dev(dev_t dev, blk_mode_t mode, void *holder, + const struct blk_holder_ops *hops) +{ + struct bdev_handle *handle = kmalloc(sizeof(struct bdev_handle), + GFP_KERNEL); + struct block_device *bdev; + bool unblock_events = true; + struct gendisk *disk; + int ret; + + if (!handle) + return ERR_PTR(-ENOMEM); + + ret = devcgroup_check_permission(DEVCG_DEV_BLOCK, + MAJOR(dev), MINOR(dev), + ((mode & BLK_OPEN_READ) ? DEVCG_ACC_READ : 0) | + ((mode & BLK_OPEN_WRITE) ? DEVCG_ACC_WRITE : 0)); + if (ret) + goto free_handle; + + /* Blocking writes requires exclusive opener */ + if (mode & BLK_OPEN_RESTRICT_WRITES && !holder) { + ret = -EINVAL; + goto free_handle; + } + + bdev = blkdev_get_no_open(dev); + if (!bdev) { + ret = -ENXIO; + goto free_handle; + } + disk = bdev->bd_disk; + + if (holder) { + mode |= BLK_OPEN_EXCL; + ret = bd_prepare_to_claim(bdev, holder, hops); + if (ret) + goto put_blkdev; + } else { + if (WARN_ON_ONCE(mode & BLK_OPEN_EXCL)) { + ret = -EIO; + goto put_blkdev; + } + } + + disk_block_events(disk); + + mutex_lock(&disk->open_mutex); + ret = -ENXIO; + if (!disk_live(disk)) + goto abort_claiming; + if (!try_module_get(disk->fops->owner)) + goto abort_claiming; + ret = -EBUSY; + if (!bdev_may_open(bdev, mode)) + goto abort_claiming; + if (bdev_is_partition(bdev)) + ret = blkdev_get_part(bdev, mode); + else + ret = blkdev_get_whole(bdev, mode); + if (ret) + goto put_module; + bdev_claim_write_access(bdev, mode); + if (holder) { + bd_finish_claiming(bdev, holder, hops); + + /* + * Block event polling for write claims if requested. Any write + * holder makes the write_holder state stick until all are + * released. This is good enough and tracking individual + * writeable reference is too fragile given the way @mode is + * used in blkdev_get/put(). + */ + if ((mode & BLK_OPEN_WRITE) && !bdev->bd_write_holder && + (disk->event_flags & DISK_EVENT_FLAG_BLOCK_ON_EXCL_WRITE)) { + bdev->bd_write_holder = true; + unblock_events = false; + } + } + mutex_unlock(&disk->open_mutex); + + if (unblock_events) + disk_unblock_events(disk); + handle->bdev = bdev; + handle->holder = holder; + handle->mode = mode; + return handle; +put_module: + module_put(disk->fops->owner); +abort_claiming: + if (holder) + bd_abort_claiming(bdev, holder); + mutex_unlock(&disk->open_mutex); + disk_unblock_events(disk); +put_blkdev: + blkdev_put_no_open(bdev); +free_handle: + kfree(handle); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(bdev_open_by_dev); + +/** + * bdev_open_by_path - open a block device by name + * @path: path to the block device to open + * @mode: open mode (BLK_OPEN_*) + * @holder: exclusive holder identifier + * @hops: holder operations + * + * Open the block device described by the device file at @path. If @holder is + * not %NULL, the block device is opened with exclusive access. Exclusive opens + * may nest for the same @holder. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * Handle with a reference to the block_device on success, ERR_PTR(-errno) on + * failure. + */ +struct bdev_handle *bdev_open_by_path(const char *path, blk_mode_t mode, + void *holder, const struct blk_holder_ops *hops) +{ + struct bdev_handle *handle; + dev_t dev; + int error; + + error = lookup_bdev(path, &dev); + if (error) + return ERR_PTR(error); + + handle = bdev_open_by_dev(dev, mode, holder, hops); + if (!IS_ERR(handle) && (mode & BLK_OPEN_WRITE) && + bdev_read_only(handle->bdev)) { + bdev_release(handle); + return ERR_PTR(-EACCES); + } + + return handle; +} +EXPORT_SYMBOL(bdev_open_by_path); + +void bdev_release(struct bdev_handle *handle) +{ + struct block_device *bdev = handle->bdev; + struct gendisk *disk = bdev->bd_disk; + + /* + * Sync early if it looks like we're the last one. If someone else + * opens the block device between now and the decrement of bd_openers + * then we did a sync that we didn't need to, but that's not the end + * of the world and we want to avoid long (could be several minute) + * syncs while holding the mutex. + */ + if (atomic_read(&bdev->bd_openers) == 1) + sync_blockdev(bdev); + + mutex_lock(&disk->open_mutex); + bdev_yield_write_access(bdev, handle->mode); + + if (handle->holder) + bd_end_claim(bdev, handle->holder); + + /* + * Trigger event checking and tell drivers to flush MEDIA_CHANGE + * event. This is to ensure detection of media removal commanded + * from userland - e.g. eject(1). + */ + disk_flush_events(disk, DISK_EVENT_MEDIA_CHANGE); + + if (bdev_is_partition(bdev)) + blkdev_put_part(bdev); + else + blkdev_put_whole(bdev); + mutex_unlock(&disk->open_mutex); + + module_put(disk->fops->owner); + blkdev_put_no_open(bdev); + kfree(handle); +} +EXPORT_SYMBOL(bdev_release); + +/** + * lookup_bdev() - Look up a struct block_device by name. + * @pathname: Name of the block device in the filesystem. + * @dev: Pointer to the block device's dev_t, if found. + * + * Lookup the block device's dev_t at @pathname in the current + * namespace if possible and return it in @dev. + * + * Context: May sleep. + * Return: 0 if succeeded, negative errno otherwise. + */ +int lookup_bdev(const char *pathname, dev_t *dev) +{ + struct inode *inode; + struct path path; + int error; + + if (!pathname || !*pathname) + return -EINVAL; + + error = kern_path(pathname, LOOKUP_FOLLOW, &path); + if (error) + return error; + + inode = d_backing_inode(path.dentry); + error = -ENOTBLK; + if (!S_ISBLK(inode->i_mode)) + goto out_path_put; + error = -EACCES; + if (!may_open_dev(&path)) + goto out_path_put; + + *dev = inode->i_rdev; + error = 0; +out_path_put: + path_put(&path); + return error; +} +EXPORT_SYMBOL(lookup_bdev); + +/** + * bdev_mark_dead - mark a block device as dead + * @bdev: block device to operate on + * @surprise: indicate a surprise removal + * + * Tell the file system that this devices or media is dead. If @surprise is set + * to %true the device or media is already gone, if not we are preparing for an + * orderly removal. + * + * This calls into the file system, which then typicall syncs out all dirty data + * and writes back inodes and then invalidates any cached data in the inodes on + * the file system. In addition we also invalidate the block device mapping. + */ +void bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->mark_dead) + bdev->bd_holder_ops->mark_dead(bdev, surprise); + else { + mutex_unlock(&bdev->bd_holder_lock); + sync_blockdev(bdev); + } + + invalidate_bdev(bdev); +} +/* + * New drivers should not use this directly. There are some drivers however + * that needs this for historical reasons. For example, the DASD driver has + * historically had a shutdown to offline mode that doesn't actually remove the + * gendisk that otherwise looks a lot like a safe device removal. + */ +EXPORT_SYMBOL_GPL(bdev_mark_dead); + +void sync_bdevs(bool wait) +{ + struct inode *inode, *old_inode = NULL; + + spin_lock(&blockdev_superblock->s_inode_list_lock); + list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { + struct address_space *mapping = inode->i_mapping; + struct block_device *bdev; + + spin_lock(&inode->i_lock); + if (inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW) || + mapping->nrpages == 0) { + spin_unlock(&inode->i_lock); + continue; + } + __iget(inode); + spin_unlock(&inode->i_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); + /* + * We hold a reference to 'inode' so it couldn't have been + * removed from s_inodes list while we dropped the + * s_inode_list_lock We cannot iput the inode now as we can + * be holding the last reference and we cannot iput it under + * s_inode_list_lock. So we keep the reference and iput it + * later. + */ + iput(old_inode); + old_inode = inode; + bdev = I_BDEV(inode); + + mutex_lock(&bdev->bd_disk->open_mutex); + if (!atomic_read(&bdev->bd_openers)) { + ; /* skip */ + } else if (wait) { + /* + * We keep the error status of individual mapping so + * that applications can catch the writeback error using + * fsync(2). See filemap_fdatawait_keep_errors() for + * details. + */ + filemap_fdatawait_keep_errors(inode->i_mapping); + } else { + filemap_fdatawrite(inode->i_mapping); + } + mutex_unlock(&bdev->bd_disk->open_mutex); + + spin_lock(&blockdev_superblock->s_inode_list_lock); + } + spin_unlock(&blockdev_superblock->s_inode_list_lock); + iput(old_inode); +} + +/* + * Handle STATX_DIOALIGN for block devices. + * + * Note that the inode passed to this is the inode of a block device node file, + * not the block device's internal inode. Therefore it is *not* valid to use + * I_BDEV() here; the block device has to be looked up by i_rdev instead. + */ +void bdev_statx_dioalign(struct inode *inode, struct kstat *stat) +{ + struct block_device *bdev; + + bdev = blkdev_get_no_open(inode->i_rdev); + if (!bdev) + return; + + stat->dio_mem_align = bdev_dma_alignment(bdev) + 1; + stat->dio_offset_align = bdev_logical_block_size(bdev); + stat->result_mask |= STATX_DIOALIGN; + + blkdev_put_no_open(bdev); +} + +static int __init setup_bdev_allow_write_mounted(char *str) +{ + if (kstrtobool(str, &bdev_allow_write_mounted)) + pr_warn("Invalid option string for bdev_allow_write_mounted:" + " '%s'\n", str); + return 1; +} +__setup("bdev_allow_write_mounted=", setup_bdev_allow_write_mounted); diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c index 68882b9b8f11..2c90e5de0acd 100644 --- a/block/bfq-cgroup.c +++ b/block/bfq-cgroup.c @@ -6,13 +6,13 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> -#include <linux/elevator.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> #include <linux/sbitmap.h> #include <linux/delay.h> +#include "elevator.h" #include "bfq-iosched.h" #ifdef CONFIG_BFQ_CGROUP_DEBUG @@ -220,51 +220,46 @@ void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) } void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) + blk_opf_t opf) { - blkg_rwstat_add(&bfqg->stats.queued, op, 1); + blkg_rwstat_add(&bfqg->stats.queued, opf, 1); bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) + if (!(bfqq == bfqg->bfqd->in_service_queue)) bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); } -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { - blkg_rwstat_add(&bfqg->stats.queued, op, -1); + blkg_rwstat_add(&bfqg->stats.queued, opf, -1); } -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { - blkg_rwstat_add(&bfqg->stats.merged, op, 1); + blkg_rwstat_add(&bfqg->stats.merged, opf, 1); } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, - u64 io_start_time_ns, unsigned int op) + u64 io_start_time_ns, blk_opf_t opf) { struct bfqg_stats *stats = &bfqg->stats; u64 now = ktime_get_ns(); if (now > io_start_time_ns) - blkg_rwstat_add(&stats->service_time, op, + blkg_rwstat_add(&stats->service_time, opf, now - io_start_time_ns); if (io_start_time_ns > start_time_ns) - blkg_rwstat_add(&stats->wait_time, op, + blkg_rwstat_add(&stats->wait_time, opf, io_start_time_ns - start_time_ns); } #else /* CONFIG_BFQ_CGROUP_DEBUG */ -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op) { } -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf) { } +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf) { } void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, - u64 io_start_time_ns, unsigned int op) { } + u64 io_start_time_ns, blk_opf_t opf) { } void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } #endif /* CONFIG_BFQ_CGROUP_DEBUG */ @@ -321,18 +316,16 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) static void bfqg_get(struct bfq_group *bfqg) { - bfqg->ref++; + refcount_inc(&bfqg->ref); } static void bfqg_put(struct bfq_group *bfqg) { - bfqg->ref--; - - if (bfqg->ref == 0) + if (refcount_dec_and_test(&bfqg->ref)) kfree(bfqg); } -void bfqg_and_blkg_get(struct bfq_group *bfqg) +static void bfqg_and_blkg_get(struct bfq_group *bfqg) { /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ bfqg_get(bfqg); @@ -463,7 +456,7 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) { if (blkg_rwstat_init(&stats->bytes, gfp) || blkg_rwstat_init(&stats->ios, gfp)) - return -ENOMEM; + goto error; #ifdef CONFIG_BFQ_CGROUP_DEBUG if (blkg_rwstat_init(&stats->merged, gfp) || @@ -476,13 +469,15 @@ static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) bfq_stat_init(&stats->dequeue, gfp) || bfq_stat_init(&stats->group_wait_time, gfp) || bfq_stat_init(&stats->idle_time, gfp) || - bfq_stat_init(&stats->empty_time, gfp)) { - bfqg_stats_exit(stats); - return -ENOMEM; - } + bfq_stat_init(&stats->empty_time, gfp)) + goto error; #endif return 0; + +error: + bfqg_stats_exit(stats); + return -ENOMEM; } static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) @@ -502,15 +497,9 @@ static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) bgd = kzalloc(sizeof(*bgd), gfp); if (!bgd) return NULL; - return &bgd->pd; -} - -static void bfq_cpd_init(struct blkcg_policy_data *cpd) -{ - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? - CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; + bgd->weight = CGROUP_WEIGHT_DFL; + return &bgd->pd; } static void bfq_cpd_free(struct blkcg_policy_data *cpd) @@ -518,12 +507,12 @@ static void bfq_cpd_free(struct blkcg_policy_data *cpd) kfree(cpd_to_bfqgd(cpd)); } -static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *bfq_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct bfq_group *bfqg; - bfqg = kzalloc_node(sizeof(*bfqg), gfp, q->node); + bfqg = kzalloc_node(sizeof(*bfqg), gfp, disk->node_id); if (!bfqg) return NULL; @@ -533,7 +522,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, struct request_queue *q, } /* see comments in bfq_bic_update_cgroup for why refcounting */ - bfqg_get(bfqg); + refcount_set(&bfqg->ref, 1); return &bfqg->pd; } @@ -547,12 +536,15 @@ static void bfq_pd_init(struct blkg_policy_data *pd) entity->orig_weight = entity->weight = entity->new_weight = d->weight; entity->my_sched_data = &bfqg->sched_data; + entity->last_bfqq_created = NULL; + bfqg->my_entity = entity; /* * the root_group's will be set to NULL * in bfq_init_queue() */ bfqg->bfqd = bfqd; bfqg->active_entities = 0; + bfqg->num_queues_with_pending_reqs = 0; bfqg->rq_pos_tree = RB_ROOT; } @@ -581,28 +573,11 @@ static void bfq_group_set_parent(struct bfq_group *bfqg, entity->sched_data = &parent->sched_data; } -static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, - struct blkcg *blkcg) -{ - struct blkcg_gq *blkg; - - blkg = blkg_lookup(blkcg, bfqd->queue); - if (likely(blkg)) - return blkg_to_bfqg(blkg); - return NULL; -} - -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg) +static void bfq_link_bfqg(struct bfq_data *bfqd, struct bfq_group *bfqg) { - struct bfq_group *bfqg, *parent; + struct bfq_group *parent; struct bfq_entity *entity; - bfqg = bfq_lookup_bfqg(bfqd, blkcg); - - if (unlikely(!bfqg)) - return NULL; - /* * Update chain of bfq_groups as we might be handling a leaf group * which, along with some of its relatives, has not been hooked yet @@ -619,8 +594,28 @@ struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, bfq_group_set_parent(curr_bfqg, parent); } } +} - return bfqg; +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + struct bfq_group *bfqg; + + while (blkg) { + if (!blkg->online) { + blkg = blkg->parent; + continue; + } + bfqg = blkg_to_bfqg(blkg); + if (bfqg->pd.online) { + bio_associate_blkg_from_css(bio, &blkg->blkcg->css); + return bfqg; + } + blkg = blkg->parent; + } + bio_associate_blkg_from_css(bio, + &bfqg_to_blkg(bfqd->root_group)->blkcg->css); + return bfqd->root_group; } /** @@ -641,13 +636,33 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg) { struct bfq_entity *entity = &bfqq->entity; + struct bfq_group *old_parent = bfqq_group(bfqq); + bool has_pending_reqs = false; /* + * No point to move bfqq to the same group, which can happen when + * root group is offlined + */ + if (old_parent == bfqg) + return; + + /* + * oom_bfqq is not allowed to move, oom_bfqq will hold ref to root_group + * until elevator exit. + */ + if (bfqq == &bfqd->oom_bfqq) + return; + /* * Get extra reference to prevent bfqq from being freed in * next possible expire or deactivate. */ bfqq->ref++; + if (entity->in_groups_with_pending_reqs) { + has_pending_reqs = true; + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + } + /* If bfqq is empty, then bfq_bfqq_expire also invokes * bfq_del_bfqq_busy, thereby removing bfqq and its entity * from data structures related to current group. Otherwise we @@ -662,88 +677,126 @@ void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, false, false); else if (entity->on_st_or_in_serv) bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); - bfqg_and_blkg_put(bfqq_group(bfqq)); + bfqg_and_blkg_put(old_parent); + + if (entity->parent && + entity->parent->last_bfqq_created == bfqq) + entity->parent->last_bfqq_created = NULL; + else if (bfqd->last_bfqq_created == bfqq) + bfqd->last_bfqq_created = NULL; entity->parent = bfqg->my_entity; entity->sched_data = &bfqg->sched_data; /* pin down bfqg and its associated blkg */ bfqg_and_blkg_get(bfqg); + if (has_pending_reqs) + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); + if (bfq_bfqq_busy(bfqq)) { if (unlikely(!bfqd->nonrot_with_queueing)) bfq_pos_tree_add_move(bfqd, bfqq); bfq_activate_bfqq(bfqd, bfqq); } - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) + if (!bfqd->in_service_queue && !bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); /* release extra ref taken above, bfqq may happen to be freed now */ bfq_put_queue(bfqq); } +static void bfq_sync_bfqq_move(struct bfq_data *bfqd, + struct bfq_queue *sync_bfqq, + struct bfq_io_cq *bic, + struct bfq_group *bfqg, + unsigned int act_idx) +{ + struct bfq_queue *bfqq; + + if (!sync_bfqq->new_bfqq && !bfq_bfqq_coop(sync_bfqq)) { + /* We are the only user of this bfqq, just move it */ + if (sync_bfqq->entity.sched_data != &bfqg->sched_data) + bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + return; + } + + /* + * The queue was merged to a different queue. Check + * that the merge chain still belongs to the same + * cgroup. + */ + for (bfqq = sync_bfqq; bfqq; bfqq = bfqq->new_bfqq) + if (bfqq->entity.sched_data != &bfqg->sched_data) + break; + if (bfqq) { + /* + * Some queue changed cgroup so the merge is not valid + * anymore. We cannot easily just cancel the merge (by + * clearing new_bfqq) as there may be other processes + * using this queue and holding refs to all queues + * below sync_bfqq->new_bfqq. Similarly if the merge + * already happened, we need to detach from bfqq now + * so that we cannot merge bio to a request from the + * old cgroup. + */ + bfq_put_cooperator(sync_bfqq); + bic_set_bfqq(bic, NULL, true, act_idx); + bfq_release_process_ref(bfqd, sync_bfqq); + } +} + /** - * __bfq_bic_change_cgroup - move @bic to @cgroup. + * __bfq_bic_change_cgroup - move @bic to @bfqg. * @bfqd: the queue descriptor. * @bic: the bic to move. - * @blkcg: the blk-cgroup to move to. + * @bfqg: the group to move to. * * Move bic to blkcg, assuming that bfqd->lock is held; which makes * sure that the reference to cgroup is valid across the call (see * comments in bfq_bic_update_cgroup on this issue) - * - * NOTE: an alternative approach might have been to store the current - * cgroup in bfqq and getting a reference to it, reducing the lookup - * time here, at the price of slightly more complex code. */ -static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - struct bfq_io_cq *bic, - struct blkcg *blkcg) +static void __bfq_bic_change_cgroup(struct bfq_data *bfqd, + struct bfq_io_cq *bic, + struct bfq_group *bfqg) { - struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); - struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); - struct bfq_group *bfqg; - struct bfq_entity *entity; - - bfqg = bfq_find_set_group(bfqd, blkcg); + unsigned int act_idx; - if (unlikely(!bfqg)) - bfqg = bfqd->root_group; + for (act_idx = 0; act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *async_bfqq = bic_to_bfqq(bic, false, act_idx); + struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, true, act_idx); - if (async_bfqq) { - entity = &async_bfqq->entity; - - if (entity->sched_data != &bfqg->sched_data) { - bic_set_bfqq(bic, NULL, 0); + if (async_bfqq && + async_bfqq->entity.sched_data != &bfqg->sched_data) { + bic_set_bfqq(bic, NULL, false, act_idx); bfq_release_process_ref(bfqd, async_bfqq); } - } - if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) - bfq_bfqq_move(bfqd, sync_bfqq, bfqg); + if (sync_bfqq) + bfq_sync_bfqq_move(bfqd, sync_bfqq, bic, bfqg, act_idx); } - - return bfqg; } void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) { struct bfq_data *bfqd = bic_to_bfqd(bic); - struct bfq_group *bfqg = NULL; + struct bfq_group *bfqg = bfq_bio_bfqg(bfqd, bio); uint64_t serial_nr; - rcu_read_lock(); - serial_nr = __bio_blkcg(bio)->css.serial_nr; + serial_nr = bfqg_to_blkg(bfqg)->blkcg->css.serial_nr; /* * Check whether blkcg has changed. The condition may trigger * spuriously on a newly created cic but there's no harm. */ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) - goto out; + return; - bfqg = __bfq_bic_change_cgroup(bfqd, bic, __bio_blkcg(bio)); + /* + * New cgroup for this process. Make sure it is linked to bfq internal + * cgroup hierarchy. + */ + bfq_link_bfqg(bfqd, bfqg); + __bfq_bic_change_cgroup(bfqd, bic, bfqg); /* * Update blkg_path for bfq_log_* functions. We cache this * path, and update it here, for the following @@ -796,8 +849,6 @@ void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) */ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); bic->blkcg_serial_nr = serial_nr; -out: - rcu_read_unlock(); } /** @@ -817,6 +868,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) * @bfqd: the device data structure with the root group. * @entity: the entity to move, if entity is a leaf; or the parent entity * of an active leaf entity to move, if entity is not a leaf. + * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, struct bfq_entity *entity, @@ -846,6 +898,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, * @bfqd: the device data structure with the root group. * @bfqg: the group to move from. * @st: the service tree to start the search from. + * @ioprio_class: I/O priority class to reparent. */ static void bfq_reparent_active_queues(struct bfq_data *bfqd, struct bfq_group *bfqg, @@ -1052,9 +1105,11 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, struct bfq_group *bfqg; u64 v; - ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_bfq, &ctx); if (ret) - return ret; + goto out; if (sscanf(ctx.body, "%llu", &v) == 1) { /* require "default" on dfl */ @@ -1076,7 +1131,7 @@ static ssize_t bfq_io_set_device_weight(struct kernfs_open_file *of, ret = 0; } out: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -1230,7 +1285,7 @@ struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) { int ret; - ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); + ret = blkcg_activate_policy(bfqd->queue->disk, &blkcg_policy_bfq); if (ret) return NULL; @@ -1242,8 +1297,6 @@ struct blkcg_policy blkcg_policy_bfq = { .legacy_cftypes = bfq_blkcg_legacy_files, .cpd_alloc_fn = bfq_cpd_alloc, - .cpd_init_fn = bfq_cpd_init, - .cpd_bind_fn = bfq_cpd_init, .cpd_free_fn = bfq_cpd_free, .pd_alloc_fn = bfq_pd_alloc, @@ -1414,7 +1467,7 @@ void bfq_end_wr_async(struct bfq_data *bfqd) bfq_end_wr_async_queues(bfqd, bfqd->root_group); } -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, struct blkcg *blkcg) +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio) { return bfqd->root_group; } @@ -1424,8 +1477,6 @@ struct bfq_group *bfqq_group(struct bfq_queue *bfqq) return bfqq->bfqd->root_group; } -void bfqg_and_blkg_get(struct bfq_group *bfqg) {} - void bfqg_and_blkg_put(struct bfq_group *bfqg) {} struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c index 50c8f034c01c..3cce6de464a7 100644 --- a/block/bfq-iosched.c +++ b/block/bfq-iosched.c @@ -117,7 +117,6 @@ #include <linux/slab.h> #include <linux/blkdev.h> #include <linux/cgroup.h> -#include <linux/elevator.h> #include <linux/ktime.h> #include <linux/rbtree.h> #include <linux/ioprio.h> @@ -125,9 +124,11 @@ #include <linux/delay.h> #include <linux/backing-dev.h> +#include <trace/events/block.h> + +#include "elevator.h" #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" #include "blk-mq-sched.h" #include "bfq-iosched.h" #include "blk-wbt.h" @@ -158,10 +159,9 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS \ -/* Expiration time of sync (0) and async (1) requests, in ns. */ +/* Expiration time of async (0) and sync (1) requests, in ns. */ static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; /* Maximum backwards seek (magic number lifted from CFQ), in KiB. */ @@ -363,17 +363,74 @@ static int ref_wr_duration[2]; */ static const unsigned long max_service_from_wr = 120000; -#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) +/* + * Maximum time between the creation of two queues, for stable merge + * to be activated (in ms) + */ +static const unsigned long bfq_activation_stable_merging = 600; +/* + * Minimum time to be waited before evaluating delayed stable merge (in ms) + */ +static const unsigned long bfq_late_stable_merging = 600; + +#define RQ_BIC(rq) ((struct bfq_io_cq *)((rq)->elv.priv[0])) #define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - return bic->bfqq[is_sync]; + if (is_sync) + return bic->bfqq[1][actuator_idx]; + + return bic->bfqq[0][actuator_idx]; } -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync) +static void bfq_put_stable_ref(struct bfq_queue *bfqq); + +void bic_set_bfqq(struct bfq_io_cq *bic, + struct bfq_queue *bfqq, + bool is_sync, + unsigned int actuator_idx) { - bic->bfqq[is_sync] = bfqq; + struct bfq_queue *old_bfqq = bic->bfqq[is_sync][actuator_idx]; + + /* + * If bfqq != NULL, then a non-stable queue merge between + * bic->bfqq and bfqq is happening here. This causes troubles + * in the following case: bic->bfqq has also been scheduled + * for a possible stable merge with bic->stable_merge_bfqq, + * and bic->stable_merge_bfqq == bfqq happens to + * hold. Troubles occur because bfqq may then undergo a split, + * thereby becoming eligible for a stable merge. Yet, if + * bic->stable_merge_bfqq points exactly to bfqq, then bfqq + * would be stably merged with itself. To avoid this anomaly, + * we cancel the stable merge if + * bic->stable_merge_bfqq == bfqq. + */ + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[actuator_idx]; + + /* Clear bic pointer if bfqq is detached from this bic */ + if (old_bfqq && old_bfqq->bic == bic) + old_bfqq->bic = NULL; + + if (is_sync) + bic->bfqq[1][actuator_idx] = bfqq; + else + bic->bfqq[0][actuator_idx] = bfqq; + + if (bfqq && bfqq_data->stable_merge_bfqq == bfqq) { + /* + * Actually, these same instructions are executed also + * in bfq_setup_cooperator, in case of abort or actual + * execution of a stable merge. We could avoid + * repeating these instructions there too, but if we + * did so, we would nest even more complexity in this + * function. + */ + bfq_put_stable_ref(bfqq_data->stable_merge_bfqq); + + bfqq_data->stable_merge_bfqq = NULL; + } } struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) @@ -393,26 +450,21 @@ static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) /** * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. - * @bfqd: the lookup key. - * @ioc: the io_context of the process doing I/O. * @q: the request queue. */ -static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, - struct io_context *ioc, - struct request_queue *q) +static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) { - if (ioc) { - unsigned long flags; - struct bfq_io_cq *icq; + struct bfq_io_cq *icq; + unsigned long flags; - spin_lock_irqsave(&q->queue_lock, flags); - icq = icq_to_bic(ioc_lookup_icq(ioc, q)); - spin_unlock_irqrestore(&q->queue_lock, flags); + if (!current->io_context) + return NULL; - return icq; - } + spin_lock_irqsave(&q->queue_lock, flags); + icq = icq_to_bic(ioc_lookup_icq(q)); + spin_unlock_irqrestore(&q->queue_lock, flags); - return NULL; + return icq; } /* @@ -421,6 +473,8 @@ static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, */ void bfq_schedule_dispatch(struct bfq_data *bfqd) { + lockdep_assert_held(&bfqd->lock); + if (bfqd->queued != 0) { bfq_log(bfqd, "schedule dispatch"); blk_mq_run_hw_queues(bfqd->queue, true); @@ -525,26 +579,149 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, } } +#define BFQ_LIMIT_INLINE_DEPTH 16 + +#ifdef CONFIG_BFQ_GROUP_IOSCHED +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + struct bfq_data *bfqd = bfqq->bfqd; + struct bfq_entity *entity = &bfqq->entity; + struct bfq_entity *inline_entities[BFQ_LIMIT_INLINE_DEPTH]; + struct bfq_entity **entities = inline_entities; + int depth, level, alloc_depth = BFQ_LIMIT_INLINE_DEPTH; + int class_idx = bfqq->ioprio_class - 1; + struct bfq_sched_data *sched_data; + unsigned long wsum; + bool ret = false; + + if (!entity->on_st_or_in_serv) + return false; + +retry: + spin_lock_irq(&bfqd->lock); + /* +1 for bfqq entity, root cgroup not included */ + depth = bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css.cgroup->level + 1; + if (depth > alloc_depth) { + spin_unlock_irq(&bfqd->lock); + if (entities != inline_entities) + kfree(entities); + entities = kmalloc_array(depth, sizeof(*entities), GFP_NOIO); + if (!entities) + return false; + alloc_depth = depth; + goto retry; + } + + sched_data = entity->sched_data; + /* Gather our ancestors as we need to traverse them in reverse order */ + level = 0; + for_each_entity(entity) { + /* + * If at some level entity is not even active, allow request + * queueing so that BFQ knows there's work to do and activate + * entities. + */ + if (!entity->on_st_or_in_serv) + goto out; + /* Uh, more parents than cgroup subsystem thinks? */ + if (WARN_ON_ONCE(level >= depth)) + break; + entities[level++] = entity; + } + WARN_ON_ONCE(level != depth); + for (level--; level >= 0; level--) { + entity = entities[level]; + if (level > 0) { + wsum = bfq_entity_service_tree(entity)->wsum; + } else { + int i; + /* + * For bfqq itself we take into account service trees + * of all higher priority classes and multiply their + * weights so that low prio queue from higher class + * gets more requests than high prio queue from lower + * class. + */ + wsum = 0; + for (i = 0; i <= class_idx; i++) { + wsum = wsum * IOPRIO_BE_NR + + sched_data->service_tree[i].wsum; + } + } + if (!wsum) + continue; + limit = DIV_ROUND_CLOSEST(limit * entity->weight, wsum); + if (entity->allocated >= limit) { + bfq_log_bfqq(bfqq->bfqd, bfqq, + "too many requests: allocated %d limit %d level %d", + entity->allocated, limit, level); + ret = true; + break; + } + } +out: + spin_unlock_irq(&bfqd->lock); + if (entities != inline_entities) + kfree(entities); + return ret; +} +#else +static bool bfqq_request_over_limit(struct bfq_queue *bfqq, int limit) +{ + return false; +} +#endif + /* * Async I/O can easily starve sync I/O (both sync reads and sync * writes), by consuming all tags. Similarly, storms of sync writes, * such as those that sync(2) may trigger, can starve sync reads. * Limit depths of async I/O and sync writes so as to counter both * problems. + * + * Also if a bfq queue or its parent cgroup consume more tags than would be + * appropriate for their weight, we trim the available tag depth to 1. This + * avoids a situation where one cgroup can starve another cgroup from tags and + * thus block service differentiation among cgroups. Note that because the + * queue / cgroup already has many requests allocated and queued, this does not + * significantly affect service guarantees coming from the BFQ scheduling + * algorithm. */ -static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +static void bfq_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { struct bfq_data *bfqd = data->q->elevator->elevator_data; + struct bfq_io_cq *bic = bfq_bic_lookup(data->q); + int depth; + unsigned limit = data->q->nr_requests; + unsigned int act_idx; + + /* Sync reads have full depth available */ + if (op_is_sync(opf) && !op_is_write(opf)) { + depth = 0; + } else { + depth = bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(opf)]; + limit = (limit * depth) >> bfqd->full_depth_shift; + } - if (op_is_sync(op) && !op_is_write(op)) - return; - - data->shallow_depth = - bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; + for (act_idx = 0; bic && act_idx < bfqd->num_actuators; act_idx++) { + struct bfq_queue *bfqq = + bic_to_bfqq(bic, op_is_sync(opf), act_idx); + /* + * Does queue (or any parent entity) exceed number of + * requests that should be available to it? Heavily + * limit depth so that it cannot consume more + * available requests and thus starve other entities. + */ + if (bfqq && bfqq_request_over_limit(bfqq, limit)) { + depth = 1; + break; + } + } bfq_log(bfqd, "[%s] wr_busy %d sync %d depth %u", - __func__, bfqd->wr_busy_queues, op_is_sync(op), - data->shallow_depth); + __func__, bfqd->wr_busy_queues, op_is_sync(opf), depth); + if (depth) + data->shallow_depth = depth; } static struct bfq_queue * @@ -631,7 +808,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) if (!bfqq->next_rq) return; - bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + bfqq->pos_root = &bfqq_group(bfqq)->rq_pos_tree; __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, blk_rq_pos(bfqq->next_rq), &parent, &p); if (!__bfqq) { @@ -669,7 +846,7 @@ bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) * much easier to maintain the needed state: * 1) all active queues have the same weight, * 2) all active queues belong to the same I/O-priority class, - * 3) there are no active groups. + * 3) there is at most one active group. * In particular, the last condition is always true if hierarchical * support or the cgroups interface are not enabled, thus no state * needs to be maintained in this case. @@ -701,7 +878,7 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, return varied_queue_weights || multiple_classes_busy #ifdef CONFIG_BFQ_GROUP_IOSCHED - || bfqd->num_groups_with_pending_reqs > 0 + || bfqd->num_groups_with_pending_reqs > 1 #endif ; } @@ -719,9 +896,9 @@ static bool bfq_asymmetric_scenario(struct bfq_data *bfqd, * In most scenarios, the rate at which nodes are created/destroyed * should be low too. */ -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_add(struct bfq_queue *bfqq) { + struct rb_root_cached *root = &bfqq->bfqd->queue_weights_tree; struct bfq_entity *entity = &bfqq->entity; struct rb_node **new = &(root->rb_root.rb_node), *parent = NULL; bool leftmost = true; @@ -793,13 +970,14 @@ inc_counter: * See the comments to the function bfq_weights_tree_add() for considerations * about overhead. */ -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root) +void bfq_weights_tree_remove(struct bfq_queue *bfqq) { + struct rb_root_cached *root; + if (!bfqq->weight_counter) return; + root = &bfqq->bfqd->queue_weights_tree; bfqq->weight_counter->num_active--; if (bfqq->weight_counter->num_active > 0) goto reset_entity_pointer; @@ -813,59 +991,6 @@ reset_entity_pointer: } /* - * Invoke __bfq_weights_tree_remove on bfqq and decrement the number - * of active groups for each queue's inactive parent entity. - */ -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq) -{ - struct bfq_entity *entity = bfqq->entity.parent; - - for_each_entity(entity) { - struct bfq_sched_data *sd = entity->my_sched_data; - - if (sd->next_in_service || sd->in_service_entity) { - /* - * entity is still active, because either - * next_in_service or in_service_entity is not - * NULL (see the comments on the definition of - * next_in_service for details on why - * in_service_entity must be checked too). - * - * As a consequence, its parent entities are - * active as well, and thus this loop must - * stop here. - */ - break; - } - - /* - * The decrement of num_groups_with_pending_reqs is - * not performed immediately upon the deactivation of - * entity, but it is delayed to when it also happens - * that the first leaf descendant bfqq of entity gets - * all its pending requests completed. The following - * instructions perform this delayed decrement, if - * needed. See the comments on - * num_groups_with_pending_reqs for details. - */ - if (entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = false; - bfqd->num_groups_with_pending_reqs--; - } - } - - /* - * Next function is invoked last, because it causes bfqq to be - * freed if the following holds: bfqq is not in service and - * has no dispatched request. DO NOT use bfqq after the next - * function invocation. - */ - __bfq_weights_tree_remove(bfqd, bfqq, - &bfqd->queue_weights_tree); -} - -/* * Return expired entry, or NULL to just start from scratch in rbtree. */ static struct request *bfq_check_fifo(struct bfq_queue *bfqq, @@ -969,9 +1094,6 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) { u64 dur; - if (bfqd->bfq_wr_max_time > 0) - return bfqd->bfq_wr_max_time; - dur = bfqd->rate_dur_prod; do_div(dur, bfqd->peak_rate); @@ -1011,25 +1133,41 @@ static void bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, struct bfq_io_cq *bic, bool bfq_already_existing) { - unsigned int old_wr_coeff = bfqq->wr_coeff; + unsigned int old_wr_coeff = 1; bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; - if (bic->saved_has_short_ttime) + if (bfqq_data->saved_has_short_ttime) bfq_mark_bfqq_has_short_ttime(bfqq); else bfq_clear_bfqq_has_short_ttime(bfqq); - if (bic->saved_IO_bound) + if (bfqq_data->saved_IO_bound) bfq_mark_bfqq_IO_bound(bfqq); else bfq_clear_bfqq_IO_bound(bfqq); - bfqq->entity.new_weight = bic->saved_weight; - bfqq->ttime = bic->saved_ttime; - bfqq->wr_coeff = bic->saved_wr_coeff; - bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; - bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; - bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; + bfqq->last_serv_time_ns = bfqq_data->saved_last_serv_time_ns; + bfqq->inject_limit = bfqq_data->saved_inject_limit; + bfqq->decrease_time_jif = bfqq_data->saved_decrease_time_jif; + + bfqq->entity.new_weight = bfqq_data->saved_weight; + bfqq->ttime = bfqq_data->saved_ttime; + bfqq->io_start_time = bfqq_data->saved_io_start_time; + bfqq->tot_idle_time = bfqq_data->saved_tot_idle_time; + /* + * Restore weight coefficient only if low_latency is on + */ + if (bfqd->low_latency) { + old_wr_coeff = bfqq->wr_coeff; + bfqq->wr_coeff = bfqq_data->saved_wr_coeff; + } + bfqq->service_from_wr = bfqq_data->saved_service_from_wr; + bfqq->wr_start_at_switch_to_srt = + bfqq_data->saved_wr_start_at_switch_to_srt; + bfqq->last_wr_start_finish = bfqq_data->saved_last_wr_start_finish; + bfqq->wr_cur_max_time = bfqq_data->saved_wr_cur_max_time; if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || time_is_before_jiffies(bfqq->last_wr_start_finish + @@ -1060,8 +1198,9 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, static int bfqq_process_refs(struct bfq_queue *bfqq) { - return bfqq->ref - bfqq->allocated - bfqq->entity.on_st_or_in_serv - - (bfqq->weight_counter != NULL); + return bfqq->ref - bfqq->entity.allocated - + bfqq->entity.on_st_or_in_serv - + (bfqq->weight_counter != NULL) - bfqq->stable_ref; } /* Empty burst list and add just bfqq (see comments on bfq_handle_burst) */ @@ -1647,6 +1786,35 @@ static bool bfq_bfqq_higher_class_or_weight(struct bfq_queue *bfqq, return bfqq_weight > in_serv_weight; } +/* + * Get the index of the actuator that will serve bio. + */ +static unsigned int bfq_actuator_index(struct bfq_data *bfqd, struct bio *bio) +{ + unsigned int i; + sector_t end; + + /* no search needed if one or zero ranges present */ + if (bfqd->num_actuators == 1) + return 0; + + /* bio_end_sector(bio) gives the sector after the last one */ + end = bio_end_sector(bio) - 1; + + for (i = 0; i < bfqd->num_actuators; i++) { + if (end >= bfqd->sector[i] && + end < bfqd->sector[i] + bfqd->nr_sectors[i]) + return i; + } + + WARN_ONCE(true, + "bfq_actuator_index: bio sector out of ranges: end=%llu\n", + end); + return 0; +} + +static bool bfq_better_to_idle(struct bfq_queue *bfqq); + static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, struct bfq_queue *bfqq, int old_wr_coeff, @@ -1664,26 +1832,44 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, arrived_in_time = ktime_get_ns() <= bfqq->ttime.last_end_request + bfqd->bfq_slice_idle * 3; - + unsigned int act_idx = bfq_actuator_index(bfqd, rq->bio); + bool bfqq_non_merged_or_stably_merged = + bfqq->bic || RQ_BIC(rq)->bfqq_data[act_idx].stably_merged; /* * bfqq deserves to be weight-raised if: * - it is sync, * - it does not belong to a large burst, * - it has been idle for enough time or is soft real-time, - * - is linked to a bfq_io_cq (it is not shared in any sense). + * - is linked to a bfq_io_cq (it is not shared in any sense), + * - has a default weight (otherwise we assume the user wanted + * to control its weight explicitly) */ in_burst = bfq_bfqq_in_large_burst(bfqq); soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && !BFQQ_TOTALLY_SEEKY(bfqq) && !in_burst && time_is_before_jiffies(bfqq->soft_rt_next_start) && - bfqq->dispatched == 0; - *interactive = !in_burst && idle_for_long_time; + bfqq->dispatched == 0 && + bfqq->entity.new_weight == 40; + *interactive = !in_burst && idle_for_long_time && + bfqq->entity.new_weight == 40; + /* + * Merged bfq_queues are kept out of weight-raising + * (low-latency) mechanisms. The reason is that these queues + * are usually created for non-interactive and + * non-soft-real-time tasks. Yet this is not the case for + * stably-merged queues. These queues are merged just because + * they are created shortly after each other. So they may + * easily serve the I/O of an interactive or soft-real time + * application, if the application happens to spawn multiple + * processes. So let also stably-merged queued enjoy weight + * raising. + */ wr_or_deserves_wr = bfqd->low_latency && (bfqq->wr_coeff > 1 || - (bfq_bfqq_sync(bfqq) && - bfqq->bic && (*interactive || soft_rt))); + (bfq_bfqq_sync(bfqq) && bfqq_non_merged_or_stably_merged && + (*interactive || soft_rt))); /* * Using the last flag, update budget and check whether bfqq @@ -1717,17 +1903,6 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfq_clear_bfqq_just_created(bfqq); - - if (!bfq_bfqq_IO_bound(bfqq)) { - if (arrived_in_time) { - bfqq->requests_within_timer++; - if (bfqq->requests_within_timer >= - bfqd->bfq_requests_within_timer) - bfq_mark_bfqq_IO_bound(bfqq); - } else - bfqq->requests_within_timer = 0; - } - if (bfqd->low_latency) { if (unlikely(time_is_after_jiffies(bfqq->split_time))) /* wraparound */ @@ -1752,13 +1927,13 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, bfqq->service_from_backlogged = 0; bfq_clear_bfqq_softrt_update(bfqq); - bfq_add_bfqq_busy(bfqd, bfqq); + bfq_add_bfqq_busy(bfqq); /* - * Expire in-service queue only if preemption may be needed - * for guarantees. In particular, we care only about two - * cases. The first is that bfqq has to recover a service - * hole, as explained in the comments on + * Expire in-service queue if preemption may be needed for + * guarantees or throughput. As for guarantees, we care + * explicitly about two cases. The first is that bfqq has to + * recover a service hole, as explained in the comments on * bfq_bfqq_update_budg_for_activation(), i.e., that * bfqq_wants_to_preempt is true. However, if bfqq does not * carry time-critical I/O, then bfqq's bandwidth is less @@ -1785,11 +1960,23 @@ static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, * timestamps of the in-service queue would need to be * updated, and this operation is quite costly (see the * comments on bfq_bfqq_update_budg_for_activation()). + * + * As for throughput, we ask bfq_better_to_idle() whether we + * still need to plug I/O dispatching. If bfq_better_to_idle() + * says no, then plugging is not needed any longer, either to + * boost throughput or to perserve service guarantees. Then + * the best option is to stop plugging I/O, as not doing so + * would certainly lower throughput. We may end up in this + * case if: (1) upon a dispatch attempt, we detected that it + * was better to plug I/O dispatch, and to wait for a new + * request to arrive for the currently in-service queue, but + * (2) this switch of bfqq to busy changes the scenario. */ if (bfqd->in_service_queue && ((bfqq_wants_to_preempt && bfqq->wr_coeff >= bfqd->in_service_queue->wr_coeff) || - bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue)) && + bfq_bfqq_higher_class_or_weight(bfqq, bfqd->in_service_queue) || + !bfq_better_to_idle(bfqd->in_service_queue)) && next_queue_may_preempt(bfqd)) bfq_bfqq_expire(bfqd, bfqd->in_service_queue, false, BFQQE_PREEMPTED); @@ -1861,6 +2048,159 @@ static void bfq_reset_inject_limit(struct bfq_data *bfqd, bfqq->decrease_time_jif = jiffies; } +static void bfq_update_io_intensity(struct bfq_queue *bfqq, u64 now_ns) +{ + u64 tot_io_time = now_ns - bfqq->io_start_time; + + if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq->dispatched == 0) + bfqq->tot_idle_time += + now_ns - bfqq->ttime.last_end_request; + + if (unlikely(bfq_bfqq_just_created(bfqq))) + return; + + /* + * Must be busy for at least about 80% of the time to be + * considered I/O bound. + */ + if (bfqq->tot_idle_time * 5 > tot_io_time) + bfq_clear_bfqq_IO_bound(bfqq); + else + bfq_mark_bfqq_IO_bound(bfqq); + + /* + * Keep an observation window of at most 200 ms in the past + * from now. + */ + if (tot_io_time > 200 * NSEC_PER_MSEC) { + bfqq->io_start_time = now_ns - (tot_io_time>>1); + bfqq->tot_idle_time >>= 1; + } +} + +/* + * Detect whether bfqq's I/O seems synchronized with that of some + * other queue, i.e., whether bfqq, after remaining empty, happens to + * receive new I/O only right after some I/O request of the other + * queue has been completed. We call waker queue the other queue, and + * we assume, for simplicity, that bfqq may have at most one waker + * queue. + * + * A remarkable throughput boost can be reached by unconditionally + * injecting the I/O of the waker queue, every time a new + * bfq_dispatch_request happens to be invoked while I/O is being + * plugged for bfqq. In addition to boosting throughput, this + * unblocks bfqq's I/O, thereby improving bandwidth and latency for + * bfqq. Note that these same results may be achieved with the general + * injection mechanism, but less effectively. For details on this + * aspect, see the comments on the choice of the queue for injection + * in bfq_select_queue(). + * + * Turning back to the detection of a waker queue, a queue Q is deemed as a + * waker queue for bfqq if, for three consecutive times, bfqq happens to become + * non empty right after a request of Q has been completed within given + * timeout. In this respect, even if bfqq is empty, we do not check for a waker + * if it still has some in-flight I/O. In fact, in this case bfqq is actually + * still being served by the drive, and may receive new I/O on the completion + * of some of the in-flight requests. In particular, on the first time, Q is + * tentatively set as a candidate waker queue, while on the third consecutive + * time that Q is detected, the field waker_bfqq is set to Q, to confirm that Q + * is a waker queue for bfqq. These detection steps are performed only if bfqq + * has a long think time, so as to make it more likely that bfqq's I/O is + * actually being blocked by a synchronization. This last filter, plus the + * above three-times requirement and time limit for detection, make false + * positives less likely. + * + * NOTE + * + * The sooner a waker queue is detected, the sooner throughput can be + * boosted by injecting I/O from the waker queue. Fortunately, + * detection is likely to be actually fast, for the following + * reasons. While blocked by synchronization, bfqq has a long think + * time. This implies that bfqq's inject limit is at least equal to 1 + * (see the comments in bfq_update_inject_limit()). So, thanks to + * injection, the waker queue is likely to be served during the very + * first I/O-plugging time interval for bfqq. This triggers the first + * step of the detection mechanism. Thanks again to injection, the + * candidate waker queue is then likely to be confirmed no later than + * during the next I/O-plugging interval for bfqq. + * + * ISSUE + * + * On queue merging all waker information is lost. + */ +static void bfq_check_waker(struct bfq_data *bfqd, struct bfq_queue *bfqq, + u64 now_ns) +{ + char waker_name[MAX_BFQQ_NAME_LENGTH]; + + if (!bfqd->last_completed_rq_bfqq || + bfqd->last_completed_rq_bfqq == bfqq || + bfq_bfqq_has_short_ttime(bfqq) || + now_ns - bfqd->last_completion >= 4 * NSEC_PER_MSEC || + bfqd->last_completed_rq_bfqq == &bfqd->oom_bfqq || + bfqq == &bfqd->oom_bfqq) + return; + + /* + * We reset waker detection logic also if too much time has passed + * since the first detection. If wakeups are rare, pointless idling + * doesn't hurt throughput that much. The condition below makes sure + * we do not uselessly idle blocking waker in more than 1/64 cases. + */ + if (bfqd->last_completed_rq_bfqq != + bfqq->tentative_waker_bfqq || + now_ns > bfqq->waker_detection_started + + 128 * (u64)bfqd->bfq_slice_idle) { + /* + * First synchronization detected with a + * candidate waker queue, or with a different + * candidate waker queue from the current one. + */ + bfqq->tentative_waker_bfqq = + bfqd->last_completed_rq_bfqq; + bfqq->num_waker_detections = 1; + bfqq->waker_detection_started = now_ns; + bfq_bfqq_name(bfqq->tentative_waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set tentative waker %s", waker_name); + } else /* Same tentative waker queue detected again */ + bfqq->num_waker_detections++; + + if (bfqq->num_waker_detections == 3) { + bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; + bfqq->tentative_waker_bfqq = NULL; + bfq_bfqq_name(bfqq->waker_bfqq, waker_name, + MAX_BFQQ_NAME_LENGTH); + bfq_log_bfqq(bfqd, bfqq, "set waker %s", waker_name); + + /* + * If the waker queue disappears, then + * bfqq->waker_bfqq must be reset. To + * this goal, we maintain in each + * waker queue a list, woken_list, of + * all the queues that reference the + * waker queue through their + * waker_bfqq pointer. When the waker + * queue exits, the waker_bfqq pointer + * of all the queues in the woken_list + * is reset. + * + * In addition, if bfqq is already in + * the woken_list of a waker queue, + * then, before being inserted into + * the woken_list of a new waker + * queue, bfqq must be removed from + * the woken_list of the old waker + * queue. + */ + if (!hlist_unhashed(&bfqq->woken_list_node)) + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqd->last_completed_rq_bfqq->woken_list); + } +} + static void bfq_add_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); @@ -1868,117 +2208,18 @@ static void bfq_add_request(struct request *rq) struct request *next_rq, *prev; unsigned int old_wr_coeff = bfqq->wr_coeff; bool interactive = false; + u64 now_ns = ktime_get_ns(); bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued + 1); - if (RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_sync(bfqq)) { - /* - * Detect whether bfqq's I/O seems synchronized with - * that of some other queue, i.e., whether bfqq, after - * remaining empty, happens to receive new I/O only - * right after some I/O request of the other queue has - * been completed. We call waker queue the other - * queue, and we assume, for simplicity, that bfqq may - * have at most one waker queue. - * - * A remarkable throughput boost can be reached by - * unconditionally injecting the I/O of the waker - * queue, every time a new bfq_dispatch_request - * happens to be invoked while I/O is being plugged - * for bfqq. In addition to boosting throughput, this - * unblocks bfqq's I/O, thereby improving bandwidth - * and latency for bfqq. Note that these same results - * may be achieved with the general injection - * mechanism, but less effectively. For details on - * this aspect, see the comments on the choice of the - * queue for injection in bfq_select_queue(). - * - * Turning back to the detection of a waker queue, a - * queue Q is deemed as a waker queue for bfqq if, for - * two consecutive times, bfqq happens to become non - * empty right after a request of Q has been - * completed. In particular, on the first time, Q is - * tentatively set as a candidate waker queue, while - * on the second time, the flag - * bfq_bfqq_has_waker(bfqq) is set to confirm that Q - * is a waker queue for bfqq. These detection steps - * are performed only if bfqq has a long think time, - * so as to make it more likely that bfqq's I/O is - * actually being blocked by a synchronization. This - * last filter, plus the above two-times requirement, - * make false positives less likely. - * - * NOTE - * - * The sooner a waker queue is detected, the sooner - * throughput can be boosted by injecting I/O from the - * waker queue. Fortunately, detection is likely to be - * actually fast, for the following reasons. While - * blocked by synchronization, bfqq has a long think - * time. This implies that bfqq's inject limit is at - * least equal to 1 (see the comments in - * bfq_update_inject_limit()). So, thanks to - * injection, the waker queue is likely to be served - * during the very first I/O-plugging time interval - * for bfqq. This triggers the first step of the - * detection mechanism. Thanks again to injection, the - * candidate waker queue is then likely to be - * confirmed no later than during the next - * I/O-plugging interval for bfqq. - */ - if (bfqd->last_completed_rq_bfqq && - !bfq_bfqq_has_short_ttime(bfqq) && - ktime_get_ns() - bfqd->last_completion < - 200 * NSEC_PER_USEC) { - if (bfqd->last_completed_rq_bfqq != bfqq && - bfqd->last_completed_rq_bfqq != - bfqq->waker_bfqq) { - /* - * First synchronization detected with - * a candidate waker queue, or with a - * different candidate waker queue - * from the current one. - */ - bfqq->waker_bfqq = bfqd->last_completed_rq_bfqq; - - /* - * If the waker queue disappears, then - * bfqq->waker_bfqq must be reset. To - * this goal, we maintain in each - * waker queue a list, woken_list, of - * all the queues that reference the - * waker queue through their - * waker_bfqq pointer. When the waker - * queue exits, the waker_bfqq pointer - * of all the queues in the woken_list - * is reset. - * - * In addition, if bfqq is already in - * the woken_list of a waker queue, - * then, before being inserted into - * the woken_list of a new waker - * queue, bfqq must be removed from - * the woken_list of the old waker - * queue. - */ - if (!hlist_unhashed(&bfqq->woken_list_node)) - hlist_del_init(&bfqq->woken_list_node); - hlist_add_head(&bfqq->woken_list_node, - &bfqd->last_completed_rq_bfqq->woken_list); - - bfq_clear_bfqq_has_waker(bfqq); - } else if (bfqd->last_completed_rq_bfqq == - bfqq->waker_bfqq && - !bfq_bfqq_has_waker(bfqq)) { - /* - * synchronization with waker_bfqq - * seen for the second time - */ - bfq_mark_bfqq_has_waker(bfqq); - } - } + if (bfq_bfqq_sync(bfqq) && RQ_BIC(rq)->requests <= 1) { + bfq_check_waker(bfqd, bfqq, now_ns); /* * Periodically reset inject limit, to make sure that @@ -2016,9 +2257,9 @@ static void bfq_add_request(struct request *rq) * elapsed. */ if (bfqq == bfqd->in_service_queue && - (bfqd->rq_in_driver == 0 || + (bfqd->tot_rq_in_driver == 0 || (bfqq->last_serv_time_ns > 0 && - bfqd->rqs_injected && bfqd->rq_in_driver > 0)) && + bfqd->rqs_injected && bfqd->tot_rq_in_driver > 0)) && time_is_before_eq_jiffies(bfqq->decrease_time_jif + msecs_to_jiffies(10))) { bfqd->last_empty_occupied_ns = ktime_get_ns(); @@ -2042,11 +2283,14 @@ static void bfq_add_request(struct request *rq) * will be set in case injection is performed * on bfqq before rq is completed). */ - if (bfqd->rq_in_driver == 0) + if (bfqd->tot_rq_in_driver == 0) bfqd->rqs_injected = false; } } + if (bfq_bfqq_sync(bfqq)) + bfq_update_io_intensity(bfqq, now_ns); + elv_rb_add(&bfqq->sort_list, rq); /* @@ -2133,22 +2377,6 @@ static sector_t get_sdist(sector_t last_pos, struct request *rq) return 0; } -#if 0 /* Still not clear if we can do without next two functions */ -static void bfq_activate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver++; -} - -static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -{ - struct bfq_data *bfqd = q->elevator->elevator_data; - - bfqd->rq_in_driver--; -} -#endif - static void bfq_remove_request(struct request_queue *q, struct request *rq) { @@ -2164,7 +2392,11 @@ static void bfq_remove_request(struct request_queue *q, if (rq->queuelist.prev != &rq->queuelist) list_del_init(&rq->queuelist); bfqq->queued[sync]--; - bfqd->queued--; + /* + * Updating of 'bfqd->queued' is protected by 'bfqd->lock', however, it + * may be read without holding the lock in bfq_has_work(). + */ + WRITE_ONCE(bfqd->queued, bfqd->queued - 1); elv_rb_del(&bfqq->sort_list, rq); elv_rqhash_del(q, rq); @@ -2175,7 +2407,7 @@ static void bfq_remove_request(struct request_queue *q, bfqq->next_rq = NULL; if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); /* * bfqq emptied. In normal operation, when * bfqq is empty, bfqq->entity.service and @@ -2210,10 +2442,9 @@ static void bfq_remove_request(struct request_queue *q, } -static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct request *free = NULL; /* @@ -2223,22 +2454,30 @@ static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, * returned by bfq_bic_lookup does not go away before * bfqd->lock is taken. */ - struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); + struct bfq_io_cq *bic = bfq_bic_lookup(q); bool ret; spin_lock_irq(&bfqd->lock); - if (bic) - bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - else + if (bic) { + /* + * Make sure cgroup info is uptodate for current process before + * considering the merge. + */ + bfq_bic_update_cgroup(bic, bio); + + bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf), + bfq_actuator_index(bfqd, bio)); + } else { bfqd->bio_bfqq = NULL; + } bfqd->bio_bic = bic; ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock_irq(&bfqd->lock); if (free) blk_mq_free_request(free); - spin_unlock_irq(&bfqd->lock); return ret; } @@ -2252,14 +2491,15 @@ static int bfq_request_merge(struct request_queue *q, struct request **req, __rq = bfq_find_rq_fmerge(bfqd, bio, q); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } return ELEVATOR_NO_MERGE; } -static struct bfq_queue *bfq_init_rq(struct request *rq); - static void bfq_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { @@ -2268,7 +2508,7 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, blk_rq_pos(req) < blk_rq_pos(container_of(rb_prev(&req->rb_node), struct request, rb_node))) { - struct bfq_queue *bfqq = bfq_init_rq(req); + struct bfq_queue *bfqq = RQ_BFQQ(req); struct bfq_data *bfqd; struct request *prev, *next_rq; @@ -2320,11 +2560,11 @@ static void bfq_request_merged(struct request_queue *q, struct request *req, static void bfq_requests_merged(struct request_queue *q, struct request *rq, struct request *next) { - struct bfq_queue *bfqq = bfq_init_rq(rq), - *next_bfqq = bfq_init_rq(next); + struct bfq_queue *bfqq = RQ_BFQQ(rq), + *next_bfqq = RQ_BFQQ(next); if (!bfqq) - return; + goto remove; /* * If next and rq belong to the same bfq_queue and next is older @@ -2347,11 +2587,37 @@ static void bfq_requests_merged(struct request_queue *q, struct request *rq, bfqq->next_rq = rq; bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); +remove: + /* Merged request may be in the IO scheduler. Remove it. */ + if (!RB_EMPTY_NODE(&next->rb_node)) { + bfq_remove_request(next->q, next); + if (next_bfqq) + bfqg_stats_update_io_remove(bfqq_group(next_bfqq), + next->cmd_flags); + } } /* Must be called with bfqq != NULL */ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) { + /* + * If bfqq has been enjoying interactive weight-raising, then + * reset soft_rt_next_start. We do it for the following + * reason. bfqq may have been conveying the I/O needed to load + * a soft real-time application. Such an application actually + * exhibits a soft real-time I/O pattern after it finishes + * loading, and finally starts doing its job. But, if bfqq has + * been receiving a lot of bandwidth so far (likely to happen + * on a fast device), then soft_rt_next_start now contains a + * high value that. So, without this reset, bfqq would be + * prevented from being possibly considered as soft_rt for a + * very long time. + */ + + if (bfqq->wr_cur_max_time != + bfqq->bfqd->bfq_wr_rt_max_time) + bfqq->soft_rt_next_start = jiffies; + if (bfq_bfqq_busy(bfqq)) bfqq->bfqd->wr_busy_queues--; bfqq->wr_coeff = 1; @@ -2367,24 +2633,29 @@ static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - if (bfqg->async_bfqq[i][j]) - bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); - if (bfqg->async_idle_bfqq) - bfq_bfqq_end_wr(bfqg->async_idle_bfqq); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + if (bfqg->async_bfqq[i][j][k]) + bfq_bfqq_end_wr(bfqg->async_bfqq[i][j][k]); + if (bfqg->async_idle_bfqq[k]) + bfq_bfqq_end_wr(bfqg->async_idle_bfqq[k]); + } } static void bfq_end_wr(struct bfq_data *bfqd) { struct bfq_queue *bfqq; + int i; spin_lock_irq(&bfqd->lock); - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - bfq_bfqq_end_wr(bfqq); + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + bfq_bfqq_end_wr(bfqq); + } list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) bfq_bfqq_end_wr(bfqq); bfq_end_wr_async(bfqd); @@ -2411,7 +2682,7 @@ static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, struct bfq_queue *bfqq, sector_t sector) { - struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; + struct rb_root *root = &bfqq_group(bfqq)->rq_pos_tree; struct rb_node *parent, *node; struct bfq_queue *__bfqq; @@ -2500,6 +2771,14 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) if (process_refs == 0 || new_process_refs == 0) return NULL; + /* + * Make sure merged queues belong to the same parent. Parents could + * have changed since the time we decided the two queues are suitable + * for merging. + */ + if (new_bfqq->entity.parent != bfqq->entity.parent) + return NULL; + bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", new_bfqq->pid); @@ -2524,6 +2803,15 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) * are likely to increase the throughput. */ bfqq->new_bfqq = new_bfqq; + /* + * The above assignment schedules the following redirections: + * each time some I/O for bfqq arrives, the process that + * generated that I/O is disassociated from bfqq and + * associated with new_bfqq. Here we increases new_bfqq->ref + * in advance, adding the number of processes that are + * expected to be associated with new_bfqq as they happen to + * issue I/O. + */ new_bfqq->ref += process_refs; return new_bfqq; } @@ -2557,6 +2845,43 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, return true; } +static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, + struct bfq_queue *bfqq); + +static struct bfq_queue * +bfq_setup_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_queue *stable_merge_bfqq, + struct bfq_iocq_bfqq_data *bfqq_data) +{ + int proc_ref = min(bfqq_process_refs(bfqq), + bfqq_process_refs(stable_merge_bfqq)); + struct bfq_queue *new_bfqq = NULL; + + bfqq_data->stable_merge_bfqq = NULL; + if (idling_boosts_thr_without_issues(bfqd, bfqq) || proc_ref == 0) + goto out; + + /* next function will take at least one ref */ + new_bfqq = bfq_setup_merge(bfqq, stable_merge_bfqq); + + if (new_bfqq) { + bfqq_data->stably_merged = true; + if (new_bfqq->bic) { + unsigned int new_a_idx = new_bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *new_bfqq_data = + &new_bfqq->bic->bfqq_data[new_a_idx]; + + new_bfqq_data->stably_merged = true; + } + } + +out: + /* deschedule stable merge, because done or aborted here */ + bfq_put_stable_ref(stable_merge_bfqq); + + return new_bfqq; +} + /* * Attempt to schedule a merge of bfqq with the currently in-service * queue or with a close queue among the scheduled queues. Return @@ -2579,9 +2904,46 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, */ static struct bfq_queue * bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - void *io_struct, bool request) + void *io_struct, bool request, struct bfq_io_cq *bic) { struct bfq_queue *in_service_bfqq, *new_bfqq; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; + + /* if a merge has already been setup, then proceed with that first */ + if (bfqq->new_bfqq) + return bfqq->new_bfqq; + + /* + * Check delayed stable merge for rotational or non-queueing + * devs. For this branch to be executed, bfqq must not be + * currently merged with some other queue (i.e., bfqq->bic + * must be non null). If we considered also merged queues, + * then we should also check whether bfqq has already been + * merged with bic->stable_merge_bfqq. But this would be + * costly and complicated. + */ + if (unlikely(!bfqd->nonrot_with_queueing)) { + /* + * Make sure also that bfqq is sync, because + * bic->stable_merge_bfqq may point to some queue (for + * stable merging) also if bic is associated with a + * sync queue, but this bfqq is async + */ + if (bfq_bfqq_sync(bfqq) && bfqq_data->stable_merge_bfqq && + !bfq_bfqq_just_created(bfqq) && + time_is_before_jiffies(bfqq->split_time + + msecs_to_jiffies(bfq_late_stable_merging)) && + time_is_before_jiffies(bfqq->creation_time + + msecs_to_jiffies(bfq_late_stable_merging))) { + struct bfq_queue *stable_merge_bfqq = + bfqq_data->stable_merge_bfqq; + + return bfq_setup_stable_merge(bfqd, bfqq, + stable_merge_bfqq, + bfqq_data); + } + } /* * Do not perform queue merging if the device is non @@ -2637,9 +2999,6 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfq_too_late_for_merging(bfqq)) return NULL; - if (bfqq->new_bfqq) - return bfqq->new_bfqq; - if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) return NULL; @@ -2677,6 +3036,8 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, static void bfq_bfqq_save_state(struct bfq_queue *bfqq) { struct bfq_io_cq *bic = bfqq->bic; + unsigned int a_idx = bfqq->actuator_idx; + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[a_idx]; /* * If !bfqq->bic, the queue is already shared or its requests @@ -2686,12 +3047,21 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) if (!bic) return; - bic->saved_weight = bfqq->entity.orig_weight; - bic->saved_ttime = bfqq->ttime; - bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); - bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); - bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); - bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); + bfqq_data->saved_last_serv_time_ns = bfqq->last_serv_time_ns; + bfqq_data->saved_inject_limit = bfqq->inject_limit; + bfqq_data->saved_decrease_time_jif = bfqq->decrease_time_jif; + + bfqq_data->saved_weight = bfqq->entity.orig_weight; + bfqq_data->saved_ttime = bfqq->ttime; + bfqq_data->saved_has_short_ttime = + bfq_bfqq_has_short_ttime(bfqq); + bfqq_data->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); + bfqq_data->saved_io_start_time = bfqq->io_start_time; + bfqq_data->saved_tot_idle_time = bfqq->tot_idle_time; + bfqq_data->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); + bfqq_data->was_in_burst_list = + !hlist_unhashed(&bfqq->burst_list_node); + if (unlikely(bfq_bfqq_just_created(bfqq) && !bfq_bfqq_in_large_burst(bfqq) && bfqq->bfqd->low_latency)) { @@ -2704,19 +3074,35 @@ static void bfq_bfqq_save_state(struct bfq_queue *bfqq) * to bfqq, so that to avoid that bfqq unjustly fails * to enjoy weight raising if split soon. */ - bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; - bic->saved_wr_start_at_switch_to_srt = bfq_smallest_from_now(); - bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); - bic->saved_last_wr_start_finish = jiffies; + bfqq_data->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = + bfq_smallest_from_now(); + bfqq_data->saved_wr_cur_max_time = + bfq_wr_duration(bfqq->bfqd); + bfqq_data->saved_last_wr_start_finish = jiffies; } else { - bic->saved_wr_coeff = bfqq->wr_coeff; - bic->saved_wr_start_at_switch_to_srt = + bfqq_data->saved_wr_coeff = bfqq->wr_coeff; + bfqq_data->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; - bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; - bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; + bfqq_data->saved_service_from_wr = + bfqq->service_from_wr; + bfqq_data->saved_last_wr_start_finish = + bfqq->last_wr_start_finish; + bfqq_data->saved_wr_cur_max_time = bfqq->wr_cur_max_time; } } + +static void +bfq_reassign_last_bfqq(struct bfq_queue *cur_bfqq, struct bfq_queue *new_bfqq) +{ + if (cur_bfqq->entity.parent && + cur_bfqq->entity.parent->last_bfqq_created == cur_bfqq) + cur_bfqq->entity.parent->last_bfqq_created = new_bfqq; + else if (cur_bfqq->bfqd && cur_bfqq->bfqd->last_bfqq_created == cur_bfqq) + cur_bfqq->bfqd->last_bfqq_created = new_bfqq; +} + void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) { /* @@ -2732,7 +3118,9 @@ void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq) */ if (bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) && bfqq != bfqd->in_service_queue) - bfq_del_bfqq_busy(bfqd, bfqq, false); + bfq_del_bfqq_busy(bfqq, false); + + bfq_reassign_last_bfqq(bfqq, NULL); bfq_put_queue(bfqq); } @@ -2751,6 +3139,29 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, bfq_clear_bfqq_IO_bound(bfqq); /* + * The processes associated with bfqq are cooperators of the + * processes associated with new_bfqq. So, if bfqq has a + * waker, then assume that all these processes will be happy + * to let bfqq's waker freely inject I/O when they have no + * I/O. + */ + if (bfqq->waker_bfqq && !new_bfqq->waker_bfqq && + bfqq->waker_bfqq != new_bfqq) { + new_bfqq->waker_bfqq = bfqq->waker_bfqq; + new_bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be reset. So insert + * new_bfqq into the woken_list of the waker. See + * bfq_check_waker for details. + */ + hlist_add_head(&new_bfqq->woken_list_node, + &new_bfqq->waker_bfqq->woken_list); + + } + + /* * If bfqq is weight-raised, then let new_bfqq inherit * weight-raising. To reduce false positives, neglect the case * where bfqq has just been created, but has not yet made it @@ -2783,7 +3194,7 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, /* * Merge queues (that is, let bic redirect its requests to new_bfqq) */ - bic_set_bfqq(bic, new_bfqq, 1); + bic_set_bfqq(bic, new_bfqq, true, bfqq->actuator_idx); bfq_mark_bfqq_coop(new_bfqq); /* * new_bfqq now belongs to at least two bics (it is a shared queue): @@ -2807,6 +3218,9 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, */ new_bfqq->pid = -1; bfqq->bic = NULL; + + bfq_reassign_last_bfqq(bfqq, new_bfqq); + bfq_release_process_ref(bfqd, bfqq); } @@ -2834,7 +3248,7 @@ static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, * We take advantage of this function to perform an early merge * of the queues of possible cooperating processes. */ - new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); + new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false, bfqd->bio_bic); if (new_bfqq) { /* * bic still points to bfqq, then it has not yet been @@ -2937,6 +3351,7 @@ static void __bfq_set_in_service_queue(struct bfq_data *bfqd, } bfqd->in_service_queue = bfqq; + bfqd->in_serv_last_pos = 0; } /* @@ -3197,13 +3612,13 @@ static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) * - start a new observation interval with this dispatch */ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && - bfqd->rq_in_driver == 0) + bfqd->tot_rq_in_driver == 0) goto update_rate_and_reset; /* Update sampling information */ bfqd->peak_rate_samples++; - if ((bfqd->rq_in_driver > 0 || + if ((bfqd->tot_rq_in_driver > 0 || now_ns - bfqd->last_completion < BFQ_MIN_TT) && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) bfqd->sequential_samples++; @@ -3442,20 +3857,36 @@ static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) * order until all the requests already queued in the device have been * served. The last sub-condition commented above somewhat mitigates * this problem for weight-raised queues. + * + * However, as an additional mitigation for this problem, we preserve + * plugging for a special symmetric case that may suddenly turn into + * asymmetric: the case where only bfqq is busy. In this case, not + * expiring bfqq does not cause any harm to any other queues in terms + * of service guarantees. In contrast, it avoids the following unlucky + * sequence of events: (1) bfqq is expired, (2) a new queue with a + * lower weight than bfqq becomes busy (or more queues), (3) the new + * queue is served until a new request arrives for bfqq, (4) when bfqq + * is finally served, there are so many requests of the new queue in + * the drive that the pending requests for bfqq take a lot of time to + * be served. In particular, event (2) may case even already + * dispatched requests of bfqq to be delayed, inside the drive. So, to + * avoid this series of events, the scenario is preventively declared + * as asymmetric also if bfqq is the only busy queues */ static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, struct bfq_queue *bfqq) { + int tot_busy_queues = bfq_tot_busy_queues(bfqd); + /* No point in idling for bfqq if it won't get requests any longer */ if (unlikely(!bfqq_process_refs(bfqq))) return false; return (bfqq->wr_coeff > 1 && - (bfqd->wr_busy_queues < - bfq_tot_busy_queues(bfqd) || - bfqd->rq_in_driver >= - bfqq->dispatched + 4)) || - bfq_asymmetric_scenario(bfqd, bfqq); + (bfqd->wr_busy_queues < tot_busy_queues || + bfqd->tot_rq_in_driver >= bfqq->dispatched + 4)) || + bfq_asymmetric_scenario(bfqd, bfqq) || + tot_busy_queues == 1; } static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, @@ -3495,7 +3926,7 @@ static bool __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, */ bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, true); + bfq_del_bfqq_busy(bfqq, true); } else { bfq_requeue_bfqq(bfqd, bfqq, true); /* @@ -3719,8 +4150,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, * function to evaluate the I/O speed of a process. */ static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool compensate, enum bfqq_expiration reason, - unsigned long *delta_ms) + bool compensate, unsigned long *delta_ms) { ktime_t delta_ktime; u32 delta_usecs; @@ -3916,7 +4346,7 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, /* * Check whether the process is slow (see bfq_bfqq_is_slow). */ - slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); + slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, &delta); /* * As above explained, charge slow (typically seeky) and @@ -3939,10 +4369,6 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) bfq_bfqq_charge_time(bfqd, bfqq, delta); - if (reason == BFQQE_TOO_IDLE && - entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - if (bfqd->low_latency && bfqq->wr_coeff == 1) bfqq->last_wr_start_finish = jiffies; @@ -3952,30 +4378,15 @@ void bfq_bfqq_expire(struct bfq_data *bfqd, * If we get here, and there are no outstanding * requests, then the request pattern is isochronous * (see the comments on the function - * bfq_bfqq_softrt_next_start()). Thus we can compute - * soft_rt_next_start. And we do it, unless bfqq is in - * interactive weight raising. We do not do it in the - * latter subcase, for the following reason. bfqq may - * be conveying the I/O needed to load a soft - * real-time application. Such an application will - * actually exhibit a soft real-time I/O pattern after - * it finally starts doing its job. But, if - * soft_rt_next_start is computed here for an - * interactive bfqq, and bfqq had received a lot of - * service before remaining with no outstanding - * request (likely to happen on a fast device), then - * soft_rt_next_start would be assigned such a high - * value that, for a very long time, bfqq would be - * prevented from being possibly considered as soft - * real time. + * bfq_bfqq_softrt_next_start()). Therefore we can + * compute soft_rt_next_start. * * If, instead, the queue still has outstanding * requests, then we have to wait for the completion * of all the outstanding requests to discover whether * the request pattern is actually isochronous. */ - if (bfqq->dispatched == 0 && - bfqq->wr_coeff != bfqd->bfq_wr_coeff) + if (bfqq->dispatched == 0) bfqq->soft_rt_next_start = bfq_bfqq_softrt_next_start(bfqd, bfqq); else if (bfqq->dispatched > 0) { @@ -4243,6 +4654,8 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) { struct bfq_queue *bfqq, *in_serv_bfqq = bfqd->in_service_queue; unsigned int limit = in_serv_bfqq->inject_limit; + int i; + /* * If * - bfqq is not weight-raised and therefore does not carry @@ -4274,7 +4687,7 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ) limit = 1; - if (bfqd->rq_in_driver >= limit) + if (bfqd->tot_rq_in_driver >= limit) return NULL; /* @@ -4289,11 +4702,12 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) * (and re-added only if it gets new requests, but then it * is assigned again enough budget for its new backlog). */ - list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) - if (!RB_EMPTY_ROOT(&bfqq->sort_list) && - (in_serv_always_inject || bfqq->wr_coeff > 1) && - bfq_serv_to_charge(bfqq->next_rq, bfqq) <= - bfq_bfqq_budget_left(bfqq)) { + for (i = 0; i < bfqd->num_actuators; i++) { + list_for_each_entry(bfqq, &bfqd->active_list[i], bfqq_list) + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + (in_serv_always_inject || bfqq->wr_coeff > 1) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { /* * Allow for only one large in-flight request * on non-rotational devices, for the @@ -4313,27 +4727,80 @@ bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) */ if (blk_queue_nonrot(bfqd->queue) && blk_rq_sectors(bfqq->next_rq) >= - BFQQ_SECT_THR_NONROT) - limit = min_t(unsigned int, 1, limit); - else - limit = in_serv_bfqq->inject_limit; - - if (bfqd->rq_in_driver < limit) { + BFQQ_SECT_THR_NONROT && + bfqd->tot_rq_in_driver >= 1) + continue; + else { bfqd->rqs_injected = true; return bfqq; } } + } + + return NULL; +} + +static struct bfq_queue * +bfq_find_active_bfqq_for_actuator(struct bfq_data *bfqd, int idx) +{ + struct bfq_queue *bfqq; + + if (bfqd->in_service_queue && + bfqd->in_service_queue->actuator_idx == idx) + return bfqd->in_service_queue; + + list_for_each_entry(bfqq, &bfqd->active_list[idx], bfqq_list) { + if (!RB_EMPTY_ROOT(&bfqq->sort_list) && + bfq_serv_to_charge(bfqq->next_rq, bfqq) <= + bfq_bfqq_budget_left(bfqq)) { + return bfqq; + } + } + + return NULL; +} + +/* + * Perform a linear scan of each actuator, until an actuator is found + * for which the following three conditions hold: the load of the + * actuator is below the threshold (see comments on + * actuator_load_threshold for details) and lower than that of the + * next actuator (comments on this extra condition below), and there + * is a queue that contains I/O for that actuator. On success, return + * that queue. + * + * Performing a plain linear scan entails a prioritization among + * actuators. The extra condition above breaks this prioritization and + * tends to distribute injection uniformly across actuators. + */ +static struct bfq_queue * +bfq_find_bfqq_for_underused_actuator(struct bfq_data *bfqd) +{ + int i; + + for (i = 0 ; i < bfqd->num_actuators; i++) { + if (bfqd->rq_in_driver[i] < bfqd->actuator_load_threshold && + (i == bfqd->num_actuators - 1 || + bfqd->rq_in_driver[i] < bfqd->rq_in_driver[i+1])) { + struct bfq_queue *bfqq = + bfq_find_active_bfqq_for_actuator(bfqd, i); + + if (bfqq) + return bfqq; + } + } return NULL; } + /* * Select a queue for service. If we have a current queue in service, * check whether to continue servicing it, or retrieve and set a new one. */ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) { - struct bfq_queue *bfqq; + struct bfq_queue *bfqq, *inject_bfqq; struct request *next_rq; enum bfqq_expiration reason = BFQQE_BUDGET_TIMEOUT; @@ -4356,6 +4823,15 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) check_queue: /* + * If some actuator is underutilized, but the in-service + * queue does not contain I/O for that actuator, then try to + * inject I/O for that actuator. + */ + inject_bfqq = bfq_find_bfqq_for_underused_actuator(bfqd); + if (inject_bfqq && inject_bfqq != bfqq) + return inject_bfqq; + + /* * This loop is rarely executed more than once. Even when it * happens, it is much more convenient to re-execute this loop * than to return NULL and trigger a new dispatch to get a @@ -4414,14 +4890,21 @@ check_queue: */ if (bfq_bfqq_wait_request(bfqq) || (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { - struct bfq_queue *async_bfqq = - bfqq->bic && bfqq->bic->bfqq[0] && - bfq_bfqq_busy(bfqq->bic->bfqq[0]) && - bfqq->bic->bfqq[0]->next_rq ? - bfqq->bic->bfqq[0] : NULL; - + unsigned int act_idx = bfqq->actuator_idx; + struct bfq_queue *async_bfqq = NULL; + struct bfq_queue *blocked_bfqq = + !hlist_empty(&bfqq->woken_list) ? + container_of(bfqq->woken_list.first, + struct bfq_queue, + woken_list_node) + : NULL; + + if (bfqq->bic && bfqq->bic->bfqq[0][act_idx] && + bfq_bfqq_busy(bfqq->bic->bfqq[0][act_idx]) && + bfqq->bic->bfqq[0][act_idx]->next_rq) + async_bfqq = bfqq->bic->bfqq[0][act_idx]; /* - * The next three mutually-exclusive ifs decide + * The next four mutually-exclusive ifs decide * whether to try injection, and choose the queue to * pick an I/O request from. * @@ -4454,7 +4937,15 @@ check_queue: * next bfqq's I/O is brought forward dramatically, * for it is not blocked for milliseconds. * - * The third if checks whether bfqq is a queue for + * The third if checks whether there is a queue woken + * by bfqq, and currently with pending I/O. Such a + * woken queue does not steal bandwidth from bfqq, + * because it remains soon without I/O if bfqq is not + * served. So there is virtually no risk of loss of + * bandwidth for bfqq if this woken queue has I/O + * dispatched while bfqq is waiting for new I/O. + * + * The fourth if checks whether bfqq is a queue for * which it is better to avoid injection. It is so if * bfqq delivers more throughput when served without * any further I/O from other queues in the middle, or @@ -4474,11 +4965,11 @@ check_queue: * bfq_update_has_short_ttime(), it is rather likely * that, if I/O is being plugged for bfqq and the * waker queue has pending I/O requests that are - * blocking bfqq's I/O, then the third alternative + * blocking bfqq's I/O, then the fourth alternative * above lets the waker queue get served before the * I/O-plugging timeout fires. So one may deem the * second alternative superfluous. It is not, because - * the third alternative may be way less effective in + * the fourth alternative may be way less effective in * case of a synchronization. For two main * reasons. First, throughput may be low because the * inject limit may be too low to guarantee the same @@ -4487,7 +4978,7 @@ check_queue: * guarantees (the second alternative unconditionally * injects a pending I/O request of the waker queue * for each bfq_dispatch_request()). Second, with the - * third alternative, the duration of the plugging, + * fourth alternative, the duration of the plugging, * i.e., the time before bfqq finally receives new I/O, * may not be minimized, because the waker queue may * happen to be served only after other queues. @@ -4496,15 +4987,23 @@ check_queue: icq_to_bic(async_bfqq->next_rq->elv.icq) == bfqq->bic && bfq_serv_to_charge(async_bfqq->next_rq, async_bfqq) <= bfq_bfqq_budget_left(async_bfqq)) - bfqq = bfqq->bic->bfqq[0]; - else if (bfq_bfqq_has_waker(bfqq) && + bfqq = async_bfqq; + else if (bfqq->waker_bfqq && bfq_bfqq_busy(bfqq->waker_bfqq) && - bfqq->next_rq && + bfqq->waker_bfqq->next_rq && bfq_serv_to_charge(bfqq->waker_bfqq->next_rq, bfqq->waker_bfqq) <= bfq_bfqq_budget_left(bfqq->waker_bfqq) ) bfqq = bfqq->waker_bfqq; + else if (blocked_bfqq && + bfq_bfqq_busy(blocked_bfqq) && + blocked_bfqq->next_rq && + bfq_serv_to_charge(blocked_bfqq->next_rq, + blocked_bfqq) <= + bfq_bfqq_budget_left(blocked_bfqq) + ) + bfqq = blocked_bfqq; else if (!idling_boosts_thr_without_issues(bfqd, bfqq) && (bfqq->wr_coeff == 1 || bfqd->wr_busy_queues > 1 || !bfq_bfqq_has_short_ttime(bfqq))) @@ -4559,9 +5058,21 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfqq->wr_cur_max_time)) { if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + - bfq_wr_duration(bfqd))) + bfq_wr_duration(bfqd))) { + /* + * Either in interactive weight + * raising, or in soft_rt weight + * raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ bfq_bfqq_end_wr(bfqq); - else { + } else { /* + * soft_rt finishing while still in + * interactive period, switch back to + * interactive weight raising + */ switch_back_to_interactive_wr(bfqq, bfqd); bfqq->entity.prio_changed = 1; } @@ -4607,7 +5118,7 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, bfq_dispatch_remove(bfqd->queue, rq); if (bfqq != bfqd->in_service_queue) - goto return_rq; + return rq; /* * If weight raising has to terminate for bfqq, then next @@ -4627,12 +5138,9 @@ static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, * belongs to CLASS_IDLE and other queues are waiting for * service. */ - if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) - goto return_rq; - - bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); + if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) + bfq_bfqq_expire(bfqd, bfqq, false, BFQQE_BUDGET_EXHAUSTED); -return_rq: return rq; } @@ -4641,11 +5149,11 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; /* - * Avoiding lock: a race on bfqd->busy_queues should cause at + * Avoiding lock: a race on bfqd->queued should cause at * most a call to dispatch for nothing */ return !list_empty_careful(&bfqd->dispatch) || - bfq_tot_busy_queues(bfqd) > 0; + READ_ONCE(bfqd->queued); } static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) @@ -4675,11 +5183,11 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) /* * We exploit the bfq_finish_requeue_request hook to - * decrement rq_in_driver, but + * decrement tot_rq_in_driver, but * bfq_finish_requeue_request will not be invoked on * this request. So, to avoid unbalance, just start - * this request, without incrementing rq_in_driver. As - * a negative consequence, rq_in_driver is deceptively + * this request, without incrementing tot_rq_in_driver. As + * a negative consequence, tot_rq_in_driver is deceptively * lower than it should be while this request is in * service. This may cause bfq_schedule_dispatch to be * invoked uselessly. @@ -4688,7 +5196,7 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * bfq_finish_requeue_request hook, if defined, is * probably invoked also on this request. So, by * exploiting this hook, we could 1) increment - * rq_in_driver here, and 2) decrement it in + * tot_rq_in_driver here, and 2) decrement it in * bfq_finish_requeue_request. Such a solution would * let the value of the counter be always accurate, * but it would entail using an extra interface @@ -4714,10 +5222,10 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) * some unlucky request wait for as long as the device * wishes. * - * Of course, serving one request at at time may cause loss of + * Of course, serving one request at a time may cause loss of * throughput. */ - if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) + if (bfqd->strict_guarantees && bfqd->tot_rq_in_driver > 0) goto exit; bfqq = bfq_select_queue(bfqd); @@ -4728,7 +5236,8 @@ static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) if (rq) { inc_in_driver_start_rq: - bfqd->rq_in_driver++; + bfqd->rq_in_driver[bfqq->actuator_idx]++; + bfqd->tot_rq_in_driver++; start_rq: rq->rq_flags |= RQF_STARTED; } @@ -4793,7 +5302,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct request *rq; struct bfq_queue *in_serv_queue; - bool waiting_rq, idle_timer_disabled; + bool waiting_rq, idle_timer_disabled = false; spin_lock_irq(&bfqd->lock); @@ -4801,14 +5310,15 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); rq = __bfq_dispatch_request(hctx); - - idle_timer_disabled = - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + if (in_serv_queue == bfqd->in_service_queue) { + idle_timer_disabled = + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } spin_unlock_irq(&bfqd->lock); - - bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, - idle_timer_disabled); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, + idle_timer_disabled); return rq; } @@ -4826,9 +5336,7 @@ void bfq_put_queue(struct bfq_queue *bfqq) struct hlist_node *n; struct bfq_group *bfqg = bfqq_group(bfqq); - if (bfqq->bfqd) - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", - bfqq, bfqq->ref); + bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); bfqq->ref--; if (bfqq->ref) @@ -4889,18 +5397,27 @@ void bfq_put_queue(struct bfq_queue *bfqq) hlist_for_each_entry_safe(item, n, &bfqq->woken_list, woken_list_node) { item->waker_bfqq = NULL; - bfq_clear_bfqq_has_waker(item); hlist_del_init(&item->woken_list_node); } - if (bfqq->bfqd && bfqq->bfqd->last_completed_rq_bfqq == bfqq) + if (bfqq->bfqd->last_completed_rq_bfqq == bfqq) bfqq->bfqd->last_completed_rq_bfqq = NULL; + WARN_ON_ONCE(!list_empty(&bfqq->fifo)); + WARN_ON_ONCE(!RB_EMPTY_ROOT(&bfqq->sort_list)); + WARN_ON_ONCE(bfqq->dispatched); + kmem_cache_free(bfq_pool, bfqq); bfqg_and_blkg_put(bfqg); } -static void bfq_put_cooperator(struct bfq_queue *bfqq) +static void bfq_put_stable_ref(struct bfq_queue *bfqq) +{ + bfqq->stable_ref--; + bfq_put_queue(bfqq); +} + +void bfq_put_cooperator(struct bfq_queue *bfqq) { struct bfq_queue *__bfqq, *next; @@ -4911,8 +5428,6 @@ static void bfq_put_cooperator(struct bfq_queue *bfqq) */ __bfqq = bfqq->new_bfqq; while (__bfqq) { - if (__bfqq == bfqq) - break; next = __bfqq->new_bfqq; bfq_put_queue(__bfqq); __bfqq = next; @@ -4933,31 +5448,55 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_release_process_ref(bfqd, bfqq); } -static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) +static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, actuator_idx); struct bfq_data *bfqd; if (bfqq) bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ if (bfqq && bfqd) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - bfqq->bic = NULL; + bic_set_bfqq(bic, NULL, is_sync, actuator_idx); bfq_exit_bfqq(bfqd, bfqq); - bic_set_bfqq(bic, NULL, is_sync); - spin_unlock_irqrestore(&bfqd->lock, flags); } } static void bfq_exit_icq(struct io_cq *icq) { struct bfq_io_cq *bic = icq_to_bic(icq); + struct bfq_data *bfqd = bic_to_bfqd(bic); + unsigned long flags; + unsigned int act_idx; + /* + * If bfqd and thus bfqd->num_actuators is not available any + * longer, then cycle over all possible per-actuator bfqqs in + * next loop. We rely on bic being zeroed on creation, and + * therefore on its unused per-actuator fields being NULL. + */ + unsigned int num_actuators = BFQ_MAX_ACTUATORS; + struct bfq_iocq_bfqq_data *bfqq_data = bic->bfqq_data; + + /* + * bfqd is NULL if scheduler already exited, and in that case + * this is the last time these queues are accessed. + */ + if (bfqd) { + spin_lock_irqsave(&bfqd->lock, flags); + num_actuators = bfqd->num_actuators; + } + + for (act_idx = 0; act_idx < num_actuators; act_idx++) { + if (bfqq_data[act_idx].stable_merge_bfqq) + bfq_put_stable_ref(bfqq_data[act_idx].stable_merge_bfqq); + + bfq_exit_icq_bfqq(bic, true, act_idx); + bfq_exit_icq_bfqq(bic, false, act_idx); + } - bfq_exit_icq_bfqq(bic, true); - bfq_exit_icq_bfqq(bic, false); + if (bfqd) + spin_unlock_irqrestore(&bfqd->lock, flags); } /* @@ -4978,9 +5517,9 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) switch (ioprio_class) { default: pr_err("bdi %s: bfq: bad prio class %d\n", - bdi_dev_name(bfqq->bfqd->queue->backing_dev_info), - ioprio_class); - /* fall through */ + bdi_dev_name(bfqq->bfqd->queue->disk->bdi), + ioprio_class); + fallthrough; case IOPRIO_CLASS_NONE: /* * No prio set, inherit CPU scheduling settings. @@ -4989,32 +5528,35 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) bfqq->new_ioprio_class = task_nice_ioclass(tsk); break; case IOPRIO_CLASS_RT: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); bfqq->new_ioprio_class = IOPRIO_CLASS_RT; break; case IOPRIO_CLASS_BE: - bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + bfqq->new_ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); bfqq->new_ioprio_class = IOPRIO_CLASS_BE; break; case IOPRIO_CLASS_IDLE: bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; - bfqq->new_ioprio = 7; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; break; } - if (bfqq->new_ioprio >= IOPRIO_BE_NR) { + if (bfqq->new_ioprio >= IOPRIO_NR_LEVELS) { pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", bfqq->new_ioprio); - bfqq->new_ioprio = IOPRIO_BE_NR; + bfqq->new_ioprio = IOPRIO_NR_LEVELS - 1; } bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); + bfq_log_bfqq(bfqd, bfqq, "new_ioprio %d new_weight %d", + bfqq->new_ioprio, bfqq->entity.new_weight); bfqq->entity.prio_changed = 1; } static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic); + struct bfq_io_cq *bic, + bool respawn); static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) { @@ -5031,21 +5573,27 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) bic->ioprio = ioprio; - bfqq = bic_to_bfqq(bic, false); + bfqq = bic_to_bfqq(bic, false, bfq_actuator_index(bfqd, bio)); if (bfqq) { - bfq_release_process_ref(bfqd, bfqq); - bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); - bic_set_bfqq(bic, bfqq, false); + struct bfq_queue *old_bfqq = bfqq; + + bfqq = bfq_get_queue(bfqd, bio, false, bic, true); + bic_set_bfqq(bic, bfqq, false, bfq_actuator_index(bfqd, bio)); + bfq_release_process_ref(bfqd, old_bfqq); } - bfqq = bic_to_bfqq(bic, true); + bfqq = bic_to_bfqq(bic, true, bfq_actuator_index(bfqd, bio)); if (bfqq) bfq_set_next_ioprio_data(bfqq, bic); } static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct bfq_io_cq *bic, pid_t pid, int is_sync) + struct bfq_io_cq *bic, pid_t pid, int is_sync, + unsigned int act_idx) { + u64 now_ns = ktime_get_ns(); + + bfqq->actuator_idx = act_idx; RB_CLEAR_NODE(&bfqq->entity.rb_node); INIT_LIST_HEAD(&bfqq->fifo); INIT_HLIST_NODE(&bfqq->burst_list_node); @@ -5073,7 +5621,11 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_clear_bfqq_sync(bfqq); /* set end request to minus infinity from now */ - bfqq->ttime.last_end_request = ktime_get_ns() + 1; + bfqq->ttime.last_end_request = now_ns + 1; + + bfqq->creation_time = jiffies; + + bfqq->io_start_time = now_ns; bfq_mark_bfqq_IO_bound(bfqq); @@ -5101,48 +5653,198 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, /* first request is almost certainly seeky */ bfqq->seek_history = 1; + + bfqq->decrease_time_jif = jiffies; } static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, struct bfq_group *bfqg, - int ioprio_class, int ioprio) + int ioprio_class, int ioprio, int act_idx) { switch (ioprio_class) { case IOPRIO_CLASS_RT: - return &bfqg->async_bfqq[0][ioprio]; + return &bfqg->async_bfqq[0][ioprio][act_idx]; case IOPRIO_CLASS_NONE: - ioprio = IOPRIO_NORM; - /* fall through */ + ioprio = IOPRIO_BE_NORM; + fallthrough; case IOPRIO_CLASS_BE: - return &bfqg->async_bfqq[1][ioprio]; + return &bfqg->async_bfqq[1][ioprio][act_idx]; case IOPRIO_CLASS_IDLE: - return &bfqg->async_idle_bfqq; + return &bfqg->async_idle_bfqq[act_idx]; default: return NULL; } } +static struct bfq_queue * +bfq_do_early_stable_merge(struct bfq_data *bfqd, struct bfq_queue *bfqq, + struct bfq_io_cq *bic, + struct bfq_queue *last_bfqq_created) +{ + unsigned int a_idx = last_bfqq_created->actuator_idx; + struct bfq_queue *new_bfqq = + bfq_setup_merge(bfqq, last_bfqq_created); + + if (!new_bfqq) + return bfqq; + + if (new_bfqq->bic) + new_bfqq->bic->bfqq_data[a_idx].stably_merged = true; + bic->bfqq_data[a_idx].stably_merged = true; + + /* + * Reusing merge functions. This implies that + * bfqq->bic must be set too, for + * bfq_merge_bfqqs to correctly save bfqq's + * state before killing it. + */ + bfqq->bic = bic; + bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); + + return new_bfqq; +} + +/* + * Many throughput-sensitive workloads are made of several parallel + * I/O flows, with all flows generated by the same application, or + * more generically by the same task (e.g., system boot). The most + * counterproductive action with these workloads is plugging I/O + * dispatch when one of the bfq_queues associated with these flows + * remains temporarily empty. + * + * To avoid this plugging, BFQ has been using a burst-handling + * mechanism for years now. This mechanism has proven effective for + * throughput, and not detrimental for service guarantees. The + * following function pushes this mechanism a little bit further, + * basing on the following two facts. + * + * First, all the I/O flows of a the same application or task + * contribute to the execution/completion of that common application + * or task. So the performance figures that matter are total + * throughput of the flows and task-wide I/O latency. In particular, + * these flows do not need to be protected from each other, in terms + * of individual bandwidth or latency. + * + * Second, the above fact holds regardless of the number of flows. + * + * Putting these two facts together, this commits merges stably the + * bfq_queues associated with these I/O flows, i.e., with the + * processes that generate these IO/ flows, regardless of how many the + * involved processes are. + * + * To decide whether a set of bfq_queues is actually associated with + * the I/O flows of a common application or task, and to merge these + * queues stably, this function operates as follows: given a bfq_queue, + * say Q2, currently being created, and the last bfq_queue, say Q1, + * created before Q2, Q2 is merged stably with Q1 if + * - very little time has elapsed since when Q1 was created + * - Q2 has the same ioprio as Q1 + * - Q2 belongs to the same group as Q1 + * + * Merging bfq_queues also reduces scheduling overhead. A fio test + * with ten random readers on /dev/nullb shows a throughput boost of + * 40%, with a quadcore. Since BFQ's execution time amounts to ~50% of + * the total per-request processing time, the above throughput boost + * implies that BFQ's overhead is reduced by more than 50%. + * + * This new mechanism most certainly obsoletes the current + * burst-handling heuristics. We keep those heuristics for the moment. + */ +static struct bfq_queue *bfq_do_or_sched_stable_merge(struct bfq_data *bfqd, + struct bfq_queue *bfqq, + struct bfq_io_cq *bic) +{ + struct bfq_queue **source_bfqq = bfqq->entity.parent ? + &bfqq->entity.parent->last_bfqq_created : + &bfqd->last_bfqq_created; + + struct bfq_queue *last_bfqq_created = *source_bfqq; + + /* + * If last_bfqq_created has not been set yet, then init it. If + * it has been set already, but too long ago, then move it + * forward to bfqq. Finally, move also if bfqq belongs to a + * different group than last_bfqq_created, or if bfqq has a + * different ioprio, ioprio_class or actuator_idx. If none of + * these conditions holds true, then try an early stable merge + * or schedule a delayed stable merge. As for the condition on + * actuator_idx, the reason is that, if queues associated with + * different actuators are merged, then control is lost on + * each actuator. Therefore some actuator may be + * underutilized, and throughput may decrease. + * + * A delayed merge is scheduled (instead of performing an + * early merge), in case bfqq might soon prove to be more + * throughput-beneficial if not merged. Currently this is + * possible only if bfqd is rotational with no queueing. For + * such a drive, not merging bfqq is better for throughput if + * bfqq happens to contain sequential I/O. So, we wait a + * little bit for enough I/O to flow through bfqq. After that, + * if such an I/O is sequential, then the merge is + * canceled. Otherwise the merge is finally performed. + */ + if (!last_bfqq_created || + time_before(last_bfqq_created->creation_time + + msecs_to_jiffies(bfq_activation_stable_merging), + bfqq->creation_time) || + bfqq->entity.parent != last_bfqq_created->entity.parent || + bfqq->ioprio != last_bfqq_created->ioprio || + bfqq->ioprio_class != last_bfqq_created->ioprio_class || + bfqq->actuator_idx != last_bfqq_created->actuator_idx) + *source_bfqq = bfqq; + else if (time_after_eq(last_bfqq_created->creation_time + + bfqd->bfq_burst_interval, + bfqq->creation_time)) { + if (likely(bfqd->nonrot_with_queueing)) + /* + * With this type of drive, leaving + * bfqq alone may provide no + * throughput benefits compared with + * merging bfqq. So merge bfqq now. + */ + bfqq = bfq_do_early_stable_merge(bfqd, bfqq, + bic, + last_bfqq_created); + else { /* schedule tentative stable merge */ + /* + * get reference on last_bfqq_created, + * to prevent it from being freed, + * until we decide whether to merge + */ + last_bfqq_created->ref++; + /* + * need to keep track of stable refs, to + * compute process refs correctly + */ + last_bfqq_created->stable_ref++; + /* + * Record the bfqq to merge to. + */ + bic->bfqq_data[last_bfqq_created->actuator_idx].stable_merge_bfqq = + last_bfqq_created; + } + } + + return bfqq; +} + + static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, struct bio *bio, bool is_sync, - struct bfq_io_cq *bic) + struct bfq_io_cq *bic, + bool respawn) { - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); + const int ioprio = IOPRIO_PRIO_LEVEL(bic->ioprio); const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); struct bfq_queue **async_bfqq = NULL; struct bfq_queue *bfqq; struct bfq_group *bfqg; - rcu_read_lock(); - - bfqg = bfq_find_set_group(bfqd, __bio_blkcg(bio)); - if (!bfqg) { - bfqq = &bfqd->oom_bfqq; - goto out; - } - + bfqg = bfq_bio_bfqg(bfqd, bio); if (!is_sync) { async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); + ioprio, + bfq_actuator_index(bfqd, bio)); bfqq = *async_bfqq; if (bfqq) goto out; @@ -5154,7 +5856,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, if (bfqq) { bfq_init_bfqq(bfqd, bfqq, bic, current->pid, - is_sync); + is_sync, bfq_actuator_index(bfqd, bio)); bfq_init_entity(&bfqq->entity, bfqg); bfq_log_bfqq(bfqd, bfqq, "allocated"); } else { @@ -5182,8 +5884,9 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, out: bfqq->ref++; /* get a process reference to this queue */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); - rcu_read_unlock(); + + if (bfqq != &bfqd->oom_bfqq && is_sync && !respawn) + bfqq = bfq_do_or_sched_stable_merge(bfqd, bfqq, bic); return bfqq; } @@ -5191,11 +5894,19 @@ static void bfq_update_io_thinktime(struct bfq_data *bfqd, struct bfq_queue *bfqq) { struct bfq_ttime *ttime = &bfqq->ttime; - u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; + u64 elapsed; + /* + * We are really interested in how long it takes for the queue to + * become busy when there is no outstanding IO for this queue. So + * ignore cases when the bfq queue has already IO queued. + */ + if (bfqq->dispatched || bfq_bfqq_busy(bfqq)) + return; + elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; elapsed = min_t(u64, elapsed, 2ULL * bfqd->bfq_slice_idle); - ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; + ttime->ttime_samples = (7*ttime->ttime_samples + 256) / 8; ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ttime->ttime_samples); @@ -5210,8 +5921,26 @@ bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, if (bfqq->wr_coeff > 1 && bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && - BFQQ_TOTALLY_SEEKY(bfqq)) - bfq_bfqq_end_wr(bfqq); + BFQQ_TOTALLY_SEEKY(bfqq)) { + if (time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + + bfq_wr_duration(bfqd))) { + /* + * In soft_rt weight raising with the + * interactive-weight-raising period + * elapsed (so no switch back to + * interactive weight raising). + */ + bfq_bfqq_end_wr(bfqq); + } else { /* + * stopping soft_rt weight raising + * while still in interactive period, + * switch back to interactive weight + * raising + */ + switch_back_to_interactive_wr(bfqq, bfqd); + bfqq->entity.prio_changed = 1; + } + } } static void bfq_update_has_short_ttime(struct bfq_data *bfqd, @@ -5235,12 +5964,13 @@ static void bfq_update_has_short_ttime(struct bfq_data *bfqd, return; /* Think time is infinite if no process is linked to - * bfqq. Otherwise check average think time to - * decide whether to mark as has_short_ttime + * bfqq. Otherwise check average think time to decide whether + * to mark as has_short_ttime. To this goal, compare average + * think time with half the I/O-plugging timeout. */ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || (bfq_sample_valid(bfqq->ttime.ttime_samples) && - bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) + bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle>>1)) has_short_ttime = false; state_changed = has_short_ttime != bfq_bfqq_has_short_ttime(bfqq); @@ -5401,11 +6131,28 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, } } +static void bfqq_request_allocated(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated++; +} + +static void bfqq_request_freed(struct bfq_queue *bfqq) +{ + struct bfq_entity *entity = &bfqq->entity; + + for_each_entity(entity) + entity->allocated--; +} + /* returns true if it causes the idle timer to be disabled */ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq), - *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); + *new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true, + RQ_BIC(rq)); bool waiting, idle_timer_disabled = false; if (new_bfqq) { @@ -5413,8 +6160,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * Release the request's reference to the old bfqq * and make sure one is taken to the shared queue. */ - new_bfqq->allocated++; - bfqq->allocated--; + bfqq_request_allocated(new_bfqq); + bfqq_request_freed(bfqq); new_bfqq->ref++; /* * If the bic associated with the process @@ -5424,7 +6171,8 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) * then complete the merge and redirect it to * new_bfqq. */ - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) + if (bic_to_bfqq(RQ_BIC(rq), true, + bfq_actuator_index(bfqd, rq->bio)) == bfqq) bfq_merge_bfqqs(bfqd, RQ_BIC(rq), bfqq, new_bfqq); @@ -5458,7 +6206,7 @@ static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) static void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, - unsigned int cmd_flags) + blk_opf_t cmd_flags) { if (!bfqq) return; @@ -5483,39 +6231,39 @@ static void bfq_update_insert_stats(struct request_queue *q, static inline void bfq_update_insert_stats(struct request_queue *q, struct bfq_queue *bfqq, bool idle_timer_disabled, - unsigned int cmd_flags) {} + blk_opf_t cmd_flags) {} #endif /* CONFIG_BFQ_CGROUP_DEBUG */ +static struct bfq_queue *bfq_init_rq(struct request *rq); + static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) + blk_insert_t flags) { struct request_queue *q = hctx->queue; struct bfq_data *bfqd = q->elevator->elevator_data; struct bfq_queue *bfqq; bool idle_timer_disabled = false; - unsigned int cmd_flags; + blk_opf_t cmd_flags; + LIST_HEAD(free); #ifdef CONFIG_BFQ_GROUP_IOSCHED if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) bfqg_stats_update_legacy_io(q, rq); #endif spin_lock_irq(&bfqd->lock); - if (blk_mq_sched_try_insert_merge(q, rq)) { + bfqq = bfq_init_rq(rq); + if (blk_mq_sched_try_insert_merge(q, rq, &free)) { spin_unlock_irq(&bfqd->lock); + blk_mq_free_requests(&free); return; } - spin_unlock_irq(&bfqd->lock); - - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); - spin_lock_irq(&bfqd->lock); - bfqq = bfq_init_rq(rq); - if (!bfqq || at_head || blk_rq_is_passthrough(rq)) { - if (at_head) - list_add(&rq->queuelist, &bfqd->dispatch); - else - list_add_tail(&rq->queuelist, &bfqd->dispatch); + if (flags & BLK_MQ_INSERT_AT_HEAD) { + list_add(&rq->queuelist, &bfqd->dispatch); + } else if (!bfqq) { + list_add_tail(&rq->queuelist, &bfqd->dispatch); } else { idle_timer_disabled = __bfq_insert_request(bfqd, rq); /* @@ -5538,7 +6286,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * merge). */ cmd_flags = rq->cmd_flags; - spin_unlock_irq(&bfqd->lock); bfq_update_insert_stats(q, bfqq, idle_timer_disabled, @@ -5546,14 +6293,15 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, } static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, bool at_head) + struct list_head *list, + blk_insert_t flags) { while (!list_empty(list)) { struct request *rq; rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - bfq_insert_request(hctx, rq, at_head); + bfq_insert_request(hctx, rq, flags); } } @@ -5562,7 +6310,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) struct bfq_queue *bfqq = bfqd->in_service_queue; bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, - bfqd->rq_in_driver); + bfqd->tot_rq_in_driver); if (bfqd->hw_tag == 1) return; @@ -5573,7 +6321,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) * sum is not exact, as it's not taking into account deactivated * requests. */ - if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) + if (bfqd->tot_rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) return; /* @@ -5584,7 +6332,7 @@ static void bfq_update_hw_tag(struct bfq_data *bfqd) if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < BFQ_HW_QUEUE_THRESHOLD && - bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) + bfqd->tot_rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) return; if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) @@ -5605,7 +6353,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) bfq_update_hw_tag(bfqd); - bfqd->rq_in_driver--; + bfqd->rq_in_driver[bfqq->actuator_idx]--; + bfqd->tot_rq_in_driver--; bfqq->dispatched--; if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { @@ -5617,7 +6366,8 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) */ bfqq->budget_timeout = jiffies; - bfq_weights_tree_remove(bfqd, bfqq); + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + bfq_weights_tree_remove(bfqq); } now_ns = ktime_get_ns(); @@ -5651,7 +6401,19 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) 1UL<<(BFQ_RATE_SHIFT - 10)) bfq_update_rate_reset(bfqd, NULL); bfqd->last_completion = now_ns; - bfqd->last_completed_rq_bfqq = bfqq; + /* + * Shared queues are likely to receive I/O at a high + * rate. This may deceptively let them be considered as wakers + * of other queues. But a false waker will unjustly steal + * bandwidth to its supposedly woken queue. So considering + * also shared queues in the waking mechanism may cause more + * control troubles than throughput benefits. Then reset + * last_completed_rq_bfqq if bfqq is a shared queue. + */ + if (!bfq_bfqq_coop(bfqq)) + bfqd->last_completed_rq_bfqq = bfqq; + else + bfqd->last_completed_rq_bfqq = NULL; /* * If we are waiting to discover whether the request pattern @@ -5712,17 +6474,10 @@ static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) BFQQE_NO_MORE_REQUESTS); } - if (!bfqd->rq_in_driver) + if (!bfqd->tot_rq_in_driver) bfq_schedule_dispatch(bfqd); } -static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) -{ - bfqq->allocated--; - - bfq_put_queue(bfqq); -} - /* * The processes associated with bfqq may happen to generate their * cumulative I/O at a lower rate than the rate at which the device @@ -5850,13 +6605,13 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, * conditions to do it, or we can lower the last base value * computed. * - * NOTE: (bfqd->rq_in_driver == 1) means that there is no I/O + * NOTE: (bfqd->tot_rq_in_driver == 1) means that there is no I/O * request in flight, because this function is in the code * path that handles the completion of a request of bfqq, and, * in particular, this function is executed before - * bfqd->rq_in_driver is decremented in such a code path. + * bfqd->tot_rq_in_driver is decremented in such a code path. */ - if ((bfqq->last_serv_time_ns == 0 && bfqd->rq_in_driver == 1) || + if ((bfqq->last_serv_time_ns == 0 && bfqd->tot_rq_in_driver == 1) || tot_time_ns < bfqq->last_serv_time_ns) { if (bfqq->last_serv_time_ns == 0) { /* @@ -5866,7 +6621,7 @@ static void bfq_update_inject_limit(struct bfq_data *bfqd, bfqq->inject_limit = max_t(unsigned int, 1, old_limit); } bfqq->last_serv_time_ns = tot_time_ns; - } else if (!bfqd->rqs_injected && bfqd->rq_in_driver == 1) + } else if (!bfqd->rqs_injected && bfqd->tot_rq_in_driver == 1) /* * No I/O injected and no request still in service in * the drive: these are the exact conditions for @@ -5894,18 +6649,7 @@ static void bfq_finish_requeue_request(struct request *rq) { struct bfq_queue *bfqq = RQ_BFQQ(rq); struct bfq_data *bfqd; - - /* - * Requeue and finish hooks are invoked in blk-mq without - * checking whether the involved request is actually still - * referenced in the scheduler. To handle this fact, the - * following two checks make this function exit in case of - * spurious invocations, for which there is nothing to do. - * - * First, check whether rq has nothing to do with an elevator. - */ - if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) - return; + unsigned long flags; /* * rq either is not associated with any icq, or is an already @@ -5923,39 +6667,17 @@ static void bfq_finish_requeue_request(struct request *rq) rq->io_start_time_ns, rq->cmd_flags); + spin_lock_irqsave(&bfqd->lock, flags); if (likely(rq->rq_flags & RQF_STARTED)) { - unsigned long flags; - - spin_lock_irqsave(&bfqd->lock, flags); - if (rq == bfqd->waited_rq) bfq_update_inject_limit(bfqd, bfqq); bfq_completed_request(bfqq, bfqd); - bfq_finish_requeue_request_body(bfqq); - - spin_unlock_irqrestore(&bfqd->lock, flags); - } else { - /* - * Request rq may be still/already in the scheduler, - * in which case we need to remove it (this should - * never happen in case of requeue). And we cannot - * defer such a check and removal, to avoid - * inconsistencies in the time interval from the end - * of this function to the start of the deferred work. - * This situation seems to occur only in process - * context, as a consequence of a merge. In the - * current version of the code, this implies that the - * lock is held. - */ - - if (!RB_EMPTY_NODE(&rq->rb_node)) { - bfq_remove_request(rq->q, rq); - bfqg_stats_update_io_remove(bfqq_group(bfqq), - rq->cmd_flags); - } - bfq_finish_requeue_request_body(bfqq); } + bfqq_request_freed(bfqq); + bfq_put_queue(bfqq); + RQ_BIC(rq)->requests--; + spin_unlock_irqrestore(&bfqd->lock, flags); /* * Reset private fields. In case of a requeue, this allows @@ -5978,6 +6700,16 @@ static void bfq_finish_requeue_request(struct request *rq) rq->elv.priv[1] = NULL; } +static void bfq_finish_request(struct request *rq) +{ + bfq_finish_requeue_request(rq); + + if (rq->elv.icq) { + put_io_context(rq->elv.icq->ioc); + rq->elv.icq = NULL; + } +} + /* * Removes the association between the current task and bfqq, assuming * that bic points to the bfq iocontext of the task. @@ -5996,7 +6728,7 @@ bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) return bfqq; } - bic_set_bfqq(bic, NULL, 1); + bic_set_bfqq(bic, NULL, true, bfqq->actuator_idx); bfq_put_cooperator(bfqq); @@ -6010,7 +6742,9 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, bool split, bool is_sync, bool *new_queue) { - struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); + unsigned int act_idx = bfq_actuator_index(bfqd, bio); + struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync, act_idx); + struct bfq_iocq_bfqq_data *bfqq_data = &bic->bfqq_data[act_idx]; if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) return bfqq; @@ -6020,16 +6754,16 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, if (bfqq) bfq_put_queue(bfqq); - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); + bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, split); - bic_set_bfqq(bic, bfqq, is_sync); + bic_set_bfqq(bic, bfqq, is_sync, act_idx); if (split && is_sync) { - if ((bic->was_in_burst_list && bfqd->large_burst) || - bic->saved_in_large_burst) + if ((bfqq_data->was_in_burst_list && bfqd->large_burst) || + bfqq_data->saved_in_large_burst) bfq_mark_bfqq_in_large_burst(bfqq); else { bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) + if (bfqq_data->was_in_burst_list) /* * If bfqq was in the current * burst list before being @@ -6075,6 +6809,8 @@ static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, */ static void bfq_prepare_request(struct request *rq) { + rq->elv.icq = ioc_find_get_icq(rq->q); + /* * Regardless of whether we have an icq attached, we have to * clear the scheduler pointers, as they might point to @@ -6116,19 +6852,20 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) struct bfq_queue *bfqq; bool new_queue = false; bool bfqq_already_existing = false, split = false; + unsigned int a_idx = bfq_actuator_index(bfqd, bio); if (unlikely(!rq->elv.icq)) return NULL; /* - * Assuming that elv.priv[1] is set only if everything is set + * Assuming that RQ_BFQQ(rq) is set only if everything is set * for this rq. This holds true, because this function is * invoked only for insertion or merging, and, after such * events, a request cannot be manipulated any longer before * being removed from bfq. */ - if (rq->elv.priv[1]) - return rq->elv.priv[1]; + if (RQ_BFQQ(rq)) + return RQ_BFQQ(rq); bic = icq_to_bic(rq->elv.icq); @@ -6141,27 +6878,48 @@ static struct bfq_queue *bfq_init_rq(struct request *rq) if (likely(!new_queue)) { /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); + if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq) && + !bic->bfqq_data[a_idx].stably_merged) { + struct bfq_queue *old_bfqq = bfqq; /* Update bic before losing reference to bfqq */ if (bfq_bfqq_in_large_burst(bfqq)) - bic->saved_in_large_burst = true; + bic->bfqq_data[a_idx].saved_in_large_burst = + true; bfqq = bfq_split_bfqq(bic, bfqq); split = true; - if (!bfqq) + if (!bfqq) { bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, true, is_sync, NULL); - else + if (unlikely(bfqq == &bfqd->oom_bfqq)) + bfqq_already_existing = true; + } else bfqq_already_existing = true; + + if (!bfqq_already_existing) { + bfqq->waker_bfqq = old_bfqq->waker_bfqq; + bfqq->tentative_waker_bfqq = NULL; + + /* + * If the waker queue disappears, then + * new_bfqq->waker_bfqq must be + * reset. So insert new_bfqq into the + * woken_list of the waker. See + * bfq_check_waker for details. + */ + if (bfqq->waker_bfqq) + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } } } - bfqq->allocated++; + bfqq_request_allocated(bfqq); bfqq->ref++; + bic->requests++; bfq_log_bfqq(bfqd, bfqq, "get_request %p: bfqq %p, %d", rq, bfqq, bfqq->ref); @@ -6258,8 +7016,8 @@ bfq_idle_slice_timer_body(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_bfqq_expire(bfqd, bfqq, true, reason); schedule_dispatch: - spin_unlock_irqrestore(&bfqd->lock, flags); bfq_schedule_dispatch(bfqd); + spin_unlock_irqrestore(&bfqd->lock, flags); } /* @@ -6310,24 +7068,26 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, */ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) { - int i, j; + int i, j, k; - for (i = 0; i < 2; i++) - for (j = 0; j < IOPRIO_BE_NR; j++) - __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); + for (k = 0; k < bfqd->num_actuators; k++) { + for (i = 0; i < 2; i++) + for (j = 0; j < IOPRIO_NR_LEVELS; j++) + __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j][k]); - __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); + __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq[k]); + } } /* * See the comments on bfq_limit_depth for the purpose of * the depths set in the function. Return minimum shallow depth we'll use. */ -static unsigned int bfq_update_depths(struct bfq_data *bfqd, - struct sbitmap_queue *bt) +static void bfq_update_depths(struct bfq_data *bfqd, struct sbitmap_queue *bt) { - unsigned int i, j, min_shallow = UINT_MAX; + unsigned int depth = 1U << bt->sb.shift; + bfqd->full_depth_shift = bt->sb.shift; /* * In-word depths if no bfq_queue is being weight-raised: * leaving 25% of tags only for sync reads. @@ -6339,13 +7099,13 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * limit 'something'. */ /* no more than 50% of tags for async I/O */ - bfqd->word_depths[0][0] = max((1U << bt->sb.shift) >> 1, 1U); + bfqd->word_depths[0][0] = max(depth >> 1, 1U); /* * no more than 75% of tags for sync writes (25% extra tags * w.r.t. async I/O, to prevent async I/O from starving sync * writes) */ - bfqd->word_depths[0][1] = max(((1U << bt->sb.shift) * 3) >> 2, 1U); + bfqd->word_depths[0][1] = max((depth * 3) >> 2, 1U); /* * In-word depths in case some bfq_queue is being weight- @@ -6355,25 +7115,18 @@ static unsigned int bfq_update_depths(struct bfq_data *bfqd, * shortage. */ /* no more than ~18% of tags for async I/O */ - bfqd->word_depths[1][0] = max(((1U << bt->sb.shift) * 3) >> 4, 1U); + bfqd->word_depths[1][0] = max((depth * 3) >> 4, 1U); /* no more than ~37% of tags for sync writes (~20% extra tags) */ - bfqd->word_depths[1][1] = max(((1U << bt->sb.shift) * 6) >> 4, 1U); - - for (i = 0; i < 2; i++) - for (j = 0; j < 2; j++) - min_shallow = min(min_shallow, bfqd->word_depths[i][j]); - - return min_shallow; + bfqd->word_depths[1][1] = max((depth * 6) >> 4, 1U); } static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) { struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; struct blk_mq_tags *tags = hctx->sched_tags; - unsigned int min_shallow; - min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); - sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); + bfq_update_depths(bfqd, &tags->bitmap_tags); + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); } static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) @@ -6386,6 +7139,7 @@ static void bfq_exit_queue(struct elevator_queue *e) { struct bfq_data *bfqd = e->elevator_data; struct bfq_queue *bfqq, *n; + unsigned int actuator; hrtimer_cancel(&bfqd->idle_slice_timer); @@ -6394,13 +7148,17 @@ static void bfq_exit_queue(struct elevator_queue *e) bfq_deactivate_bfqq(bfqd, bfqq, false, false); spin_unlock_irq(&bfqd->lock); + for (actuator = 0; actuator < bfqd->num_actuators; actuator++) + WARN_ON_ONCE(bfqd->rq_in_driver[actuator]); + WARN_ON_ONCE(bfqd->tot_rq_in_driver); + hrtimer_cancel(&bfqd->idle_slice_timer); /* release oom-queue reference to root group */ bfqg_and_blkg_put(bfqd->root_group); #ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); + blkcg_deactivate_policy(bfqd->queue->disk, &blkcg_policy_bfq); #else spin_lock_irq(&bfqd->lock); bfq_put_async_queues(bfqd, bfqd->root_group); @@ -6408,6 +7166,10 @@ static void bfq_exit_queue(struct elevator_queue *e) spin_unlock_irq(&bfqd->lock); #endif + blk_stat_disable_accounting(bfqd->queue); + clear_bit(ELEVATOR_FLAG_DISABLE_WBT, &e->flags); + wbt_enable_default(bfqd->queue->disk); + kfree(bfqd); } @@ -6431,6 +7193,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) { struct bfq_data *bfqd; struct elevator_queue *eq; + unsigned int i; + struct blk_independent_access_ranges *ia_ranges = q->disk->ia_ranges; eq = elevator_alloc(q, e); if (!eq) @@ -6451,8 +7215,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. * Grab a permanent reference to it, so that the normal code flow * will not attempt to free it. + * Set zero as actuator index: we will pretend that + * all I/O requests are for the same actuator. */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); + bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0, 0); bfqd->oom_bfqq.ref++; bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; @@ -6471,6 +7237,39 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->queue = q; + bfqd->num_actuators = 1; + /* + * If the disk supports multiple actuators, copy independent + * access ranges from the request queue structure. + */ + spin_lock_irq(&q->queue_lock); + if (ia_ranges) { + /* + * Check if the disk ia_ranges size exceeds the current bfq + * actuator limit. + */ + if (ia_ranges->nr_ia_ranges > BFQ_MAX_ACTUATORS) { + pr_crit("nr_ia_ranges higher than act limit: iars=%d, max=%d.\n", + ia_ranges->nr_ia_ranges, BFQ_MAX_ACTUATORS); + pr_crit("Falling back to single actuator mode.\n"); + } else { + bfqd->num_actuators = ia_ranges->nr_ia_ranges; + + for (i = 0; i < bfqd->num_actuators; i++) { + bfqd->sector[i] = ia_ranges->ia_range[i].sector; + bfqd->nr_sectors[i] = + ia_ranges->ia_range[i].nr_sectors; + } + } + } + + /* Otherwise use single-actuator dev info */ + if (bfqd->num_actuators == 1) { + bfqd->sector[0] = 0; + bfqd->nr_sectors[0] = get_capacity(q->disk); + } + spin_unlock_irq(&q->queue_lock); + INIT_LIST_HEAD(&bfqd->dispatch); hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, @@ -6478,9 +7277,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->idle_slice_timer.function = bfq_idle_slice_timer; bfqd->queue_weights_tree = RB_ROOT_CACHED; +#ifdef CONFIG_BFQ_GROUP_IOSCHED bfqd->num_groups_with_pending_reqs = 0; +#endif - INIT_LIST_HEAD(&bfqd->active_list); + INIT_LIST_HEAD(&bfqd->active_list[0]); + INIT_LIST_HEAD(&bfqd->active_list[1]); INIT_LIST_HEAD(&bfqd->idle_list); INIT_HLIST_HEAD(&bfqd->burst_list); @@ -6496,8 +7298,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfqd->bfq_slice_idle = bfq_slice_idle; bfqd->bfq_timeout = bfq_timeout; - bfqd->bfq_requests_within_timer = 120; - bfqd->bfq_large_burst_thresh = 8; bfqd->bfq_burst_interval = msecs_to_jiffies(180); @@ -6508,7 +7308,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) */ bfqd->bfq_wr_coeff = 30; bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); bfqd->bfq_wr_max_softrt_rate = 7000; /* @@ -6527,6 +7326,9 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + spin_lock_init(&bfqd->lock); /* @@ -6550,7 +7352,13 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) bfq_init_root_group(bfqd->root_group, bfqd); bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); - wbt_disable_default(q); + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); + + set_bit(ELEVATOR_FLAG_DISABLE_WBT, &eq->flags); + wbt_disable_default(q->disk); + blk_stat_enable_accounting(q); + return 0; out_free: @@ -6790,7 +7598,7 @@ static struct elevator_type iosched_bfq_mq = { .limit_depth = bfq_limit_depth, .prepare_request = bfq_prepare_request, .requeue_request = bfq_finish_requeue_request, - .finish_request = bfq_finish_requeue_request, + .finish_request = bfq_finish_request, .exit_icq = bfq_exit_icq, .insert_requests = bfq_insert_requests, .dispatch_request = bfq_dispatch_request, diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h index cd224aaf9f52..467e8cfc41a2 100644 --- a/block/bfq-iosched.h +++ b/block/bfq-iosched.h @@ -8,7 +8,6 @@ #include <linux/blktrace_api.h> #include <linux/hrtimer.h> -#include <linux/blk-cgroup.h> #include "blk-cgroup-rwstat.h" @@ -21,11 +20,10 @@ #define BFQ_DEFAULT_QUEUE_IOPRIO 4 -#define BFQ_WEIGHT_LEGACY_DFL 100 #define BFQ_DEFAULT_GRP_IOPRIO 0 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -#define MAX_PID_STR_LENGTH 12 +#define MAX_BFQQ_NAME_LENGTH 16 /* * Soft real-time applications are extremely more latency sensitive @@ -34,6 +32,14 @@ */ #define BFQ_SOFTRT_WEIGHT_FACTOR 100 +/* + * Maximum number of actuators supported. This constant is used simply + * to define the size of the static array that will contain + * per-actuator data. The current value is hopefully a good upper + * bound to the possible number of actuators of any actual drive. + */ +#define BFQ_MAX_ACTUATORS 8 + struct bfq_entity; /** @@ -170,6 +176,9 @@ struct bfq_entity { /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ int budget; + /* Number of requests allocated in the subtree of this entity */ + int allocated; + /* device weight, if non-zero, it overrides the default weight of * bfq_group_data */ int dev_weight; @@ -195,8 +204,13 @@ struct bfq_entity { /* flag, set to request a weight, ioprio or ioprio_class change */ int prio_changed; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* flag, set if the entity is counted in groups_with_pending_reqs */ bool in_groups_with_pending_reqs; +#endif + + /* last child queue of entity created (for non-leaf entities) */ + struct bfq_queue *last_bfqq_created; }; struct bfq_group; @@ -220,16 +234,20 @@ struct bfq_ttime { * struct bfq_queue - leaf schedulable entity. * * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating - * processes. @cgroup holds a reference to the cgroup, to be sure that it - * does not disappear while a bfqq still references it (mostly to avoid - * races between request issuing and task migration followed by cgroup - * destruction). - * All the fields are protected by the queue lock of the containing bfqd. + * io_context or more, if it is async or shared between cooperating + * processes. Besides, it contains I/O requests for only one actuator + * (an io_context is associated with a different bfq_queue for each + * actuator it generates I/O for). @cgroup holds a reference to the + * cgroup, to be sure that it does not disappear while a bfqq still + * references it (mostly to avoid races between request issuing and + * task migration followed by cgroup destruction). All the fields are + * protected by the queue lock of the containing bfqd. */ struct bfq_queue { /* reference counter */ int ref; + /* counter of references from other queues for delayed stable merge */ + int stable_ref; /* parent bfq_data */ struct bfq_data *bfqd; @@ -261,8 +279,6 @@ struct bfq_queue { struct request *next_rq; /* number of sync and async requests queued */ int queued[2]; - /* number of requests currently allocated */ - int allocated; /* number of pending metadata requests */ int meta_pending; /* fifo list of requests in sort_list */ @@ -291,6 +307,11 @@ struct bfq_queue { /* associated @bfq_ttime struct */ struct bfq_ttime ttime; + /* when bfqq started to do I/O within the last observation window */ + u64 io_start_time; + /* how long bfqq has remained empty during the last observ. window */ + u64 tot_idle_time; + /* bit vector: a 1 for each seeky requests in history */ u32 seek_history; @@ -359,9 +380,7 @@ struct bfq_queue { unsigned long split_time; /* time of last split */ unsigned long first_IO_time; /* time of first I/O for this queue */ - - /* max service rate measured so far */ - u32 max_service_rate; + unsigned long creation_time; /* when this queue is created */ /* * Pointer to the waker queue for this queue, i.e., to the @@ -371,6 +390,13 @@ struct bfq_queue { * bfq_select_queue(). */ struct bfq_queue *waker_bfqq; + /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ + struct bfq_queue *tentative_waker_bfqq; + /* number of times the same tentative waker has been detected */ + unsigned int num_waker_detections; + /* time when we started considering this waker */ + u64 waker_detection_started; + /* node for woken_list, see below */ struct hlist_node woken_list_node; /* @@ -380,24 +406,18 @@ struct bfq_queue { * the woken queues when this queue exits. */ struct hlist_head woken_list; + + /* index of the actuator this queue is associated with */ + unsigned int actuator_idx; }; /** - * struct bfq_io_cq - per (request_queue, io_context) structure. - */ -struct bfq_io_cq { - /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ - /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; - /* per (request_queue, blkcg) ioprio */ - int ioprio; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_serial_nr; /* the current blkcg serial */ -#endif +* struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq +*/ +struct bfq_iocq_bfqq_data { /* * Snapshot of the has_short_time flag before merging; taken - * to remember its value while the queue is merged, so as to + * to remember its values while the queue is merged, so as to * be able to restore it in case of split. */ bool saved_has_short_ttime; @@ -407,8 +427,11 @@ struct bfq_io_cq { */ bool saved_IO_bound; + u64 saved_io_start_time; + u64 saved_tot_idle_time; + /* - * Same purpose as the previous fields for the value of the + * Same purpose as the previous fields for the values of the * field keeping the queue's belonging to a large burst */ bool saved_in_large_burst; @@ -432,9 +455,53 @@ struct bfq_io_cq { */ unsigned long saved_wr_coeff; unsigned long saved_last_wr_start_finish; + unsigned long saved_service_from_wr; unsigned long saved_wr_start_at_switch_to_srt; unsigned int saved_wr_cur_max_time; struct bfq_ttime saved_ttime; + + /* Save also injection state */ + u64 saved_last_serv_time_ns; + unsigned int saved_inject_limit; + unsigned long saved_decrease_time_jif; + + /* candidate queue for a stable merge (due to close creation time) */ + struct bfq_queue *stable_merge_bfqq; + + bool stably_merged; /* non splittable if true */ +}; + +/** + * struct bfq_io_cq - per (request_queue, io_context) structure. + */ +struct bfq_io_cq { + /* associated io_cq structure */ + struct io_cq icq; /* must be the first member */ + /* + * Matrix of associated process queues: first row for async + * queues, second row sync queues. Each row contains one + * column for each actuator. An I/O request generated by the + * process is inserted into the queue pointed by bfqq[i][j] if + * the request is to be served by the j-th actuator of the + * drive, where i==0 or i==1, depending on whether the request + * is async or sync. So there is a distinct queue for each + * actuator. + */ + struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; + /* per (request_queue, blkcg) ioprio */ + int ioprio; +#ifdef CONFIG_BFQ_GROUP_IOSCHED + uint64_t blkcg_serial_nr; /* the current blkcg serial */ +#endif + + /* + * Persistent data for associated synchronous process queues + * (one queue per actuator, see field bfqq above). In + * particular, each of these queues may undergo a merge. + */ + struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; + + unsigned int requests; /* Number of requests this process has in flight */ }; /** @@ -461,28 +528,29 @@ struct bfq_data { */ struct rb_root_cached queue_weights_tree; +#ifdef CONFIG_BFQ_GROUP_IOSCHED /* - * Number of groups with at least one descendant process that + * Number of groups with at least one process that * has at least one request waiting for completion. Note that * this accounts for also requests already dispatched, but not * yet completed. Therefore this number of groups may differ * (be larger) than the number of active groups, as a group is * considered active only if its corresponding entity has - * descendant queues with at least one request queued. This + * queues with at least one request queued. This * number is used to decide whether a scenario is symmetric. * For a detailed explanation see comments on the computation * of the variable asymmetric_scenario in the function * bfq_better_to_idle(). * * However, it is hard to compute this number exactly, for - * groups with multiple descendant processes. Consider a group - * that is inactive, i.e., that has no descendant process with + * groups with multiple processes. Consider a group + * that is inactive, i.e., that has no process with * pending I/O inside BFQ queues. Then suppose that * num_groups_with_pending_reqs is still accounting for this - * group, because the group has descendant processes with some + * group, because the group has processes with some * I/O request still in flight. num_groups_with_pending_reqs * should be decremented when the in-flight request of the - * last descendant process is finally completed (assuming that + * last process is finally completed (assuming that * nothing else has changed for the group in the meantime, in * terms of composition of the group and active/inactive state of child * groups and processes). To accomplish this, an additional @@ -491,7 +559,7 @@ struct bfq_data { * we resort to the following tradeoff between simplicity and * accuracy: for an inactive group that is still counted in * num_groups_with_pending_reqs, we decrement - * num_groups_with_pending_reqs when the first descendant + * num_groups_with_pending_reqs when the first * process of the group remains with no request waiting for * completion. * @@ -499,15 +567,16 @@ struct bfq_data { * carefulness: to avoid multiple decrements, we flag a group, * more precisely an entity representing a group, as still * counted in num_groups_with_pending_reqs when it becomes - * inactive. Then, when the first descendant queue of the + * inactive. Then, when the first queue of the * entity remains with no request waiting for completion, * num_groups_with_pending_reqs is decremented, and this flag * is reset. After this flag is reset for the entity, * num_groups_with_pending_reqs won't be decremented any - * longer in case a new descendant queue of the entity remains + * longer in case a new queue of the entity remains * with no request waiting for completion. */ unsigned int num_groups_with_pending_reqs; +#endif /* * Per-class (RT, BE, IDLE) number of bfq_queues containing @@ -520,7 +589,12 @@ struct bfq_data { /* number of queued requests */ int queued; /* number of requests dispatched and waiting for completion */ - int rq_in_driver; + int tot_rq_in_driver; + /* + * number of requests dispatched and waiting for completion + * for each actuator + */ + int rq_in_driver[BFQ_MAX_ACTUATORS]; /* true if the device is non rotational and performs queueing */ bool nonrot_with_queueing; @@ -559,6 +633,9 @@ struct bfq_data { /* bfqq owning the last completed rq */ struct bfq_queue *last_completed_rq_bfqq; + /* last bfqq created, among those in the root group */ + struct bfq_queue *last_bfqq_created; + /* time of last transition from empty to non-empty (ns) */ u64 last_empty_occupied_ns; @@ -611,8 +688,13 @@ struct bfq_data { /* maximum budget allotted to a bfq_queue before rescheduling */ int bfq_max_budget; - /* list of all the bfq_queues active on the device */ - struct list_head active_list; + /* + * List of all the bfq_queues active for a specific actuator + * on the device. Keeping active queues separate on a + * per-actuator basis helps implementing per-actuator + * injection more efficiently. + */ + struct list_head active_list[BFQ_MAX_ACTUATORS]; /* list of all the bfq_queues idle on the device */ struct list_head idle_list; @@ -642,14 +724,6 @@ struct bfq_data { unsigned int bfq_timeout; /* - * Number of consecutive requests that must be issued within - * the idle time slice to set again idling to a queue which - * was marked as non-I/O-bound (see the definition of the - * IO_bound flag for further details). - */ - unsigned int bfq_requests_within_timer; - - /* * Force device idling whenever needed to provide accurate * service guarantees, without caring about throughput * issues. CAVEAT: this may even increase latencies, in case @@ -694,8 +768,6 @@ struct bfq_data { * is multiplied. */ unsigned int bfq_wr_coeff; - /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; /* Maximum weight-raising duration for soft real-time processes */ unsigned int bfq_wr_rt_max_time; @@ -742,6 +814,43 @@ struct bfq_data { * function) */ unsigned int word_depths[2][2]; + unsigned int full_depth_shift; + + /* + * Number of independent actuators. This is equal to 1 in + * case of single-actuator drives. + */ + unsigned int num_actuators; + /* + * Disk independent access ranges for each actuator + * in this device. + */ + sector_t sector[BFQ_MAX_ACTUATORS]; + sector_t nr_sectors[BFQ_MAX_ACTUATORS]; + struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; + + /* + * If the number of I/O requests queued in the device for a + * given actuator is below next threshold, then the actuator + * is deemed as underutilized. If this condition is found to + * hold for some actuator upon a dispatch, but (i) the + * in-service queue does not contain I/O for that actuator, + * while (ii) some other queue does contain I/O for that + * actuator, then the head I/O request of the latter queue is + * returned (injected), instead of the head request of the + * currently in-service queue. + * + * We set the threshold, empirically, to the minimum possible + * value for which an actuator is fully utilized, or close to + * be fully utilized. By doing so, injected I/O 'steals' as + * few drive-queue slots as possibile to the in-service + * queue. This reduces as much as possible the probability + * that the service of I/O from the in-service bfq_queue gets + * delayed because of slot exhaustion, i.e., because all the + * slots of the drive queue are filled with I/O injected from + * other queues (NCQ provides for 32 slots). + */ + unsigned int actuator_load_threshold; }; enum bfqq_state_flags { @@ -770,7 +879,6 @@ enum bfqq_state_flags { */ BFQQF_coop, /* bfqq is shared */ BFQQF_split_coop, /* shared bfqq will be split */ - BFQQF_has_waker /* bfqq has a waker queue */ }; #define BFQ_BFQQ_FNS(name) \ @@ -790,7 +898,6 @@ BFQ_BFQQ_FNS(in_large_burst); BFQ_BFQQ_FNS(coop); BFQ_BFQQ_FNS(split_coop); BFQ_BFQQ_FNS(softrt_update); -BFQ_BFQQ_FNS(has_waker); #undef BFQ_BFQQ_FNS /* Expiration reasons. */ @@ -900,19 +1007,20 @@ struct bfq_group { char blkg_path[128]; /* reference counter (see comments in bfq_bic_update_cgroup) */ - int ref; + refcount_t ref; struct bfq_entity entity; struct bfq_sched_data sched_data; - void *bfqd; + struct bfq_data *bfqd; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct bfq_entity *my_entity; int active_entities; + int num_queues_with_pending_reqs; struct rb_root rq_pos_tree; @@ -924,15 +1032,13 @@ struct bfq_group { struct bfq_entity entity; struct bfq_sched_data sched_data; - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; + struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; + struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; struct rb_root rq_pos_tree; }; #endif -struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - /* --------------- main algorithm interface ----------------- */ #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ @@ -940,20 +1046,18 @@ struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); extern const int bfq_timeout; -struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); -void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); +struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, + unsigned int actuator_idx); +void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, + unsigned int actuator_idx); struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); -void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, - struct rb_root_cached *root); -void __bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct rb_root_cached *root); -void bfq_weights_tree_remove(struct bfq_data *bfqd, - struct bfq_queue *bfqq); +void bfq_weights_tree_add(struct bfq_queue *bfqq); +void bfq_weights_tree_remove(struct bfq_queue *bfqq); void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool compensate, enum bfqq_expiration reason); void bfq_put_queue(struct bfq_queue *bfqq); +void bfq_put_cooperator(struct bfq_queue *bfqq); void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_schedule_dispatch(struct bfq_data *bfqd); @@ -964,29 +1068,30 @@ void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); /* ---------------- cgroups-support interface ---------------- */ void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); -void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, - unsigned int op); -void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); -void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op); +void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); +void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, - u64 io_start_time_ns, unsigned int op); + u64 io_start_time_ns, blk_opf_t opf); void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -void bfqg_stats_update_idle_time(struct bfq_group *bfqg); void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); -void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, struct bfq_group *bfqg); +#ifdef CONFIG_BFQ_CGROUP_DEBUG +void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, + blk_opf_t opf); +void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); +void bfqg_stats_update_idle_time(struct bfq_group *bfqg); +void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); +#endif + void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); void bfq_end_wr_async(struct bfq_data *bfqd); -struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, - struct blkcg *blkcg); +struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); struct bfq_group *bfqq_group(struct bfq_queue *bfqq); struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); -void bfqg_and_blkg_get(struct bfq_group *bfqg); void bfqg_and_blkg_put(struct bfq_group *bfqg); #ifdef CONFIG_BFQ_GROUP_IOSCHED @@ -1026,7 +1131,6 @@ extern struct blkcg_policy blkcg_policy_bfq; for (parent = NULL; entity ; entity = parent) #endif /* CONFIG_BFQ_GROUP_IOSCHED */ -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); @@ -1051,50 +1155,50 @@ void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bool expiration); -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration); -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); +void bfq_add_bfqq_busy(struct bfq_queue *bfqq); +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); /* --------------- end of interface of B-WF2Q+ ---------------- */ /* Logging facilities. */ -static inline void bfq_pid_to_str(int pid, char *str, int len) +static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len) { - if (pid != -1) - snprintf(str, len, "%d", pid); + char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A'; + + if (bfqq->pid != -1) + snprintf(str, len, "bfq%d%c", bfqq->pid, type); else - snprintf(str, len, "SHARED-"); + snprintf(str, len, "bfqSHARED-%c", type); } #ifdef CONFIG_BFQ_GROUP_IOSCHED struct bfq_group *bfqq_group(struct bfq_queue *bfqq); #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ - "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ + &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \ + "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ blk_add_cgroup_trace_msg((bfqd)->queue, \ - bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ + &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ } while (0) #else /* CONFIG_BFQ_GROUP_IOSCHED */ #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ - char pid_str[MAX_PID_STR_LENGTH]; \ + char pid_str[MAX_BFQQ_NAME_LENGTH]; \ if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ break; \ - bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ - blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ - bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ - ##args); \ + bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ + blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ } while (0) #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) diff --git a/block/bfq-wf2q.c b/block/bfq-wf2q.c index eb0e2a6daabe..7941b6f07391 100644 --- a/block/bfq-wf2q.c +++ b/block/bfq-wf2q.c @@ -137,24 +137,11 @@ static bool bfq_update_next_in_service(struct bfq_sched_data *sd, sd->next_in_service = next_in_service; - if (!next_in_service) - return parent_sched_may_change; - return parent_sched_may_change; } #ifdef CONFIG_BFQ_GROUP_IOSCHED -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -{ - struct bfq_entity *group_entity = bfqq->entity.parent; - - if (!group_entity) - group_entity = &bfqq->bfqd->root_group->entity; - - return container_of(group_entity, struct bfq_group, entity); -} - /* * Returns true if this budget changes may let next_in_service->parent * become the next_in_service entity for its parent entity. @@ -231,13 +218,26 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return false; } -#else /* CONFIG_BFQ_GROUP_IOSCHED */ +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + + if (bfqg != bfqg->bfqd->root_group) + bfqg->active_entities++; +} -struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) +static void bfq_dec_active_entities(struct bfq_entity *entity) { - return bfqq->bfqd->root_group; + struct bfq_sched_data *sd = entity->sched_data; + struct bfq_group *bfqg = container_of(sd, struct bfq_group, sched_data); + + if (bfqg != bfqg->bfqd->root_group) + bfqg->active_entities--; } +#else /* CONFIG_BFQ_GROUP_IOSCHED */ + static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) { return false; @@ -248,6 +248,14 @@ static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) return true; } +static void bfq_inc_active_entities(struct bfq_entity *entity) +{ +} + +static void bfq_dec_active_entities(struct bfq_entity *entity) +{ +} + #endif /* CONFIG_BFQ_GROUP_IOSCHED */ /* @@ -474,11 +482,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node = &entity->rb_node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif bfq_insert(&st->active, entity); @@ -489,17 +492,10 @@ static void bfq_active_insert(struct bfq_service_tree *st, bfq_update_active_tree(node); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) - list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities++; -#endif + list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list[bfqq->actuator_idx]); + + bfq_inc_active_entities(entity); } /** @@ -508,7 +504,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, */ unsigned short bfq_ioprio_to_weight(int ioprio) { - return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; + return (IOPRIO_NR_LEVELS - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; } /** @@ -517,12 +513,12 @@ unsigned short bfq_ioprio_to_weight(int ioprio) * * To preserve as much as possible the old only-ioprio user interface, * 0 is used as an escape ioprio value for weights (numerically) equal or - * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. + * larger than IOPRIO_NR_LEVELS * BFQ_WEIGHT_CONVERSION_COEFF. */ static unsigned short bfq_weight_to_ioprio(int weight) { return max_t(int, 0, - IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight); + IOPRIO_NR_LEVELS - weight / BFQ_WEIGHT_CONVERSION_COEFF); } static void bfq_get_entity(struct bfq_entity *entity) @@ -533,9 +529,7 @@ static void bfq_get_entity(struct bfq_entity *entity) bfqq->ref++; bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", bfqq, bfqq->ref); - } else - bfqg_and_blkg_get(container_of(entity, struct bfq_group, - entity)); + } } /** @@ -578,29 +572,16 @@ static void bfq_active_extract(struct bfq_service_tree *st, { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); struct rb_node *node; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd = NULL; - struct bfq_group *bfqg = NULL; - struct bfq_data *bfqd = NULL; -#endif node = bfq_find_deepest(&entity->rb_node); bfq_extract(&st->active, entity); if (node) bfq_update_active_tree(node); - -#ifdef CONFIG_BFQ_GROUP_IOSCHED - sd = entity->sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; -#endif if (bfqq) list_del(&bfqq->bfqq_list); -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (bfqg != bfqd->root_group) - bfqg->active_entities--; -#endif + + bfq_dec_active_entities(entity); } /** @@ -649,14 +630,8 @@ static void bfq_forget_entity(struct bfq_service_tree *st, entity->on_st_or_in_serv = false; st->wsum -= entity->weight; - if (is_in_service) - return; - - if (bfqq) + if (bfqq && !is_in_service) bfq_put_queue(bfqq); - else - bfqg_and_blkg_put(container_of(entity, struct bfq_group, - entity)); } /** @@ -732,22 +707,6 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, if (entity->prio_changed) { struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root_cached *root; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_sched_data *sd; - struct bfq_group *bfqg; -#endif - - if (bfqq) - bfqd = bfqq->bfqd; -#ifdef CONFIG_BFQ_GROUP_IOSCHED - else { - sd = entity->my_sched_data; - bfqg = container_of(sd, struct bfq_group, sched_data); - bfqd = (struct bfq_data *)bfqg->bfqd; - } -#endif /* Matches the smp_wmb() in bfq_group_set_weight. */ smp_rmb(); @@ -796,19 +755,15 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, * queue, remove the entity from its old weight counter (if * there is a counter associated with the entity). */ - if (prev_weight != new_weight && bfqq) { - root = &bfqd->queue_weights_tree; - __bfq_weights_tree_remove(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq) + bfq_weights_tree_remove(bfqq); entity->weight = new_weight; /* * Add the entity, if it is not a weight-raised queue, * to the counter associated with its new weight. */ - if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { - /* If we get here, root has been initialized. */ - bfq_weights_tree_add(bfqd, bfqq, root); - } + if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) + bfq_weights_tree_add(bfqq); new_st->wsum += entity->weight; @@ -1010,19 +965,6 @@ static void __bfq_activate_entity(struct bfq_entity *entity, entity->on_st_or_in_serv = true; } -#ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ - struct bfq_group *bfqg = - container_of(entity, struct bfq_group, entity); - struct bfq_data *bfqd = bfqg->bfqd; - - if (!entity->in_groups_with_pending_reqs) { - entity->in_groups_with_pending_reqs = true; - bfqd->num_groups_with_pending_reqs++; - } - } -#endif - bfq_update_fin_time_enqueue(entity, st, backshifted); } @@ -1108,12 +1050,12 @@ static void __bfq_requeue_entity(struct bfq_entity *entity) } static void __bfq_activate_requeue_entity(struct bfq_entity *entity, - struct bfq_sched_data *sd, bool non_blocking_wait_rq) { struct bfq_service_tree *st = bfq_entity_service_tree(entity); - if (sd->in_service_entity == entity || entity->tree == &st->active) + if (entity->sched_data->in_service_entity == entity || + entity->tree == &st->active) /* * in service or already queued on the active tree, * requeue or reposition @@ -1145,14 +1087,10 @@ static void bfq_activate_requeue_entity(struct bfq_entity *entity, bool non_blocking_wait_rq, bool requeue, bool expiration) { - struct bfq_sched_data *sd; - for_each_entity(entity) { - sd = entity->sched_data; - __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); - - if (!bfq_update_next_in_service(sd, entity, expiration) && - !requeue) + __bfq_activate_requeue_entity(entity, non_blocking_wait_rq); + if (!bfq_update_next_in_service(entity->sched_data, entity, + expiration) && !requeue) break; } } @@ -1386,6 +1324,8 @@ left: /** * __bfq_lookup_next_entity - return the first eligible entity in @st. * @st: the service tree. + * @in_service: whether or not there is an in-service entity for the sched_data + * this active tree belongs to. * * If there is no in-service entity for the sched_data st belongs to, * then return the entity that will be set in service if: @@ -1498,9 +1438,6 @@ static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, break; } - if (!entity) - return NULL; - return entity; } @@ -1673,14 +1610,41 @@ void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfqq == bfqd->in_service_queue, expiration); } +void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_entity *entity = &bfqq->entity; + + if (!entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = true; + if (!(bfqq_group(bfqq)->num_queues_with_pending_reqs++)) + bfqq->bfqd->num_groups_with_pending_reqs++; + } +#endif +} + +void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq) +{ +#ifdef CONFIG_BFQ_GROUP_IOSCHED + struct bfq_entity *entity = &bfqq->entity; + + if (entity->in_groups_with_pending_reqs) { + entity->in_groups_with_pending_reqs = false; + if (!(--bfqq_group(bfqq)->num_queues_with_pending_reqs)) + bfqq->bfqd->num_groups_with_pending_reqs--; + } +#endif +} + /* * Called when the bfqq no longer has requests pending, remove it from * the service tree. As a special case, it can be invoked during an * expiration. */ -void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bool expiration) +void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "del from busy"); bfq_clear_bfqq_busy(bfqq); @@ -1694,15 +1658,23 @@ void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); - if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, bfqq); + if (!bfqq->dispatched) { + bfq_del_bfqq_in_groups_with_pending_reqs(bfqq); + /* + * Next function is invoked last, because it causes bfqq to be + * freed. DO NOT use bfqq after the next function invocation. + */ + bfq_weights_tree_remove(bfqq); + } } /* * Called when an inactive queue receives a new request. */ -void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) +void bfq_add_bfqq_busy(struct bfq_queue *bfqq) { + struct bfq_data *bfqd = bfqq->bfqd; + bfq_log_bfqq(bfqd, bfqq, "add to busy"); bfq_activate_bfqq(bfqd, bfqq); @@ -1710,11 +1682,20 @@ void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) bfq_mark_bfqq_busy(bfqq); bfqd->busy_queues[bfqq->ioprio_class - 1]++; - if (!bfqq->dispatched) + if (!bfqq->dispatched) { + bfq_add_bfqq_in_groups_with_pending_reqs(bfqq); if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, bfqq, - &bfqd->queue_weights_tree); + bfq_weights_tree_add(bfqq); + } if (bfqq->wr_coeff > 1) bfqd->wr_busy_queues++; + + /* Move bfqq to the head of the woken list of its waker */ + if (!hlist_unhashed(&bfqq->woken_list_node) && + &bfqq->woken_list_node != bfqq->waker_bfqq->woken_list.first) { + hlist_del_init(&bfqq->woken_list_node); + hlist_add_head(&bfqq->woken_list_node, + &bfqq->waker_bfqq->woken_list); + } } diff --git a/block/bio-integrity.c b/block/bio-integrity.c index 9ffd7e289554..c9a16fba58b9 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/mempool.h> #include <linux/export.h> #include <linux/bio.h> @@ -14,8 +14,6 @@ #include <linux/slab.h> #include "blk.h" -#define BIP_INLINE_VECS 4 - static struct kmem_cache *bip_slab; static struct workqueue_struct *kintegrityd_wq; @@ -30,7 +28,7 @@ static void __bio_integrity_free(struct bio_set *bs, if (bs && mempool_initialized(&bs->bio_integrity_pool)) { if (bip->bip_vec) bvec_free(&bs->bvec_integrity_pool, bip->bip_vec, - bip->bip_slab); + bip->bip_max_vcnt); mempool_free(bip, &bs->bio_integrity_pool); } else { kfree(bip); @@ -63,7 +61,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, inline_vecs = nr_vecs; } else { bip = mempool_alloc(&bs->bio_integrity_pool, gfp_mask); - inline_vecs = BIP_INLINE_VECS; + inline_vecs = BIO_INLINE_VECS; } if (unlikely(!bip)) @@ -71,18 +69,15 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, memset(bip, 0, sizeof(*bip)); + /* always report as many vecs as asked explicitly, not inline vecs */ + bip->bip_max_vcnt = nr_vecs; if (nr_vecs > inline_vecs) { - unsigned long idx = 0; - - bip->bip_vec = bvec_alloc(gfp_mask, nr_vecs, &idx, - &bs->bvec_integrity_pool); + bip->bip_vec = bvec_alloc(&bs->bvec_integrity_pool, + &bip->bip_max_vcnt, gfp_mask); if (!bip->bip_vec) goto err; - bip->bip_max_vcnt = bvec_nr_vecs(idx); - bip->bip_slab = idx; } else { bip->bip_vec = bip->bip_inline_vecs; - bip->bip_max_vcnt = inline_vecs; } bip->bip_bio = bio; @@ -96,6 +91,47 @@ err: } EXPORT_SYMBOL(bio_integrity_alloc); +static void bio_integrity_unpin_bvec(struct bio_vec *bv, int nr_vecs, + bool dirty) +{ + int i; + + for (i = 0; i < nr_vecs; i++) { + if (dirty && !PageCompound(bv[i].bv_page)) + set_page_dirty_lock(bv[i].bv_page); + unpin_user_page(bv[i].bv_page); + } +} + +static void bio_integrity_uncopy_user(struct bio_integrity_payload *bip) +{ + unsigned short nr_vecs = bip->bip_max_vcnt - 1; + struct bio_vec *copy = &bip->bip_vec[1]; + size_t bytes = bip->bip_iter.bi_size; + struct iov_iter iter; + int ret; + + iov_iter_bvec(&iter, ITER_DEST, copy, nr_vecs, bytes); + ret = copy_to_iter(bvec_virt(bip->bip_vec), bytes, &iter); + WARN_ON_ONCE(ret != bytes); + + bio_integrity_unpin_bvec(copy, nr_vecs, true); +} + +static void bio_integrity_unmap_user(struct bio_integrity_payload *bip) +{ + bool dirty = bio_data_dir(bip->bip_bio) == READ; + + if (bip->bip_flags & BIP_COPY_USER) { + if (dirty) + bio_integrity_uncopy_user(bip); + kfree(bvec_virt(bip->bip_vec)); + return; + } + + bio_integrity_unpin_bvec(bip->bip_vec, bip->bip_max_vcnt, dirty); +} + /** * bio_integrity_free - Free bio integrity payload * @bio: bio containing bip to be freed @@ -109,8 +145,9 @@ void bio_integrity_free(struct bio *bio) struct bio_set *bs = bio->bi_pool; if (bip->bip_flags & BIP_BLOCK_INTEGRITY) - kfree(page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset); + kfree(bvec_virt(bip->bip_vec)); + else if (bip->bip_flags & BIP_INTEGRITY_USER) + bio_integrity_unmap_user(bip); __bio_integrity_free(bs, bip); bio->bi_integrity = NULL; @@ -129,30 +166,214 @@ void bio_integrity_free(struct bio *bio) int bio_integrity_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int offset) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); struct bio_integrity_payload *bip = bio_integrity(bio); - struct bio_vec *iv; - if (bip->bip_vcnt >= bip->bip_max_vcnt) { - printk(KERN_ERR "%s: bip_vec full\n", __func__); + if (((bip->bip_iter.bi_size + len) >> SECTOR_SHIFT) > + queue_max_hw_sectors(q)) return 0; - } - iv = bip->bip_vec + bip->bip_vcnt; + if (bip->bip_vcnt > 0) { + struct bio_vec *bv = &bip->bip_vec[bip->bip_vcnt - 1]; + bool same_page = false; - if (bip->bip_vcnt && - bvec_gap_to_prev(bio->bi_disk->queue, - &bip->bip_vec[bip->bip_vcnt - 1], offset)) - return 0; + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + &same_page)) { + bip->bip_iter.bi_size += len; + return len; + } + + if (bip->bip_vcnt >= + min(bip->bip_max_vcnt, queue_max_integrity_segments(q))) + return 0; - iv->bv_page = page; - iv->bv_len = len; - iv->bv_offset = offset; + /* + * If the queue doesn't support SG gaps and adding this segment + * would create a gap, disallow it. + */ + if (bvec_gap_to_prev(&q->limits, bv, offset)) + return 0; + } + + bvec_set_page(&bip->bip_vec[bip->bip_vcnt], page, len, offset); bip->bip_vcnt++; + bip->bip_iter.bi_size += len; return len; } EXPORT_SYMBOL(bio_integrity_add_page); +static int bio_integrity_copy_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, + unsigned int direction, u32 seed) +{ + bool write = direction == ITER_SOURCE; + struct bio_integrity_payload *bip; + struct iov_iter iter; + void *buf; + int ret; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + if (write) { + iov_iter_bvec(&iter, direction, bvec, nr_vecs, len); + if (!copy_from_iter_full(buf, len, &iter)) { + ret = -EFAULT; + goto free_buf; + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + } else { + memset(buf, 0, len); + + /* + * We need to preserve the original bvec and the number of vecs + * in it for completion handling + */ + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs + 1); + } + + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto free_buf; + } + + if (write) + bio_integrity_unpin_bvec(bvec, nr_vecs, false); + else + memcpy(&bip->bip_vec[1], bvec, nr_vecs * sizeof(*bvec)); + + ret = bio_integrity_add_page(bio, virt_to_page(buf), len, + offset_in_page(buf)); + if (ret != len) { + ret = -ENOMEM; + goto free_bip; + } + + bip->bip_flags |= BIP_INTEGRITY_USER | BIP_COPY_USER; + bip->bip_iter.bi_sector = seed; + return 0; +free_bip: + bio_integrity_free(bio); +free_buf: + kfree(buf); + return ret; +} + +static int bio_integrity_init_user(struct bio *bio, struct bio_vec *bvec, + int nr_vecs, unsigned int len, u32 seed) +{ + struct bio_integrity_payload *bip; + + bip = bio_integrity_alloc(bio, GFP_KERNEL, nr_vecs); + if (IS_ERR(bip)) + return PTR_ERR(bip); + + memcpy(bip->bip_vec, bvec, nr_vecs * sizeof(*bvec)); + bip->bip_flags |= BIP_INTEGRITY_USER; + bip->bip_iter.bi_sector = seed; + bip->bip_iter.bi_size = len; + return 0; +} + +static unsigned int bvec_from_pages(struct bio_vec *bvec, struct page **pages, + int nr_vecs, ssize_t bytes, ssize_t offset) +{ + unsigned int nr_bvecs = 0; + int i, j; + + for (i = 0; i < nr_vecs; i = j) { + size_t size = min_t(size_t, bytes, PAGE_SIZE - offset); + struct folio *folio = page_folio(pages[i]); + + bytes -= size; + for (j = i + 1; j < nr_vecs; j++) { + size_t next = min_t(size_t, PAGE_SIZE, bytes); + + if (page_folio(pages[j]) != folio || + pages[j] != pages[j - 1] + 1) + break; + unpin_user_page(pages[j]); + size += next; + bytes -= next; + } + + bvec_set_page(&bvec[nr_bvecs], pages[i], size, offset); + offset = 0; + nr_bvecs++; + } + + return nr_bvecs; +} + +int bio_integrity_map_user(struct bio *bio, void __user *ubuf, ssize_t bytes, + u32 seed) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + unsigned int align = q->dma_pad_mask | queue_dma_alignment(q); + struct page *stack_pages[UIO_FASTIOV], **pages = stack_pages; + struct bio_vec stack_vec[UIO_FASTIOV], *bvec = stack_vec; + unsigned int direction, nr_bvecs; + struct iov_iter iter; + int ret, nr_vecs; + size_t offset; + bool copy; + + if (bio_integrity(bio)) + return -EINVAL; + if (bytes >> SECTOR_SHIFT > queue_max_hw_sectors(q)) + return -E2BIG; + + if (bio_data_dir(bio) == READ) + direction = ITER_DEST; + else + direction = ITER_SOURCE; + + iov_iter_ubuf(&iter, direction, ubuf, bytes); + nr_vecs = iov_iter_npages(&iter, BIO_MAX_VECS + 1); + if (nr_vecs > BIO_MAX_VECS) + return -E2BIG; + if (nr_vecs > UIO_FASTIOV) { + bvec = kcalloc(nr_vecs, sizeof(*bvec), GFP_KERNEL); + if (!bvec) + return -ENOMEM; + pages = NULL; + } + + copy = !iov_iter_is_aligned(&iter, align, align); + ret = iov_iter_extract_pages(&iter, &pages, bytes, nr_vecs, 0, &offset); + if (unlikely(ret < 0)) + goto free_bvec; + + nr_bvecs = bvec_from_pages(bvec, pages, nr_vecs, bytes, offset); + if (pages != stack_pages) + kvfree(pages); + if (nr_bvecs > queue_max_integrity_segments(q)) + copy = true; + + if (copy) + ret = bio_integrity_copy_user(bio, bvec, nr_bvecs, bytes, + direction, seed); + else + ret = bio_integrity_init_user(bio, bvec, nr_bvecs, bytes, seed); + if (ret) + goto release_pages; + if (bvec != stack_vec) + kfree(bvec); + + return 0; + +release_pages: + bio_integrity_unpin_bvec(bvec, nr_bvecs, false); +free_bvec: + if (bvec != stack_vec) + kfree(bvec); + return ret; +} +EXPORT_SYMBOL_GPL(bio_integrity_map_user); + /** * bio_integrity_process - Process integrity metadata for a bio * @bio: bio to generate/verify integrity metadata for @@ -162,33 +383,30 @@ EXPORT_SYMBOL(bio_integrity_add_page); static blk_status_t bio_integrity_process(struct bio *bio, struct bvec_iter *proc_iter, integrity_processing_fn *proc_fn) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct blk_integrity_iter iter; struct bvec_iter bviter; struct bio_vec bv; struct bio_integrity_payload *bip = bio_integrity(bio); blk_status_t ret = BLK_STS_OK; - void *prot_buf = page_address(bip->bip_vec->bv_page) + - bip->bip_vec->bv_offset; - iter.disk_name = bio->bi_disk->disk_name; + iter.disk_name = bio->bi_bdev->bd_disk->disk_name; iter.interval = 1 << bi->interval_exp; + iter.tuple_size = bi->tuple_size; iter.seed = proc_iter->bi_sector; - iter.prot_buf = prot_buf; + iter.prot_buf = bvec_virt(bip->bip_vec); __bio_for_each_segment(bv, bio, bviter, *proc_iter) { - void *kaddr = kmap_atomic(bv.bv_page); + void *kaddr = bvec_kmap_local(&bv); - iter.data_buf = kaddr + bv.bv_offset; + iter.data_buf = kaddr; iter.data_size = bv.bv_len; - ret = proc_fn(&iter); - if (ret) { - kunmap_atomic(kaddr); - return ret; - } + kunmap_local(kaddr); + + if (ret) + break; - kunmap_atomic(kaddr); } return ret; } @@ -208,14 +426,11 @@ static blk_status_t bio_integrity_process(struct bio *bio, bool bio_integrity_prep(struct bio *bio) { struct bio_integrity_payload *bip; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); - struct request_queue *q = bio->bi_disk->queue; + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); void *buf; unsigned long start, end; unsigned int len, nr_pages; unsigned int bytes, offset, i; - unsigned int intervals; - blk_status_t status; if (!bi) return true; @@ -239,12 +454,10 @@ bool bio_integrity_prep(struct bio *bio) !(bi->flags & BLK_INTEGRITY_GENERATE)) return true; } - intervals = bio_integrity_intervals(bi, bio_sectors(bio)); /* Allocate kernel buffer for protection data */ - len = intervals * bi->tuple_size; - buf = kmalloc(len, GFP_NOIO | q->bounce_gfp); - status = BLK_STS_RESOURCE; + len = bio_integrity_bytes(bi, bio_sectors(bio)); + buf = kmalloc(len, GFP_NOIO); if (unlikely(buf == NULL)) { printk(KERN_ERR "could not allocate integrity buffer\n"); goto err_end_io; @@ -259,12 +472,10 @@ bool bio_integrity_prep(struct bio *bio) if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - status = BLK_STS_RESOURCE; goto err_end_io; } bip->bip_flags |= BIP_BLOCK_INTEGRITY; - bip->bip_iter.bi_size = len; bip_set_seed(bip, bio->bi_iter.bi_sector); if (bi->flags & BLK_INTEGRITY_IP_CHECKSUM) @@ -272,28 +483,18 @@ bool bio_integrity_prep(struct bio *bio) /* Map it */ offset = offset_in_page(buf); - for (i = 0 ; i < nr_pages ; i++) { - int ret; + for (i = 0; i < nr_pages && len > 0; i++) { bytes = PAGE_SIZE - offset; - if (len <= 0) - break; - if (bytes > len) bytes = len; - ret = bio_integrity_add_page(bio, virt_to_page(buf), - bytes, offset); - - if (ret == 0) { + if (bio_integrity_add_page(bio, virt_to_page(buf), + bytes, offset) < bytes) { printk(KERN_ERR "could not attach integrity payload\n"); - status = BLK_STS_RESOURCE; goto err_end_io; } - if (ret < bytes) - break; - buf += bytes; len -= bytes; offset = 0; @@ -309,10 +510,9 @@ bool bio_integrity_prep(struct bio *bio) return true; err_end_io: - bio->bi_status = status; + bio->bi_status = BLK_STS_RESOURCE; bio_endio(bio); return false; - } EXPORT_SYMBOL(bio_integrity_prep); @@ -329,7 +529,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) struct bio_integrity_payload *bip = container_of(work, struct bio_integrity_payload, bip_work); struct bio *bio = bip->bip_bio; - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); /* * At the moment verify is called bio's iterator was advanced @@ -355,7 +555,7 @@ static void bio_integrity_verify_fn(struct work_struct *work) */ bool __bio_integrity_endio(struct bio *bio) { - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); struct bio_integrity_payload *bip = bio_integrity(bio); if (bio_op(bio) == REQ_OP_READ && !bio->bi_status && @@ -381,10 +581,10 @@ bool __bio_integrity_endio(struct bio *bio) void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); unsigned bytes = bio_integrity_bytes(bi, bytes_done >> 9); - bip->bip_iter.bi_sector += bytes_done >> 9; + bip->bip_iter.bi_sector += bio_integrity_intervals(bi, bytes_done >> 9); bvec_iter_advance(bip->bip_vec, &bip->bip_iter, bytes); } @@ -397,7 +597,7 @@ void bio_integrity_advance(struct bio *bio, unsigned int bytes_done) void bio_integrity_trim(struct bio *bio) { struct bio_integrity_payload *bip = bio_integrity(bio); - struct blk_integrity *bi = blk_get_integrity(bio->bi_disk); + struct blk_integrity *bi = blk_get_integrity(bio->bi_bdev->bd_disk); bip->bip_iter.bi_size = bio_integrity_bytes(bi, bio_sectors(bio)); } @@ -428,10 +628,10 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, bip->bip_vcnt = bip_src->bip_vcnt; bip->bip_iter = bip_src->bip_iter; + bip->bip_flags = bip_src->bip_flags & ~BIP_BLOCK_INTEGRITY; return 0; } -EXPORT_SYMBOL(bio_integrity_clone); int bioset_integrity_create(struct bio_set *bs, int pool_size) { @@ -470,6 +670,6 @@ void __init bio_integrity_init(void) bip_slab = kmem_cache_create("bio_integrity_payload", sizeof(struct bio_integrity_payload) + - sizeof(struct bio_vec) * BIP_INLINE_VECS, + sizeof(struct bio_vec) * BIO_INLINE_VECS, 0, SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); } diff --git a/block/bio.c b/block/bio.c index 496aff366767..b9642a41f286 100644 --- a/block/bio.c +++ b/block/bio.c @@ -15,31 +15,54 @@ #include <linux/mempool.h> #include <linux/workqueue.h> #include <linux/cgroup.h> -#include <linux/blk-cgroup.h> #include <linux/highmem.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> +#include <linux/xarray.h> #include <trace/events/block.h> #include "blk.h" #include "blk-rq-qos.h" +#include "blk-cgroup.h" -/* - * Test patch to inline a certain number of bi_io_vec's inside the bio - * itself, to shrink a bio data allocation from two mempool calls to one - */ -#define BIO_INLINE_VECS 4 +#define ALLOC_CACHE_THRESHOLD 16 +#define ALLOC_CACHE_MAX 256 -/* - * if you change this list, also change bvec_alloc or things will - * break badly! cannot be bigger than what you can fit into an - * unsigned short - */ -#define BV(x, n) { .nr_vecs = x, .name = "biovec-"#n } -static struct biovec_slab bvec_slabs[BVEC_POOL_NR] __read_mostly = { - BV(1, 1), BV(4, 4), BV(16, 16), BV(64, 64), BV(128, 128), BV(BIO_MAX_PAGES, max), +struct bio_alloc_cache { + struct bio *free_list; + struct bio *free_list_irq; + unsigned int nr; + unsigned int nr_irq; }; -#undef BV + +static struct biovec_slab { + int nr_vecs; + char *name; + struct kmem_cache *slab; +} bvec_slabs[] __read_mostly = { + { .nr_vecs = 16, .name = "biovec-16" }, + { .nr_vecs = 64, .name = "biovec-64" }, + { .nr_vecs = 128, .name = "biovec-128" }, + { .nr_vecs = BIO_MAX_VECS, .name = "biovec-max" }, +}; + +static struct biovec_slab *biovec_slab(unsigned short nr_vecs) +{ + switch (nr_vecs) { + /* smaller bios use inline vecs */ + case 5 ... 16: + return &bvec_slabs[0]; + case 17 ... 64: + return &bvec_slabs[1]; + case 65 ... 128: + return &bvec_slabs[2]; + case 129 ... BIO_MAX_VECS: + return &bvec_slabs[3]; + default: + BUG(); + return NULL; + } +} /* * fs_bio_set is the bio_set containing bio and iovec memory pools used by @@ -58,184 +81,144 @@ struct bio_slab { char name[8]; }; static DEFINE_MUTEX(bio_slab_lock); -static struct bio_slab *bio_slabs; -static unsigned int bio_slab_nr, bio_slab_max; +static DEFINE_XARRAY(bio_slabs); -static struct kmem_cache *bio_find_or_create_slab(unsigned int extra_size) +static struct bio_slab *create_bio_slab(unsigned int size) { - unsigned int sz = sizeof(struct bio) + extra_size; - struct kmem_cache *slab = NULL; - struct bio_slab *bslab, *new_bio_slabs; - unsigned int new_bio_slab_max; - unsigned int i, entry = -1; + struct bio_slab *bslab = kzalloc(sizeof(*bslab), GFP_KERNEL); - mutex_lock(&bio_slab_lock); + if (!bslab) + return NULL; - i = 0; - while (i < bio_slab_nr) { - bslab = &bio_slabs[i]; + snprintf(bslab->name, sizeof(bslab->name), "bio-%d", size); + bslab->slab = kmem_cache_create(bslab->name, size, + ARCH_KMALLOC_MINALIGN, + SLAB_HWCACHE_ALIGN | SLAB_TYPESAFE_BY_RCU, NULL); + if (!bslab->slab) + goto fail_alloc_slab; - if (!bslab->slab && entry == -1) - entry = i; - else if (bslab->slab_size == sz) { - slab = bslab->slab; - bslab->slab_ref++; - break; - } - i++; - } + bslab->slab_ref = 1; + bslab->slab_size = size; - if (slab) - goto out_unlock; - - if (bio_slab_nr == bio_slab_max && entry == -1) { - new_bio_slab_max = bio_slab_max << 1; - new_bio_slabs = krealloc(bio_slabs, - new_bio_slab_max * sizeof(struct bio_slab), - GFP_KERNEL); - if (!new_bio_slabs) - goto out_unlock; - bio_slab_max = new_bio_slab_max; - bio_slabs = new_bio_slabs; - } - if (entry == -1) - entry = bio_slab_nr++; + if (!xa_err(xa_store(&bio_slabs, size, bslab, GFP_KERNEL))) + return bslab; - bslab = &bio_slabs[entry]; + kmem_cache_destroy(bslab->slab); - snprintf(bslab->name, sizeof(bslab->name), "bio-%d", entry); - slab = kmem_cache_create(bslab->name, sz, ARCH_KMALLOC_MINALIGN, - SLAB_HWCACHE_ALIGN, NULL); - if (!slab) - goto out_unlock; +fail_alloc_slab: + kfree(bslab); + return NULL; +} - bslab->slab = slab; - bslab->slab_ref = 1; - bslab->slab_size = sz; -out_unlock: +static inline unsigned int bs_bio_slab_size(struct bio_set *bs) +{ + return bs->front_pad + sizeof(struct bio) + bs->back_pad; +} + +static struct kmem_cache *bio_find_or_create_slab(struct bio_set *bs) +{ + unsigned int size = bs_bio_slab_size(bs); + struct bio_slab *bslab; + + mutex_lock(&bio_slab_lock); + bslab = xa_load(&bio_slabs, size); + if (bslab) + bslab->slab_ref++; + else + bslab = create_bio_slab(size); mutex_unlock(&bio_slab_lock); - return slab; + + if (bslab) + return bslab->slab; + return NULL; } static void bio_put_slab(struct bio_set *bs) { struct bio_slab *bslab = NULL; - unsigned int i; + unsigned int slab_size = bs_bio_slab_size(bs); mutex_lock(&bio_slab_lock); - for (i = 0; i < bio_slab_nr; i++) { - if (bs->bio_slab == bio_slabs[i].slab) { - bslab = &bio_slabs[i]; - break; - } - } - + bslab = xa_load(&bio_slabs, slab_size); if (WARN(!bslab, KERN_ERR "bio: unable to find slab!\n")) goto out; + WARN_ON_ONCE(bslab->slab != bs->bio_slab); + WARN_ON(!bslab->slab_ref); if (--bslab->slab_ref) goto out; + xa_erase(&bio_slabs, slab_size); + kmem_cache_destroy(bslab->slab); - bslab->slab = NULL; + kfree(bslab); out: mutex_unlock(&bio_slab_lock); } -unsigned int bvec_nr_vecs(unsigned short idx) +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs) { - return bvec_slabs[--idx].nr_vecs; -} + BUG_ON(nr_vecs > BIO_MAX_VECS); -void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned int idx) -{ - if (!idx) - return; - idx--; - - BIO_BUG_ON(idx >= BVEC_POOL_NR); - - if (idx == BVEC_POOL_MAX) { + if (nr_vecs == BIO_MAX_VECS) mempool_free(bv, pool); - } else { - struct biovec_slab *bvs = bvec_slabs + idx; + else if (nr_vecs > BIO_INLINE_VECS) + kmem_cache_free(biovec_slab(nr_vecs)->slab, bv); +} - kmem_cache_free(bvs->slab, bv); - } +/* + * Make the first allocation restricted and don't dump info on allocation + * failures, since we'll fall back to the mempool in case of failure. + */ +static inline gfp_t bvec_alloc_gfp(gfp_t gfp) +{ + return (gfp & ~(__GFP_DIRECT_RECLAIM | __GFP_IO)) | + __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; } -struct bio_vec *bvec_alloc(gfp_t gfp_mask, int nr, unsigned long *idx, - mempool_t *pool) +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask) { - struct bio_vec *bvl; + struct biovec_slab *bvs = biovec_slab(*nr_vecs); - /* - * see comment near bvec_array define! - */ - switch (nr) { - case 1: - *idx = 0; - break; - case 2 ... 4: - *idx = 1; - break; - case 5 ... 16: - *idx = 2; - break; - case 17 ... 64: - *idx = 3; - break; - case 65 ... 128: - *idx = 4; - break; - case 129 ... BIO_MAX_PAGES: - *idx = 5; - break; - default: + if (WARN_ON_ONCE(!bvs)) return NULL; - } /* - * idx now points to the pool we want to allocate from. only the - * 1-vec entry pool is mempool backed. + * Upgrade the nr_vecs request to take full advantage of the allocation. + * We also rely on this in the bvec_free path. */ - if (*idx == BVEC_POOL_MAX) { -fallback: - bvl = mempool_alloc(pool, gfp_mask); - } else { - struct biovec_slab *bvs = bvec_slabs + *idx; - gfp_t __gfp_mask = gfp_mask & ~(__GFP_DIRECT_RECLAIM | __GFP_IO); + *nr_vecs = bvs->nr_vecs; - /* - * Make this allocation restricted and don't dump info on - * allocation failures, since we'll fallback to the mempool - * in case of failure. - */ - __gfp_mask |= __GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN; + /* + * Try a slab allocation first for all smaller allocations. If that + * fails and __GFP_DIRECT_RECLAIM is set retry with the mempool. + * The mempool is sized to handle up to BIO_MAX_VECS entries. + */ + if (*nr_vecs < BIO_MAX_VECS) { + struct bio_vec *bvl; - /* - * Try a slab allocation. If this fails and __GFP_DIRECT_RECLAIM - * is set, retry with the 1-entry mempool - */ - bvl = kmem_cache_alloc(bvs->slab, __gfp_mask); - if (unlikely(!bvl && (gfp_mask & __GFP_DIRECT_RECLAIM))) { - *idx = BVEC_POOL_MAX; - goto fallback; - } + bvl = kmem_cache_alloc(bvs->slab, bvec_alloc_gfp(gfp_mask)); + if (likely(bvl) || !(gfp_mask & __GFP_DIRECT_RECLAIM)) + return bvl; + *nr_vecs = BIO_MAX_VECS; } - (*idx)++; - return bvl; + return mempool_alloc(pool, gfp_mask); } void bio_uninit(struct bio *bio) { - bio_disassociate_blkg(bio); - +#ifdef CONFIG_BLK_CGROUP + if (bio->bi_blkg) { + blkg_put(bio->bi_blkg); + bio->bi_blkg = NULL; + } +#endif if (bio_integrity(bio)) bio_integrity_free(bio); @@ -246,24 +229,13 @@ EXPORT_SYMBOL(bio_uninit); static void bio_free(struct bio *bio) { struct bio_set *bs = bio->bi_pool; - void *p; + void *p = bio; - bio_uninit(bio); + WARN_ON_ONCE(!bs); - if (bs) { - bvec_free(&bs->bvec_pool, bio->bi_io_vec, BVEC_POOL_IDX(bio)); - - /* - * If we have front padding, adjust the bio pointer before freeing - */ - p = bio; - p -= bs->front_pad; - - mempool_free(p, &bs->bio_pool); - } else { - /* Bio was allocated by bio_kmalloc() */ - kfree(bio); - } + bio_uninit(bio); + bvec_free(&bs->bvec_pool, bio->bi_io_vec, bio->bi_max_vecs); + mempool_free(p - bs->front_pad, &bs->bio_pool); } /* @@ -271,21 +243,53 @@ static void bio_free(struct bio *bio) * they must remember to pair any call to bio_init() with bio_uninit() * when IO has completed, or when the bio is released. */ -void bio_init(struct bio *bio, struct bio_vec *table, - unsigned short max_vecs) +void bio_init(struct bio *bio, struct block_device *bdev, struct bio_vec *table, + unsigned short max_vecs, blk_opf_t opf) { - memset(bio, 0, sizeof(*bio)); + bio->bi_next = NULL; + bio->bi_bdev = bdev; + bio->bi_opf = opf; + bio->bi_flags = 0; + bio->bi_ioprio = 0; + bio->bi_status = 0; + bio->bi_iter.bi_sector = 0; + bio->bi_iter.bi_size = 0; + bio->bi_iter.bi_idx = 0; + bio->bi_iter.bi_bvec_done = 0; + bio->bi_end_io = NULL; + bio->bi_private = NULL; +#ifdef CONFIG_BLK_CGROUP + bio->bi_blkg = NULL; + bio->bi_issue.value = 0; + if (bdev) + bio_associate_blkg(bio); +#ifdef CONFIG_BLK_CGROUP_IOCOST + bio->bi_iocost_cost = 0; +#endif +#endif +#ifdef CONFIG_BLK_INLINE_ENCRYPTION + bio->bi_crypt_context = NULL; +#endif +#ifdef CONFIG_BLK_DEV_INTEGRITY + bio->bi_integrity = NULL; +#endif + bio->bi_vcnt = 0; + atomic_set(&bio->__bi_remaining, 1); atomic_set(&bio->__bi_cnt, 1); + bio->bi_cookie = BLK_QC_T_NONE; - bio->bi_io_vec = table; bio->bi_max_vecs = max_vecs; + bio->bi_io_vec = table; + bio->bi_pool = NULL; } EXPORT_SYMBOL(bio_init); /** * bio_reset - reinitialize a bio * @bio: bio to reset + * @bdev: block device to use the bio for + * @opf: operation and flags for bio * * Description: * After calling bio_reset(), @bio will be in the same state as a freshly @@ -293,15 +297,15 @@ EXPORT_SYMBOL(bio_init); * preserved are the ones that are initialized by bio_alloc_bioset(). See * comment in struct bio. */ -void bio_reset(struct bio *bio) +void bio_reset(struct bio *bio, struct block_device *bdev, blk_opf_t opf) { - unsigned long flags = bio->bi_flags & (~0UL << BIO_RESET_BITS); - bio_uninit(bio); - memset(bio, 0, BIO_RESET_BYTES); - bio->bi_flags = flags; atomic_set(&bio->__bi_remaining, 1); + bio->bi_bdev = bdev; + if (bio->bi_bdev) + bio_associate_blkg(bio); + bio->bi_opf = opf; } EXPORT_SYMBOL(bio_reset); @@ -309,7 +313,7 @@ static struct bio *__bio_chain_endio(struct bio *bio) { struct bio *parent = bio->bi_private; - if (!parent->bi_status) + if (bio->bi_status && !parent->bi_status) parent->bi_status = bio->bi_status; bio_put(bio); return parent; @@ -323,7 +327,7 @@ static void bio_chain_endio(struct bio *bio) /** * bio_chain - chain bio completions * @bio: the target bio - * @parent: the @bio's parent bio + * @parent: the parent bio of @bio * * The caller won't have a bi_end_io called when @bio completes - instead, * @parent's bi_end_io won't be called until both @parent and @bio have @@ -341,6 +345,20 @@ void bio_chain(struct bio *bio, struct bio *parent) } EXPORT_SYMBOL(bio_chain); +struct bio *blk_next_bio(struct bio *bio, struct block_device *bdev, + unsigned int nr_pages, blk_opf_t opf, gfp_t gfp) +{ + struct bio *new = bio_alloc(bdev, nr_pages, opf, gfp); + + if (bio) { + bio_chain(bio, new); + submit_bio(bio); + } + + return new; +} +EXPORT_SYMBOL_GPL(blk_next_bio); + static void bio_alloc_rescue(struct work_struct *work) { struct bio_set *bs = container_of(work, struct bio_set, rescue_work); @@ -354,7 +372,7 @@ static void bio_alloc_rescue(struct work_struct *work) if (!bio) break; - generic_make_request(bio); + submit_bio_noacct(bio); } } @@ -395,130 +413,165 @@ static void punt_bios_to_rescuer(struct bio_set *bs) queue_work(bs->rescue_workqueue, &bs->rescue_work); } +static void bio_alloc_irq_cache_splice(struct bio_alloc_cache *cache) +{ + unsigned long flags; + + /* cache->free_list must be empty */ + if (WARN_ON_ONCE(cache->free_list)) + return; + + local_irq_save(flags); + cache->free_list = cache->free_list_irq; + cache->free_list_irq = NULL; + cache->nr += cache->nr_irq; + cache->nr_irq = 0; + local_irq_restore(flags); +} + +static struct bio *bio_alloc_percpu_cache(struct block_device *bdev, + unsigned short nr_vecs, blk_opf_t opf, gfp_t gfp, + struct bio_set *bs) +{ + struct bio_alloc_cache *cache; + struct bio *bio; + + cache = per_cpu_ptr(bs->cache, get_cpu()); + if (!cache->free_list) { + if (READ_ONCE(cache->nr_irq) >= ALLOC_CACHE_THRESHOLD) + bio_alloc_irq_cache_splice(cache); + if (!cache->free_list) { + put_cpu(); + return NULL; + } + } + bio = cache->free_list; + cache->free_list = bio->bi_next; + cache->nr--; + put_cpu(); + + bio_init(bio, bdev, nr_vecs ? bio->bi_inline_vecs : NULL, nr_vecs, opf); + bio->bi_pool = bs; + return bio; +} + /** * bio_alloc_bioset - allocate a bio for I/O + * @bdev: block device to allocate the bio for (can be %NULL) + * @nr_vecs: number of bvecs to pre-allocate + * @opf: operation and flags for bio * @gfp_mask: the GFP_* mask given to the slab allocator - * @nr_iovecs: number of iovecs to pre-allocate * @bs: the bio_set to allocate from. * - * Description: - * If @bs is NULL, uses kmalloc() to allocate the bio; else the allocation is - * backed by the @bs's mempool. + * Allocate a bio from the mempools in @bs. * - * When @bs is not NULL, if %__GFP_DIRECT_RECLAIM is set then bio_alloc will - * always be able to allocate a bio. This is due to the mempool guarantees. - * To make this work, callers must never allocate more than 1 bio at a time - * from this pool. Callers that need to allocate more than 1 bio must always - * submit the previously allocated bio for IO before attempting to allocate - * a new one. Failure to do so can cause deadlocks under memory pressure. + * If %__GFP_DIRECT_RECLAIM is set then bio_alloc will always be able to + * allocate a bio. This is due to the mempool guarantees. To make this work, + * callers must never allocate more than 1 bio at a time from the general pool. + * Callers that need to allocate more than 1 bio must always submit the + * previously allocated bio for IO before attempting to allocate a new one. + * Failure to do so can cause deadlocks under memory pressure. * - * Note that when running under generic_make_request() (i.e. any block - * driver), bios are not submitted until after you return - see the code in - * generic_make_request() that converts recursion into iteration, to prevent - * stack overflows. + * Note that when running under submit_bio_noacct() (i.e. any block driver), + * bios are not submitted until after you return - see the code in + * submit_bio_noacct() that converts recursion into iteration, to prevent + * stack overflows. * - * This would normally mean allocating multiple bios under - * generic_make_request() would be susceptible to deadlocks, but we have - * deadlock avoidance code that resubmits any blocked bios from a rescuer - * thread. + * This would normally mean allocating multiple bios under submit_bio_noacct() + * would be susceptible to deadlocks, but we have + * deadlock avoidance code that resubmits any blocked bios from a rescuer + * thread. * - * However, we do not guarantee forward progress for allocations from other - * mempools. Doing multiple allocations from the same mempool under - * generic_make_request() should be avoided - instead, use bio_set's front_pad - * for per bio allocations. + * However, we do not guarantee forward progress for allocations from other + * mempools. Doing multiple allocations from the same mempool under + * submit_bio_noacct() should be avoided - instead, use bio_set's front_pad + * for per bio allocations. * - * RETURNS: - * Pointer to new bio on success, NULL on failure. + * Returns: Pointer to new bio on success, NULL on failure. */ -struct bio *bio_alloc_bioset(gfp_t gfp_mask, unsigned int nr_iovecs, +struct bio *bio_alloc_bioset(struct block_device *bdev, unsigned short nr_vecs, + blk_opf_t opf, gfp_t gfp_mask, struct bio_set *bs) { gfp_t saved_gfp = gfp_mask; - unsigned front_pad; - unsigned inline_vecs; - struct bio_vec *bvl = NULL; struct bio *bio; void *p; - if (!bs) { - if (nr_iovecs > UIO_MAXIOV) - return NULL; - - p = kmalloc(sizeof(struct bio) + - nr_iovecs * sizeof(struct bio_vec), - gfp_mask); - front_pad = 0; - inline_vecs = nr_iovecs; - } else { - /* should not use nobvec bioset for nr_iovecs > 0 */ - if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && - nr_iovecs > 0)) - return NULL; - /* - * generic_make_request() converts recursion to iteration; this - * means if we're running beneath it, any bios we allocate and - * submit will not be submitted (and thus freed) until after we - * return. - * - * This exposes us to a potential deadlock if we allocate - * multiple bios from the same bio_set() while running - * underneath generic_make_request(). If we were to allocate - * multiple bios (say a stacking block driver that was splitting - * bios), we would deadlock if we exhausted the mempool's - * reserve. - * - * We solve this, and guarantee forward progress, with a rescuer - * workqueue per bio_set. If we go to allocate and there are - * bios on current->bio_list, we first try the allocation - * without __GFP_DIRECT_RECLAIM; if that fails, we punt those - * bios we would be blocking to the rescuer workqueue before - * we retry with the original gfp_flags. - */ - - if (current->bio_list && - (!bio_list_empty(¤t->bio_list[0]) || - !bio_list_empty(¤t->bio_list[1])) && - bs->rescue_workqueue) - gfp_mask &= ~__GFP_DIRECT_RECLAIM; + /* should not use nobvec bioset for nr_vecs > 0 */ + if (WARN_ON_ONCE(!mempool_initialized(&bs->bvec_pool) && nr_vecs > 0)) + return NULL; - p = mempool_alloc(&bs->bio_pool, gfp_mask); - if (!p && gfp_mask != saved_gfp) { - punt_bios_to_rescuer(bs); - gfp_mask = saved_gfp; - p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (opf & REQ_ALLOC_CACHE) { + if (bs->cache && nr_vecs <= BIO_INLINE_VECS) { + bio = bio_alloc_percpu_cache(bdev, nr_vecs, opf, + gfp_mask, bs); + if (bio) + return bio; + /* + * No cached bio available, bio returned below marked with + * REQ_ALLOC_CACHE to particpate in per-cpu alloc cache. + */ + } else { + opf &= ~REQ_ALLOC_CACHE; } - - front_pad = bs->front_pad; - inline_vecs = BIO_INLINE_VECS; } + /* + * submit_bio_noacct() converts recursion to iteration; this means if + * we're running beneath it, any bios we allocate and submit will not be + * submitted (and thus freed) until after we return. + * + * This exposes us to a potential deadlock if we allocate multiple bios + * from the same bio_set() while running underneath submit_bio_noacct(). + * If we were to allocate multiple bios (say a stacking block driver + * that was splitting bios), we would deadlock if we exhausted the + * mempool's reserve. + * + * We solve this, and guarantee forward progress, with a rescuer + * workqueue per bio_set. If we go to allocate and there are bios on + * current->bio_list, we first try the allocation without + * __GFP_DIRECT_RECLAIM; if that fails, we punt those bios we would be + * blocking to the rescuer workqueue before we retry with the original + * gfp_flags. + */ + if (current->bio_list && + (!bio_list_empty(¤t->bio_list[0]) || + !bio_list_empty(¤t->bio_list[1])) && + bs->rescue_workqueue) + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + + p = mempool_alloc(&bs->bio_pool, gfp_mask); + if (!p && gfp_mask != saved_gfp) { + punt_bios_to_rescuer(bs); + gfp_mask = saved_gfp; + p = mempool_alloc(&bs->bio_pool, gfp_mask); + } if (unlikely(!p)) return NULL; + if (!mempool_is_saturated(&bs->bio_pool)) + opf &= ~REQ_ALLOC_CACHE; - bio = p + front_pad; - bio_init(bio, NULL, 0); - - if (nr_iovecs > inline_vecs) { - unsigned long idx = 0; + bio = p + bs->front_pad; + if (nr_vecs > BIO_INLINE_VECS) { + struct bio_vec *bvl = NULL; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); if (!bvl && gfp_mask != saved_gfp) { punt_bios_to_rescuer(bs); gfp_mask = saved_gfp; - bvl = bvec_alloc(gfp_mask, nr_iovecs, &idx, &bs->bvec_pool); + bvl = bvec_alloc(&bs->bvec_pool, &nr_vecs, gfp_mask); } - if (unlikely(!bvl)) goto err_free; - bio->bi_flags |= idx << BVEC_POOL_OFFSET; - } else if (nr_iovecs) { - bvl = bio->bi_inline_vecs; + bio_init(bio, bdev, bvl, nr_vecs, opf); + } else if (nr_vecs) { + bio_init(bio, bdev, bio->bi_inline_vecs, BIO_INLINE_VECS, opf); + } else { + bio_init(bio, bdev, NULL, 0, opf); } bio->bi_pool = bs; - bio->bi_max_vecs = nr_iovecs; - bio->bi_io_vec = bvl; return bio; err_free: @@ -527,59 +580,41 @@ err_free: } EXPORT_SYMBOL(bio_alloc_bioset); -void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) +/** + * bio_kmalloc - kmalloc a bio + * @nr_vecs: number of bio_vecs to allocate + * @gfp_mask: the GFP_* mask given to the slab allocator + * + * Use kmalloc to allocate a bio (including bvecs). The bio must be initialized + * using bio_init() before use. To free a bio returned from this function use + * kfree() after calling bio_uninit(). A bio returned from this function can + * be reused by calling bio_uninit() before calling bio_init() again. + * + * Note that unlike bio_alloc() or bio_alloc_bioset() allocations from this + * function are not backed by a mempool can fail. Do not use this function + * for allocations in the file system I/O path. + * + * Returns: Pointer to new bio on success, NULL on failure. + */ +struct bio *bio_kmalloc(unsigned short nr_vecs, gfp_t gfp_mask) { - unsigned long flags; - struct bio_vec bv; - struct bvec_iter iter; + struct bio *bio; - __bio_for_each_segment(bv, bio, iter, start) { - char *data = bvec_kmap_irq(&bv, &flags); - memset(data, 0, bv.bv_len); - flush_dcache_page(bv.bv_page); - bvec_kunmap_irq(data, &flags); - } + if (nr_vecs > UIO_MAXIOV) + return NULL; + return kmalloc(struct_size(bio, bi_inline_vecs, nr_vecs), gfp_mask); } -EXPORT_SYMBOL(zero_fill_bio_iter); +EXPORT_SYMBOL(bio_kmalloc); -void bio_truncate(struct bio *bio, unsigned new_size) +void zero_fill_bio_iter(struct bio *bio, struct bvec_iter start) { struct bio_vec bv; struct bvec_iter iter; - unsigned int done = 0; - bool truncated = false; - - if (new_size >= bio->bi_iter.bi_size) - return; - if (bio_data_dir(bio) != READ) - goto exit; - - bio_for_each_segment(bv, bio, iter) { - if (done + bv.bv_len > new_size) { - unsigned offset; - - if (!truncated) - offset = new_size - done; - else - offset = 0; - zero_user(bv.bv_page, offset, bv.bv_len - offset); - truncated = true; - } - done += bv.bv_len; - } - - exit: - /* - * Don't touch bvec table here and make it really immutable, since - * fs bio user has to retrieve all pages via bio_for_each_segment_all - * in its .end_bio() callback. - * - * It is enough to truncate bio by updating .bi_size since we can make - * correct bvec with the updated .bi_size for drivers. - */ - bio->bi_iter.bi_size = new_size; + __bio_for_each_segment(bv, bio, iter, start) + memzero_bvec(&bv); } +EXPORT_SYMBOL(zero_fill_bio_iter); /** * bio_truncate - truncate the bio to small size of @new_size @@ -591,7 +626,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) * REQ_OP_READ, zero the truncated part. This function should only * be used for handling corner cases, such as bio eod. */ -void bio_truncate(struct bio *bio, unsigned new_size) +static void bio_truncate(struct bio *bio, unsigned new_size) { struct bio_vec bv; struct bvec_iter iter; @@ -612,7 +647,8 @@ void bio_truncate(struct bio *bio, unsigned new_size) offset = new_size - done; else offset = 0; - zero_user(bv.bv_page, offset, bv.bv_len - offset); + zero_user(bv.bv_page, bv.bv_offset + offset, + bv.bv_len - offset); truncated = true; } done += bv.bv_len; @@ -644,16 +680,7 @@ void bio_truncate(struct bio *bio, unsigned new_size) */ void guard_bio_eod(struct bio *bio) { - sector_t maxsector; - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (part) - maxsector = part_nr_sects_read(part); - else - maxsector = get_capacity(bio->bi_disk); - rcu_read_unlock(); + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); if (!maxsector) return; @@ -673,6 +700,93 @@ void guard_bio_eod(struct bio *bio) bio_truncate(bio, maxsector << 9); } +static int __bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + unsigned int i = 0; + struct bio *bio; + + while ((bio = cache->free_list) != NULL) { + cache->free_list = bio->bi_next; + cache->nr--; + bio_free(bio); + if (++i == nr) + break; + } + return i; +} + +static void bio_alloc_cache_prune(struct bio_alloc_cache *cache, + unsigned int nr) +{ + nr -= __bio_alloc_cache_prune(cache, nr); + if (!READ_ONCE(cache->free_list)) { + bio_alloc_irq_cache_splice(cache); + __bio_alloc_cache_prune(cache, nr); + } +} + +static int bio_cpu_dead(unsigned int cpu, struct hlist_node *node) +{ + struct bio_set *bs; + + bs = hlist_entry_safe(node, struct bio_set, cpuhp_dead); + if (bs->cache) { + struct bio_alloc_cache *cache = per_cpu_ptr(bs->cache, cpu); + + bio_alloc_cache_prune(cache, -1U); + } + return 0; +} + +static void bio_alloc_cache_destroy(struct bio_set *bs) +{ + int cpu; + + if (!bs->cache) + return; + + cpuhp_state_remove_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + for_each_possible_cpu(cpu) { + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bs->cache, cpu); + bio_alloc_cache_prune(cache, -1U); + } + free_percpu(bs->cache); + bs->cache = NULL; +} + +static inline void bio_put_percpu_cache(struct bio *bio) +{ + struct bio_alloc_cache *cache; + + cache = per_cpu_ptr(bio->bi_pool->cache, get_cpu()); + if (READ_ONCE(cache->nr_irq) + cache->nr > ALLOC_CACHE_MAX) { + put_cpu(); + bio_free(bio); + return; + } + + bio_uninit(bio); + + if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { + bio->bi_next = cache->free_list; + bio->bi_bdev = NULL; + cache->free_list = bio; + cache->nr++; + } else { + unsigned long flags; + + local_irq_save(flags); + bio->bi_next = cache->free_list_irq; + cache->free_list_irq = bio; + cache->nr_irq++; + local_irq_restore(flags); + } + put_cpu(); +} + /** * bio_put - release a reference to a bio * @bio: bio to release reference to @@ -683,112 +797,135 @@ void guard_bio_eod(struct bio *bio) **/ void bio_put(struct bio *bio) { - if (!bio_flagged(bio, BIO_REFFED)) - bio_free(bio); - else { - BIO_BUG_ON(!atomic_read(&bio->__bi_cnt)); - - /* - * last put frees it - */ - if (atomic_dec_and_test(&bio->__bi_cnt)) - bio_free(bio); + if (unlikely(bio_flagged(bio, BIO_REFFED))) { + BUG_ON(!atomic_read(&bio->__bi_cnt)); + if (!atomic_dec_and_test(&bio->__bi_cnt)) + return; } + if (bio->bi_opf & REQ_ALLOC_CACHE) + bio_put_percpu_cache(bio); + else + bio_free(bio); } EXPORT_SYMBOL(bio_put); -/** - * __bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: destination bio - * @bio_src: bio to clone - * - * Clone a &bio. Caller will own the returned bio, but not - * the actual data it points to. Reference count of returned - * bio will be one. - * - * Caller must ensure that @bio_src is not freed before @bio. - */ -void __bio_clone_fast(struct bio *bio, struct bio *bio_src) +static int __bio_clone(struct bio *bio, struct bio *bio_src, gfp_t gfp) { - BUG_ON(bio->bi_pool && BVEC_POOL_IDX(bio)); - - /* - * most users will be overriding ->bi_disk with a new target, - * so we don't set nor calculate new physical/hw segment counts here - */ - bio->bi_disk = bio_src->bi_disk; - bio->bi_partno = bio_src->bi_partno; bio_set_flag(bio, BIO_CLONED); - if (bio_flagged(bio_src, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); - bio->bi_opf = bio_src->bi_opf; bio->bi_ioprio = bio_src->bi_ioprio; - bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter = bio_src->bi_iter; - bio->bi_io_vec = bio_src->bi_io_vec; - bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); + if (bio->bi_bdev) { + if (bio->bi_bdev == bio_src->bi_bdev && + bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); + bio_clone_blkg_association(bio, bio_src); + } + + if (bio_crypt_clone(bio, bio_src, gfp) < 0) + return -ENOMEM; + if (bio_integrity(bio_src) && + bio_integrity_clone(bio, bio_src, gfp) < 0) + return -ENOMEM; + return 0; } -EXPORT_SYMBOL(__bio_clone_fast); /** - * bio_clone_fast - clone a bio that shares the original bio's biovec - * @bio: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from + * bio_alloc_clone - clone a bio that shares the original bio's biovec + * @bdev: block_device to clone onto + * @bio_src: bio to clone from + * @gfp: allocation priority + * @bs: bio_set to allocate from + * + * Allocate a new bio that is a clone of @bio_src. The caller owns the returned + * bio, but not the actual data it points to. * - * Like __bio_clone_fast, only also allocates the returned bio + * The caller must ensure that the return bio is not freed before @bio_src. */ -struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) +struct bio *bio_alloc_clone(struct block_device *bdev, struct bio *bio_src, + gfp_t gfp, struct bio_set *bs) { - struct bio *b; + struct bio *bio; - b = bio_alloc_bioset(gfp_mask, 0, bs); - if (!b) + bio = bio_alloc_bioset(bdev, 0, bio_src->bi_opf, gfp, bs); + if (!bio) return NULL; - __bio_clone_fast(b, bio); - - bio_crypt_clone(b, bio, gfp_mask); - - if (bio_integrity(bio)) { - int ret; + if (__bio_clone(bio, bio_src, gfp) < 0) { + bio_put(bio); + return NULL; + } + bio->bi_io_vec = bio_src->bi_io_vec; - ret = bio_integrity_clone(b, bio, gfp_mask); + return bio; +} +EXPORT_SYMBOL(bio_alloc_clone); - if (ret < 0) { - bio_put(b); - return NULL; - } - } +/** + * bio_init_clone - clone a bio that shares the original bio's biovec + * @bdev: block_device to clone onto + * @bio: bio to clone into + * @bio_src: bio to clone from + * @gfp: allocation priority + * + * Initialize a new bio in caller provided memory that is a clone of @bio_src. + * The caller owns the returned bio, but not the actual data it points to. + * + * The caller must ensure that @bio_src is not freed before @bio. + */ +int bio_init_clone(struct block_device *bdev, struct bio *bio, + struct bio *bio_src, gfp_t gfp) +{ + int ret; - return b; + bio_init(bio, bdev, bio_src->bi_io_vec, 0, bio_src->bi_opf); + ret = __bio_clone(bio, bio_src, gfp); + if (ret) + bio_uninit(bio); + return ret; } -EXPORT_SYMBOL(bio_clone_fast); +EXPORT_SYMBOL(bio_init_clone); -const char *bio_devname(struct bio *bio, char *buf) +/** + * bio_full - check if the bio is full + * @bio: bio to check + * @len: length of one segment to be added + * + * Return true if @bio is full and one segment with @len bytes can't be + * added to the bio, otherwise return false + */ +static inline bool bio_full(struct bio *bio, unsigned len) { - return disk_name(bio->bi_disk, bio->bi_partno, buf); + if (bio->bi_vcnt >= bio->bi_max_vecs) + return true; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return true; + return false; } -EXPORT_SYMBOL(bio_devname); -static inline bool page_is_mergeable(const struct bio_vec *bv, - struct page *page, unsigned int len, unsigned int off, - bool *same_page) +static bool bvec_try_merge_page(struct bio_vec *bv, struct page *page, + unsigned int len, unsigned int off, bool *same_page) { - phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + - bv->bv_offset + bv->bv_len - 1; + size_t bv_end = bv->bv_offset + bv->bv_len; + phys_addr_t vec_end_addr = page_to_phys(bv->bv_page) + bv_end - 1; phys_addr_t page_addr = page_to_phys(page); if (vec_end_addr + 1 != page_addr + off) return false; if (xen_domain() && !xen_biovec_phys_mergeable(bv, page)) return false; + if (!zone_device_pages_have_same_pgmap(bv->bv_page, page)) + return false; *same_page = ((vec_end_addr & PAGE_MASK) == page_addr); - if (!*same_page && pfn_to_page(PFN_DOWN(vec_end_addr)) + 1 != page) - return false; + if (!*same_page) { + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (bv->bv_page + bv_end / PAGE_SIZE != page + off / PAGE_SIZE) + return false; + } + + bv->bv_len += len; return true; } @@ -797,20 +934,19 @@ static inline bool page_is_mergeable(const struct bio_vec *bv, * size limit. This is not for normal read/write bios, but for passthrough * or Zone Append operations that we can't split. */ -static bool bio_try_merge_hw_seg(struct request_queue *q, struct bio *bio, - struct page *page, unsigned len, - unsigned offset, bool *same_page) +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; unsigned long mask = queue_segment_boundary(q); phys_addr_t addr1 = page_to_phys(bv->bv_page) + bv->bv_offset; phys_addr_t addr2 = page_to_phys(page) + offset + len - 1; if ((addr1 | mask) != (addr2 | mask)) return false; - if (bv->bv_len + len > queue_max_segment_size(q)) + if (len > queue_max_segment_size(q) - bv->bv_len) return false; - return __bio_try_merge_page(bio, page, len, offset, same_page); + return bvec_try_merge_page(bv, page, len, offset, same_page); } /** @@ -830,37 +966,37 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct page *page, unsigned int len, unsigned int offset, unsigned int max_sectors, bool *same_page) { - struct bio_vec *bvec; + unsigned int max_size = max_sectors << SECTOR_SHIFT; if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) return 0; - if (((bio->bi_iter.bi_size + len) >> 9) > max_sectors) + len = min3(len, max_size, queue_max_segment_size(q)); + if (len > max_size - bio->bi_iter.bi_size) return 0; if (bio->bi_vcnt > 0) { - if (bio_try_merge_hw_seg(q, bio, page, len, offset, same_page)) + struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + + if (bvec_try_merge_hw_page(q, bv, page, len, offset, + same_page)) { + bio->bi_iter.bi_size += len; return len; + } + + if (bio->bi_vcnt >= + min(bio->bi_max_vecs, queue_max_segments(q))) + return 0; /* * If the queue doesn't support SG gaps and adding this segment * would create a gap, disallow it. */ - bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; - if (bvec_gap_to_prev(q, bvec, offset)) + if (bvec_gap_to_prev(&q->limits, bv, offset)) return 0; } - if (bio_full(bio, len)) - return 0; - - if (bio->bi_vcnt >= queue_max_segments(q)) - return 0; - - bvec = &bio->bi_io_vec[bio->bi_vcnt]; - bvec->bv_page = page; - bvec->bv_len = len; - bvec->bv_offset = offset; + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, offset); bio->bi_vcnt++; bio->bi_iter.bi_size += len; return len; @@ -891,41 +1027,37 @@ int bio_add_pc_page(struct request_queue *q, struct bio *bio, EXPORT_SYMBOL(bio_add_pc_page); /** - * __bio_try_merge_page - try appending data to an existing bvec. + * bio_add_zone_append_page - attempt to add page to zone-append bio * @bio: destination bio - * @page: start page to add - * @len: length of the data to add - * @off: offset of the data relative to @page - * @same_page: return if the segment has been merged inside the same page - * - * Try to add the data at @page + @off to the last bvec of @bio. This is a - * a useful optimisation for file systems with a block size smaller than the - * page size. + * @page: page to add + * @len: vec entry length + * @offset: vec entry offset * - * Warn if (@len, @off) crosses pages in case that @same_page is true. + * Attempt to add a page to the bio_vec maplist of a bio that will be submitted + * for a zone-append request. This can fail for a number of reasons, such as the + * bio being full or the target block device is not a zoned block device or + * other limitations of the target block device. The target block device must + * allow bio's up to PAGE_SIZE, so it is always possible to add a single page + * to an empty bio. * - * Return %true on success or %false on failure. + * Returns: number of bytes added to the bio, or 0 in case of a failure. */ -bool __bio_try_merge_page(struct bio *bio, struct page *page, - unsigned int len, unsigned int off, bool *same_page) +int bio_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) { - if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) - return false; + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + bool same_page = false; - if (bio->bi_vcnt > 0) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt - 1]; + if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_ZONE_APPEND)) + return 0; - if (page_is_mergeable(bv, page, len, off, same_page)) { - if (bio->bi_iter.bi_size > UINT_MAX - len) - return false; - bv->bv_len += len; - bio->bi_iter.bi_size += len; - return true; - } - } - return false; + if (WARN_ON_ONCE(!bdev_is_zoned(bio->bi_bdev))) + return 0; + + return bio_add_hw_page(q, bio, page, len, offset, + queue_max_zone_append_sectors(q), &same_page); } -EXPORT_SYMBOL_GPL(__bio_try_merge_page); +EXPORT_SYMBOL_GPL(bio_add_zone_append_page); /** * __bio_add_page - add page(s) to a bio in a new segment @@ -940,20 +1072,12 @@ EXPORT_SYMBOL_GPL(__bio_try_merge_page); void __bio_add_page(struct bio *bio, struct page *page, unsigned int len, unsigned int off) { - struct bio_vec *bv = &bio->bi_io_vec[bio->bi_vcnt]; - WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED)); WARN_ON_ONCE(bio_full(bio, len)); - bv->bv_page = page; - bv->bv_offset = off; - bv->bv_len = len; - + bvec_set_page(&bio->bi_io_vec[bio->bi_vcnt], page, len, off); bio->bi_iter.bi_size += len; bio->bi_vcnt++; - - if (!bio_flagged(bio, BIO_WORKINGSET) && unlikely(PageWorkingset(page))) - bio_set_flag(bio, BIO_WORKINGSET); } EXPORT_SYMBOL_GPL(__bio_add_page); @@ -972,46 +1096,129 @@ int bio_add_page(struct bio *bio, struct page *page, { bool same_page = false; - if (!__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (bio_full(bio, len)) - return 0; - __bio_add_page(bio, page, len, offset); + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return 0; + if (bio->bi_iter.bi_size > UINT_MAX - len) + return 0; + + if (bio->bi_vcnt > 0 && + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; + return len; } + + if (bio->bi_vcnt >= bio->bi_max_vecs) + return 0; + __bio_add_page(bio, page, len, offset); return len; } EXPORT_SYMBOL(bio_add_page); -void bio_release_pages(struct bio *bio, bool mark_dirty) +void bio_add_folio_nofail(struct bio *bio, struct folio *folio, size_t len, + size_t off) { - struct bvec_iter_all iter_all; - struct bio_vec *bvec; + WARN_ON_ONCE(len > UINT_MAX); + WARN_ON_ONCE(off > UINT_MAX); + __bio_add_page(bio, &folio->page, len, off); +} - if (bio_flagged(bio, BIO_NO_PAGE_REF)) - return; +/** + * bio_add_folio - Attempt to add part of a folio to a bio. + * @bio: BIO to add to. + * @folio: Folio to add. + * @len: How many bytes from the folio to add. + * @off: First byte in this folio to add. + * + * Filesystems that use folios can call this function instead of calling + * bio_add_page() for each page in the folio. If @off is bigger than + * PAGE_SIZE, this function can create a bio_vec that starts in a page + * after the bv_page. BIOs do not support folios that are 4GiB or larger. + * + * Return: Whether the addition was successful. + */ +bool bio_add_folio(struct bio *bio, struct folio *folio, size_t len, + size_t off) +{ + if (len > UINT_MAX || off > UINT_MAX) + return false; + return bio_add_page(bio, &folio->page, len, off) > 0; +} +EXPORT_SYMBOL(bio_add_folio); - bio_for_each_segment_all(bvec, bio, iter_all) { - if (mark_dirty && !PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); - put_page(bvec->bv_page); +void __bio_release_pages(struct bio *bio, bool mark_dirty) +{ + struct folio_iter fi; + + bio_for_each_folio_all(fi, bio) { + struct page *page; + size_t done = 0; + + if (mark_dirty) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); + } + page = folio_page(fi.folio, fi.offset / PAGE_SIZE); + do { + bio_release_page(bio, page++); + done += PAGE_SIZE; + } while (done < fi.length); } } -EXPORT_SYMBOL_GPL(bio_release_pages); +EXPORT_SYMBOL_GPL(__bio_release_pages); -static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) +void bio_iov_bvec_set(struct bio *bio, struct iov_iter *iter) { - const struct bio_vec *bv = iter->bvec; - unsigned int len; - size_t size; + size_t size = iov_iter_count(iter); - if (WARN_ON_ONCE(iter->iov_offset > bv->bv_len)) - return -EINVAL; + WARN_ON_ONCE(bio->bi_max_vecs); + + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + size_t max_sectors = queue_max_zone_append_sectors(q); + + size = min(size, max_sectors << SECTOR_SHIFT); + } - len = min_t(size_t, bv->bv_len - iter->iov_offset, iter->count); - size = bio_add_page(bio, bv->bv_page, len, - bv->bv_offset + iter->iov_offset); - if (unlikely(size != len)) + bio->bi_vcnt = iter->nr_segs; + bio->bi_io_vec = (struct bio_vec *)iter->bvec; + bio->bi_iter.bi_bvec_done = iter->iov_offset; + bio->bi_iter.bi_size = size; + bio_set_flag(bio, BIO_CLONED); +} + +static int bio_iov_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + bool same_page = false; + + if (WARN_ON_ONCE(bio->bi_iter.bi_size > UINT_MAX - len)) + return -EIO; + + if (bio->bi_vcnt > 0 && + bvec_try_merge_page(&bio->bi_io_vec[bio->bi_vcnt - 1], + page, len, offset, &same_page)) { + bio->bi_iter.bi_size += len; + if (same_page) + bio_release_page(bio, page); + return 0; + } + __bio_add_page(bio, page, len, offset); + return 0; +} + +static int bio_iov_add_zone_append_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + bool same_page = false; + + if (bio_add_hw_page(q, bio, page, len, offset, + queue_max_zone_append_sectors(q), &same_page) != len) return -EINVAL; - iov_iter_advance(iter, size); + if (same_page) + bio_release_page(bio, page); return 0; } @@ -1022,96 +1229,81 @@ static int __bio_iov_bvec_add_pages(struct bio *bio, struct iov_iter *iter) * @bio: bio to add pages to * @iter: iov iterator describing the region to be mapped * - * Pins pages from *iter and appends them to @bio's bvec array. The - * pages will have to be released using put_page() when done. - * For multi-segment *iter, this function only adds pages from the - * the next non-empty segment of the iov iterator. + * Extracts pages from *iter and appends them to @bio's bvec array. The pages + * will have to be cleaned up in the way indicated by the BIO_PAGE_PINNED flag. + * For a multi-segment *iter, this function only adds pages from the next + * non-empty segment of the iov iterator. */ static int __bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { + iov_iter_extraction_t extraction_flags = 0; unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; struct page **pages = (struct page **)bv; - bool same_page = false; ssize_t size, left; - unsigned len, i; + unsigned len, i = 0; size_t offset; + int ret = 0; /* * Move page array up in the allocated memory for the bio vecs as far as * possible so that we can start filling biovecs from the beginning * without overwriting the temporary page array. - */ + */ BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); + if (bio->bi_bdev && blk_queue_pci_p2pdma(bio->bi_bdev->bd_disk->queue)) + extraction_flags |= ITER_ALLOW_P2PDMA; + + /* + * Each segment in the iov is required to be a block size multiple. + * However, we may not be able to get the entire segment if it spans + * more pages than bi_max_vecs allows, so we have to ALIGN_DOWN the + * result to ensure the bio's total size is correct. The remainder of + * the iov data will be picked up in the next bio iteration. + */ + size = iov_iter_extract_pages(iter, &pages, + UINT_MAX - bio->bi_iter.bi_size, + nr_pages, extraction_flags, &offset); if (unlikely(size <= 0)) return size ? size : -EFAULT; - for (left = size, i = 0; left > 0; left -= len, i++) { - struct page *page = pages[i]; - - len = min_t(size_t, PAGE_SIZE - offset, left); + nr_pages = DIV_ROUND_UP(offset + size, PAGE_SIZE); - if (__bio_try_merge_page(bio, page, len, offset, &same_page)) { - if (same_page) - put_page(page); - } else { - if (WARN_ON_ONCE(bio_full(bio, len))) - return -EINVAL; - __bio_add_page(bio, page, len, offset); - } - offset = 0; + if (bio->bi_bdev) { + size_t trim = size & (bdev_logical_block_size(bio->bi_bdev) - 1); + iov_iter_revert(iter, trim); + size -= trim; } - iov_iter_advance(iter, size); - return 0; -} - -static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) -{ - unsigned short nr_pages = bio->bi_max_vecs - bio->bi_vcnt; - unsigned short entries_left = bio->bi_max_vecs - bio->bi_vcnt; - struct request_queue *q = bio->bi_disk->queue; - unsigned int max_append_sectors = queue_max_zone_append_sectors(q); - struct bio_vec *bv = bio->bi_io_vec + bio->bi_vcnt; - struct page **pages = (struct page **)bv; - ssize_t size, left; - unsigned len, i; - size_t offset; - - if (WARN_ON_ONCE(!max_append_sectors)) - return 0; - - /* - * Move page array up in the allocated memory for the bio vecs as far as - * possible so that we can start filling biovecs from the beginning - * without overwriting the temporary page array. - */ - BUILD_BUG_ON(PAGE_PTRS_PER_BVEC < 2); - pages += entries_left * (PAGE_PTRS_PER_BVEC - 1); - - size = iov_iter_get_pages(iter, pages, LONG_MAX, nr_pages, &offset); - if (unlikely(size <= 0)) - return size ? size : -EFAULT; + if (unlikely(!size)) { + ret = -EFAULT; + goto out; + } for (left = size, i = 0; left > 0; left -= len, i++) { struct page *page = pages[i]; - bool same_page = false; len = min_t(size_t, PAGE_SIZE - offset, left); - if (bio_add_hw_page(q, bio, page, len, offset, - max_append_sectors, &same_page) != len) - return -EINVAL; - if (same_page) - put_page(page); + if (bio_op(bio) == REQ_OP_ZONE_APPEND) { + ret = bio_iov_add_zone_append_page(bio, page, len, + offset); + if (ret) + break; + } else + bio_iov_add_page(bio, page, len, offset); + offset = 0; } - iov_iter_advance(iter, size); - return 0; + iov_iter_revert(iter, left); +out: + while (i < nr_pages) + bio_release_page(bio, pages[i++]); + + return ret; } /** @@ -1122,41 +1314,37 @@ static int __bio_iov_append_get_pages(struct bio *bio, struct iov_iter *iter) * This takes either an iterator pointing to user memory, or one pointing to * kernel pages (BVEC iterator). If we're adding user pages, we pin them and * map them into the kernel. On IO completion, the caller should put those - * pages. If we're adding kernel pages, and the caller told us it's safe to - * do so, we just have to add the pages to the bio directly. We don't grab an - * extra reference to those pages (the user should already have that), and we - * don't put the page on IO completion. The caller needs to check if the bio is - * flagged BIO_NO_PAGE_REF on IO completion. If it isn't, then pages should be - * released. + * pages. For bvec based iterators bio_iov_iter_get_pages() uses the provided + * bvecs rather than copying them. Hence anyone issuing kiocb based IO needs + * to ensure the bvecs and pages stay referenced until the submitted I/O is + * completed by a call to ->ki_complete() or returns with an error other than + * -EIOCBQUEUED. The caller needs to check if the bio is flagged BIO_NO_PAGE_REF + * on IO completion. If it isn't, then pages should be released. * * The function tries, but does not guarantee, to pin as many pages as - * fit into the bio, or are requested in *iter, whatever is smaller. If + * fit into the bio, or are requested in @iter, whatever is smaller. If * MM encounters an error pinning the requested pages, it stops. Error * is returned only if 0 pages could be pinned. */ int bio_iov_iter_get_pages(struct bio *bio, struct iov_iter *iter) { - const bool is_bvec = iov_iter_is_bvec(iter); - int ret; + int ret = 0; - if (WARN_ON_ONCE(bio->bi_vcnt)) - return -EINVAL; + if (WARN_ON_ONCE(bio_flagged(bio, BIO_CLONED))) + return -EIO; + + if (iov_iter_is_bvec(iter)) { + bio_iov_bvec_set(bio, iter); + iov_iter_advance(iter, bio->bi_iter.bi_size); + return 0; + } + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); do { - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { - if (WARN_ON_ONCE(is_bvec)) - return -EINVAL; - ret = __bio_iov_append_get_pages(bio, iter); - } else { - if (is_bvec) - ret = __bio_iov_bvec_add_pages(bio, iter); - else - ret = __bio_iov_iter_get_pages(bio, iter); - } + ret = __bio_iov_iter_get_pages(bio, iter); } while (!ret && iov_iter_count(iter) && !bio_full(bio, 0)); - if (is_bvec) - bio_set_flag(bio, BIO_NO_PAGE_REF); return bio->bi_vcnt ? 0 : ret; } EXPORT_SYMBOL_GPL(bio_iov_iter_get_pages); @@ -1179,7 +1367,8 @@ static void submit_bio_wait_endio(struct bio *bio) */ int submit_bio_wait(struct bio *bio) { - DECLARE_COMPLETION_ONSTACK_MAP(done, bio->bi_disk->lockdep_map); + DECLARE_COMPLETION_ONSTACK_MAP(done, + bio->bi_bdev->bd_disk->lockdep_map); unsigned long hang_check; bio->bi_private = &done; @@ -1200,18 +1389,7 @@ int submit_bio_wait(struct bio *bio) } EXPORT_SYMBOL(submit_bio_wait); -/** - * bio_advance - increment/complete a bio by some number of bytes - * @bio: bio to advance - * @bytes: number of bytes to complete - * - * This updates bi_sector, bi_size and bi_idx; if the number of bytes to - * complete doesn't align with a bvec boundary, then bv_len and bv_offset will - * be updated on the last bvec as well. - * - * @bio will then represent the remaining, uncompleted portion of the io. - */ -void bio_advance(struct bio *bio, unsigned bytes) +void __bio_advance(struct bio *bio, unsigned bytes) { if (bio_integrity(bio)) bio_integrity_advance(bio, bytes); @@ -1219,35 +1397,25 @@ void bio_advance(struct bio *bio, unsigned bytes) bio_crypt_advance(bio, bytes); bio_advance_iter(bio, &bio->bi_iter, bytes); } -EXPORT_SYMBOL(bio_advance); +EXPORT_SYMBOL(__bio_advance); void bio_copy_data_iter(struct bio *dst, struct bvec_iter *dst_iter, struct bio *src, struct bvec_iter *src_iter) { - struct bio_vec src_bv, dst_bv; - void *src_p, *dst_p; - unsigned bytes; - while (src_iter->bi_size && dst_iter->bi_size) { - src_bv = bio_iter_iovec(src, *src_iter); - dst_bv = bio_iter_iovec(dst, *dst_iter); + struct bio_vec src_bv = bio_iter_iovec(src, *src_iter); + struct bio_vec dst_bv = bio_iter_iovec(dst, *dst_iter); + unsigned int bytes = min(src_bv.bv_len, dst_bv.bv_len); + void *src_buf = bvec_kmap_local(&src_bv); + void *dst_buf = bvec_kmap_local(&dst_bv); - bytes = min(src_bv.bv_len, dst_bv.bv_len); + memcpy(dst_buf, src_buf, bytes); - src_p = kmap_atomic(src_bv.bv_page); - dst_p = kmap_atomic(dst_bv.bv_page); + kunmap_local(dst_buf); + kunmap_local(src_buf); - memcpy(dst_p + dst_bv.bv_offset, - src_p + src_bv.bv_offset, - bytes); - - kunmap_atomic(dst_p); - kunmap_atomic(src_p); - - flush_dcache_page(dst_bv.bv_page); - - bio_advance_iter(src, src_iter, bytes); - bio_advance_iter(dst, dst_iter, bytes); + bio_advance_iter_single(src, src_iter, bytes); + bio_advance_iter_single(dst, dst_iter, bytes); } } EXPORT_SYMBOL(bio_copy_data_iter); @@ -1269,43 +1437,6 @@ void bio_copy_data(struct bio *dst, struct bio *src) } EXPORT_SYMBOL(bio_copy_data); -/** - * bio_list_copy_data - copy contents of data buffers from one chain of bios to - * another - * @src: source bio list - * @dst: destination bio list - * - * Stops when it reaches the end of either the @src list or @dst list - that is, - * copies min(src->bi_size, dst->bi_size) bytes (or the equivalent for lists of - * bios). - */ -void bio_list_copy_data(struct bio *dst, struct bio *src) -{ - struct bvec_iter src_iter = src->bi_iter; - struct bvec_iter dst_iter = dst->bi_iter; - - while (1) { - if (!src_iter.bi_size) { - src = src->bi_next; - if (!src) - break; - - src_iter = src->bi_iter; - } - - if (!dst_iter.bi_size) { - dst = dst->bi_next; - if (!dst) - break; - - dst_iter = dst->bi_iter; - } - - bio_copy_data_iter(dst, &dst_iter, src, &src_iter); - } -} -EXPORT_SYMBOL(bio_list_copy_data); - void bio_free_pages(struct bio *bio) { struct bio_vec *bvec; @@ -1320,18 +1451,12 @@ EXPORT_SYMBOL(bio_free_pages); * bio_set_pages_dirty() and bio_check_pages_dirty() are support functions * for performing direct-IO in BIOs. * - * The problem is that we cannot run set_page_dirty() from interrupt context + * The problem is that we cannot run folio_mark_dirty() from interrupt context * because the required locks are not interrupt-safe. So what we can do is to * mark the pages dirty _before_ performing IO. And in interrupt context, * check that the pages are still dirty. If so, fine. If not, redirty them * in process context. * - * We special-case compound pages here: normally this means reads into hugetlb - * pages. The logic in here doesn't really work right for compound pages - * because the VM does not uniformly chase down the head page in all cases. - * But dirtiness of compound pages is pretty meaningless anyway: the VM doesn't - * handle them at all. So we skip compound pages here at an early stage. - * * Note that this code is very hard to test under normal circumstances because * direct-io pins the pages with get_user_pages(). This makes * is_page_cache_freeable return false, and the VM will not clean the pages. @@ -1347,14 +1472,15 @@ EXPORT_SYMBOL(bio_free_pages); */ void bio_set_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageCompound(bvec->bv_page)) - set_page_dirty_lock(bvec->bv_page); + bio_for_each_folio_all(fi, bio) { + folio_lock(fi.folio); + folio_mark_dirty(fi.folio); + folio_unlock(fi.folio); } } +EXPORT_SYMBOL_GPL(bio_set_pages_dirty); /* * bio_check_pages_dirty() will check that all the BIO's pages are still dirty. @@ -1363,8 +1489,8 @@ void bio_set_pages_dirty(struct bio *bio) * the BIO and re-dirty the pages in process context. * * It is expected that bio_check_pages_dirty() will wholly own the BIO from - * here on. It will run one put_page() against each page and will run one - * bio_put() against the BIO. + * here on. It will unpin each page and will run one bio_put() against the + * BIO. */ static void bio_dirty_fn(struct work_struct *work); @@ -1395,12 +1521,11 @@ static void bio_dirty_fn(struct work_struct *work) void bio_check_pages_dirty(struct bio *bio) { - struct bio_vec *bvec; + struct folio_iter fi; unsigned long flags; - struct bvec_iter_all iter_all; - bio_for_each_segment_all(bvec, bio, iter_all) { - if (!PageDirty(bvec->bv_page) && !PageCompound(bvec->bv_page)) + bio_for_each_folio_all(fi, bio) { + if (!folio_test_dirty(fi.folio)) goto defer; } @@ -1414,6 +1539,7 @@ defer: spin_unlock_irqrestore(&bio_dirty_lock, flags); schedule_work(&bio_dirty_work); } +EXPORT_SYMBOL_GPL(bio_check_pages_dirty); static inline bool bio_remaining_done(struct bio *bio) { @@ -1445,8 +1571,7 @@ static inline bool bio_remaining_done(struct bio *bio) * * bio_endio() can be called several times on a bio that has been chained * using bio_chain(). The ->bi_end_io() function will only be called the - * last time. At this point the BLK_TA_COMPLETE tracing event will be - * generated if BIO_TRACE_COMPLETION is set. + * last time. **/ void bio_endio(struct bio *bio) { @@ -1456,8 +1581,12 @@ again: if (!bio_integrity_endio(bio)) return; - if (bio->bi_disk) - rq_qos_done_bio(bio->bi_disk->queue, bio); + rq_qos_done_bio(bio); + + if (bio->bi_bdev && bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_complete(bdev_get_queue(bio->bi_bdev), bio); + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + } /* * Need to have a real endio function for chained bios, otherwise @@ -1472,11 +1601,6 @@ again: goto again; } - if (bio->bi_disk && bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_complete(bio->bi_disk->queue, bio); - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - } - blk_throtl_bio_endio(bio); /* release cgroup info */ bio_uninit(bio); @@ -1511,7 +1635,7 @@ struct bio *bio_split(struct bio *bio, int sectors, if (WARN_ON_ONCE(bio_op(bio) == REQ_OP_ZONE_APPEND)) return NULL; - split = bio_clone_fast(bio, gfp, bs); + split = bio_alloc_clone(bio->bi_bdev, bio, gfp, bs); if (!split) return NULL; @@ -1534,12 +1658,15 @@ EXPORT_SYMBOL(bio_split); * @bio: bio to trim * @offset: number of sectors to trim from the front of @bio * @size: size we want to trim @bio to, in sectors + * + * This function is typically used for bios that are cloned and submitted + * to the underlying device in parts. */ -void bio_trim(struct bio *bio, int offset, int size) +void bio_trim(struct bio *bio, sector_t offset, sector_t size) { - /* 'bio' is a cloned bio which we need to trim to match - * the given offset and size. - */ + if (WARN_ON_ONCE(offset > BIO_MAX_SECTORS || size > BIO_MAX_SECTORS || + offset + size > bio_sectors(bio))) + return; size <<= 9; if (offset == 0 && size == bio->bi_iter.bi_size) @@ -1550,7 +1677,6 @@ void bio_trim(struct bio *bio, int offset, int size) if (bio_integrity(bio)) bio_integrity_trim(bio); - } EXPORT_SYMBOL_GPL(bio_trim); @@ -1560,7 +1686,7 @@ EXPORT_SYMBOL_GPL(bio_trim); */ int biovec_init_pool(mempool_t *pool, int pool_entries) { - struct biovec_slab *bp = bvec_slabs + BVEC_POOL_MAX; + struct biovec_slab *bp = bvec_slabs + ARRAY_SIZE(bvec_slabs) - 1; return mempool_init_slab_pool(pool, pool_entries, bp->slab); } @@ -1573,6 +1699,7 @@ int biovec_init_pool(mempool_t *pool, int pool_entries) */ void bioset_exit(struct bio_set *bs) { + bio_alloc_cache_destroy(bs); if (bs->rescue_workqueue) destroy_workqueue(bs->rescue_workqueue); bs->rescue_workqueue = NULL; @@ -1603,9 +1730,9 @@ EXPORT_SYMBOL(bioset_exit); * Note that the bio must be embedded at the END of that structure always, * or things will break badly. * If %BIOSET_NEED_BVECS is set in @flags, a separate pool will be allocated - * for allocating iovecs. This pool is not needed e.g. for bio_clone_fast(). - * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used to - * dispatch queued requests when the mempool runs out of space. + * for allocating iovecs. This pool is not needed e.g. for bio_init_clone(). + * If %BIOSET_NEED_RESCUER is set, a workqueue is created which can be used + * to dispatch queued requests when the mempool runs out of space. * */ int bioset_init(struct bio_set *bs, @@ -1613,15 +1740,17 @@ int bioset_init(struct bio_set *bs, unsigned int front_pad, int flags) { - unsigned int back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); - bs->front_pad = front_pad; + if (flags & BIOSET_NEED_BVECS) + bs->back_pad = BIO_INLINE_VECS * sizeof(struct bio_vec); + else + bs->back_pad = 0; spin_lock_init(&bs->rescue_lock); bio_list_init(&bs->rescue_list); INIT_WORK(&bs->rescue_work, bio_alloc_rescue); - bs->bio_slab = bio_find_or_create_slab(front_pad + back_pad); + bs->bio_slab = bio_find_or_create_slab(bs); if (!bs->bio_slab) return -ENOMEM; @@ -1632,12 +1761,18 @@ int bioset_init(struct bio_set *bs, biovec_init_pool(&bs->bvec_pool, pool_size)) goto bad; - if (!(flags & BIOSET_NEED_RESCUER)) - return 0; - - bs->rescue_workqueue = alloc_workqueue("bioset", WQ_MEM_RECLAIM, 0); - if (!bs->rescue_workqueue) - goto bad; + if (flags & BIOSET_NEED_RESCUER) { + bs->rescue_workqueue = alloc_workqueue("bioset", + WQ_MEM_RECLAIM, 0); + if (!bs->rescue_workqueue) + goto bad; + } + if (flags & BIOSET_PERCPU_CACHE) { + bs->cache = alloc_percpu(struct bio_alloc_cache); + if (!bs->cache) + goto bad; + cpuhp_state_add_instance_nocalls(CPUHP_BIO_DEAD, &bs->cpuhp_dead); + } return 0; bad: @@ -1646,194 +1781,27 @@ bad: } EXPORT_SYMBOL(bioset_init); -/* - * Initialize and setup a new bio_set, based on the settings from - * another bio_set. - */ -int bioset_init_from_src(struct bio_set *bs, struct bio_set *src) -{ - int flags; - - flags = 0; - if (src->bvec_pool.min_nr) - flags |= BIOSET_NEED_BVECS; - if (src->rescue_workqueue) - flags |= BIOSET_NEED_RESCUER; - - return bioset_init(bs, src->bio_pool.min_nr, src->front_pad, flags); -} -EXPORT_SYMBOL(bioset_init_from_src); - -#ifdef CONFIG_BLK_CGROUP - -/** - * bio_disassociate_blkg - puts back the blkg reference if associated - * @bio: target bio - * - * Helper to disassociate the blkg from @bio if a blkg is associated. - */ -void bio_disassociate_blkg(struct bio *bio) -{ - if (bio->bi_blkg) { - blkg_put(bio->bi_blkg); - bio->bi_blkg = NULL; - } -} -EXPORT_SYMBOL_GPL(bio_disassociate_blkg); - -/** - * __bio_associate_blkg - associate a bio with the a blkg - * @bio: target bio - * @blkg: the blkg to associate - * - * This tries to associate @bio with the specified @blkg. Association failure - * is handled by walking up the blkg tree. Therefore, the blkg associated can - * be anything between @blkg and the root_blkg. This situation only happens - * when a cgroup is dying and then the remaining bios will spill to the closest - * alive blkg. - * - * A reference will be taken on the @blkg and will be released when @bio is - * freed. - */ -static void __bio_associate_blkg(struct bio *bio, struct blkcg_gq *blkg) -{ - bio_disassociate_blkg(bio); - - bio->bi_blkg = blkg_tryget_closest(blkg); -} - -/** - * bio_associate_blkg_from_css - associate a bio with a specified css - * @bio: target bio - * @css: target css - * - * Associate @bio with the blkg found by combining the css's blkg and the - * request_queue of the @bio. This falls back to the queue's root_blkg if - * the association fails with the css. - */ -void bio_associate_blkg_from_css(struct bio *bio, - struct cgroup_subsys_state *css) -{ - struct request_queue *q = bio->bi_disk->queue; - struct blkcg_gq *blkg; - - rcu_read_lock(); - - if (!css || !css->parent) - blkg = q->root_blkg; - else - blkg = blkg_lookup_create(css_to_blkcg(css), q); - - __bio_associate_blkg(bio, blkg); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); - -#ifdef CONFIG_MEMCG -/** - * bio_associate_blkg_from_page - associate a bio with the page's blkg - * @bio: target bio - * @page: the page to lookup the blkcg from - * - * Associate @bio with the blkg from @page's owning memcg and the respective - * request_queue. If cgroup_e_css returns %NULL, fall back to the queue's - * root_blkg. - */ -void bio_associate_blkg_from_page(struct bio *bio, struct page *page) -{ - struct cgroup_subsys_state *css; - - if (!page->mem_cgroup) - return; - - rcu_read_lock(); - - css = cgroup_e_css(page->mem_cgroup->css.cgroup, &io_cgrp_subsys); - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -#endif /* CONFIG_MEMCG */ - -/** - * bio_associate_blkg - associate a bio with a blkg - * @bio: target bio - * - * Associate @bio with the blkg found from the bio's css and request_queue. - * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is - * already associated, the css is reused and association redone as the - * request_queue may have changed. - */ -void bio_associate_blkg(struct bio *bio) -{ - struct cgroup_subsys_state *css; - - rcu_read_lock(); - - if (bio->bi_blkg) - css = &bio_blkcg(bio)->css; - else - css = blkcg_css(); - - bio_associate_blkg_from_css(bio, css); - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_associate_blkg); - -/** - * bio_clone_blkg_association - clone blkg association from src to dst bio - * @dst: destination bio - * @src: source bio - */ -void bio_clone_blkg_association(struct bio *dst, struct bio *src) +static int __init init_bio(void) { - rcu_read_lock(); - - if (src->bi_blkg) - __bio_associate_blkg(dst, src->bi_blkg); + int i; - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(bio_clone_blkg_association); -#endif /* CONFIG_BLK_CGROUP */ + BUILD_BUG_ON(BIO_FLAG_LAST > 8 * sizeof_field(struct bio, bi_flags)); -static void __init biovec_init_slabs(void) -{ - int i; + bio_integrity_init(); - for (i = 0; i < BVEC_POOL_NR; i++) { - int size; + for (i = 0; i < ARRAY_SIZE(bvec_slabs); i++) { struct biovec_slab *bvs = bvec_slabs + i; - if (bvs->nr_vecs <= BIO_INLINE_VECS) { - bvs->slab = NULL; - continue; - } - - size = bvs->nr_vecs * sizeof(struct bio_vec); - bvs->slab = kmem_cache_create(bvs->name, size, 0, - SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL); + bvs->slab = kmem_cache_create(bvs->name, + bvs->nr_vecs * sizeof(struct bio_vec), 0, + SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL); } -} - -static int __init init_bio(void) -{ - bio_slab_max = 2; - bio_slab_nr = 0; - bio_slabs = kcalloc(bio_slab_max, sizeof(struct bio_slab), - GFP_KERNEL); - BUILD_BUG_ON(BIO_FLAG_LAST > BVEC_POOL_OFFSET); - - if (!bio_slabs) - panic("bio: can't allocate bios\n"); - - bio_integrity_init(); - biovec_init_slabs(); + cpuhp_setup_state_multi(CPUHP_BIO_DEAD, "block/bio:dead", NULL, + bio_cpu_dead); - if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS)) + if (bioset_init(&fs_bio_set, BIO_POOL_SIZE, 0, + BIOSET_NEED_BVECS | BIOSET_PERCPU_CACHE)) panic("bio: can't allocate bios\n"); if (bioset_integrity_create(&fs_bio_set, BIO_POOL_SIZE)) diff --git a/block/blk-cgroup-fc-appid.c b/block/blk-cgroup-fc-appid.c new file mode 100644 index 000000000000..3ec21333f393 --- /dev/null +++ b/block/blk-cgroup-fc-appid.c @@ -0,0 +1,57 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "blk-cgroup.h" + +/** + * blkcg_set_fc_appid - set the fc_app_id field associted to blkcg + * @app_id: application identifier + * @cgrp_id: cgroup id + * @app_id_len: size of application identifier + */ +int blkcg_set_fc_appid(char *app_id, u64 cgrp_id, size_t app_id_len) +{ + struct cgroup *cgrp; + struct cgroup_subsys_state *css; + struct blkcg *blkcg; + int ret = 0; + + if (app_id_len > FC_APPID_LEN) + return -EINVAL; + + cgrp = cgroup_get_from_id(cgrp_id); + if (IS_ERR(cgrp)) + return PTR_ERR(cgrp); + css = cgroup_get_e_css(cgrp, &io_cgrp_subsys); + if (!css) { + ret = -ENOENT; + goto out_cgrp_put; + } + blkcg = css_to_blkcg(css); + /* + * There is a slight race condition on setting the appid. + * Worst case an I/O may not find the right id. + * This is no different from the I/O we let pass while obtaining + * the vmid from the fabric. + * Adding the overhead of a lock is not necessary. + */ + strscpy(blkcg->fc_app_id, app_id, app_id_len); + css_put(css); +out_cgrp_put: + cgroup_put(cgrp); + return ret; +} +EXPORT_SYMBOL_GPL(blkcg_set_fc_appid); + +/** + * blkcg_get_fc_appid - get the fc app identifier associated with a bio + * @bio: target bio + * + * On success return the fc_app_id, on failure return NULL + */ +char *blkcg_get_fc_appid(struct bio *bio) +{ + if (!bio->bi_blkg || bio->bi_blkg->blkcg->fc_app_id[0] == '\0') + return NULL; + return bio->bi_blkg->blkcg->fc_app_id; +} +EXPORT_SYMBOL_GPL(blkcg_get_fc_appid); diff --git a/block/blk-cgroup-rwstat.c b/block/blk-cgroup-rwstat.c index 85d5790ac49b..3304e841df7c 100644 --- a/block/blk-cgroup-rwstat.c +++ b/block/blk-cgroup-rwstat.c @@ -109,6 +109,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, lockdep_assert_held(&blkg->q->queue_lock); + memset(sum, 0, sizeof(*sum)); rcu_read_lock(); blkg_for_each_descendant_pre(pos_blkg, pos_css, blkg) { struct blkg_rwstat *rwstat; @@ -122,7 +123,7 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, rwstat = (void *)pos_blkg + off; for (i = 0; i < BLKG_RWSTAT_NR; i++) - sum->cnt[i] = blkg_rwstat_read_counter(rwstat, i); + sum->cnt[i] += blkg_rwstat_read_counter(rwstat, i); } rcu_read_unlock(); } diff --git a/block/blk-cgroup-rwstat.h b/block/blk-cgroup-rwstat.h index ee746919c41f..022527b0b043 100644 --- a/block/blk-cgroup-rwstat.h +++ b/block/blk-cgroup-rwstat.h @@ -6,7 +6,7 @@ #ifndef _BLK_CGROUP_RWSTAT_H #define _BLK_CGROUP_RWSTAT_H -#include <linux/blk-cgroup.h> +#include "blk-cgroup.h" enum blkg_rwstat_type { BLKG_RWSTAT_READ, @@ -59,20 +59,20 @@ void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol, * caller is responsible for synchronizing calls to this function. */ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat, - unsigned int op, uint64_t val) + blk_opf_t opf, uint64_t val) { struct percpu_counter *cnt; - if (op_is_discard(op)) + if (op_is_discard(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD]; - else if (op_is_write(op)) + else if (op_is_write(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ]; percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH); - if (op_is_sync(op)) + if (op_is_sync(opf)) cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC]; else cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC]; diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 0ecc897b225c..ff93c385ba5a 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -23,16 +23,18 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/slab.h> -#include <linux/genhd.h> #include <linux/delay.h> #include <linux/atomic.h> #include <linux/ctype.h> -#include <linux/blk-cgroup.h> -#include <linux/tracehook.h> +#include <linux/resume_user_mode.h> #include <linux/psi.h> +#include <linux/part_stat.h> #include "blk.h" +#include "blk-cgroup.h" +#include "blk-ioprio.h" +#include "blk-throttle.h" -#define MAX_KEY_LEN 100 +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu); /* * blkcg_pol_mutex protects blkcg_policy[] and policy [de]activation. @@ -55,7 +57,58 @@ static struct blkcg_policy *blkcg_policy[BLKCG_MAX_POLS]; static LIST_HEAD(all_blkcgs); /* protected by blkcg_pol_mutex */ bool blkcg_debug_stats = false; -static struct workqueue_struct *blkcg_punt_bio_wq; + +static DEFINE_RAW_SPINLOCK(blkg_stat_lock); + +#define BLKG_DESTROY_BATCH_SIZE 64 + +/* + * Lockless lists for tracking IO stats update + * + * New IO stats are stored in the percpu iostat_cpu within blkcg_gq (blkg). + * There are multiple blkg's (one for each block device) attached to each + * blkcg. The rstat code keeps track of which cpu has IO stats updated, + * but it doesn't know which blkg has the updated stats. If there are many + * block devices in a system, the cost of iterating all the blkg's to flush + * out the IO stats can be high. To reduce such overhead, a set of percpu + * lockless lists (lhead) per blkcg are used to track the set of recently + * updated iostat_cpu's since the last flush. An iostat_cpu will be put + * onto the lockless list on the update side [blk_cgroup_bio_start()] if + * not there yet and then removed when being flushed [blkcg_rstat_flush()]. + * References to blkg are gotten and then put back in the process to + * protect against blkg removal. + * + * Return: 0 if successful or -ENOMEM if allocation fails. + */ +static int init_blkcg_llists(struct blkcg *blkcg) +{ + int cpu; + + blkcg->lhead = alloc_percpu_gfp(struct llist_head, GFP_KERNEL); + if (!blkcg->lhead) + return -ENOMEM; + + for_each_possible_cpu(cpu) + init_llist_head(per_cpu_ptr(blkcg->lhead, cpu)); + return 0; +} + +/** + * blkcg_css - find the current css + * + * Find the css associated with either the kthread or the current task. + * This may return a dying css, so it is up to the caller to use tryget logic + * to confirm it is alive and well. + */ +static struct cgroup_subsys_state *blkcg_css(void) +{ + struct cgroup_subsys_state *css; + + css = kthread_blkcg(); + if (css) + return css; + return task_css(current, io_cgrp_id); +} static bool blkcg_policy_enabled(struct request_queue *q, const struct blkcg_policy *pol) @@ -63,6 +116,37 @@ static bool blkcg_policy_enabled(struct request_queue *q, return pol && test_bit(pol->plid, q->blkcg_pols); } +static void blkg_free_workfn(struct work_struct *work) +{ + struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, + free_work); + struct request_queue *q = blkg->q; + int i; + + /* + * pd_free_fn() can also be called from blkcg_deactivate_policy(), + * in order to make sure pd_free_fn() is called in order, the deletion + * of the list blkg->q_node is delayed to here from blkg_destroy(), and + * blkcg_mutex is used to synchronize blkg_free_workfn() and + * blkcg_deactivate_policy(). + */ + mutex_lock(&q->blkcg_mutex); + for (i = 0; i < BLKCG_MAX_POLS; i++) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + if (blkg->parent) + blkg_put(blkg->parent); + spin_lock_irq(&q->queue_lock); + list_del_init(&blkg->q_node); + spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); + + blk_put_queue(q); + free_percpu(blkg->iostat_cpu); + percpu_ref_exit(&blkg->refcnt); + kfree(blkg); +} + /** * blkg_free - free a blkg * @blkg: blkg to free @@ -71,33 +155,37 @@ static bool blkcg_policy_enabled(struct request_queue *q, */ static void blkg_free(struct blkcg_gq *blkg) { - int i; - if (!blkg) return; - for (i = 0; i < BLKCG_MAX_POLS; i++) - if (blkg->pd[i]) - blkcg_policy[i]->pd_free_fn(blkg->pd[i]); - - free_percpu(blkg->iostat_cpu); - percpu_ref_exit(&blkg->refcnt); - kfree(blkg); + /* + * Both ->pd_free_fn() and request queue's release handler may + * sleep, so free us by scheduling one work func + */ + INIT_WORK(&blkg->free_work, blkg_free_workfn); + schedule_work(&blkg->free_work); } static void __blkg_release(struct rcu_head *rcu) { struct blkcg_gq *blkg = container_of(rcu, struct blkcg_gq, rcu_head); + struct blkcg *blkcg = blkg->blkcg; + int cpu; +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO WARN_ON(!bio_list_empty(&blkg->async_bios)); +#endif + /* + * Flush all the non-empty percpu lockless lists before releasing + * us, given these stat belongs to us. + * + * blkg_stat_lock is for serializing blkg stat update + */ + for_each_possible_cpu(cpu) + __blkcg_rstat_flush(blkcg, cpu); /* release the blkcg and parent blkg refs this blkg has been holding */ css_put(&blkg->blkcg->css); - if (blkg->parent) - blkg_put(blkg->parent); - - wb_congested_put(blkg->wb_congested); - blkg_free(blkg); } @@ -116,125 +204,184 @@ static void blkg_release(struct percpu_ref *ref) call_rcu(&blkg->rcu_head, __blkg_release); } +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO +static struct workqueue_struct *blkcg_punt_bio_wq; + static void blkg_async_bio_workfn(struct work_struct *work) { struct blkcg_gq *blkg = container_of(work, struct blkcg_gq, async_bio_work); struct bio_list bios = BIO_EMPTY_LIST; struct bio *bio; + struct blk_plug plug; + bool need_plug = false; /* as long as there are pending bios, @blkg can't go away */ - spin_lock_bh(&blkg->async_bio_lock); + spin_lock(&blkg->async_bio_lock); bio_list_merge(&bios, &blkg->async_bios); bio_list_init(&blkg->async_bios); - spin_unlock_bh(&blkg->async_bio_lock); + spin_unlock(&blkg->async_bio_lock); + /* start plug only when bio_list contains at least 2 bios */ + if (bios.head && bios.head->bi_next) { + need_plug = true; + blk_start_plug(&plug); + } while ((bio = bio_list_pop(&bios))) submit_bio(bio); + if (need_plug) + blk_finish_plug(&plug); +} + +/* + * When a shared kthread issues a bio for a cgroup, doing so synchronously can + * lead to priority inversions as the kthread can be trapped waiting for that + * cgroup. Use this helper instead of submit_bio to punt the actual issuing to + * a dedicated per-blkcg work item to avoid such priority inversions. + */ +void blkcg_punt_bio_submit(struct bio *bio) +{ + struct blkcg_gq *blkg = bio->bi_blkg; + + if (blkg->parent) { + spin_lock(&blkg->async_bio_lock); + bio_list_add(&blkg->async_bios, bio); + spin_unlock(&blkg->async_bio_lock); + queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); + } else { + /* never bounce for the root cgroup */ + submit_bio(bio); + } +} +EXPORT_SYMBOL_GPL(blkcg_punt_bio_submit); + +static int __init blkcg_punt_bio_init(void) +{ + blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", + WQ_MEM_RECLAIM | WQ_FREEZABLE | + WQ_UNBOUND | WQ_SYSFS, 0); + if (!blkcg_punt_bio_wq) + return -ENOMEM; + return 0; +} +subsys_initcall(blkcg_punt_bio_init); +#endif /* CONFIG_BLK_CGROUP_PUNT_BIO */ + +/** + * bio_blkcg_css - return the blkcg CSS associated with a bio + * @bio: target bio + * + * This returns the CSS for the blkcg associated with a bio, or %NULL if not + * associated. Callers are expected to either handle %NULL or know association + * has been done prior to calling this. + */ +struct cgroup_subsys_state *bio_blkcg_css(struct bio *bio) +{ + if (!bio || !bio->bi_blkg) + return NULL; + return &bio->bi_blkg->blkcg->css; +} +EXPORT_SYMBOL_GPL(bio_blkcg_css); + +/** + * blkcg_parent - get the parent of a blkcg + * @blkcg: blkcg of interest + * + * Return the parent blkcg of @blkcg. Can be called anytime. + */ +static inline struct blkcg *blkcg_parent(struct blkcg *blkcg) +{ + return css_to_blkcg(blkcg->css.parent); } /** * blkg_alloc - allocate a blkg * @blkcg: block cgroup the new blkg is associated with - * @q: request_queue the new blkg is associated with + * @disk: gendisk the new blkg is associated with * @gfp_mask: allocation mask to use * - * Allocate a new blkg assocating @blkcg and @q. + * Allocate a new blkg associating @blkcg and @disk. */ -static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct request_queue *q, +static struct blkcg_gq *blkg_alloc(struct blkcg *blkcg, struct gendisk *disk, gfp_t gfp_mask) { struct blkcg_gq *blkg; int i, cpu; /* alloc and init base part */ - blkg = kzalloc_node(sizeof(*blkg), gfp_mask, q->node); + blkg = kzalloc_node(sizeof(*blkg), gfp_mask, disk->queue->node); if (!blkg) return NULL; - if (percpu_ref_init(&blkg->refcnt, blkg_release, 0, gfp_mask)) - goto err_free; - + goto out_free_blkg; blkg->iostat_cpu = alloc_percpu_gfp(struct blkg_iostat_set, gfp_mask); if (!blkg->iostat_cpu) - goto err_free; + goto out_exit_refcnt; + if (!blk_get_queue(disk->queue)) + goto out_free_iostat; - blkg->q = q; + blkg->q = disk->queue; INIT_LIST_HEAD(&blkg->q_node); + blkg->blkcg = blkcg; +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO spin_lock_init(&blkg->async_bio_lock); bio_list_init(&blkg->async_bios); INIT_WORK(&blkg->async_bio_work, blkg_async_bio_workfn); - blkg->blkcg = blkcg; +#endif u64_stats_init(&blkg->iostat.sync); - for_each_possible_cpu(cpu) + for_each_possible_cpu(cpu) { u64_stats_init(&per_cpu_ptr(blkg->iostat_cpu, cpu)->sync); + per_cpu_ptr(blkg->iostat_cpu, cpu)->blkg = blkg; + } for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkg_policy_data *pd; - if (!blkcg_policy_enabled(q, pol)) + if (!blkcg_policy_enabled(disk->queue, pol)) continue; /* alloc per-policy data and attach it to blkg */ - pd = pol->pd_alloc_fn(gfp_mask, q, blkcg); + pd = pol->pd_alloc_fn(disk, blkcg, gfp_mask); if (!pd) - goto err_free; - + goto out_free_pds; blkg->pd[i] = pd; pd->blkg = blkg; pd->plid = i; + pd->online = false; } return blkg; -err_free: - blkg_free(blkg); - return NULL; -} - -struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg, - struct request_queue *q, bool update_hint) -{ - struct blkcg_gq *blkg; - - /* - * Hint didn't match. Look up from the radix tree. Note that the - * hint can only be updated under queue_lock as otherwise @blkg - * could have already been removed from blkg_tree. The caller is - * responsible for grabbing queue_lock if @update_hint. - */ - blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); - if (blkg && blkg->q == q) { - if (update_hint) { - lockdep_assert_held(&q->queue_lock); - rcu_assign_pointer(blkcg->blkg_hint, blkg); - } - return blkg; - } - +out_free_pds: + while (--i >= 0) + if (blkg->pd[i]) + blkcg_policy[i]->pd_free_fn(blkg->pd[i]); + blk_put_queue(disk->queue); +out_free_iostat: + free_percpu(blkg->iostat_cpu); +out_exit_refcnt: + percpu_ref_exit(&blkg->refcnt); +out_free_blkg: + kfree(blkg); return NULL; } -EXPORT_SYMBOL_GPL(blkg_lookup_slowpath); /* * If @new_blkg is %NULL, this function tries to allocate a new one as * necessary using %GFP_NOWAIT. @new_blkg is always consumed on return. */ -static struct blkcg_gq *blkg_create(struct blkcg *blkcg, - struct request_queue *q, +static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk, struct blkcg_gq *new_blkg) { struct blkcg_gq *blkg; - struct bdi_writeback_congested *wb_congested; int i, ret; - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); + lockdep_assert_held(&disk->queue->queue_lock); /* request_queue is dying, do not create/recreate a blkg */ - if (blk_queue_dying(q)) { + if (blk_queue_dying(disk->queue)) { ret = -ENODEV; goto err_free_blkg; } @@ -245,31 +392,22 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, goto err_free_blkg; } - wb_congested = wb_congested_get_create(q->backing_dev_info, - blkcg->css.id, - GFP_NOWAIT | __GFP_NOWARN); - if (!wb_congested) { - ret = -ENOMEM; - goto err_put_css; - } - /* allocate */ if (!new_blkg) { - new_blkg = blkg_alloc(blkcg, q, GFP_NOWAIT | __GFP_NOWARN); + new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto err_put_congested; + goto err_put_css; } } blkg = new_blkg; - blkg->wb_congested = wb_congested; /* link parent */ if (blkcg_parent(blkcg)) { - blkg->parent = __blkg_lookup(blkcg_parent(blkcg), q, false); + blkg->parent = blkg_lookup(blkcg_parent(blkcg), disk->queue); if (WARN_ON_ONCE(!blkg->parent)) { ret = -ENODEV; - goto err_put_congested; + goto err_put_css; } blkg_get(blkg->parent); } @@ -284,16 +422,19 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, /* insert */ spin_lock(&blkcg->lock); - ret = radix_tree_insert(&blkcg->blkg_tree, q->id, blkg); + ret = radix_tree_insert(&blkcg->blkg_tree, disk->queue->id, blkg); if (likely(!ret)) { hlist_add_head_rcu(&blkg->blkcg_node, &blkcg->blkg_list); - list_add(&blkg->q_node, &q->blkg_list); + list_add(&blkg->q_node, &disk->queue->blkg_list); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_online_fn) - pol->pd_online_fn(blkg->pd[i]); + if (blkg->pd[i]) { + if (pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[i]); + blkg->pd[i]->online = true; + } } } blkg->online = true; @@ -306,40 +447,49 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, blkg_put(blkg); return ERR_PTR(ret); -err_put_congested: - wb_congested_put(wb_congested); err_put_css: css_put(&blkcg->css); err_free_blkg: - blkg_free(new_blkg); + if (new_blkg) + blkg_free(new_blkg); return ERR_PTR(ret); } /** - * __blkg_lookup_create - lookup blkg, try to create one if not there + * blkg_lookup_create - lookup blkg, try to create one if not there * @blkcg: blkcg of interest - * @q: request_queue of interest + * @disk: gendisk of interest * - * Lookup blkg for the @blkcg - @q pair. If it doesn't exist, try to + * Lookup blkg for the @blkcg - @disk pair. If it doesn't exist, try to * create one. blkg creation is performed recursively from blkcg_root such * that all non-root blkg's have access to the parent blkg. This function - * should be called under RCU read lock and @q->queue_lock. + * should be called under RCU read lock and takes @disk->queue->queue_lock. * * Returns the blkg or the closest blkg if blkg_create() fails as it walks * down from root. */ -struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) +static struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, + struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; + unsigned long flags; WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - blkg = __blkg_lookup(blkcg, q, true); + blkg = blkg_lookup(blkcg, q); if (blkg) return blkg; + spin_lock_irqsave(&q->queue_lock, flags); + blkg = blkg_lookup(blkcg, q); + if (blkg) { + if (blkcg != &blkcg_root && + blkg != rcu_dereference(blkcg->blkg_hint)) + rcu_assign_pointer(blkcg->blkg_hint, blkg); + goto found; + } + /* * Create blkgs walking down from blkcg_root to @blkcg, so that all * non-root blkgs have access to their parents. Returns the closest @@ -351,7 +501,7 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, struct blkcg_gq *ret_blkg = q->root_blkg; while (parent) { - blkg = __blkg_lookup(parent, q, false); + blkg = blkg_lookup(parent, q); if (blkg) { /* remember closest blkg */ ret_blkg = blkg; @@ -361,35 +511,17 @@ struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg, parent = blkcg_parent(parent); } - blkg = blkg_create(pos, q, NULL); - if (IS_ERR(blkg)) - return ret_blkg; + blkg = blkg_create(pos, disk, NULL); + if (IS_ERR(blkg)) { + blkg = ret_blkg; + break; + } if (pos == blkcg) - return blkg; - } -} - -/** - * blkg_lookup_create - find or create a blkg - * @blkcg: target block cgroup - * @q: target request_queue - * - * This looks up or creates the blkg representing the unique pair - * of the blkcg and the request_queue. - */ -struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg, - struct request_queue *q) -{ - struct blkcg_gq *blkg = blkg_lookup(blkcg, q); - - if (unlikely(!blkg)) { - unsigned long flags; - - spin_lock_irqsave(&q->queue_lock, flags); - blkg = __blkg_lookup_create(blkcg, q); - spin_unlock_irqrestore(&q->queue_lock, flags); + break; } +found: + spin_unlock_irqrestore(&q->queue_lock, flags); return blkg; } @@ -401,21 +533,28 @@ static void blkg_destroy(struct blkcg_gq *blkg) lockdep_assert_held(&blkg->q->queue_lock); lockdep_assert_held(&blkcg->lock); - /* Something wrong if we are trying to remove same group twice */ - WARN_ON_ONCE(list_empty(&blkg->q_node)); - WARN_ON_ONCE(hlist_unhashed(&blkg->blkcg_node)); + /* + * blkg stays on the queue list until blkg_free_workfn(), see details in + * blkg_free_workfn(), hence this function can be called from + * blkcg_destroy_blkgs() first and again from blkg_destroy_all() before + * blkg_free_workfn(). + */ + if (hlist_unhashed(&blkg->blkcg_node)) + return; for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; - if (blkg->pd[i] && pol->pd_offline_fn) - pol->pd_offline_fn(blkg->pd[i]); + if (blkg->pd[i] && blkg->pd[i]->online) { + blkg->pd[i]->online = false; + if (pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[i]); + } } blkg->online = false; radix_tree_delete(&blkcg->blkg_tree, blkg->q->id); - list_del_init(&blkg->q_node); hlist_del_init_rcu(&blkg->blkcg_node); /* @@ -433,23 +572,47 @@ static void blkg_destroy(struct blkcg_gq *blkg) percpu_ref_kill(&blkg->refcnt); } -/** - * blkg_destroy_all - destroy all blkgs associated with a request_queue - * @q: request_queue of interest - * - * Destroy all blkgs associated with @q. - */ -static void blkg_destroy_all(struct request_queue *q) +static void blkg_destroy_all(struct gendisk *disk) { - struct blkcg_gq *blkg, *n; + struct request_queue *q = disk->queue; + struct blkcg_gq *blkg; + int count = BLKG_DESTROY_BATCH_SIZE; + int i; +restart: spin_lock_irq(&q->queue_lock); - list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { + list_for_each_entry(blkg, &q->blkg_list, q_node) { struct blkcg *blkcg = blkg->blkcg; + if (hlist_unhashed(&blkg->blkcg_node)) + continue; + spin_lock(&blkcg->lock); blkg_destroy(blkg); spin_unlock(&blkcg->lock); + + /* + * in order to avoid holding the spin lock for too long, release + * it when a batch of blkgs are destroyed. + */ + if (!(--count)) { + count = BLKG_DESTROY_BATCH_SIZE; + spin_unlock_irq(&q->queue_lock); + cond_resched(); + goto restart; + } + } + + /* + * Mark policy deactivated since policy offline has been done, and + * the free is scheduled, so future blkcg_deactivate_policy() can + * be bypassed + */ + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (pol) + __clear_bit(pol->plid, q->blkcg_pols); } q->root_blkg = NULL; @@ -476,8 +639,13 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, struct blkg_iostat_set *bis = per_cpu_ptr(blkg->iostat_cpu, cpu); memset(bis, 0, sizeof(*bis)); + + /* Re-initialize the cleared blkg_iostat_set */ + u64_stats_init(&bis->sync); + bis->blkg = blkg; } memset(&blkg->iostat, 0, sizeof(blkg->iostat)); + u64_stats_init(&blkg->iostat.sync); for (i = 0; i < BLKCG_MAX_POLS; i++) { struct blkcg_policy *pol = blkcg_policy[i]; @@ -494,10 +662,9 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css, const char *blkg_dev_name(struct blkcg_gq *blkg) { - /* some drivers (floppy) instantiate a queue w/o disk registered */ - if (blkg->q->backing_dev_info->dev) - return bdi_dev_name(blkg->q->backing_dev_info); - return NULL; + if (!blkg->q->disk) + return NULL; + return bdi_dev_name(blkg->q->disk->bdi); } /** @@ -547,7 +714,7 @@ EXPORT_SYMBOL_GPL(blkcg_print_blkgs); * @pd: policy private data of interest * @v: value to print * - * Print @v to @sf for the device assocaited with @pd. + * Print @v to @sf for the device associated with @pd. */ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) { @@ -561,93 +728,119 @@ u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v) } EXPORT_SYMBOL_GPL(__blkg_prfill_u64); -/* Performs queue bypass and policy enabled checks then looks up blkg. */ -static struct blkcg_gq *blkg_lookup_check(struct blkcg *blkcg, - const struct blkcg_policy *pol, - struct request_queue *q) +/** + * blkg_conf_init - initialize a blkg_conf_ctx + * @ctx: blkg_conf_ctx to initialize + * @input: input string + * + * Initialize @ctx which can be used to parse blkg config input string @input. + * Once initialized, @ctx can be used with blkg_conf_open_bdev() and + * blkg_conf_prep(), and must be cleaned up with blkg_conf_exit(). + */ +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input) { - WARN_ON_ONCE(!rcu_read_lock_held()); - lockdep_assert_held(&q->queue_lock); - - if (!blkcg_policy_enabled(q, pol)) - return ERR_PTR(-EOPNOTSUPP); - return __blkg_lookup(blkcg, q, true /* update_hint */); + *ctx = (struct blkg_conf_ctx){ .input = input }; } +EXPORT_SYMBOL_GPL(blkg_conf_init); /** - * blkg_conf_prep - parse and prepare for per-blkg config update - * @inputp: input string pointer + * blkg_conf_open_bdev - parse and open bdev for per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Parse the device node prefix part, MAJ:MIN, of per-blkg config update - * from @input and get and return the matching gendisk. *@inputp is - * updated to point past the device node prefix. Returns an ERR_PTR() - * value on error. + * Parse the device node prefix part, MAJ:MIN, of per-blkg config update from + * @ctx->input and get and store the matching bdev in @ctx->bdev. @ctx->body is + * set to point past the device node prefix. * - * Use this function iff blkg_conf_prep() can't be used for some reason. + * This function may be called multiple times on @ctx and the extra calls become + * NOOPs. blkg_conf_prep() implicitly calls this function. Use this function + * explicitly if bdev access is needed without resolving the blkcg / policy part + * of @ctx->input. Returns -errno on error. */ -struct gendisk *blkcg_conf_get_disk(char **inputp) +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx) { - char *input = *inputp; + char *input = ctx->input; unsigned int major, minor; - struct gendisk *disk; - int key_len, part; + struct block_device *bdev; + int key_len; + + if (ctx->bdev) + return 0; if (sscanf(input, "%u:%u%n", &major, &minor, &key_len) != 2) - return ERR_PTR(-EINVAL); + return -EINVAL; input += key_len; if (!isspace(*input)) - return ERR_PTR(-EINVAL); + return -EINVAL; input = skip_spaces(input); - disk = get_gendisk(MKDEV(major, minor), &part); - if (!disk) - return ERR_PTR(-ENODEV); - if (part) { - put_disk_and_module(disk); - return ERR_PTR(-ENODEV); + bdev = blkdev_get_no_open(MKDEV(major, minor)); + if (!bdev) + return -ENODEV; + if (bdev_is_partition(bdev)) { + blkdev_put_no_open(bdev); + return -ENODEV; + } + + mutex_lock(&bdev->bd_queue->rq_qos_mutex); + if (!disk_live(bdev->bd_disk)) { + blkdev_put_no_open(bdev); + mutex_unlock(&bdev->bd_queue->rq_qos_mutex); + return -ENODEV; } - *inputp = input; - return disk; + ctx->body = input; + ctx->bdev = bdev; + return 0; } /** * blkg_conf_prep - parse and prepare for per-blkg config update * @blkcg: target block cgroup * @pol: target policy - * @input: input string - * @ctx: blkg_conf_ctx to be filled + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() + * + * Parse per-blkg config update from @ctx->input and initialize @ctx + * accordingly. On success, @ctx->body points to the part of @ctx->input + * following MAJ:MIN, @ctx->bdev points to the target block device and + * @ctx->blkg to the blkg being configured. * - * Parse per-blkg config update from @input and initialize @ctx with the - * result. @ctx->blkg points to the blkg to be updated and @ctx->body the - * part of @input following MAJ:MIN. This function returns with RCU read - * lock and queue lock held and must be paired with blkg_conf_finish(). + * blkg_conf_open_bdev() may be called on @ctx beforehand. On success, this + * function returns with queue lock held and must be followed by + * blkg_conf_exit(). */ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, - char *input, struct blkg_conf_ctx *ctx) - __acquires(rcu) __acquires(&disk->queue->queue_lock) + struct blkg_conf_ctx *ctx) + __acquires(&bdev->bd_queue->queue_lock) { struct gendisk *disk; struct request_queue *q; struct blkcg_gq *blkg; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + ret = blkg_conf_open_bdev(ctx); + if (ret) + return ret; + disk = ctx->bdev->bd_disk; q = disk->queue; - rcu_read_lock(); + /* + * blkcg_deactivate_policy() requires queue to be frozen, we can grab + * q_usage_counter to prevent concurrent with blkcg_deactivate_policy(). + */ + ret = blk_queue_enter(q, 0); + if (ret) + goto fail; + spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(blkcg, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); + if (!blkcg_policy_enabled(q, pol)) { + ret = -EOPNOTSUPP; goto fail_unlock; } + blkg = blkg_lookup(blkcg, q); if (blkg) goto success; @@ -661,54 +854,62 @@ int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, struct blkcg_gq *new_blkg; parent = blkcg_parent(blkcg); - while (parent && !__blkg_lookup(parent, q, false)) { + while (parent && !blkg_lookup(parent, q)) { pos = parent; parent = blkcg_parent(parent); } /* Drop locks to do new blkg allocation with GFP_KERNEL. */ spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); - new_blkg = blkg_alloc(pos, q, GFP_KERNEL); + new_blkg = blkg_alloc(pos, disk, GFP_KERNEL); if (unlikely(!new_blkg)) { ret = -ENOMEM; - goto fail; + goto fail_exit_queue; + } + + if (radix_tree_preload(GFP_KERNEL)) { + blkg_free(new_blkg); + ret = -ENOMEM; + goto fail_exit_queue; } - rcu_read_lock(); spin_lock_irq(&q->queue_lock); - blkg = blkg_lookup_check(pos, pol, q); - if (IS_ERR(blkg)) { - ret = PTR_ERR(blkg); - goto fail_unlock; + if (!blkcg_policy_enabled(q, pol)) { + blkg_free(new_blkg); + ret = -EOPNOTSUPP; + goto fail_preloaded; } + blkg = blkg_lookup(pos, q); if (blkg) { blkg_free(new_blkg); } else { - blkg = blkg_create(pos, q, new_blkg); + blkg = blkg_create(pos, disk, new_blkg); if (IS_ERR(blkg)) { ret = PTR_ERR(blkg); - goto fail_unlock; + goto fail_preloaded; } } + radix_tree_preload_end(); + if (pos == blkcg) goto success; } success: - ctx->disk = disk; + blk_queue_exit(q); ctx->blkg = blkg; - ctx->body = input; return 0; +fail_preloaded: + radix_tree_preload_end(); fail_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); +fail_exit_queue: + blk_queue_exit(q); fail: - put_disk_and_module(disk); /* * If queue was bypassing, we should retry. Do so after a * short msleep(). It isn't strictly necessary but queue @@ -724,108 +925,251 @@ fail: EXPORT_SYMBOL_GPL(blkg_conf_prep); /** - * blkg_conf_finish - finish up per-blkg config update - * @ctx: blkg_conf_ctx intiailized by blkg_conf_prep() + * blkg_conf_exit - clean up per-blkg config update + * @ctx: blkg_conf_ctx initialized with blkg_conf_init() * - * Finish up after per-blkg config update. This function must be paired - * with blkg_conf_prep(). + * Clean up after per-blkg config update. This function must be called on all + * blkg_conf_ctx's initialized with blkg_conf_init(). */ -void blkg_conf_finish(struct blkg_conf_ctx *ctx) - __releases(&ctx->disk->queue->queue_lock) __releases(rcu) +void blkg_conf_exit(struct blkg_conf_ctx *ctx) + __releases(&ctx->bdev->bd_queue->queue_lock) + __releases(&ctx->bdev->bd_queue->rq_qos_mutex) { - spin_unlock_irq(&ctx->disk->queue->queue_lock); - rcu_read_unlock(); - put_disk_and_module(ctx->disk); + if (ctx->blkg) { + spin_unlock_irq(&bdev_get_queue(ctx->bdev)->queue_lock); + ctx->blkg = NULL; + } + + if (ctx->bdev) { + mutex_unlock(&ctx->bdev->bd_queue->rq_qos_mutex); + blkdev_put_no_open(ctx->bdev); + ctx->body = NULL; + ctx->bdev = NULL; + } } -EXPORT_SYMBOL_GPL(blkg_conf_finish); +EXPORT_SYMBOL_GPL(blkg_conf_exit); -static int blkcg_print_stat(struct seq_file *sf, void *v) +static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) { - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct blkcg_gq *blkg; + int i; - cgroup_rstat_flush(blkcg->css.cgroup); - rcu_read_lock(); + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] = src->bytes[i]; + dst->ios[i] = src->ios[i]; + } +} - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkg_iostat_set *bis = &blkg->iostat; - const char *dname; - char *buf; - u64 rbytes, wbytes, rios, wios, dbytes, dios; - size_t size = seq_get_buf(sf, &buf), off = 0; - int i; - bool has_stats = false; - unsigned seq; +static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; - spin_lock_irq(&blkg->q->queue_lock); + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] += src->bytes[i]; + dst->ios[i] += src->ios[i]; + } +} - if (!blkg->online) - goto skip; +static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +{ + int i; - dname = blkg_dev_name(blkg); - if (!dname) - goto skip; + for (i = 0; i < BLKG_IOSTAT_NR; i++) { + dst->bytes[i] -= src->bytes[i]; + dst->ios[i] -= src->ios[i]; + } +} - /* - * Hooray string manipulation, count is the size written NOT - * INCLUDING THE \0, so size is now count+1 less than what we - * had before, but we want to start writing the next bit from - * the \0 so we only add count to buf. - */ - off += scnprintf(buf+off, size-off, "%s ", dname); +static void blkcg_iostat_update(struct blkcg_gq *blkg, struct blkg_iostat *cur, + struct blkg_iostat *last) +{ + struct blkg_iostat delta; + unsigned long flags; + + /* propagate percpu delta to global */ + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&delta, cur); + blkg_iostat_sub(&delta, last); + blkg_iostat_add(&blkg->iostat.cur, &delta); + blkg_iostat_add(last, &delta); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); +} +static void __blkcg_rstat_flush(struct blkcg *blkcg, int cpu) +{ + struct llist_head *lhead = per_cpu_ptr(blkcg->lhead, cpu); + struct llist_node *lnode; + struct blkg_iostat_set *bisc, *next_bisc; + unsigned long flags; + + rcu_read_lock(); + + lnode = llist_del_all(lhead); + if (!lnode) + goto out; + + /* + * For covering concurrent parent blkg update from blkg_release(). + * + * When flushing from cgroup, cgroup_rstat_lock is always held, so + * this lock won't cause contention most of time. + */ + raw_spin_lock_irqsave(&blkg_stat_lock, flags); + + /* + * Iterate only the iostat_cpu's queued in the lockless list. + */ + llist_for_each_entry_safe(bisc, next_bisc, lnode, lnode) { + struct blkcg_gq *blkg = bisc->blkg; + struct blkcg_gq *parent = blkg->parent; + struct blkg_iostat cur; + unsigned int seq; + + WRITE_ONCE(bisc->lqueued, false); + + /* fetch the current per-cpu values */ do { - seq = u64_stats_fetch_begin(&bis->sync); - - rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; - wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; - dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; - rios = bis->cur.ios[BLKG_IOSTAT_READ]; - wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; - dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; - } while (u64_stats_fetch_retry(&bis->sync, seq)); - - if (rbytes || wbytes || rios || wios) { - has_stats = true; - off += scnprintf(buf+off, size-off, - "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", - rbytes, wbytes, rios, wios, - dbytes, dios); - } + seq = u64_stats_fetch_begin(&bisc->sync); + blkg_iostat_set(&cur, &bisc->cur); + } while (u64_stats_fetch_retry(&bisc->sync, seq)); - if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { - has_stats = true; - off += scnprintf(buf+off, size-off, - " use_delay=%d delay_nsec=%llu", - atomic_read(&blkg->use_delay), - (unsigned long long)atomic64_read(&blkg->delay_nsec)); - } + blkcg_iostat_update(blkg, &cur, &bisc->last); - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - size_t written; + /* propagate global delta to parent (unless that's root) */ + if (parent && parent->parent) + blkcg_iostat_update(parent, &blkg->iostat.cur, + &blkg->iostat.last); + } + raw_spin_unlock_irqrestore(&blkg_stat_lock, flags); +out: + rcu_read_unlock(); +} + +static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) +{ + /* Root-level stats are sourced from system-wide IO stats */ + if (cgroup_parent(css->cgroup)) + __blkcg_rstat_flush(css_to_blkcg(css), cpu); +} - if (!blkg->pd[i] || !pol->pd_stat_fn) - continue; +/* + * We source root cgroup stats from the system-wide stats to avoid + * tracking the same information twice and incurring overhead when no + * cgroups are defined. For that reason, cgroup_rstat_flush in + * blkcg_print_stat does not actually fill out the iostat in the root + * cgroup's blkcg_gq. + * + * However, we would like to re-use the printing code between the root and + * non-root cgroups to the extent possible. For that reason, we simulate + * flushing the root cgroup's stats by explicitly filling in the iostat + * with disk level statistics. + */ +static void blkcg_fill_root_iostats(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct block_device *bdev = dev_to_bdev(dev); + struct blkcg_gq *blkg = bdev->bd_disk->queue->root_blkg; + struct blkg_iostat tmp; + int cpu; + unsigned long flags; - written = pol->pd_stat_fn(blkg->pd[i], buf+off, size-off); - if (written) - has_stats = true; - off += written; + memset(&tmp, 0, sizeof(tmp)); + for_each_possible_cpu(cpu) { + struct disk_stats *cpu_dkstats; + + cpu_dkstats = per_cpu_ptr(bdev->bd_stats, cpu); + tmp.ios[BLKG_IOSTAT_READ] += + cpu_dkstats->ios[STAT_READ]; + tmp.ios[BLKG_IOSTAT_WRITE] += + cpu_dkstats->ios[STAT_WRITE]; + tmp.ios[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->ios[STAT_DISCARD]; + // convert sectors to bytes + tmp.bytes[BLKG_IOSTAT_READ] += + cpu_dkstats->sectors[STAT_READ] << 9; + tmp.bytes[BLKG_IOSTAT_WRITE] += + cpu_dkstats->sectors[STAT_WRITE] << 9; + tmp.bytes[BLKG_IOSTAT_DISCARD] += + cpu_dkstats->sectors[STAT_DISCARD] << 9; } - if (has_stats) { - if (off < size - 1) { - off += scnprintf(buf+off, size-off, "\n"); - seq_commit(sf, off); - } else { - seq_commit(sf, -1); - } - } - skip: - spin_unlock_irq(&blkg->q->queue_lock); + flags = u64_stats_update_begin_irqsave(&blkg->iostat.sync); + blkg_iostat_set(&blkg->iostat.cur, &tmp); + u64_stats_update_end_irqrestore(&blkg->iostat.sync, flags); } +} + +static void blkcg_print_one_stat(struct blkcg_gq *blkg, struct seq_file *s) +{ + struct blkg_iostat_set *bis = &blkg->iostat; + u64 rbytes, wbytes, rios, wios, dbytes, dios; + const char *dname; + unsigned seq; + int i; + + if (!blkg->online) + return; + + dname = blkg_dev_name(blkg); + if (!dname) + return; + + seq_printf(s, "%s ", dname); + do { + seq = u64_stats_fetch_begin(&bis->sync); + + rbytes = bis->cur.bytes[BLKG_IOSTAT_READ]; + wbytes = bis->cur.bytes[BLKG_IOSTAT_WRITE]; + dbytes = bis->cur.bytes[BLKG_IOSTAT_DISCARD]; + rios = bis->cur.ios[BLKG_IOSTAT_READ]; + wios = bis->cur.ios[BLKG_IOSTAT_WRITE]; + dios = bis->cur.ios[BLKG_IOSTAT_DISCARD]; + } while (u64_stats_fetch_retry(&bis->sync, seq)); + + if (rbytes || wbytes || rios || wios) { + seq_printf(s, "rbytes=%llu wbytes=%llu rios=%llu wios=%llu dbytes=%llu dios=%llu", + rbytes, wbytes, rios, wios, + dbytes, dios); + } + + if (blkcg_debug_stats && atomic_read(&blkg->use_delay)) { + seq_printf(s, " use_delay=%d delay_nsec=%llu", + atomic_read(&blkg->use_delay), + atomic64_read(&blkg->delay_nsec)); + } + + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + + if (!blkg->pd[i] || !pol->pd_stat_fn) + continue; + + pol->pd_stat_fn(blkg->pd[i], s); + } + + seq_puts(s, "\n"); +} + +static int blkcg_print_stat(struct seq_file *sf, void *v) +{ + struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); + struct blkcg_gq *blkg; + + if (!seq_css(sf)->parent) + blkcg_fill_root_iostats(); + else + cgroup_rstat_flush(blkcg->css.cgroup); + + rcu_read_lock(); + hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { + spin_lock_irq(&blkg->q->queue_lock); + blkcg_print_one_stat(blkg, sf); + spin_unlock_irq(&blkg->q->queue_lock); + } rcu_read_unlock(); return 0; } @@ -833,7 +1177,6 @@ static int blkcg_print_stat(struct seq_file *sf, void *v) static struct cftype blkcg_files[] = { { .name = "stat", - .flags = CFTYPE_NOT_ON_ROOT, .seq_show = blkcg_print_stat, }, { } /* terminate */ @@ -847,6 +1190,13 @@ static struct cftype blkcg_legacy_files[] = { { } /* terminate */ }; +#ifdef CONFIG_CGROUP_WRITEBACK +struct list_head *blkcg_get_cgwb_list(struct cgroup_subsys_state *css) +{ + return &css_to_blkcg(css)->cgwb_list; +} +#endif + /* * blkcg destruction is a three-stage process. * @@ -869,25 +1219,6 @@ static struct cftype blkcg_legacy_files[] = { */ /** - * blkcg_css_offline - cgroup css_offline callback - * @css: css of interest - * - * This function is called when @css is about to go away. Here the cgwbs are - * offlined first and only once writeback associated with the blkcg has - * finished do we start step 2 (see above). - */ -static void blkcg_css_offline(struct cgroup_subsys_state *css) -{ - struct blkcg *blkcg = css_to_blkcg(css); - - /* this prevents anyone from attaching or migrating to this blkcg */ - wb_blkcg_offline(blkcg); - - /* put the base online pin allowing step 2 to be triggered */ - blkcg_unpin_online(blkcg); -} - -/** * blkcg_destroy_blkgs - responsible for shooting down blkgs * @blkcg: blkcg of interest * @@ -898,8 +1229,10 @@ static void blkcg_css_offline(struct cgroup_subsys_state *css) * * This is the blkcg counterpart of ioc_release_fn(). */ -void blkcg_destroy_blkgs(struct blkcg *blkcg) +static void blkcg_destroy_blkgs(struct blkcg *blkcg) { + might_sleep(); + spin_lock_irq(&blkcg->lock); while (!hlist_empty(&blkcg->blkg_list)) { @@ -907,19 +1240,76 @@ void blkcg_destroy_blkgs(struct blkcg *blkcg) struct blkcg_gq, blkcg_node); struct request_queue *q = blkg->q; - if (spin_trylock(&q->queue_lock)) { - blkg_destroy(blkg); - spin_unlock(&q->queue_lock); - } else { + if (need_resched() || !spin_trylock(&q->queue_lock)) { + /* + * Given that the system can accumulate a huge number + * of blkgs in pathological cases, check to see if we + * need to rescheduling to avoid softlockup. + */ spin_unlock_irq(&blkcg->lock); - cpu_relax(); + cond_resched(); spin_lock_irq(&blkcg->lock); + continue; } + + blkg_destroy(blkg); + spin_unlock(&q->queue_lock); } spin_unlock_irq(&blkcg->lock); } +/** + * blkcg_pin_online - pin online state + * @blkcg_css: blkcg of interest + * + * While pinned, a blkcg is kept online. This is primarily used to + * impedance-match blkg and cgwb lifetimes so that blkg doesn't go offline + * while an associated cgwb is still active. + */ +void blkcg_pin_online(struct cgroup_subsys_state *blkcg_css) +{ + refcount_inc(&css_to_blkcg(blkcg_css)->online_pin); +} + +/** + * blkcg_unpin_online - unpin online state + * @blkcg_css: blkcg of interest + * + * This is primarily used to impedance-match blkg and cgwb lifetimes so + * that blkg doesn't go offline while an associated cgwb is still active. + * When this count goes to zero, all active cgwbs have finished so the + * blkcg can continue destruction by calling blkcg_destroy_blkgs(). + */ +void blkcg_unpin_online(struct cgroup_subsys_state *blkcg_css) +{ + struct blkcg *blkcg = css_to_blkcg(blkcg_css); + + do { + if (!refcount_dec_and_test(&blkcg->online_pin)) + break; + blkcg_destroy_blkgs(blkcg); + blkcg = blkcg_parent(blkcg); + } while (blkcg); +} + +/** + * blkcg_css_offline - cgroup css_offline callback + * @css: css of interest + * + * This function is called when @css is about to go away. Here the cgwbs are + * offlined first and only once writeback associated with the blkcg has + * finished do we start step 2 (see above). + */ +static void blkcg_css_offline(struct cgroup_subsys_state *css) +{ + /* this prevents anyone from attaching or migrating to this blkcg */ + wb_blkcg_offline(css); + + /* put the base online pin allowing step 2 to be triggered */ + blkcg_unpin_online(css); +} + static void blkcg_css_free(struct cgroup_subsys_state *css) { struct blkcg *blkcg = css_to_blkcg(css); @@ -935,6 +1325,7 @@ static void blkcg_css_free(struct cgroup_subsys_state *css) mutex_unlock(&blkcg_pol_mutex); + free_percpu(blkcg->lhead); kfree(blkcg); } @@ -942,7 +1333,6 @@ static struct cgroup_subsys_state * blkcg_css_alloc(struct cgroup_subsys_state *parent_css) { struct blkcg *blkcg; - struct cgroup_subsys_state *ret; int i; mutex_lock(&blkcg_pol_mutex); @@ -951,12 +1341,13 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) blkcg = &blkcg_root; } else { blkcg = kzalloc(sizeof(*blkcg), GFP_KERNEL); - if (!blkcg) { - ret = ERR_PTR(-ENOMEM); + if (!blkcg) goto unlock; - } } + if (init_blkcg_llists(blkcg)) + goto free_blkcg; + for (i = 0; i < BLKCG_MAX_POLS ; i++) { struct blkcg_policy *pol = blkcg_policy[i]; struct blkcg_policy_data *cpd; @@ -971,15 +1362,12 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css) continue; cpd = pol->cpd_alloc_fn(GFP_KERNEL); - if (!cpd) { - ret = ERR_PTR(-ENOMEM); + if (!cpd) goto free_pd_blkcg; - } + blkcg->cpd[i] = cpd; cpd->blkcg = blkcg; cpd->plid = i; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } spin_lock_init(&blkcg->lock); @@ -998,18 +1386,18 @@ free_pd_blkcg: for (i--; i >= 0; i--) if (blkcg->cpd[i]) blkcg_policy[i]->cpd_free_fn(blkcg->cpd[i]); - + free_percpu(blkcg->lhead); +free_blkcg: if (blkcg != &blkcg_root) kfree(blkcg); unlock: mutex_unlock(&blkcg_pol_mutex); - return ret; + return ERR_PTR(-ENOMEM); } static int blkcg_css_online(struct cgroup_subsys_state *css) { - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg *parent = blkcg_parent(blkcg); + struct blkcg *parent = blkcg_parent(css_to_blkcg(css)); /* * blkcg_pin_online() is used to delay blkcg offline so that blkgs @@ -1017,199 +1405,71 @@ static int blkcg_css_online(struct cgroup_subsys_state *css) * parent so that offline always happens towards the root. */ if (parent) - blkcg_pin_online(parent); + blkcg_pin_online(&parent->css); return 0; } -/** - * blkcg_init_queue - initialize blkcg part of request queue - * @q: request_queue to initialize - * - * Called from __blk_alloc_queue(). Responsible for initializing blkcg - * part of new request_queue @q. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int blkcg_init_queue(struct request_queue *q) +int blkcg_init_disk(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blkcg_gq *new_blkg, *blkg; bool preloaded; int ret; - new_blkg = blkg_alloc(&blkcg_root, q, GFP_KERNEL); + INIT_LIST_HEAD(&q->blkg_list); + mutex_init(&q->blkcg_mutex); + + new_blkg = blkg_alloc(&blkcg_root, disk, GFP_KERNEL); if (!new_blkg) return -ENOMEM; preloaded = !radix_tree_preload(GFP_KERNEL); /* Make sure the root blkg exists. */ - rcu_read_lock(); + /* spin_lock_irq can serve as RCU read-side critical section. */ spin_lock_irq(&q->queue_lock); - blkg = blkg_create(&blkcg_root, q, new_blkg); + blkg = blkg_create(&blkcg_root, disk, new_blkg); if (IS_ERR(blkg)) goto err_unlock; q->root_blkg = blkg; spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); - ret = blk_iolatency_init(q); + ret = blk_ioprio_init(disk); if (ret) goto err_destroy_all; - ret = blk_throtl_init(q); + ret = blk_throtl_init(disk); if (ret) - goto err_destroy_all; + goto err_ioprio_exit; + return 0; +err_ioprio_exit: + blk_ioprio_exit(disk); err_destroy_all: - blkg_destroy_all(q); + blkg_destroy_all(disk); return ret; err_unlock: spin_unlock_irq(&q->queue_lock); - rcu_read_unlock(); if (preloaded) radix_tree_preload_end(); return PTR_ERR(blkg); } -/** - * blkcg_exit_queue - exit and release blkcg part of request_queue - * @q: request_queue being released - * - * Called from blk_exit_queue(). Responsible for exiting blkcg part. - */ -void blkcg_exit_queue(struct request_queue *q) -{ - blkg_destroy_all(q); - blk_throtl_exit(q); -} - -/* - * We cannot support shared io contexts, as we have no mean to support - * two tasks with the same ioc in two different groups without major rework - * of the main cic data structures. For now we allow a task to change - * its cgroup only if it's the only owner of its ioc. - */ -static int blkcg_can_attach(struct cgroup_taskset *tset) -{ - struct task_struct *task; - struct cgroup_subsys_state *dst_css; - struct io_context *ioc; - int ret = 0; - - /* task_lock() is needed to avoid races with exit_io_context() */ - cgroup_taskset_for_each(task, dst_css, tset) { - task_lock(task); - ioc = task->io_context; - if (ioc && atomic_read(&ioc->nr_tasks) > 1) - ret = -EINVAL; - task_unlock(task); - if (ret) - break; - } - return ret; -} - -static void blkg_iostat_set(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] = src->bytes[i]; - dst->ios[i] = src->ios[i]; - } -} - -static void blkg_iostat_add(struct blkg_iostat *dst, struct blkg_iostat *src) -{ - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] += src->bytes[i]; - dst->ios[i] += src->ios[i]; - } -} - -static void blkg_iostat_sub(struct blkg_iostat *dst, struct blkg_iostat *src) +void blkcg_exit_disk(struct gendisk *disk) { - int i; - - for (i = 0; i < BLKG_IOSTAT_NR; i++) { - dst->bytes[i] -= src->bytes[i]; - dst->ios[i] -= src->ios[i]; - } -} - -static void blkcg_rstat_flush(struct cgroup_subsys_state *css, int cpu) -{ - struct blkcg *blkcg = css_to_blkcg(css); - struct blkcg_gq *blkg; - - rcu_read_lock(); - - hlist_for_each_entry_rcu(blkg, &blkcg->blkg_list, blkcg_node) { - struct blkcg_gq *parent = blkg->parent; - struct blkg_iostat_set *bisc = per_cpu_ptr(blkg->iostat_cpu, cpu); - struct blkg_iostat cur, delta; - unsigned seq; - - /* fetch the current per-cpu values */ - do { - seq = u64_stats_fetch_begin(&bisc->sync); - blkg_iostat_set(&cur, &bisc->cur); - } while (u64_stats_fetch_retry(&bisc->sync, seq)); - - /* propagate percpu delta to global */ - u64_stats_update_begin(&blkg->iostat.sync); - blkg_iostat_set(&delta, &cur); - blkg_iostat_sub(&delta, &bisc->last); - blkg_iostat_add(&blkg->iostat.cur, &delta); - blkg_iostat_add(&bisc->last, &delta); - u64_stats_update_end(&blkg->iostat.sync); - - /* propagate global delta to parent */ - if (parent) { - u64_stats_update_begin(&parent->iostat.sync); - blkg_iostat_set(&delta, &blkg->iostat.cur); - blkg_iostat_sub(&delta, &blkg->iostat.last); - blkg_iostat_add(&parent->iostat.cur, &delta); - blkg_iostat_add(&blkg->iostat.last, &delta); - u64_stats_update_end(&parent->iostat.sync); - } - } - - rcu_read_unlock(); -} - -static void blkcg_bind(struct cgroup_subsys_state *root_css) -{ - int i; - - mutex_lock(&blkcg_pol_mutex); - - for (i = 0; i < BLKCG_MAX_POLS; i++) { - struct blkcg_policy *pol = blkcg_policy[i]; - struct blkcg *blkcg; - - if (!pol || !pol->cpd_bind_fn) - continue; - - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) - if (blkcg->cpd[pol->plid]) - pol->cpd_bind_fn(blkcg->cpd[pol->plid]); - } - mutex_unlock(&blkcg_pol_mutex); + blkg_destroy_all(disk); + blk_throtl_exit(disk); } static void blkcg_exit(struct task_struct *tsk) { - if (tsk->throttle_queue) - blk_put_queue(tsk->throttle_queue); - tsk->throttle_queue = NULL; + if (tsk->throttle_disk) + put_disk(tsk->throttle_disk); + tsk->throttle_disk = NULL; } struct cgroup_subsys io_cgrp_subsys = { @@ -1217,9 +1477,7 @@ struct cgroup_subsys io_cgrp_subsys = { .css_online = blkcg_css_online, .css_offline = blkcg_css_offline, .css_free = blkcg_css_free, - .can_attach = blkcg_can_attach, .css_rstat_flush = blkcg_rstat_flush, - .bind = blkcg_bind, .dfl_cftypes = blkcg_files, .legacy_cftypes = blkcg_legacy_files, .legacy_name = "blkio", @@ -1236,14 +1494,14 @@ struct cgroup_subsys io_cgrp_subsys = { EXPORT_SYMBOL_GPL(io_cgrp_subsys); /** - * blkcg_activate_policy - activate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_activate_policy - activate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to activate * - * Activate @pol on @q. Requires %GFP_KERNEL context. @q goes through + * Activate @pol on @disk. Requires %GFP_KERNEL context. @disk goes through * bypass mode to populate its blkgs with policy_data for @pol. * - * Activation happens with @q bypassed, so nobody would be accessing blkgs + * Activation happens with @disk bypassed, so nobody would be accessing blkgs * from IO path. Update of each blkg is protected by both queue and blkcg * locks so that holding either lock and testing blkcg_policy_enabled() is * always enough for dereferencing policy data. @@ -1251,9 +1509,9 @@ EXPORT_SYMBOL_GPL(io_cgrp_subsys); * The caller is responsible for synchronizing [de]activations and policy * [un]registerations. Returns 0 on success, -errno on failure. */ -int blkcg_activate_policy(struct request_queue *q, - const struct blkcg_policy *pol) +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkg_policy_data *pd_prealloc = NULL; struct blkcg_gq *blkg, *pinned_blkg = NULL; int ret; @@ -1266,7 +1524,7 @@ int blkcg_activate_policy(struct request_queue *q, retry: spin_lock_irq(&q->queue_lock); - /* blkg_list is pushed at the head, reverse walk to allocate parents first */ + /* blkg_list is pushed at the head, reverse walk to initialize parents first */ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { struct blkg_policy_data *pd; @@ -1278,8 +1536,8 @@ retry: pd = pd_prealloc; pd_prealloc = NULL; } else { - pd = pol->pd_alloc_fn(GFP_NOWAIT | __GFP_NOWARN, q, - blkg->blkcg); + pd = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_NOWAIT | __GFP_NOWARN); } if (!pd) { @@ -1296,23 +1554,29 @@ retry: if (pd_prealloc) pol->pd_free_fn(pd_prealloc); - pd_prealloc = pol->pd_alloc_fn(GFP_KERNEL, q, - blkg->blkcg); + pd_prealloc = pol->pd_alloc_fn(disk, blkg->blkcg, + GFP_KERNEL); if (pd_prealloc) goto retry; else goto enomem; } - blkg->pd[pol->plid] = pd; + spin_lock(&blkg->blkcg->lock); + pd->blkg = blkg; pd->plid = pol->plid; - } + blkg->pd[pol->plid] = pd; - /* all allocated, init in the same order */ - if (pol->pd_init_fn) - list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) - pol->pd_init_fn(blkg->pd[pol->plid]); + if (pol->pd_init_fn) + pol->pd_init_fn(pd); + + if (pol->pd_online_fn) + pol->pd_online_fn(pd); + pd->online = true; + + spin_unlock(&blkg->blkcg->lock); + } __set_bit(pol->plid, q->blkcg_pols); ret = 0; @@ -1328,13 +1592,22 @@ out: return ret; enomem: - /* alloc failed, nothing's initialized yet, free everything */ + /* alloc failed, take down everything */ spin_lock_irq(&q->queue_lock); list_for_each_entry(blkg, &q->blkg_list, q_node) { - if (blkg->pd[pol->plid]) { - pol->pd_free_fn(blkg->pd[pol->plid]); + struct blkcg *blkcg = blkg->blkcg; + struct blkg_policy_data *pd; + + spin_lock(&blkcg->lock); + pd = blkg->pd[pol->plid]; + if (pd) { + if (pd->online && pol->pd_offline_fn) + pol->pd_offline_fn(pd); + pd->online = false; + pol->pd_free_fn(pd); blkg->pd[pol->plid] = NULL; } + spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); ret = -ENOMEM; @@ -1343,16 +1616,17 @@ enomem: EXPORT_SYMBOL_GPL(blkcg_activate_policy); /** - * blkcg_deactivate_policy - deactivate a blkcg policy on a request_queue - * @q: request_queue of interest + * blkcg_deactivate_policy - deactivate a blkcg policy on a gendisk + * @disk: gendisk of interest * @pol: blkcg policy to deactivate * - * Deactivate @pol on @q. Follows the same synchronization rules as + * Deactivate @pol on @disk. Follows the same synchronization rules as * blkcg_activate_policy(). */ -void blkcg_deactivate_policy(struct request_queue *q, +void blkcg_deactivate_policy(struct gendisk *disk, const struct blkcg_policy *pol) { + struct request_queue *q = disk->queue; struct blkcg_gq *blkg; if (!blkcg_policy_enabled(q, pol)) @@ -1361,26 +1635,44 @@ void blkcg_deactivate_policy(struct request_queue *q, if (queue_is_mq(q)) blk_mq_freeze_queue(q); + mutex_lock(&q->blkcg_mutex); spin_lock_irq(&q->queue_lock); __clear_bit(pol->plid, q->blkcg_pols); list_for_each_entry(blkg, &q->blkg_list, q_node) { + struct blkcg *blkcg = blkg->blkcg; + + spin_lock(&blkcg->lock); if (blkg->pd[pol->plid]) { - if (pol->pd_offline_fn) + if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) pol->pd_offline_fn(blkg->pd[pol->plid]); pol->pd_free_fn(blkg->pd[pol->plid]); blkg->pd[pol->plid] = NULL; } + spin_unlock(&blkcg->lock); } spin_unlock_irq(&q->queue_lock); + mutex_unlock(&q->blkcg_mutex); if (queue_is_mq(q)) blk_mq_unfreeze_queue(q); } EXPORT_SYMBOL_GPL(blkcg_deactivate_policy); +static void blkcg_free_all_cpd(struct blkcg_policy *pol) +{ + struct blkcg *blkcg; + + list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { + if (blkcg->cpd[pol->plid]) { + pol->cpd_free_fn(blkcg->cpd[pol->plid]); + blkcg->cpd[pol->plid] = NULL; + } + } +} + /** * blkcg_policy_register - register a blkcg policy * @pol: blkcg policy to register @@ -1427,8 +1719,6 @@ int blkcg_policy_register(struct blkcg_policy *pol) blkcg->cpd[pol->plid] = cpd; cpd->blkcg = blkcg; cpd->plid = pol->plid; - if (pol->cpd_init_fn) - pol->cpd_init_fn(cpd); } } @@ -1445,14 +1735,9 @@ int blkcg_policy_register(struct blkcg_policy *pol) return 0; err_free_cpds: - if (pol->cpd_free_fn) { - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - if (blkcg->cpd[pol->plid]) { - pol->cpd_free_fn(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; - } - } - } + if (pol->cpd_free_fn) + blkcg_free_all_cpd(pol); + blkcg_policy[pol->plid] = NULL; err_unlock: mutex_unlock(&blkcg_pol_mutex); @@ -1469,8 +1754,6 @@ EXPORT_SYMBOL_GPL(blkcg_policy_register); */ void blkcg_policy_unregister(struct blkcg_policy *pol) { - struct blkcg *blkcg; - mutex_lock(&blkcg_pol_register_mutex); if (WARN_ON(blkcg_policy[pol->plid] != pol)) @@ -1485,14 +1768,9 @@ void blkcg_policy_unregister(struct blkcg_policy *pol) /* remove cpds and unregister */ mutex_lock(&blkcg_pol_mutex); - if (pol->cpd_free_fn) { - list_for_each_entry(blkcg, &all_blkcgs, all_blkcgs_node) { - if (blkcg->cpd[pol->plid]) { - pol->cpd_free_fn(blkcg->cpd[pol->plid]); - blkcg->cpd[pol->plid] = NULL; - } - } - } + if (pol->cpd_free_fn) + blkcg_free_all_cpd(pol); + blkcg_policy[pol->plid] = NULL; mutex_unlock(&blkcg_pol_mutex); @@ -1501,25 +1779,6 @@ out_unlock: } EXPORT_SYMBOL_GPL(blkcg_policy_unregister); -bool __blkcg_punt_bio_submit(struct bio *bio) -{ - struct blkcg_gq *blkg = bio->bi_blkg; - - /* consume the flag first */ - bio->bi_opf &= ~REQ_CGROUP_PUNT; - - /* never bounce for the root cgroup */ - if (!blkg->parent) - return false; - - spin_lock_bh(&blkg->async_bio_lock); - bio_list_add(&blkg->async_bios, bio); - spin_unlock_bh(&blkg->async_bio_lock); - - queue_work(blkcg_punt_bio_wq, &blkg->async_bio_work); - return true; -} - /* * Scale the accumulated delay based on how long it has been since we updated * the delay. We only call this when we are adding delay, in case it's been a @@ -1548,7 +1807,7 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) * everybody is happy with their IO latencies. */ if (time_before64(old + NSEC_PER_SEC, now) && - atomic64_cmpxchg(&blkg->delay_start, old, now) == old) { + atomic64_try_cmpxchg(&blkg->delay_start, &old, now)) { u64 cur = atomic64_read(&blkg->delay_nsec); u64 sub = min_t(u64, blkg->last_delay, now - old); int cur_use = atomic_read(&blkg->use_delay); @@ -1586,16 +1845,24 @@ static void blkcg_scale_delay(struct blkcg_gq *blkg, u64 now) static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) { unsigned long pflags; + bool clamp; u64 now = ktime_to_ns(ktime_get()); u64 exp; u64 delay_nsec = 0; int tok; while (blkg->parent) { - if (atomic_read(&blkg->use_delay)) { + int use_delay = atomic_read(&blkg->use_delay); + + if (use_delay) { + u64 this_delay; + blkcg_scale_delay(blkg, now); - delay_nsec = max_t(u64, delay_nsec, - atomic64_read(&blkg->delay_nsec)); + this_delay = atomic64_read(&blkg->delay_nsec); + if (this_delay > delay_nsec) { + delay_nsec = this_delay; + clamp = use_delay > 0; + } } blkg = blkg->parent; } @@ -1607,10 +1874,13 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * Let's not sleep for all eternity if we've amassed a huge delay. * Swapping or metadata IO can accumulate 10's of seconds worth of * delay, and we want userspace to be able to do _something_ so cap the - * delays at 1 second. If there's 10's of seconds worth of delay then - * the tasks will be delayed for 1 second for every syscall. + * delays at 0.25s. If there's 10's of seconds worth of delay then the + * tasks will be delayed for 0.25 second for every syscall. If + * blkcg_set_delay() was used as indicated by negative use_delay, the + * caller is responsible for regulating the range. */ - delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); + if (clamp) + delay_nsec = min_t(u64, delay_nsec, 250 * NSEC_PER_MSEC); if (use_memdelay) psi_memstall_enter(&pflags); @@ -1633,35 +1903,29 @@ static void blkcg_maybe_throttle_blkg(struct blkcg_gq *blkg, bool use_memdelay) * * This is only called if we've been marked with set_notify_resume(). Obviously * we can be set_notify_resume() for reasons other than blkcg throttling, so we - * check to see if current->throttle_queue is set and if not this doesn't do + * check to see if current->throttle_disk is set and if not this doesn't do * anything. This should only ever be called by the resume code, it's not meant * to be called by people willy-nilly as it will actually do the work to * throttle the task if it is setup for throttling. */ void blkcg_maybe_throttle_current(void) { - struct request_queue *q = current->throttle_queue; - struct cgroup_subsys_state *css; + struct gendisk *disk = current->throttle_disk; struct blkcg *blkcg; struct blkcg_gq *blkg; bool use_memdelay = current->use_memdelay; - if (!q) + if (!disk) return; - current->throttle_queue = NULL; + current->throttle_disk = NULL; current->use_memdelay = false; rcu_read_lock(); - css = kthread_blkcg(); - if (css) - blkcg = css_to_blkcg(css); - else - blkcg = css_to_blkcg(task_css(current, io_cgrp_id)); - + blkcg = css_to_blkcg(blkcg_css()); if (!blkcg) goto out; - blkg = blkg_lookup(blkcg, q); + blkg = blkg_lookup(blkcg, disk->queue); if (!blkg) goto out; if (!blkg_tryget(blkg)) @@ -1670,22 +1934,21 @@ void blkcg_maybe_throttle_current(void) blkcg_maybe_throttle_blkg(blkg, use_memdelay); blkg_put(blkg); - blk_put_queue(q); + put_disk(disk); return; out: rcu_read_unlock(); - blk_put_queue(q); } /** * blkcg_schedule_throttle - this task needs to check for throttling - * @q: the request queue IO was submitted on + * @disk: disk to throttle * @use_memdelay: do we charge this to memory delay for PSI * * This is called by the IO controller when we know there's delay accumulated * for the blkg for this task. We do not pass the blkg because there are places * we call this that may not have that information, the swapping code for - * instance will only have a request_queue at that point. This set's the + * instance will only have a block_device at that point. This set's the * notify_resume for the task to check and see if it requires throttling before * returning to user space. * @@ -1694,17 +1957,21 @@ out: * throttle once. If the task needs to be throttled again it'll need to be * re-set at the next time we see the task. */ -void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) +void blkcg_schedule_throttle(struct gendisk *disk, bool use_memdelay) { if (unlikely(current->flags & PF_KTHREAD)) return; - if (!blk_get_queue(q)) - return; + if (current->throttle_disk != disk) { + if (test_bit(GD_DEAD, &disk->state)) + return; + get_device(disk_to_dev(disk)); + + if (current->throttle_disk) + put_disk(current->throttle_disk); + current->throttle_disk = disk; + } - if (current->throttle_queue) - blk_put_queue(current->throttle_queue); - current->throttle_queue = q; if (use_memdelay) current->use_memdelay = use_memdelay; set_notify_resume(current); @@ -1727,16 +1994,172 @@ void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta) atomic64_add(delta, &blkg->delay_nsec); } -static int __init blkcg_init(void) +/** + * blkg_tryget_closest - try and get a blkg ref on the closet blkg + * @bio: target bio + * @css: target css + * + * As the failure mode here is to walk up the blkg tree, this ensure that the + * blkg->parent pointers are always valid. This returns the blkg that it ended + * up taking a reference on or %NULL if no reference was taken. + */ +static inline struct blkcg_gq *blkg_tryget_closest(struct bio *bio, + struct cgroup_subsys_state *css) { - blkcg_punt_bio_wq = alloc_workqueue("blkcg_punt_bio", - WQ_MEM_RECLAIM | WQ_FREEZABLE | - WQ_UNBOUND | WQ_SYSFS, 0); - if (!blkcg_punt_bio_wq) - return -ENOMEM; - return 0; + struct blkcg_gq *blkg, *ret_blkg = NULL; + + rcu_read_lock(); + blkg = blkg_lookup_create(css_to_blkcg(css), bio->bi_bdev->bd_disk); + while (blkg) { + if (blkg_tryget(blkg)) { + ret_blkg = blkg; + break; + } + blkg = blkg->parent; + } + rcu_read_unlock(); + + return ret_blkg; +} + +/** + * bio_associate_blkg_from_css - associate a bio with a specified css + * @bio: target bio + * @css: target css + * + * Associate @bio with the blkg found by combining the css's blkg and the + * request_queue of the @bio. An association failure is handled by walking up + * the blkg tree. Therefore, the blkg associated can be anything between @blkg + * and q->root_blkg. This situation only happens when a cgroup is dying and + * then the remaining bios will spill to the closest alive blkg. + * + * A reference will be taken on the blkg and will be released when @bio is + * freed. + */ +void bio_associate_blkg_from_css(struct bio *bio, + struct cgroup_subsys_state *css) +{ + if (bio->bi_blkg) + blkg_put(bio->bi_blkg); + + if (css && css->parent) { + bio->bi_blkg = blkg_tryget_closest(bio, css); + } else { + blkg_get(bdev_get_queue(bio->bi_bdev)->root_blkg); + bio->bi_blkg = bdev_get_queue(bio->bi_bdev)->root_blkg; + } +} +EXPORT_SYMBOL_GPL(bio_associate_blkg_from_css); + +/** + * bio_associate_blkg - associate a bio with a blkg + * @bio: target bio + * + * Associate @bio with the blkg found from the bio's css and request_queue. + * If one is not found, bio_lookup_blkg() creates the blkg. If a blkg is + * already associated, the css is reused and association redone as the + * request_queue may have changed. + */ +void bio_associate_blkg(struct bio *bio) +{ + struct cgroup_subsys_state *css; + + if (blk_op_is_passthrough(bio->bi_opf)) + return; + + rcu_read_lock(); + + if (bio->bi_blkg) + css = bio_blkcg_css(bio); + else + css = blkcg_css(); + + bio_associate_blkg_from_css(bio, css); + + rcu_read_unlock(); +} +EXPORT_SYMBOL_GPL(bio_associate_blkg); + +/** + * bio_clone_blkg_association - clone blkg association from src to dst bio + * @dst: destination bio + * @src: source bio + */ +void bio_clone_blkg_association(struct bio *dst, struct bio *src) +{ + if (src->bi_blkg) + bio_associate_blkg_from_css(dst, bio_blkcg_css(src)); +} +EXPORT_SYMBOL_GPL(bio_clone_blkg_association); + +static int blk_cgroup_io_type(struct bio *bio) +{ + if (op_is_discard(bio->bi_opf)) + return BLKG_IOSTAT_DISCARD; + if (op_is_write(bio->bi_opf)) + return BLKG_IOSTAT_WRITE; + return BLKG_IOSTAT_READ; +} + +void blk_cgroup_bio_start(struct bio *bio) +{ + struct blkcg *blkcg = bio->bi_blkg->blkcg; + int rwd = blk_cgroup_io_type(bio), cpu; + struct blkg_iostat_set *bis; + unsigned long flags; + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) + return; + + /* Root-level stats are sourced from system-wide IO stats */ + if (!cgroup_parent(blkcg->css.cgroup)) + return; + + cpu = get_cpu(); + bis = per_cpu_ptr(bio->bi_blkg->iostat_cpu, cpu); + flags = u64_stats_update_begin_irqsave(&bis->sync); + + /* + * If the bio is flagged with BIO_CGROUP_ACCT it means this is a split + * bio and we would have already accounted for the size of the bio. + */ + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + bis->cur.bytes[rwd] += bio->bi_iter.bi_size; + } + bis->cur.ios[rwd]++; + + /* + * If the iostat_cpu isn't in a lockless list, put it into the + * list to indicate that a stat update is pending. + */ + if (!READ_ONCE(bis->lqueued)) { + struct llist_head *lhead = this_cpu_ptr(blkcg->lhead); + + llist_add(&bis->lnode, lhead); + WRITE_ONCE(bis->lqueued, true); + } + + u64_stats_update_end_irqrestore(&bis->sync, flags); + cgroup_rstat_updated(blkcg->css.cgroup, cpu); + put_cpu(); +} + +bool blk_cgroup_congested(void) +{ + struct cgroup_subsys_state *css; + bool ret = false; + + rcu_read_lock(); + for (css = blkcg_css(); css; css = css->parent) { + if (atomic_read(&css->cgroup->congestion_count)) { + ret = true; + break; + } + } + rcu_read_unlock(); + return ret; } -subsys_initcall(blkcg_init); module_param(blkcg_debug_stats, bool, 0644); MODULE_PARM_DESC(blkcg_debug_stats, "True if you want debug stats, false if not"); diff --git a/block/blk-cgroup.h b/block/blk-cgroup.h new file mode 100644 index 000000000000..b927a4a0ad03 --- /dev/null +++ b/block/blk-cgroup.h @@ -0,0 +1,508 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BLK_CGROUP_PRIVATE_H +#define _BLK_CGROUP_PRIVATE_H +/* + * block cgroup private header + * + * Based on ideas and code from CFQ, CFS and BFQ: + * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> + * + * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> + * Paolo Valente <paolo.valente@unimore.it> + * + * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com> + * Nauman Rafique <nauman@google.com> + */ + +#include <linux/blk-cgroup.h> +#include <linux/cgroup.h> +#include <linux/kthread.h> +#include <linux/blk-mq.h> +#include <linux/llist.h> + +struct blkcg_gq; +struct blkg_policy_data; + + +/* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */ +#define BLKG_STAT_CPU_BATCH (INT_MAX / 2) + +#ifdef CONFIG_BLK_CGROUP + +enum blkg_iostat_type { + BLKG_IOSTAT_READ, + BLKG_IOSTAT_WRITE, + BLKG_IOSTAT_DISCARD, + + BLKG_IOSTAT_NR, +}; + +struct blkg_iostat { + u64 bytes[BLKG_IOSTAT_NR]; + u64 ios[BLKG_IOSTAT_NR]; +}; + +struct blkg_iostat_set { + struct u64_stats_sync sync; + struct blkcg_gq *blkg; + struct llist_node lnode; + int lqueued; /* queued in llist */ + struct blkg_iostat cur; + struct blkg_iostat last; +}; + +/* association between a blk cgroup and a request queue */ +struct blkcg_gq { + /* Pointer to the associated request_queue */ + struct request_queue *q; + struct list_head q_node; + struct hlist_node blkcg_node; + struct blkcg *blkcg; + + /* all non-root blkcg_gq's are guaranteed to have access to parent */ + struct blkcg_gq *parent; + + /* reference count */ + struct percpu_ref refcnt; + + /* is this blkg online? protected by both blkcg and q locks */ + bool online; + + struct blkg_iostat_set __percpu *iostat_cpu; + struct blkg_iostat_set iostat; + + struct blkg_policy_data *pd[BLKCG_MAX_POLS]; +#ifdef CONFIG_BLK_CGROUP_PUNT_BIO + spinlock_t async_bio_lock; + struct bio_list async_bios; +#endif + union { + struct work_struct async_bio_work; + struct work_struct free_work; + }; + + atomic_t use_delay; + atomic64_t delay_nsec; + atomic64_t delay_start; + u64 last_delay; + int last_use; + + struct rcu_head rcu_head; +}; + +struct blkcg { + struct cgroup_subsys_state css; + spinlock_t lock; + refcount_t online_pin; + + struct radix_tree_root blkg_tree; + struct blkcg_gq __rcu *blkg_hint; + struct hlist_head blkg_list; + + struct blkcg_policy_data *cpd[BLKCG_MAX_POLS]; + + struct list_head all_blkcgs_node; + + /* + * List of updated percpu blkg_iostat_set's since the last flush. + */ + struct llist_head __percpu *lhead; + +#ifdef CONFIG_BLK_CGROUP_FC_APPID + char fc_app_id[FC_APPID_LEN]; +#endif +#ifdef CONFIG_CGROUP_WRITEBACK + struct list_head cgwb_list; +#endif +}; + +static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css) +{ + return css ? container_of(css, struct blkcg, css) : NULL; +} + +/* + * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a + * request_queue (q). This is used by blkcg policies which need to track + * information per blkcg - q pair. + * + * There can be multiple active blkcg policies and each blkg:policy pair is + * represented by a blkg_policy_data which is allocated and freed by each + * policy's pd_alloc/free_fn() methods. A policy can allocate private data + * area by allocating larger data structure which embeds blkg_policy_data + * at the beginning. + */ +struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; + bool online; +}; + +/* + * Policies that need to keep per-blkcg data which is independent from any + * request_queue associated to it should implement cpd_alloc/free_fn() + * methods. A policy can allocate private data area by allocating larger + * data structure which embeds blkcg_policy_data at the beginning. + * cpd_init() is invoked to let each policy handle per-blkcg data. + */ +struct blkcg_policy_data { + /* the blkcg and policy id this per-policy data belongs to */ + struct blkcg *blkcg; + int plid; +}; + +typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp); +typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd); +typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd); +typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp); +typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd); +typedef void (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, + struct seq_file *s); + +struct blkcg_policy { + int plid; + /* cgroup files for the policy */ + struct cftype *dfl_cftypes; + struct cftype *legacy_cftypes; + + /* operations */ + blkcg_pol_alloc_cpd_fn *cpd_alloc_fn; + blkcg_pol_free_cpd_fn *cpd_free_fn; + + blkcg_pol_alloc_pd_fn *pd_alloc_fn; + blkcg_pol_init_pd_fn *pd_init_fn; + blkcg_pol_online_pd_fn *pd_online_fn; + blkcg_pol_offline_pd_fn *pd_offline_fn; + blkcg_pol_free_pd_fn *pd_free_fn; + blkcg_pol_reset_pd_stats_fn *pd_reset_stats_fn; + blkcg_pol_stat_pd_fn *pd_stat_fn; +}; + +extern struct blkcg blkcg_root; +extern bool blkcg_debug_stats; + +int blkcg_init_disk(struct gendisk *disk); +void blkcg_exit_disk(struct gendisk *disk); + +/* Blkio controller policy registration */ +int blkcg_policy_register(struct blkcg_policy *pol); +void blkcg_policy_unregister(struct blkcg_policy *pol); +int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol); +void blkcg_deactivate_policy(struct gendisk *disk, + const struct blkcg_policy *pol); + +const char *blkg_dev_name(struct blkcg_gq *blkg); +void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg, + u64 (*prfill)(struct seq_file *, + struct blkg_policy_data *, int), + const struct blkcg_policy *pol, int data, + bool show_total); +u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v); + +struct blkg_conf_ctx { + char *input; + char *body; + struct block_device *bdev; + struct blkcg_gq *blkg; +}; + +void blkg_conf_init(struct blkg_conf_ctx *ctx, char *input); +int blkg_conf_open_bdev(struct blkg_conf_ctx *ctx); +int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol, + struct blkg_conf_ctx *ctx); +void blkg_conf_exit(struct blkg_conf_ctx *ctx); + +/** + * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg + * @return: true if this bio needs to be submitted with the root blkg context. + * + * In order to avoid priority inversions we sometimes need to issue a bio as if + * it were attached to the root blkg, and then backcharge to the actual owning + * blkg. The idea is we do bio_blkcg_css() to look up the actual context for + * the bio and attach the appropriate blkg to the bio. Then we call this helper + * and if it is true run with the root blkg for that queue and then do any + * backcharging to the originating cgroup once the io is complete. + */ +static inline bool bio_issue_as_root_blkg(struct bio *bio) +{ + return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0; +} + +/** + * blkg_lookup - lookup blkg for the specified blkcg - q pair + * @blkcg: blkcg of interest + * @q: request_queue of interest + * + * Lookup blkg for the @blkcg - @q pair. + + * Must be called in a RCU critical section. + */ +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, + struct request_queue *q) +{ + struct blkcg_gq *blkg; + + if (blkcg == &blkcg_root) + return q->root_blkg; + + blkg = rcu_dereference_check(blkcg->blkg_hint, + lockdep_is_held(&q->queue_lock)); + if (blkg && blkg->q == q) + return blkg; + + blkg = radix_tree_lookup(&blkcg->blkg_tree, q->id); + if (blkg && blkg->q != q) + blkg = NULL; + return blkg; +} + +/** + * blkg_to_pdata - get policy private data + * @blkg: blkg of interest + * @pol: policy of interest + * + * Return pointer to private data associated with the @blkg-@pol pair. + */ +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) +{ + return blkg ? blkg->pd[pol->plid] : NULL; +} + +static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg, + struct blkcg_policy *pol) +{ + return blkcg ? blkcg->cpd[pol->plid] : NULL; +} + +/** + * pdata_to_blkg - get blkg associated with policy private data + * @pd: policy private data of interest + * + * @pd is policy private data. Determine the blkg it's associated with. + */ +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) +{ + return pd ? pd->blkg : NULL; +} + +static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd) +{ + return cpd ? cpd->blkcg : NULL; +} + +/** + * blkg_path - format cgroup path of blkg + * @blkg: blkg of interest + * @buf: target buffer + * @buflen: target buffer length + * + * Format the path of the cgroup of @blkg into @buf. + */ +static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen) +{ + return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen); +} + +/** + * blkg_get - get a blkg reference + * @blkg: blkg to get + * + * The caller should be holding an existing reference. + */ +static inline void blkg_get(struct blkcg_gq *blkg) +{ + percpu_ref_get(&blkg->refcnt); +} + +/** + * blkg_tryget - try and get a blkg reference + * @blkg: blkg to get + * + * This is for use when doing an RCU lookup of the blkg. We may be in the midst + * of freeing this blkg, so we can only use it if the refcnt is not zero. + */ +static inline bool blkg_tryget(struct blkcg_gq *blkg) +{ + return blkg && percpu_ref_tryget(&blkg->refcnt); +} + +/** + * blkg_put - put a blkg reference + * @blkg: blkg to put + */ +static inline void blkg_put(struct blkcg_gq *blkg) +{ + percpu_ref_put(&blkg->refcnt); +} + +/** + * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Walk @c_blkg through the descendants of @p_blkg. Must be used with RCU + * read locked. If called under either blkcg or queue lock, the iteration + * is guaranteed to include all and only online blkgs. The caller may + * update @pos_css by calling css_rightmost_descendant() to skip subtree. + * @p_blkg is included in the iteration and the first node to be visited. + */ +#define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) + +/** + * blkg_for_each_descendant_post - post-order walk of a blkg's descendants + * @d_blkg: loop cursor pointing to the current descendant + * @pos_css: used for iteration + * @p_blkg: target blkg to walk descendants of + * + * Similar to blkg_for_each_descendant_pre() but performs post-order + * traversal instead. Synchronization rules are the same. @p_blkg is + * included in the iteration and the last node to be visited. + */ +#define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg) \ + css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css) \ + if (((d_blkg) = blkg_lookup(css_to_blkcg(pos_css), \ + (p_blkg)->q))) + +static inline void blkcg_bio_issue_init(struct bio *bio) +{ + bio_issue_init(&bio->bi_issue, bio_sectors(bio)); +} + +static inline void blkcg_use_delay(struct blkcg_gq *blkg) +{ + if (WARN_ON_ONCE(atomic_read(&blkg->use_delay) < 0)) + return; + if (atomic_add_return(1, &blkg->use_delay) == 1) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); +} + +static inline int blkcg_unuse_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + if (WARN_ON_ONCE(old < 0)) + return 0; + if (old == 0) + return 0; + + /* + * We do this song and dance because we can race with somebody else + * adding or removing delay. If we just did an atomic_dec we'd end up + * negative and we'd already be in trouble. We need to subtract 1 and + * then check to see if we were the last delay so we can drop the + * congestion count on the cgroup. + */ + while (old && !atomic_try_cmpxchg(&blkg->use_delay, &old, old - 1)) + ; + + if (old == 0) + return 0; + if (old == 1) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); + return 1; +} + +/** + * blkcg_set_delay - Enable allocator delay mechanism with the specified delay amount + * @blkg: target blkg + * @delay: delay duration in nsecs + * + * When enabled with this function, the delay is not decayed and must be + * explicitly cleared with blkcg_clear_delay(). Must not be mixed with + * blkcg_[un]use_delay() and blkcg_add_delay() usages. + */ +static inline void blkcg_set_delay(struct blkcg_gq *blkg, u64 delay) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person setting the congestion count for this blkg. */ + if (!old && atomic_try_cmpxchg(&blkg->use_delay, &old, -1)) + atomic_inc(&blkg->blkcg->css.cgroup->congestion_count); + + atomic64_set(&blkg->delay_nsec, delay); +} + +/** + * blkcg_clear_delay - Disable allocator delay mechanism + * @blkg: target blkg + * + * Disable use_delay mechanism. See blkcg_set_delay(). + */ +static inline void blkcg_clear_delay(struct blkcg_gq *blkg) +{ + int old = atomic_read(&blkg->use_delay); + + /* We only want 1 person clearing the congestion count for this blkg. */ + if (old && atomic_try_cmpxchg(&blkg->use_delay, &old, 0)) + atomic_dec(&blkg->blkcg->css.cgroup->congestion_count); +} + +/** + * blk_cgroup_mergeable - Determine whether to allow or disallow merges + * @rq: request to merge into + * @bio: bio to merge + * + * @bio and @rq should belong to the same cgroup and their issue_as_root should + * match. The latter is necessary as we don't want to throttle e.g. a metadata + * update because it happens to be next to a regular IO. + */ +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) +{ + return rq->bio->bi_blkg == bio->bi_blkg && + bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio); +} + +void blk_cgroup_bio_start(struct bio *bio); +void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta); +#else /* CONFIG_BLK_CGROUP */ + +struct blkg_policy_data { +}; + +struct blkcg_policy_data { +}; + +struct blkcg_policy { +}; + +struct blkcg { +}; + +static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; } +static inline int blkcg_init_disk(struct gendisk *disk) { return 0; } +static inline void blkcg_exit_disk(struct gendisk *disk) { } +static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; } +static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { } +static inline int blkcg_activate_policy(struct gendisk *disk, + const struct blkcg_policy *pol) { return 0; } +static inline void blkcg_deactivate_policy(struct gendisk *disk, + const struct blkcg_policy *pol) { } + +static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg, + struct blkcg_policy *pol) { return NULL; } +static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; } +static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; } +static inline void blkg_get(struct blkcg_gq *blkg) { } +static inline void blkg_put(struct blkcg_gq *blkg) { } +static inline void blkcg_bio_issue_init(struct bio *bio) { } +static inline void blk_cgroup_bio_start(struct bio *bio) { } +static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; } + +#define blk_queue_for_each_rl(rl, q) \ + for ((rl) = &(q)->root_rl; (rl); (rl) = NULL) + +#endif /* CONFIG_BLK_CGROUP */ + +#endif /* _BLK_CGROUP_PRIVATE_H */ diff --git a/block/blk-core.c b/block/blk-core.c index 03252af8c82c..de771093b526 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -14,10 +14,10 @@ */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> +#include <linux/blk-pm.h> +#include <linux/blk-integrity.h> #include <linux/highmem.h> #include <linux/mm.h> #include <linux/pagemap.h> @@ -34,11 +34,10 @@ #include <linux/delay.h> #include <linux/ratelimit.h> #include <linux/pm_runtime.h> -#include <linux/blk-cgroup.h> #include <linux/t10-pi.h> #include <linux/debugfs.h> #include <linux/bpf.h> -#include <linux/psi.h> +#include <linux/part_stat.h> #include <linux/sched/sysctl.h> #include <linux/blk-crypto.h> @@ -46,27 +45,27 @@ #include <trace/events/block.h> #include "blk.h" -#include "blk-mq.h" #include "blk-mq-sched.h" #include "blk-pm.h" -#include "blk-rq-qos.h" +#include "blk-cgroup.h" +#include "blk-throttle.h" +#include "blk-ioprio.h" -#ifdef CONFIG_DEBUG_FS struct dentry *blk_debugfs_root; -#endif EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_remap); EXPORT_TRACEPOINT_SYMBOL_GPL(block_bio_complete); EXPORT_TRACEPOINT_SYMBOL_GPL(block_split); EXPORT_TRACEPOINT_SYMBOL_GPL(block_unplug); +EXPORT_TRACEPOINT_SYMBOL_GPL(block_rq_insert); -DEFINE_IDA(blk_queue_ida); +static DEFINE_IDA(blk_queue_ida); /* * For queue allocation */ -struct kmem_cache *blk_requestq_cachep; +static struct kmem_cache *blk_requestq_cachep; /* * Controlling structure to kblockd @@ -109,24 +108,6 @@ bool blk_queue_flag_test_and_set(unsigned int flag, struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_queue_flag_test_and_set); -void blk_rq_init(struct request_queue *q, struct request *rq) -{ - memset(rq, 0, sizeof(*rq)); - - INIT_LIST_HEAD(&rq->queuelist); - rq->q = q; - rq->__sector = (sector_t) -1; - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); - rq->tag = -1; - rq->internal_tag = -1; - rq->start_time_ns = ktime_get_ns(); - rq->part = NULL; - refcount_set(&rq->ref, 1); - blk_crypto_rq_set_defaults(rq); -} -EXPORT_SYMBOL(blk_rq_init); - #define REQ_OP_NAME(name) [REQ_OP_##name] = #name static const char *const blk_op_name[] = { REQ_OP_NAME(READ), @@ -140,10 +121,7 @@ static const char *const blk_op_name[] = { REQ_OP_NAME(ZONE_CLOSE), REQ_OP_NAME(ZONE_FINISH), REQ_OP_NAME(ZONE_APPEND), - REQ_OP_NAME(WRITE_SAME), REQ_OP_NAME(WRITE_ZEROES), - REQ_OP_NAME(SCSI_IN), - REQ_OP_NAME(SCSI_OUT), REQ_OP_NAME(DRV_IN), REQ_OP_NAME(DRV_OUT), }; @@ -157,7 +135,7 @@ static const char *const blk_op_name[] = { * string format. Useful in the debugging and tracing bio or request. For * invalid REQ_OP_XXX it returns string "UNKNOWN". */ -inline const char *blk_op_str(unsigned int op) +inline const char *blk_op_str(enum req_op op) { const char *op_str = "UNKNOWN"; @@ -178,16 +156,24 @@ static const struct { [BLK_STS_NOSPC] = { -ENOSPC, "critical space allocation" }, [BLK_STS_TRANSPORT] = { -ENOLINK, "recoverable transport" }, [BLK_STS_TARGET] = { -EREMOTEIO, "critical target" }, - [BLK_STS_NEXUS] = { -EBADE, "critical nexus" }, + [BLK_STS_RESV_CONFLICT] = { -EBADE, "reservation conflict" }, [BLK_STS_MEDIUM] = { -ENODATA, "critical medium" }, [BLK_STS_PROTECTION] = { -EILSEQ, "protection" }, [BLK_STS_RESOURCE] = { -ENOMEM, "kernel resource" }, [BLK_STS_DEV_RESOURCE] = { -EBUSY, "device resource" }, [BLK_STS_AGAIN] = { -EAGAIN, "nonblocking retry" }, + [BLK_STS_OFFLINE] = { -ENODEV, "device offline" }, /* device mapper special case, should not leak out: */ [BLK_STS_DM_REQUEUE] = { -EREMCHG, "dm internal retry" }, + /* zone device specific errors */ + [BLK_STS_ZONE_OPEN_RESOURCE] = { -ETOOMANYREFS, "open zones exceeded" }, + [BLK_STS_ZONE_ACTIVE_RESOURCE] = { -EOVERFLOW, "active zones exceeded" }, + + /* Command duration limit device-side timeout */ + [BLK_STS_DURATION_LIMIT] = { -ETIME, "duration limit exceeded" }, + /* everything else not covered above: */ [BLK_STS_IOERR] = { -EIO, "I/O" }, }; @@ -215,65 +201,15 @@ int blk_status_to_errno(blk_status_t status) } EXPORT_SYMBOL_GPL(blk_status_to_errno); -static void print_req_error(struct request *req, blk_status_t status, - const char *caller) +const char *blk_status_to_str(blk_status_t status) { int idx = (__force int)status; if (WARN_ON_ONCE(idx >= ARRAY_SIZE(blk_errors))) - return; - - printk_ratelimited(KERN_ERR - "%s: %s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " - "phys_seg %u prio class %u\n", - caller, blk_errors[idx].name, - req->rq_disk ? req->rq_disk->disk_name : "?", - blk_rq_pos(req), req_op(req), blk_op_str(req_op(req)), - req->cmd_flags & ~REQ_OP_MASK, - req->nr_phys_segments, - IOPRIO_PRIO_CLASS(req->ioprio)); -} - -static void req_bio_endio(struct request *rq, struct bio *bio, - unsigned int nbytes, blk_status_t error) -{ - if (error) - bio->bi_status = error; - - if (unlikely(rq->rq_flags & RQF_QUIET)) - bio_set_flag(bio, BIO_QUIET); - - bio_advance(bio, nbytes); - - if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { - /* - * Partial zone append completions cannot be supported as the - * BIO fragments may end up not being written sequentially. - */ - if (bio->bi_iter.bi_size) - bio->bi_status = BLK_STS_IOERR; - else - bio->bi_iter.bi_sector = rq->__sector; - } - - /* don't actually finish bio if it's part of flush sequence */ - if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) - bio_endio(bio); + return "<null>"; + return blk_errors[idx].name; } - -void blk_dump_rq_flags(struct request *rq, char *msg) -{ - printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, - rq->rq_disk ? rq->rq_disk->disk_name : "?", - (unsigned long long) rq->cmd_flags); - - printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", - (unsigned long long)blk_rq_pos(rq), - blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); - printk(KERN_INFO " bio %p, biotail %p, len %u\n", - rq->bio, rq->biotail, blk_rq_bytes(rq)); -} -EXPORT_SYMBOL(blk_dump_rq_flags); +EXPORT_SYMBOL_GPL(blk_status_to_str); /** * blk_sync_queue - cancel any pending callbacks on a queue @@ -285,7 +221,7 @@ EXPORT_SYMBOL(blk_dump_rq_flags); * A block device may call blk_sync_queue to ensure that any * such activity is cancelled, thus allowing it to release resources * that the callbacks might use. The caller must already have made sure - * that its ->make_request_fn will not re-add plugging prior to calling + * that its ->submit_bio will not re-add plugging prior to calling * this function. * * This function does not cancel any asynchronous activity arising @@ -321,156 +257,117 @@ void blk_clear_pm_only(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_clear_pm_only); -void blk_put_queue(struct request_queue *q) +static void blk_free_queue_rcu(struct rcu_head *rcu_head) { - kobject_put(&q->kobj); + struct request_queue *q = container_of(rcu_head, + struct request_queue, rcu_head); + + percpu_ref_exit(&q->q_usage_counter); + kmem_cache_free(blk_requestq_cachep, q); } -EXPORT_SYMBOL(blk_put_queue); -void blk_set_queue_dying(struct request_queue *q) +static void blk_free_queue(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_DYING, q); - - /* - * When queue DYING flag is set, we need to block new req - * entering queue, so we call blk_freeze_queue_start() to - * prevent I/O from crossing blk_queue_enter(). - */ - blk_freeze_queue_start(q); - + blk_free_queue_stats(q->stats); if (queue_is_mq(q)) - blk_mq_wake_waiters(q); + blk_mq_release(q); - /* Make blk_queue_enter() reexamine the DYING flag. */ - wake_up_all(&q->mq_freeze_wq); + ida_free(&blk_queue_ida, q->id); + call_rcu(&q->rcu_head, blk_free_queue_rcu); } -EXPORT_SYMBOL_GPL(blk_set_queue_dying); /** - * blk_cleanup_queue - shutdown a request queue - * @q: request queue to shutdown + * blk_put_queue - decrement the request_queue refcount + * @q: the request_queue structure to decrement the refcount for * - * Mark @q DYING, drain all pending requests, mark @q DEAD, destroy and - * put it. All future requests will be failed immediately with -ENODEV. + * Decrements the refcount of the request_queue and free it when the refcount + * reaches 0. */ -void blk_cleanup_queue(struct request_queue *q) +void blk_put_queue(struct request_queue *q) { - WARN_ON_ONCE(blk_queue_registered(q)); - - /* mark @q DYING, no new request or merges will be allowed afterwards */ - blk_set_queue_dying(q); - - blk_queue_flag_set(QUEUE_FLAG_NOMERGES, q); - blk_queue_flag_set(QUEUE_FLAG_NOXMERGES, q); + if (refcount_dec_and_test(&q->refs)) + blk_free_queue(q); +} +EXPORT_SYMBOL(blk_put_queue); +void blk_queue_start_drain(struct request_queue *q) +{ /* - * Drain all requests queued before DYING marking. Set DEAD flag to - * prevent that blk_mq_run_hw_queues() accesses the hardware queues - * after draining finished. + * When queue DYING flag is set, we need to block new req + * entering queue, so we call blk_freeze_queue_start() to + * prevent I/O from crossing blk_queue_enter(). */ - blk_freeze_queue(q); - - rq_qos_exit(q); - - blk_queue_flag_set(QUEUE_FLAG_DEAD, q); - - /* for synchronous bio-based driver finish in-flight integrity i/o */ - blk_flush_integrity(); - - /* @q won't process any more request, flush async actions */ - del_timer_sync(&q->backing_dev_info->laptop_mode_wb_timer); - blk_sync_queue(q); - + blk_freeze_queue_start(q); if (queue_is_mq(q)) - blk_mq_exit_queue(q); - - /* - * In theory, request pool of sched_tags belongs to request queue. - * However, the current implementation requires tag_set for freeing - * requests, so free the pool now. - * - * Queue has become frozen, there can't be any in-queue requests, so - * it is safe to free requests now. - */ - mutex_lock(&q->sysfs_lock); - if (q->elevator) - blk_mq_sched_free_requests(q); - mutex_unlock(&q->sysfs_lock); - - percpu_ref_exit(&q->q_usage_counter); - - /* @q is and will stay empty, shutdown and put */ - blk_put_queue(q); + blk_mq_wake_waiters(q); + /* Make blk_queue_enter() reexamine the DYING flag. */ + wake_up_all(&q->mq_freeze_wq); } -EXPORT_SYMBOL(blk_cleanup_queue); /** * blk_queue_enter() - try to increase q->q_usage_counter * @q: request queue pointer - * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PREEMPT + * @flags: BLK_MQ_REQ_NOWAIT and/or BLK_MQ_REQ_PM */ int blk_queue_enter(struct request_queue *q, blk_mq_req_flags_t flags) { - const bool pm = flags & BLK_MQ_REQ_PREEMPT; - - while (true) { - bool success = false; - - rcu_read_lock(); - if (percpu_ref_tryget_live(&q->q_usage_counter)) { - /* - * The code that increments the pm_only counter is - * responsible for ensuring that that counter is - * globally visible before the queue is unfrozen. - */ - if (pm || !blk_queue_pm_only(q)) { - success = true; - } else { - percpu_ref_put(&q->q_usage_counter); - } - } - rcu_read_unlock(); - - if (success) - return 0; + const bool pm = flags & BLK_MQ_REQ_PM; + while (!blk_try_enter_queue(q, pm)) { if (flags & BLK_MQ_REQ_NOWAIT) - return -EBUSY; + return -EAGAIN; /* - * read pair of barrier in blk_freeze_queue_start(), - * we need to order reading __PERCPU_REF_DEAD flag of - * .q_usage_counter and reading .mq_freeze_depth or - * queue dying flag, otherwise the following wait may - * never return if the two reads are reordered. + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. */ smp_rmb(); - wait_event(q->mq_freeze_wq, (!q->mq_freeze_depth && - (pm || (blk_pm_request_resume(q), - !blk_queue_pm_only(q)))) || + blk_pm_resume_queue(pm, q)) || blk_queue_dying(q)); if (blk_queue_dying(q)) return -ENODEV; } + + return 0; } -static inline int bio_queue_enter(struct bio *bio) +int __bio_queue_enter(struct request_queue *q, struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; - bool nowait = bio->bi_opf & REQ_NOWAIT; - int ret; + while (!blk_try_enter_queue(q, false)) { + struct gendisk *disk = bio->bi_bdev->bd_disk; - ret = blk_queue_enter(q, nowait ? BLK_MQ_REQ_NOWAIT : 0); - if (unlikely(ret)) { - if (nowait && !blk_queue_dying(q)) + if (bio->bi_opf & REQ_NOWAIT) { + if (test_bit(GD_DEAD, &disk->state)) + goto dead; bio_wouldblock_error(bio); - else - bio_io_error(bio); + return -EAGAIN; + } + + /* + * read pair of barrier in blk_freeze_queue_start(), we need to + * order reading __PERCPU_REF_DEAD flag of .q_usage_counter and + * reading .mq_freeze_depth or queue dying flag, otherwise the + * following wait may never return if the two reads are + * reordered. + */ + smp_rmb(); + wait_event(q->mq_freeze_wq, + (!q->mq_freeze_depth && + blk_pm_resume_queue(false, q)) || + test_bit(GD_DEAD, &disk->state)); + if (test_bit(GD_DEAD, &disk->state)) + goto dead; } - return ret; + return 0; +dead: + bio_io_error(bio); + return -ENODEV; } void blk_queue_exit(struct request_queue *q) @@ -497,54 +394,38 @@ static void blk_timeout_work(struct work_struct *work) { } -struct request_queue *__blk_alloc_queue(int node_id) +struct request_queue *blk_alloc_queue(int node_id) { struct request_queue *q; - int ret; - q = kmem_cache_alloc_node(blk_requestq_cachep, - GFP_KERNEL | __GFP_ZERO, node_id); + q = kmem_cache_alloc_node(blk_requestq_cachep, GFP_KERNEL | __GFP_ZERO, + node_id); if (!q) return NULL; q->last_merge = NULL; - q->id = ida_simple_get(&blk_queue_ida, 0, 0, GFP_KERNEL); + q->id = ida_alloc(&blk_queue_ida, GFP_KERNEL); if (q->id < 0) goto fail_q; - ret = bioset_init(&q->bio_split, BIO_POOL_SIZE, 0, BIOSET_NEED_BVECS); - if (ret) - goto fail_id; - - q->backing_dev_info = bdi_alloc(node_id); - if (!q->backing_dev_info) - goto fail_split; - q->stats = blk_alloc_queue_stats(); if (!q->stats) - goto fail_stats; + goto fail_id; - q->backing_dev_info->ra_pages = VM_READAHEAD_PAGES; - q->backing_dev_info->capabilities = BDI_CAP_CGROUP_WRITEBACK; q->node = node_id; - timer_setup(&q->backing_dev_info->laptop_mode_wb_timer, - laptop_mode_timer_fn, 0); + atomic_set(&q->nr_active_requests_shared_tags, 0); + timer_setup(&q->timeout, blk_rq_timed_out_timer, 0); INIT_WORK(&q->timeout_work, blk_timeout_work); INIT_LIST_HEAD(&q->icq_list); -#ifdef CONFIG_BLK_CGROUP - INIT_LIST_HEAD(&q->blkg_list); -#endif - kobject_init(&q->kobj, &blk_queue_ktype); - -#ifdef CONFIG_BLK_DEV_IO_TRACE - mutex_init(&q->blk_trace_mutex); -#endif + refcount_set(&q->refs, 1); + mutex_init(&q->debugfs_mutex); mutex_init(&q->sysfs_lock); mutex_init(&q->sysfs_dir_lock); + mutex_init(&q->rq_qos_mutex); spin_lock_init(&q->queue_lock); init_waitqueue_head(&q->mq_freeze_wq); @@ -557,252 +438,38 @@ struct request_queue *__blk_alloc_queue(int node_id) if (percpu_ref_init(&q->q_usage_counter, blk_queue_usage_counter_release, PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) - goto fail_bdi; - - if (blkcg_init_queue(q)) - goto fail_ref; + goto fail_stats; - blk_queue_dma_alignment(q, 511); blk_set_default_limits(&q->limits); + q->nr_requests = BLKDEV_DEFAULT_RQ; return q; -fail_ref: - percpu_ref_exit(&q->q_usage_counter); -fail_bdi: - blk_free_queue_stats(q->stats); fail_stats: - bdi_put(q->backing_dev_info); -fail_split: - bioset_exit(&q->bio_split); + blk_free_queue_stats(q->stats); fail_id: - ida_simple_remove(&blk_queue_ida, q->id); + ida_free(&blk_queue_ida, q->id); fail_q: kmem_cache_free(blk_requestq_cachep, q); return NULL; } -struct request_queue *blk_alloc_queue(make_request_fn make_request, int node_id) -{ - struct request_queue *q; - - if (WARN_ON_ONCE(!make_request)) - return NULL; - - q = __blk_alloc_queue(node_id); - if (!q) - return NULL; - q->make_request_fn = make_request; - q->nr_requests = BLKDEV_MAX_RQ; - return q; -} -EXPORT_SYMBOL(blk_alloc_queue); - -bool blk_get_queue(struct request_queue *q) -{ - if (likely(!blk_queue_dying(q))) { - __blk_get_queue(q); - return true; - } - - return false; -} -EXPORT_SYMBOL(blk_get_queue); - -/** - * blk_get_request - allocate a request - * @q: request queue to allocate a request for - * @op: operation (REQ_OP_*) and REQ_* flags, e.g. REQ_SYNC. - * @flags: BLK_MQ_REQ_* flags, e.g. BLK_MQ_REQ_NOWAIT. - */ -struct request *blk_get_request(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags) -{ - struct request *req; - - WARN_ON_ONCE(op & REQ_NOWAIT); - WARN_ON_ONCE(flags & ~(BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_PREEMPT)); - - req = blk_mq_alloc_request(q, op, flags); - if (!IS_ERR(req) && q->mq_ops->initialize_rq_fn) - q->mq_ops->initialize_rq_fn(req); - - return req; -} -EXPORT_SYMBOL(blk_get_request); - -void blk_put_request(struct request *req) -{ - blk_mq_free_request(req); -} -EXPORT_SYMBOL(blk_put_request); - -static void blk_account_io_merge_bio(struct request *req) -{ - if (!blk_do_io_stat(req)) - return; - - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); -} - -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_back_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_backmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_free_ctx(bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs) -{ - const int ff = bio->bi_opf & REQ_FAILFAST_MASK; - - if (!ll_front_merge_fn(req, bio, nr_segs)) - return false; - - trace_block_bio_frontmerge(req->q, req, bio); - rq_qos_merge(req->q, req, bio); - - if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) - blk_rq_set_mixed_merge(req); - - bio->bi_next = req->bio; - req->bio = bio; - - req->__sector = bio->bi_iter.bi_sector; - req->__data_len += bio->bi_iter.bi_size; - - bio_crypt_do_front_merge(req, bio); - - blk_account_io_merge_bio(req); - return true; -} - -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio) -{ - unsigned short segments = blk_rq_nr_discard_segments(req); - - if (segments >= queue_max_discard_segments(q)) - goto no_merge; - if (blk_rq_sectors(req) + bio_sectors(bio) > - blk_rq_get_max_sectors(req, blk_rq_pos(req))) - goto no_merge; - - rq_qos_merge(q, req, bio); - - req->biotail->bi_next = bio; - req->biotail = bio; - req->__data_len += bio->bi_iter.bi_size; - req->nr_phys_segments = segments + 1; - - blk_account_io_merge_bio(req); - return true; -no_merge: - req_set_nomerge(q, req); - return false; -} - /** - * blk_attempt_plug_merge - try to merge with %current's plugged list - * @q: request_queue new bio is being queued at - * @bio: new bio being queued - * @nr_segs: number of segments in @bio - * @same_queue_rq: pointer to &struct request that gets filled in when - * another request associated with @q is found on the plug list - * (optional, may be %NULL) - * - * Determine whether @bio being queued on @q can be merged with a request - * on %current's plugged list. Returns %true if merge was successful, - * otherwise %false. + * blk_get_queue - increment the request_queue refcount + * @q: the request_queue structure to increment the refcount for * - * Plugging coalesces IOs from the same issuer for the same purpose without - * going through @q->queue_lock. As such it's more of an issuing mechanism - * than scheduling, and the request, while may have elvpriv data, is not - * added on the elevator at this point. In addition, we don't have - * reliable access to the elevator outside queue lock. Only check basic - * merging parameters without querying the elevator. + * Increment the refcount of the request_queue kobject. * - * Caller must ensure !blk_queue_nomerges(q) beforehand. + * Context: Any context. */ -bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq) +bool blk_get_queue(struct request_queue *q) { - struct blk_plug *plug; - struct request *rq; - struct list_head *plug_list; - - plug = blk_mq_plug(q, bio); - if (!plug) + if (unlikely(blk_queue_dying(q))) return false; - - plug_list = &plug->mq_list; - - list_for_each_entry_reverse(rq, plug_list, queuelist) { - bool merged = false; - - if (rq->q == q && same_queue_rq) { - /* - * Only blk-mq multiple hardware queues case checks the - * rq in the same queue, there should be only one such - * rq in a queue - **/ - *same_queue_rq = rq; - } - - if (rq->q != q || !blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - merged = bio_attempt_back_merge(rq, bio, nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - merged = bio_attempt_front_merge(rq, bio, nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - break; - } - - if (merged) - return true; - } - - return false; -} - -static void handle_bad_sector(struct bio *bio, sector_t maxsector) -{ - char b[BDEVNAME_SIZE]; - - printk(KERN_INFO "attempt to access beyond end of device\n"); - printk(KERN_INFO "%s: rw=%d, want=%Lu, limit=%Lu\n", - bio_devname(bio, b), bio->bi_opf, - (unsigned long long)bio_end_sector(bio), - (long long)maxsector); + refcount_inc(&q->refs); + return true; } +EXPORT_SYMBOL(blk_get_queue); #ifdef CONFIG_FAIL_MAKE_REQUEST @@ -814,9 +481,9 @@ static int __init setup_fail_make_request(char *str) } __setup("fail_make_request=", setup_fail_make_request); -static bool should_fail_request(struct hd_struct *part, unsigned int bytes) +bool should_fail_request(struct block_device *part, unsigned int bytes) { - return part->make_it_fail && should_fail(&fail_make_request, bytes); + return part->bd_make_it_fail && should_fail(&fail_make_request, bytes); } static int __init fail_make_request_debugfs(void) @@ -828,41 +495,30 @@ static int __init fail_make_request_debugfs(void) } late_initcall(fail_make_request_debugfs); - -#else /* CONFIG_FAIL_MAKE_REQUEST */ - -static inline bool should_fail_request(struct hd_struct *part, - unsigned int bytes) -{ - return false; -} - #endif /* CONFIG_FAIL_MAKE_REQUEST */ -static inline bool bio_check_ro(struct bio *bio, struct hd_struct *part) +static inline void bio_check_ro(struct bio *bio) { - const int op = bio_op(bio); - - if (part->policy && op_is_write(op)) { - char b[BDEVNAME_SIZE]; - + if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) - return false; + return; - WARN_ONCE(1, - "generic_make_request: Trying to write " - "to read-only block-device %s (partno %d)\n", - bio_devname(bio, b), part->partno); - /* Older lvm-tools actually trigger this */ - return false; - } + if (bio->bi_bdev->bd_ro_warned) + return; - return false; + bio->bi_bdev->bd_ro_warned = true; + /* + * Use ioctl to set underlying disk of raid/dm to read-only + * will trigger this. + */ + pr_warn("Trying to write to read-only block-device %pg\n", + bio->bi_bdev); + } } static noinline int should_fail_bio(struct bio *bio) { - if (should_fail_request(&bio->bi_disk->part0, bio->bi_iter.bi_size)) + if (should_fail_request(bdev_whole(bio->bi_bdev), bio->bi_iter.bi_size)) return -EIO; return 0; } @@ -873,14 +529,18 @@ ALLOW_ERROR_INJECTION(should_fail_bio, ERRNO); * This may well happen - the kernel calls bread() without checking the size of * the device, e.g., when mounting a file system. */ -static inline int bio_check_eod(struct bio *bio, sector_t maxsector) +static inline int bio_check_eod(struct bio *bio) { + sector_t maxsector = bdev_nr_sectors(bio->bi_bdev); unsigned int nr_sectors = bio_sectors(bio); - if (nr_sectors && maxsector && + if (nr_sectors && (nr_sectors > maxsector || bio->bi_iter.bi_sector > maxsector - nr_sectors)) { - handle_bad_sector(bio, maxsector); + pr_info_ratelimited("%s: attempt to access beyond end of device\n" + "%pg: rw=%d, sector=%llu, nr_sectors = %u limit=%llu\n", + current->comm, bio->bi_bdev, bio->bi_opf, + bio->bi_iter.bi_sector, nr_sectors, maxsector); return -EIO; } return 0; @@ -889,32 +549,20 @@ static inline int bio_check_eod(struct bio *bio, sector_t maxsector) /* * Remap block n of partition p to block n+start(p) of the disk. */ -static inline int blk_partition_remap(struct bio *bio) +static int blk_partition_remap(struct bio *bio) { - struct hd_struct *p; - int ret = -EIO; + struct block_device *p = bio->bi_bdev; - rcu_read_lock(); - p = __disk_get_part(bio->bi_disk, bio->bi_partno); - if (unlikely(!p)) - goto out; if (unlikely(should_fail_request(p, bio->bi_iter.bi_size))) - goto out; - if (unlikely(bio_check_ro(bio, p))) - goto out; - + return -EIO; if (bio_sectors(bio)) { - if (bio_check_eod(bio, part_nr_sects_read(p))) - goto out; - bio->bi_iter.bi_sector += p->start_sect; - trace_block_bio_remap(bio->bi_disk->queue, bio, part_devt(p), - bio->bi_iter.bi_sector - p->start_sect); + bio->bi_iter.bi_sector += p->bd_start_sect; + trace_block_bio_remap(bio, p->bd_dev, + bio->bi_iter.bi_sector - + p->bd_start_sect); } - bio->bi_partno = 0; - ret = 0; -out: - rcu_read_unlock(); - return ret; + bio_set_flag(bio, BIO_REMAPPED); + return 0; } /* @@ -923,16 +571,15 @@ out: static inline blk_status_t blk_check_zone_append(struct request_queue *q, struct bio *bio) { - sector_t pos = bio->bi_iter.bi_sector; int nr_sectors = bio_sectors(bio); /* Only applicable to zoned block devices */ - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bio->bi_bdev)) return BLK_STS_NOTSUPP; /* The bio sector must point to the start of a sequential zone */ - if (pos & (blk_queue_zone_sectors(q) - 1) || - !blk_queue_zone_is_seq(q, pos)) + if (!bdev_is_zone_start(bio->bi_bdev, bio->bi_iter.bi_sector) || + !bio_zone_is_seq(bio)) return BLK_STS_IOERR; /* @@ -952,73 +599,195 @@ static inline blk_status_t blk_check_zone_append(struct request_queue *q, return BLK_STS_OK; } -static noinline_for_stack bool -generic_make_request_checks(struct bio *bio) +static void __submit_bio(struct bio *bio) { - struct request_queue *q; - int nr_sectors = bio_sectors(bio); - blk_status_t status = BLK_STS_IOERR; - char b[BDEVNAME_SIZE]; + if (unlikely(!blk_crypto_bio_prep(&bio))) + return; - might_sleep(); + if (!bio->bi_bdev->bd_has_submit_bio) { + blk_mq_submit_bio(bio); + } else if (likely(bio_queue_enter(bio) == 0)) { + struct gendisk *disk = bio->bi_bdev->bd_disk; - q = bio->bi_disk->queue; - if (unlikely(!q)) { - printk(KERN_ERR - "generic_make_request: Trying to access " - "nonexistent block-device %s (%Lu)\n", - bio_devname(bio, b), (long long)bio->bi_iter.bi_sector); - goto end_io; + disk->fops->submit_bio(bio); + blk_queue_exit(disk->queue); + } +} + +/* + * The loop in this function may be a bit non-obvious, and so deserves some + * explanation: + * + * - Before entering the loop, bio->bi_next is NULL (as all callers ensure + * that), so we have a list with a single bio. + * - We pretend that we have just taken it off a longer list, so we assign + * bio_list to a pointer to the bio_list_on_stack, thus initialising the + * bio_list of new bios to be added. ->submit_bio() may indeed add some more + * bios through a recursive call to submit_bio_noacct. If it did, we find a + * non-NULL value in bio_list and re-enter the loop from the top. + * - In this case we really did just take the bio of the top of the list (no + * pretending) and so remove it from bio_list, and call into ->submit_bio() + * again. + * + * bio_list_on_stack[0] contains bios submitted by the current ->submit_bio. + * bio_list_on_stack[1] contains bios that were submitted before the current + * ->submit_bio, but that haven't been processed yet. + */ +static void __submit_bio_noacct(struct bio *bio) +{ + struct bio_list bio_list_on_stack[2]; + + BUG_ON(bio->bi_next); + + bio_list_init(&bio_list_on_stack[0]); + current->bio_list = bio_list_on_stack; + + do { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct bio_list lower, same; + + /* + * Create a fresh bio_list for all subordinate requests. + */ + bio_list_on_stack[1] = bio_list_on_stack[0]; + bio_list_init(&bio_list_on_stack[0]); + + __submit_bio(bio); + + /* + * Sort new bios into those for a lower level and those for the + * same level. + */ + bio_list_init(&lower); + bio_list_init(&same); + while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) + if (q == bdev_get_queue(bio->bi_bdev)) + bio_list_add(&same, bio); + else + bio_list_add(&lower, bio); + + /* + * Now assemble so we handle the lowest level first. + */ + bio_list_merge(&bio_list_on_stack[0], &lower); + bio_list_merge(&bio_list_on_stack[0], &same); + bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); + } while ((bio = bio_list_pop(&bio_list_on_stack[0]))); + + current->bio_list = NULL; +} + +static void __submit_bio_noacct_mq(struct bio *bio) +{ + struct bio_list bio_list[2] = { }; + + current->bio_list = bio_list; + + do { + __submit_bio(bio); + } while ((bio = bio_list_pop(&bio_list[0]))); + + current->bio_list = NULL; +} + +void submit_bio_noacct_nocheck(struct bio *bio) +{ + blk_cgroup_bio_start(bio); + blkcg_bio_issue_init(bio); + + if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { + trace_block_bio_queue(bio); + /* + * Now that enqueuing has been traced, we need to trace + * completion as well. + */ + bio_set_flag(bio, BIO_TRACE_COMPLETION); } /* + * We only want one ->submit_bio to be active at a time, else stack + * usage with stacked devices could be a problem. Use current->bio_list + * to collect a list of requests submited by a ->submit_bio method while + * it is active, and then process them after it returned. + */ + if (current->bio_list) + bio_list_add(¤t->bio_list[0], bio); + else if (!bio->bi_bdev->bd_has_submit_bio) + __submit_bio_noacct_mq(bio); + else + __submit_bio_noacct(bio); +} + +/** + * submit_bio_noacct - re-submit a bio to the block device layer for I/O + * @bio: The bio describing the location in memory and on the device. + * + * This is a version of submit_bio() that shall only be used for I/O that is + * resubmitted to lower level drivers by stacking block drivers. All file + * systems and other upper level users of the block layer should use + * submit_bio() instead. + */ +void submit_bio_noacct(struct bio *bio) +{ + struct block_device *bdev = bio->bi_bdev; + struct request_queue *q = bdev_get_queue(bdev); + blk_status_t status = BLK_STS_IOERR; + + might_sleep(); + + /* * For a REQ_NOWAIT based request, return -EOPNOTSUPP - * if queue is not a request based queue. + * if queue does not support NOWAIT. */ - if ((bio->bi_opf & REQ_NOWAIT) && !queue_is_mq(q)) + if ((bio->bi_opf & REQ_NOWAIT) && !bdev_nowait(bdev)) goto not_supported; if (should_fail_bio(bio)) goto end_io; - - if (bio->bi_partno) { - if (unlikely(blk_partition_remap(bio))) - goto end_io; - } else { - if (unlikely(bio_check_ro(bio, &bio->bi_disk->part0))) + bio_check_ro(bio); + if (!bio_flagged(bio, BIO_REMAPPED)) { + if (unlikely(bio_check_eod(bio))) goto end_io; - if (unlikely(bio_check_eod(bio, get_capacity(bio->bi_disk)))) + if (bdev->bd_partno && unlikely(blk_partition_remap(bio))) goto end_io; } /* - * Filter flush bio's early so that make_request based - * drivers without flush support don't have to worry - * about them. + * Filter flush bio's early so that bio based drivers without flush + * support don't have to worry about them. */ - if (op_is_flush(bio->bi_opf) && - !test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { - bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); - if (!nr_sectors) { - status = BLK_STS_OK; + if (op_is_flush(bio->bi_opf)) { + if (WARN_ON_ONCE(bio_op(bio) != REQ_OP_WRITE && + bio_op(bio) != REQ_OP_ZONE_APPEND)) goto end_io; + if (!test_bit(QUEUE_FLAG_WC, &q->queue_flags)) { + bio->bi_opf &= ~(REQ_PREFLUSH | REQ_FUA); + if (!bio_sectors(bio)) { + status = BLK_STS_OK; + goto end_io; + } } } if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) - bio->bi_opf &= ~REQ_HIPRI; + bio_clear_polled(bio); switch (bio_op(bio)) { + case REQ_OP_READ: + case REQ_OP_WRITE: + break; + case REQ_OP_FLUSH: + /* + * REQ_OP_FLUSH can't be submitted through bios, it is only + * synthetized in struct request by the flush state machine. + */ + goto not_supported; case REQ_OP_DISCARD: - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) goto not_supported; break; case REQ_OP_SECURE_ERASE: - if (!blk_queue_secure_erase(q)) - goto not_supported; - break; - case REQ_OP_WRITE_SAME: - if (!q->limits.max_write_same_sectors) + if (!bdev_max_secure_erase_sectors(bdev)) goto not_supported; break; case REQ_OP_ZONE_APPEND: @@ -1026,186 +795,52 @@ generic_make_request_checks(struct bio *bio) if (status != BLK_STS_OK) goto end_io; break; + case REQ_OP_WRITE_ZEROES: + if (!q->limits.max_write_zeroes_sectors) + goto not_supported; + break; case REQ_OP_ZONE_RESET: case REQ_OP_ZONE_OPEN: case REQ_OP_ZONE_CLOSE: case REQ_OP_ZONE_FINISH: - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bio->bi_bdev)) goto not_supported; break; case REQ_OP_ZONE_RESET_ALL: - if (!blk_queue_is_zoned(q) || !blk_queue_zone_resetall(q)) - goto not_supported; - break; - case REQ_OP_WRITE_ZEROES: - if (!q->limits.max_write_zeroes_sectors) + if (!bdev_is_zoned(bio->bi_bdev) || !blk_queue_zone_resetall(q)) goto not_supported; break; + case REQ_OP_DRV_IN: + case REQ_OP_DRV_OUT: + /* + * Driver private operations are only used with passthrough + * requests. + */ + fallthrough; default: - break; + goto not_supported; } - /* - * Various block parts want %current->io_context, so allocate it up - * front rather than dealing with lots of pain to allocate it only - * where needed. This may fail and the block layer knows how to live - * with it. - */ - if (unlikely(!current->io_context)) - create_task_io_context(current, GFP_ATOMIC, q->node); - - if (!blkcg_bio_issue_check(q, bio)) - return false; - - if (!bio_flagged(bio, BIO_TRACE_COMPLETION)) { - trace_block_bio_queue(q, bio); - /* Now that enqueuing has been traced, we need to trace - * completion as well. - */ - bio_set_flag(bio, BIO_TRACE_COMPLETION); - } - return true; + if (blk_throtl_bio(bio)) + return; + submit_bio_noacct_nocheck(bio); + return; not_supported: status = BLK_STS_NOTSUPP; end_io: bio->bi_status = status; bio_endio(bio); - return false; -} - -static blk_qc_t do_make_request(struct bio *bio) -{ - struct request_queue *q = bio->bi_disk->queue; - blk_qc_t ret = BLK_QC_T_NONE; - - if (blk_crypto_bio_prep(&bio)) { - if (!q->make_request_fn) - return blk_mq_make_request(q, bio); - ret = q->make_request_fn(q, bio); - } - blk_queue_exit(q); - return ret; -} - -/** - * generic_make_request - re-submit a bio to the block device layer for I/O - * @bio: The bio describing the location in memory and on the device. - * - * This is a version of submit_bio() that shall only be used for I/O that is - * resubmitted to lower level drivers by stacking block drivers. All file - * systems and other upper level users of the block layer should use - * submit_bio() instead. - */ -blk_qc_t generic_make_request(struct bio *bio) -{ - /* - * bio_list_on_stack[0] contains bios submitted by the current - * make_request_fn. - * bio_list_on_stack[1] contains bios that were submitted before - * the current make_request_fn, but that haven't been processed - * yet. - */ - struct bio_list bio_list_on_stack[2]; - blk_qc_t ret = BLK_QC_T_NONE; - - if (!generic_make_request_checks(bio)) - goto out; - - /* - * We only want one ->make_request_fn to be active at a time, else - * stack usage with stacked devices could be a problem. So use - * current->bio_list to keep a list of requests submited by a - * make_request_fn function. current->bio_list is also used as a - * flag to say if generic_make_request is currently active in this - * task or not. If it is NULL, then no make_request is active. If - * it is non-NULL, then a make_request is active, and new requests - * should be added at the tail - */ - if (current->bio_list) { - bio_list_add(¤t->bio_list[0], bio); - goto out; - } - - /* following loop may be a bit non-obvious, and so deserves some - * explanation. - * Before entering the loop, bio->bi_next is NULL (as all callers - * ensure that) so we have a list with a single bio. - * We pretend that we have just taken it off a longer list, so - * we assign bio_list to a pointer to the bio_list_on_stack, - * thus initialising the bio_list of new bios to be - * added. ->make_request() may indeed add some more bios - * through a recursive call to generic_make_request. If it - * did, we find a non-NULL value in bio_list and re-enter the loop - * from the top. In this case we really did just take the bio - * of the top of the list (no pretending) and so remove it from - * bio_list, and call into ->make_request() again. - */ - BUG_ON(bio->bi_next); - bio_list_init(&bio_list_on_stack[0]); - current->bio_list = bio_list_on_stack; - do { - struct request_queue *q = bio->bi_disk->queue; - - if (likely(bio_queue_enter(bio) == 0)) { - struct bio_list lower, same; - - /* Create a fresh bio_list for all subordinate requests */ - bio_list_on_stack[1] = bio_list_on_stack[0]; - bio_list_init(&bio_list_on_stack[0]); - ret = do_make_request(bio); - - /* sort new bios into those for a lower level - * and those for the same level - */ - bio_list_init(&lower); - bio_list_init(&same); - while ((bio = bio_list_pop(&bio_list_on_stack[0])) != NULL) - if (q == bio->bi_disk->queue) - bio_list_add(&same, bio); - else - bio_list_add(&lower, bio); - /* now assemble so we handle the lowest level first */ - bio_list_merge(&bio_list_on_stack[0], &lower); - bio_list_merge(&bio_list_on_stack[0], &same); - bio_list_merge(&bio_list_on_stack[0], &bio_list_on_stack[1]); - } - bio = bio_list_pop(&bio_list_on_stack[0]); - } while (bio); - current->bio_list = NULL; /* deactivate */ - -out: - return ret; } -EXPORT_SYMBOL(generic_make_request); +EXPORT_SYMBOL(submit_bio_noacct); -/** - * direct_make_request - hand a buffer directly to its device driver for I/O - * @bio: The bio describing the location in memory and on the device. - * - * This function behaves like generic_make_request(), but does not protect - * against recursion. Must only be used if the called driver is known - * to be blk-mq based. - */ -blk_qc_t direct_make_request(struct bio *bio) +static void bio_set_ioprio(struct bio *bio) { - struct request_queue *q = bio->bi_disk->queue; - - if (WARN_ON_ONCE(q->make_request_fn)) { - bio_io_error(bio); - return BLK_QC_T_NONE; - } - if (!generic_make_request_checks(bio)) - return BLK_QC_T_NONE; - if (unlikely(bio_queue_enter(bio))) - return BLK_QC_T_NONE; - if (!blk_crypto_bio_prep(&bio)) { - blk_queue_exit(q); - return BLK_QC_T_NONE; - } - return blk_mq_make_request(q, bio); + /* Nobody set ioprio so far? Initialize it based on task's nice value */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) == IOPRIO_CLASS_NONE) + bio->bi_ioprio = get_current_ioprio(); + blkcg_set_ioprio(bio); } -EXPORT_SYMBOL_GPL(direct_make_request); /** * submit_bio - submit a bio to the block device layer for I/O @@ -1213,429 +848,189 @@ EXPORT_SYMBOL_GPL(direct_make_request); * * submit_bio() is used to submit I/O requests to block devices. It is passed a * fully set up &struct bio that describes the I/O that needs to be done. The - * bio will be send to the device described by the bi_disk and bi_partno fields. + * bio will be send to the device described by the bi_bdev field. * * The success/failure status of the request, along with notification of * completion, is delivered asynchronously through the ->bi_end_io() callback - * in @bio. The bio must NOT be touched by thecaller until ->bi_end_io() has + * in @bio. The bio must NOT be touched by the caller until ->bi_end_io() has * been called. */ -blk_qc_t submit_bio(struct bio *bio) +void submit_bio(struct bio *bio) { - if (blkcg_punt_bio_submit(bio)) - return BLK_QC_T_NONE; - - /* - * If it's a regular read/write or a barrier with data attached, - * go through the normal accounting stuff before submission. - */ - if (bio_has_data(bio)) { - unsigned int count; - - if (unlikely(bio_op(bio) == REQ_OP_WRITE_SAME)) - count = queue_logical_block_size(bio->bi_disk->queue) >> 9; - else - count = bio_sectors(bio); - - if (op_is_write(bio_op(bio))) { - count_vm_events(PGPGOUT, count); - } else { - task_io_account_read(bio->bi_iter.bi_size); - count_vm_events(PGPGIN, count); - } - - if (unlikely(block_dump)) { - char b[BDEVNAME_SIZE]; - printk(KERN_DEBUG "%s(%d): %s block %Lu on %s (%u sectors)\n", - current->comm, task_pid_nr(current), - op_is_write(bio_op(bio)) ? "WRITE" : "READ", - (unsigned long long)bio->bi_iter.bi_sector, - bio_devname(bio, b), count); - } - } - - /* - * If we're reading data that is part of the userspace workingset, count - * submission time as memory stall. When the device is congested, or - * the submitting cgroup IO-throttled, submission can be a significant - * part of overall IO time. - */ - if (unlikely(bio_op(bio) == REQ_OP_READ && - bio_flagged(bio, BIO_WORKINGSET))) { - unsigned long pflags; - blk_qc_t ret; - - psi_memstall_enter(&pflags); - ret = generic_make_request(bio); - psi_memstall_leave(&pflags); - - return ret; + if (bio_op(bio) == REQ_OP_READ) { + task_io_account_read(bio->bi_iter.bi_size); + count_vm_events(PGPGIN, bio_sectors(bio)); + } else if (bio_op(bio) == REQ_OP_WRITE) { + count_vm_events(PGPGOUT, bio_sectors(bio)); } - return generic_make_request(bio); + bio_set_ioprio(bio); + submit_bio_noacct(bio); } EXPORT_SYMBOL(submit_bio); /** - * blk_cloned_rq_check_limits - Helper function to check a cloned request - * for the new queue limits - * @q: the queue - * @rq: the request being checked + * bio_poll - poll for BIO completions + * @bio: bio to poll for + * @iob: batches of IO + * @flags: BLK_POLL_* flags that control the behavior * - * Description: - * @rq may have been made based on weaker limitations of upper-level queues - * in request stacking drivers, and it may violate the limitation of @q. - * Since the block layer and the underlying device driver trust @rq - * after it is inserted to @q, it should be checked against @q before - * the insertion using this generic function. + * Poll for completions on queue associated with the bio. Returns number of + * completed entries found. * - * Request stacking drivers like request-based dm may change the queue - * limits when retrying requests on other queues. Those requests need - * to be checked against the new queue limits again during dispatch. - */ -static int blk_cloned_rq_check_limits(struct request_queue *q, - struct request *rq) -{ - if (blk_rq_sectors(rq) > blk_queue_get_max_sectors(q, req_op(rq))) { - printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", - __func__, blk_rq_sectors(rq), - blk_queue_get_max_sectors(q, req_op(rq))); - return -EIO; - } - - /* - * queue's settings related to segment counting like q->bounce_pfn - * may differ from that of other stacking queues. - * Recalculate it to check the request correctly on this queue's - * limitation. - */ - rq->nr_phys_segments = blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_segments(q)) { - printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", - __func__, rq->nr_phys_segments, queue_max_segments(q)); - return -EIO; - } - - return 0; -} - -/** - * blk_insert_cloned_request - Helper for stacking drivers to submit a request - * @q: the queue to submit the request - * @rq: the request being queued + * Note: the caller must either be the context that submitted @bio, or + * be in a RCU critical section to prevent freeing of @bio. */ -blk_status_t blk_insert_cloned_request(struct request_queue *q, struct request *rq) +int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - if (blk_cloned_rq_check_limits(q, rq)) - return BLK_STS_IOERR; + blk_qc_t cookie = READ_ONCE(bio->bi_cookie); + struct block_device *bdev; + struct request_queue *q; + int ret = 0; - if (rq->rq_disk && - should_fail_request(&rq->rq_disk->part0, blk_rq_bytes(rq))) - return BLK_STS_IOERR; + bdev = READ_ONCE(bio->bi_bdev); + if (!bdev) + return 0; - if (blk_crypto_insert_cloned_request(rq)) - return BLK_STS_IOERR; + q = bdev_get_queue(bdev); + if (cookie == BLK_QC_T_NONE || + !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + return 0; - if (blk_queue_io_stat(q)) - blk_account_io_start(rq); + /* + * As the requests that require a zone lock are not plugged in the + * first place, directly accessing the plug instead of using + * blk_mq_plug() should not have any consequences during flushing for + * zoned devices. + */ + blk_flush_plug(current->plug, false); /* - * Since we have a scheduler attached on the top device, - * bypass a potential scheduler on the bottom device for - * insert. + * We need to be able to enter a frozen queue, similar to how + * timeouts also need to do that. If that is blocked, then we can + * have pending IO when a queue freeze is started, and then the + * wait for the freeze to finish will wait for polled requests to + * timeout as the poller is preventer from entering the queue and + * completing them. As long as we prevent new IO from being queued, + * that should be all that matters. */ - return blk_mq_request_issue_directly(rq, true); + if (!percpu_ref_tryget(&q->q_usage_counter)) + return 0; + if (queue_is_mq(q)) { + ret = blk_mq_poll(q, cookie, iob, flags); + } else { + struct gendisk *disk = q->disk; + + if (disk && disk->fops->poll_bio) + ret = disk->fops->poll_bio(bio, iob, flags); + } + blk_queue_exit(q); + return ret; } -EXPORT_SYMBOL_GPL(blk_insert_cloned_request); +EXPORT_SYMBOL_GPL(bio_poll); -/** - * blk_rq_err_bytes - determine number of bytes till the next failure boundary - * @rq: request to examine - * - * Description: - * A request could be merge of IOs which require different failure - * handling. This function determines the number of bytes which - * can be failed from the beginning of the request without - * crossing into area which need to be retried further. - * - * Return: - * The number of bytes to fail. +/* + * Helper to implement file_operations.iopoll. Requires the bio to be stored + * in iocb->private, and cleared before freeing the bio. */ -unsigned int blk_rq_err_bytes(const struct request *rq) +int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, + unsigned int flags) { - unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; - unsigned int bytes = 0; struct bio *bio; - - if (!(rq->rq_flags & RQF_MIXED_MERGE)) - return blk_rq_bytes(rq); + int ret = 0; /* - * Currently the only 'mixing' which can happen is between - * different fastfail types. We can safely fail portions - * which have all the failfast bits that the first one has - - * the ones which are at least as eager to fail as the first - * one. + * Note: the bio cache only uses SLAB_TYPESAFE_BY_RCU, so bio can + * point to a freshly allocated bio at this point. If that happens + * we have a few cases to consider: + * + * 1) the bio is beeing initialized and bi_bdev is NULL. We can just + * simply nothing in this case + * 2) the bio points to a not poll enabled device. bio_poll will catch + * this and return 0 + * 3) the bio points to a poll capable device, including but not + * limited to the one that the original bio pointed to. In this + * case we will call into the actual poll method and poll for I/O, + * even if we don't need to, but it won't cause harm either. + * + * For cases 2) and 3) above the RCU grace period ensures that bi_bdev + * is still allocated. Because partitions hold a reference to the whole + * device bdev and thus disk, the disk is also still valid. Grabbing + * a reference to the queue in bio_poll() ensures the hctxs and requests + * are still valid as well. */ - for (bio = rq->bio; bio; bio = bio->bi_next) { - if ((bio->bi_opf & ff) != ff) - break; - bytes += bio->bi_iter.bi_size; - } + rcu_read_lock(); + bio = READ_ONCE(kiocb->private); + if (bio) + ret = bio_poll(bio, iob, flags); + rcu_read_unlock(); - /* this could lead to infinite loop */ - BUG_ON(blk_rq_bytes(rq) && !bytes); - return bytes; + return ret; } -EXPORT_SYMBOL_GPL(blk_rq_err_bytes); +EXPORT_SYMBOL_GPL(iocb_bio_iopoll); -static void update_io_ticks(struct hd_struct *part, unsigned long now, bool end) +void update_io_ticks(struct block_device *part, unsigned long now, bool end) { unsigned long stamp; again: - stamp = READ_ONCE(part->stamp); - if (unlikely(stamp != now)) { - if (likely(cmpxchg(&part->stamp, stamp, now) == stamp)) + stamp = READ_ONCE(part->bd_stamp); + if (unlikely(time_after(now, stamp))) { + if (likely(try_cmpxchg(&part->bd_stamp, &stamp, now))) __part_stat_add(part, io_ticks, end ? now - stamp : 1); } - if (part->partno) { - part = &part_to_disk(part)->part0; + if (part->bd_partno) { + part = bdev_whole(part); goto again; } } -static void blk_account_io_completion(struct request *req, unsigned int bytes) -{ - if (req->part && blk_do_io_stat(req)) { - const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; - - part_stat_lock(); - part = req->part; - part_stat_add(part, sectors[sgrp], bytes >> 9); - part_stat_unlock(); - } -} - -void blk_account_io_done(struct request *req, u64 now) +unsigned long bdev_start_io_acct(struct block_device *bdev, enum req_op op, + unsigned long start_time) { - /* - * Account IO completion. flush_rq isn't accounted as a - * normal IO on queueing nor completion. Accounting the - * containing request is enough. - */ - if (req->part && blk_do_io_stat(req) && - !(req->rq_flags & RQF_FLUSH_SEQ)) { - const int sgrp = op_stat_group(req_op(req)); - struct hd_struct *part; - - part_stat_lock(); - part = req->part; - - update_io_ticks(part, jiffies, true); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, nsecs[sgrp], now - req->start_time_ns); - part_stat_unlock(); - - hd_struct_put(part); - } -} - -void blk_account_io_start(struct request *rq) -{ - if (!blk_do_io_stat(rq)) - return; - - rq->part = disk_map_sector_rcu(rq->rq_disk, blk_rq_pos(rq)); - part_stat_lock(); - update_io_ticks(rq->part, jiffies, false); + update_io_ticks(bdev, start_time, false); + part_stat_local_inc(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); + + return start_time; } +EXPORT_SYMBOL(bdev_start_io_acct); -unsigned long disk_start_io_acct(struct gendisk *disk, unsigned int sectors, - unsigned int op) +/** + * bio_start_io_acct - start I/O accounting for bio based drivers + * @bio: bio to start account for + * + * Returns the start time that should be passed back to bio_end_io_acct(). + */ +unsigned long bio_start_io_acct(struct bio *bio) { - struct hd_struct *part = &disk->part0; - const int sgrp = op_stat_group(op); - unsigned long now = READ_ONCE(jiffies); - - part_stat_lock(); - update_io_ticks(part, now, false); - part_stat_inc(part, ios[sgrp]); - part_stat_add(part, sectors[sgrp], sectors); - part_stat_local_inc(part, in_flight[op_is_write(op)]); - part_stat_unlock(); - - return now; + return bdev_start_io_acct(bio->bi_bdev, bio_op(bio), jiffies); } -EXPORT_SYMBOL(disk_start_io_acct); +EXPORT_SYMBOL_GPL(bio_start_io_acct); -void disk_end_io_acct(struct gendisk *disk, unsigned int op, - unsigned long start_time) +void bdev_end_io_acct(struct block_device *bdev, enum req_op op, + unsigned int sectors, unsigned long start_time) { - struct hd_struct *part = &disk->part0; const int sgrp = op_stat_group(op); unsigned long now = READ_ONCE(jiffies); unsigned long duration = now - start_time; part_stat_lock(); - update_io_ticks(part, now, true); - part_stat_add(part, nsecs[sgrp], jiffies_to_nsecs(duration)); - part_stat_local_dec(part, in_flight[op_is_write(op)]); + update_io_ticks(bdev, now, true); + part_stat_inc(bdev, ios[sgrp]); + part_stat_add(bdev, sectors[sgrp], sectors); + part_stat_add(bdev, nsecs[sgrp], jiffies_to_nsecs(duration)); + part_stat_local_dec(bdev, in_flight[op_is_write(op)]); part_stat_unlock(); } -EXPORT_SYMBOL(disk_end_io_acct); - -/* - * Steal bios from a request and add them to a bio list. - * The request must not have been partially completed before. - */ -void blk_steal_bios(struct bio_list *list, struct request *rq) -{ - if (rq->bio) { - if (list->tail) - list->tail->bi_next = rq->bio; - else - list->head = rq->bio; - list->tail = rq->biotail; - - rq->bio = NULL; - rq->biotail = NULL; - } - - rq->__data_len = 0; -} -EXPORT_SYMBOL_GPL(blk_steal_bios); - -/** - * blk_update_request - Special helper function for request stacking drivers - * @req: the request being processed - * @error: block status code - * @nr_bytes: number of bytes to complete @req - * - * Description: - * Ends I/O on a number of bytes attached to @req, but doesn't complete - * the request structure even if @req doesn't have leftover. - * If @req has leftover, sets it up for the next range of segments. - * - * This special helper function is only for request stacking drivers - * (e.g. request-based dm) so that they can handle partial completion. - * Actual device drivers should use blk_mq_end_request instead. - * - * Passing the result of blk_rq_bytes() as @nr_bytes guarantees - * %false return from this function. - * - * Note: - * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in both - * blk_rq_bytes() and in blk_update_request(). - * - * Return: - * %false - this request doesn't have any more data - * %true - this request has more data - **/ -bool blk_update_request(struct request *req, blk_status_t error, - unsigned int nr_bytes) -{ - int total_bytes; - - trace_block_rq_complete(req, blk_status_to_errno(error), nr_bytes); - - if (!req->bio) - return false; - -#ifdef CONFIG_BLK_DEV_INTEGRITY - if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && - error == BLK_STS_OK) - req->q->integrity.profile->complete_fn(req, nr_bytes); -#endif - - if (unlikely(error && !blk_rq_is_passthrough(req) && - !(req->rq_flags & RQF_QUIET))) - print_req_error(req, error, __func__); - - blk_account_io_completion(req, nr_bytes); - - total_bytes = 0; - while (req->bio) { - struct bio *bio = req->bio; - unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); - - if (bio_bytes == bio->bi_iter.bi_size) - req->bio = bio->bi_next; - - /* Completion has already been traced */ - bio_clear_flag(bio, BIO_TRACE_COMPLETION); - req_bio_endio(req, bio, bio_bytes, error); - - total_bytes += bio_bytes; - nr_bytes -= bio_bytes; - - if (!nr_bytes) - break; - } - - /* - * completely done - */ - if (!req->bio) { - /* - * Reset counters so that the request stacking driver - * can find how many bytes remain in the request - * later. - */ - req->__data_len = 0; - return false; - } - - req->__data_len -= total_bytes; - - /* update sector only for requests with clear definition of sector */ - if (!blk_rq_is_passthrough(req)) - req->__sector += total_bytes >> 9; - - /* mixed attributes always follow the first bio */ - if (req->rq_flags & RQF_MIXED_MERGE) { - req->cmd_flags &= ~REQ_FAILFAST_MASK; - req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; - } - - if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { - /* - * If total number of sectors is less than the first segment - * size, something has gone terribly wrong. - */ - if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { - blk_dump_rq_flags(req, "request botched"); - req->__data_len = blk_rq_cur_bytes(req); - } - - /* recalculate the number of segments */ - req->nr_phys_segments = blk_recalc_rq_segments(req); - } - - return true; -} -EXPORT_SYMBOL_GPL(blk_update_request); +EXPORT_SYMBOL(bdev_end_io_acct); -#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE -/** - * rq_flush_dcache_pages - Helper function to flush all pages in a request - * @rq: the request to be flushed - * - * Description: - * Flush all pages in @rq. - */ -void rq_flush_dcache_pages(struct request *rq) +void bio_end_io_acct_remapped(struct bio *bio, unsigned long start_time, + struct block_device *orig_bdev) { - struct req_iterator iter; - struct bio_vec bvec; - - rq_for_each_segment(bvec, rq, iter) - flush_dcache_page(bvec.bv_page); + bdev_end_io_acct(orig_bdev, bio_op(bio), bio_sectors(bio), start_time); } -EXPORT_SYMBOL_GPL(rq_flush_dcache_pages); -#endif +EXPORT_SYMBOL_GPL(bio_end_io_acct_remapped); /** * blk_lld_busy - Check if underlying low-level drivers of a device are busy @@ -1665,91 +1060,6 @@ int blk_lld_busy(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_lld_busy); -/** - * blk_rq_unprep_clone - Helper function to free all bios in a cloned request - * @rq: the clone request to be cleaned up - * - * Description: - * Free all bios in @rq for a cloned request. - */ -void blk_rq_unprep_clone(struct request *rq) -{ - struct bio *bio; - - while ((bio = rq->bio) != NULL) { - rq->bio = bio->bi_next; - - bio_put(bio); - } -} -EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); - -/** - * blk_rq_prep_clone - Helper function to setup clone request - * @rq: the request to be setup - * @rq_src: original request to be cloned - * @bs: bio_set that bios for clone are allocated from - * @gfp_mask: memory allocation mask for bio - * @bio_ctr: setup function to be called for each clone bio. - * Returns %0 for success, non %0 for failure. - * @data: private data to be passed to @bio_ctr - * - * Description: - * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. - * Also, pages which the original bios are pointing to are not copied - * and the cloned bios just point same pages. - * So cloned bios must be completed before original bios, which means - * the caller must complete @rq before @rq_src. - */ -int blk_rq_prep_clone(struct request *rq, struct request *rq_src, - struct bio_set *bs, gfp_t gfp_mask, - int (*bio_ctr)(struct bio *, struct bio *, void *), - void *data) -{ - struct bio *bio, *bio_src; - - if (!bs) - bs = &fs_bio_set; - - __rq_for_each_bio(bio_src, rq_src) { - bio = bio_clone_fast(bio_src, gfp_mask, bs); - if (!bio) - goto free_and_out; - - if (bio_ctr && bio_ctr(bio, bio_src, data)) - goto free_and_out; - - if (rq->bio) { - rq->biotail->bi_next = bio; - rq->biotail = bio; - } else - rq->bio = rq->biotail = bio; - } - - /* Copy attributes of the original request to the clone request. */ - rq->__sector = blk_rq_pos(rq_src); - rq->__data_len = blk_rq_bytes(rq_src); - if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { - rq->rq_flags |= RQF_SPECIAL_PAYLOAD; - rq->special_vec = rq_src->special_vec; - } - rq->nr_phys_segments = rq_src->nr_phys_segments; - rq->ioprio = rq_src->ioprio; - - if (rq->bio) - blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask); - - return 0; - -free_and_out: - if (bio) - bio_put(bio); - blk_rq_unprep_clone(rq); - - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_rq_prep_clone); - int kblockd_schedule_work(struct work_struct *work) { return queue_work(kblockd_workqueue, work); @@ -1763,6 +1073,31 @@ int kblockd_mod_delayed_work_on(int cpu, struct delayed_work *dwork, } EXPORT_SYMBOL(kblockd_mod_delayed_work_on); +void blk_start_plug_nr_ios(struct blk_plug *plug, unsigned short nr_ios) +{ + struct task_struct *tsk = current; + + /* + * If this is a nested plug, don't actually assign it. + */ + if (tsk->plug) + return; + + plug->mq_list = NULL; + plug->cached_rq = NULL; + plug->nr_ios = min_t(unsigned short, nr_ios, BLK_MAX_REQUEST_COUNT); + plug->rq_count = 0; + plug->multiple_queues = false; + plug->has_elevator = false; + INIT_LIST_HEAD(&plug->cb_list); + + /* + * Store ordering should not be needed here, since a potential + * preempt will imply a full memory barrier + */ + tsk->plug = plug; +} + /** * blk_start_plug - initialize blk_plug and track it inside the task_struct * @plug: The &struct blk_plug that needs to be initialized @@ -1788,24 +1123,7 @@ EXPORT_SYMBOL(kblockd_mod_delayed_work_on); */ void blk_start_plug(struct blk_plug *plug) { - struct task_struct *tsk = current; - - /* - * If this is a nested plug, don't actually assign it. - */ - if (tsk->plug) - return; - - INIT_LIST_HEAD(&plug->mq_list); - INIT_LIST_HEAD(&plug->cb_list); - plug->rq_count = 0; - plug->multiple_queues = false; - - /* - * Store ordering should not be needed here, since a potential - * preempt will imply a full memory barrier - */ - tsk->plug = plug; + blk_start_plug_nr_ios(plug, 1); } EXPORT_SYMBOL(blk_start_plug); @@ -1851,12 +1169,19 @@ struct blk_plug_cb *blk_check_plugged(blk_plug_cb_fn unplug, void *data, } EXPORT_SYMBOL(blk_check_plugged); -void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) +void __blk_flush_plug(struct blk_plug *plug, bool from_schedule) { - flush_plug_callbacks(plug, from_schedule); - - if (!list_empty(&plug->mq_list)) - blk_mq_flush_plug_list(plug, from_schedule); + if (!list_empty(&plug->cb_list)) + flush_plug_callbacks(plug, from_schedule); + blk_mq_flush_plug_list(plug, from_schedule); + /* + * Unconditionally flush out cached requests, even if the unplug + * event came from schedule. Since we know hold references to the + * queue for cached requests, we don't want a blocked task holding + * up a queue freeze/quiesce event. + */ + if (unlikely(!rq_list_empty(plug->cached_rq))) + blk_mq_free_plug_rqs(plug); } /** @@ -1871,11 +1196,10 @@ void blk_flush_plug_list(struct blk_plug *plug, bool from_schedule) */ void blk_finish_plug(struct blk_plug *plug) { - if (plug != current->plug) - return; - blk_flush_plug_list(plug, false); - - current->plug = NULL; + if (plug == current->plug) { + __blk_flush_plug(plug, false); + current->plug = NULL; + } } EXPORT_SYMBOL(blk_finish_plug); @@ -1893,7 +1217,7 @@ EXPORT_SYMBOL_GPL(blk_io_schedule); int __init blk_dev_init(void) { - BUILD_BUG_ON(REQ_OP_LAST >= (1 << REQ_OP_BITS)); + BUILD_BUG_ON((__force u32)REQ_OP_LAST >= (1 << REQ_OP_BITS)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * sizeof_field(struct request, cmd_flags)); BUILD_BUG_ON(REQ_OP_BITS + REQ_FLAG_BITS > 8 * @@ -1908,9 +1232,7 @@ int __init blk_dev_init(void) blk_requestq_cachep = kmem_cache_create("request_queue", sizeof(struct request_queue), 0, SLAB_PANIC, NULL); -#ifdef CONFIG_DEBUG_FS blk_debugfs_root = debugfs_create_dir("block", NULL); -#endif return 0; } diff --git a/block/blk-crypto-fallback.c b/block/blk-crypto-fallback.c index 6e49688a2d80..e6468eab2681 100644 --- a/block/blk-crypto-fallback.c +++ b/block/blk-crypto-fallback.c @@ -10,15 +10,16 @@ #define pr_fmt(fmt) "blk-crypto-fallback: " fmt #include <crypto/skcipher.h> -#include <linux/blk-cgroup.h> #include <linux/blk-crypto.h> +#include <linux/blk-crypto-profile.h> #include <linux/blkdev.h> #include <linux/crypto.h> -#include <linux/keyslot-manager.h> #include <linux/mempool.h> #include <linux/module.h> #include <linux/random.h> +#include <linux/scatterlist.h> +#include "blk-cgroup.h" #include "blk-crypto-internal.h" static unsigned int num_prealloc_bounce_pg = 32; @@ -72,14 +73,15 @@ static mempool_t *bio_fallback_crypt_ctx_pool; static DEFINE_MUTEX(tfms_init_lock); static bool tfms_inited[BLK_ENCRYPTION_MODE_MAX]; -static struct blk_crypto_keyslot { +static struct blk_crypto_fallback_keyslot { enum blk_crypto_mode_num crypto_mode; struct crypto_skcipher *tfms[BLK_ENCRYPTION_MODE_MAX]; } *blk_crypto_keyslots; -static struct blk_keyslot_manager blk_crypto_ksm; +static struct blk_crypto_profile *blk_crypto_fallback_profile; static struct workqueue_struct *blk_crypto_wq; static mempool_t *blk_crypto_bounce_page_pool; +static struct bio_set crypto_bio_split; /* * This is the key we set when evicting a keyslot. This *should* be the all 0's @@ -87,9 +89,9 @@ static mempool_t *blk_crypto_bounce_page_pool; */ static u8 blank_key[BLK_CRYPTO_MAX_KEY_SIZE]; -static void blk_crypto_evict_keyslot(unsigned int slot) +static void blk_crypto_fallback_evict_keyslot(unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; enum blk_crypto_mode_num crypto_mode = slotp->crypto_mode; int err; @@ -102,45 +104,41 @@ static void blk_crypto_evict_keyslot(unsigned int slot) slotp->crypto_mode = BLK_ENCRYPTION_MODE_INVALID; } -static int blk_crypto_keyslot_program(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int +blk_crypto_fallback_keyslot_program(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - struct blk_crypto_keyslot *slotp = &blk_crypto_keyslots[slot]; + struct blk_crypto_fallback_keyslot *slotp = &blk_crypto_keyslots[slot]; const enum blk_crypto_mode_num crypto_mode = key->crypto_cfg.crypto_mode; int err; if (crypto_mode != slotp->crypto_mode && slotp->crypto_mode != BLK_ENCRYPTION_MODE_INVALID) - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); slotp->crypto_mode = crypto_mode; err = crypto_skcipher_setkey(slotp->tfms[crypto_mode], key->raw, key->size); if (err) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return err; } return 0; } -static int blk_crypto_keyslot_evict(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - unsigned int slot) +static int blk_crypto_fallback_keyslot_evict(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + unsigned int slot) { - blk_crypto_evict_keyslot(slot); + blk_crypto_fallback_evict_keyslot(slot); return 0; } -/* - * The crypto API fallback KSM ops - only used for a bio when it specifies a - * blk_crypto_key that was not supported by the device's inline encryption - * hardware. - */ -static const struct blk_ksm_ll_ops blk_crypto_ksm_ll_ops = { - .keyslot_program = blk_crypto_keyslot_program, - .keyslot_evict = blk_crypto_keyslot_evict, +static const struct blk_crypto_ll_ops blk_crypto_fallback_ll_ops = { + .keyslot_program = blk_crypto_fallback_keyslot_program, + .keyslot_evict = blk_crypto_fallback_keyslot_evict, }; static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) @@ -154,23 +152,26 @@ static void blk_crypto_fallback_encrypt_endio(struct bio *enc_bio) src_bio->bi_status = enc_bio->bi_status; - bio_put(enc_bio); + bio_uninit(enc_bio); + kfree(enc_bio); bio_endio(src_bio); } -static struct bio *blk_crypto_clone_bio(struct bio *bio_src) +static struct bio *blk_crypto_fallback_clone_bio(struct bio *bio_src) { + unsigned int nr_segs = bio_segments(bio_src); struct bvec_iter iter; struct bio_vec bv; struct bio *bio; - bio = bio_alloc_bioset(GFP_NOIO, bio_segments(bio_src), NULL); + bio = bio_kmalloc(nr_segs, GFP_NOIO); if (!bio) return NULL; - bio->bi_disk = bio_src->bi_disk; - bio->bi_opf = bio_src->bi_opf; + bio_init(bio, bio_src->bi_bdev, bio->bi_inline_vecs, nr_segs, + bio_src->bi_opf); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; - bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -178,18 +179,18 @@ static struct bio *blk_crypto_clone_bio(struct bio *bio_src) bio->bi_io_vec[bio->bi_vcnt++] = bv; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; } -static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, - struct skcipher_request **ciph_req_ret, - struct crypto_wait *wait) +static bool +blk_crypto_fallback_alloc_cipher_req(struct blk_crypto_keyslot *slot, + struct skcipher_request **ciph_req_ret, + struct crypto_wait *wait) { struct skcipher_request *ciph_req; - const struct blk_crypto_keyslot *slotp; - int keyslot_idx = blk_ksm_get_slot_idx(slot); + const struct blk_crypto_fallback_keyslot *slotp; + int keyslot_idx = blk_crypto_keyslot_index(slot); slotp = &blk_crypto_keyslots[keyslot_idx]; ciph_req = skcipher_request_alloc(slotp->tfms[slotp->crypto_mode], @@ -206,7 +207,7 @@ static bool blk_crypto_alloc_cipher_req(struct blk_ksm_keyslot *slot, return true; } -static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) +static bool blk_crypto_fallback_split_bio_if_needed(struct bio **bio_ptr) { struct bio *bio = *bio_ptr; unsigned int i = 0; @@ -216,19 +217,20 @@ static bool blk_crypto_split_bio_if_needed(struct bio **bio_ptr) bio_for_each_segment(bv, bio, iter) { num_sectors += bv.bv_len >> SECTOR_SHIFT; - if (++i == BIO_MAX_PAGES) + if (++i == BIO_MAX_VECS) break; } if (num_sectors < bio_sectors(bio)) { struct bio *split_bio; - split_bio = bio_split(bio, num_sectors, GFP_NOIO, NULL); + split_bio = bio_split(bio, num_sectors, GFP_NOIO, + &crypto_bio_split); if (!split_bio) { bio->bi_status = BLK_STS_RESOURCE; return false; } bio_chain(split_bio, bio); - generic_make_request(bio); + submit_bio_noacct(bio); *bio_ptr = split_bio; } @@ -260,7 +262,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) { struct bio *src_bio, *enc_bio; struct bio_crypt_ctx *bc; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; int data_unit_size; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); @@ -272,7 +274,7 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) blk_status_t blk_st; /* Split the bio if it's too big for single page bvec */ - if (!blk_crypto_split_bio_if_needed(bio_ptr)) + if (!blk_crypto_fallback_split_bio_if_needed(bio_ptr)) return false; src_bio = *bio_ptr; @@ -280,24 +282,25 @@ static bool blk_crypto_fallback_encrypt_bio(struct bio **bio_ptr) data_unit_size = bc->bc_key->crypto_cfg.data_unit_size; /* Allocate bounce bio for encryption */ - enc_bio = blk_crypto_clone_bio(src_bio); + enc_bio = blk_crypto_fallback_clone_bio(src_bio); if (!enc_bio) { src_bio->bi_status = BLK_STS_RESOURCE; return false; } /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { src_bio->bi_status = blk_st; goto out_put_enc_bio; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { src_bio->bi_status = BLK_STS_RESOURCE; goto out_release_keyslot; } @@ -358,11 +361,11 @@ out_free_bounce_pages: out_free_ciph_req: skcipher_request_free(ciph_req); out_release_keyslot: - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_put_enc_bio: if (enc_bio) - bio_put(enc_bio); - + bio_uninit(enc_bio); + kfree(enc_bio); return ret; } @@ -376,7 +379,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) container_of(work, struct bio_fallback_crypt_ctx, work); struct bio *bio = f_ctx->bio; struct bio_crypt_ctx *bc = &f_ctx->crypt_ctx; - struct blk_ksm_keyslot *slot; + struct blk_crypto_keyslot *slot; struct skcipher_request *ciph_req = NULL; DECLARE_CRYPTO_WAIT(wait); u64 curr_dun[BLK_CRYPTO_DUN_ARRAY_SIZE]; @@ -389,17 +392,18 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) blk_status_t blk_st; /* - * Use the crypto API fallback keyslot manager to get a crypto_skcipher - * for the algorithm and key specified for this bio. + * Get a blk-crypto-fallback keyslot that contains a crypto_skcipher for + * this bio's algorithm and key. */ - blk_st = blk_ksm_get_slot_for_key(&blk_crypto_ksm, bc->bc_key, &slot); + blk_st = blk_crypto_get_keyslot(blk_crypto_fallback_profile, + bc->bc_key, &slot); if (blk_st != BLK_STS_OK) { bio->bi_status = blk_st; goto out_no_keyslot; } /* and then allocate an skcipher_request for it */ - if (!blk_crypto_alloc_cipher_req(slot, &ciph_req, &wait)) { + if (!blk_crypto_fallback_alloc_cipher_req(slot, &ciph_req, &wait)) { bio->bi_status = BLK_STS_RESOURCE; goto out; } @@ -430,7 +434,7 @@ static void blk_crypto_fallback_decrypt_bio(struct work_struct *work) out: skcipher_request_free(ciph_req); - blk_ksm_put_slot(slot); + blk_crypto_put_keyslot(slot); out_no_keyslot: mempool_free(f_ctx, bio_fallback_crypt_ctx_pool); bio_endio(bio); @@ -469,9 +473,9 @@ static void blk_crypto_fallback_decrypt_endio(struct bio *bio) * @bio_ptr: pointer to the bio to prepare * * If bio is doing a WRITE operation, this splits the bio into two parts if it's - * too big (see blk_crypto_split_bio_if_needed). It then allocates a bounce bio - * for the first part, encrypts it, and update bio_ptr to point to the bounce - * bio. + * too big (see blk_crypto_fallback_split_bio_if_needed()). It then allocates a + * bounce bio for the first part, encrypts it, and updates bio_ptr to point to + * the bounce bio. * * For a READ operation, we mark the bio for decryption by using bi_private and * bi_end_io. @@ -495,8 +499,8 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) return false; } - if (!blk_ksm_crypto_cfg_supported(&blk_crypto_ksm, - &bc->bc_key->crypto_cfg)) { + if (!__blk_crypto_cfg_supported(blk_crypto_fallback_profile, + &bc->bc_key->crypto_cfg)) { bio->bi_status = BLK_STS_NOTSUPP; return false; } @@ -522,7 +526,7 @@ bool blk_crypto_fallback_bio_prep(struct bio **bio_ptr) int blk_crypto_fallback_evict_key(const struct blk_crypto_key *key) { - return blk_ksm_evict_key(&blk_crypto_ksm, key); + return __blk_crypto_evict_key(blk_crypto_fallback_profile, key); } static bool blk_crypto_fallback_inited; @@ -534,26 +538,39 @@ static int blk_crypto_fallback_init(void) if (blk_crypto_fallback_inited) return 0; - prandom_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); + get_random_bytes(blank_key, BLK_CRYPTO_MAX_KEY_SIZE); - err = blk_ksm_init(&blk_crypto_ksm, blk_crypto_num_keyslots); + err = bioset_init(&crypto_bio_split, 64, 0, 0); if (err) goto out; + + /* Dynamic allocation is needed because of lockdep_register_key(). */ + blk_crypto_fallback_profile = + kzalloc(sizeof(*blk_crypto_fallback_profile), GFP_KERNEL); + if (!blk_crypto_fallback_profile) { + err = -ENOMEM; + goto fail_free_bioset; + } + + err = blk_crypto_profile_init(blk_crypto_fallback_profile, + blk_crypto_num_keyslots); + if (err) + goto fail_free_profile; err = -ENOMEM; - blk_crypto_ksm.ksm_ll_ops = blk_crypto_ksm_ll_ops; - blk_crypto_ksm.max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; + blk_crypto_fallback_profile->ll_ops = blk_crypto_fallback_ll_ops; + blk_crypto_fallback_profile->max_dun_bytes_supported = BLK_CRYPTO_MAX_IV_SIZE; /* All blk-crypto modes have a crypto API fallback. */ for (i = 0; i < BLK_ENCRYPTION_MODE_MAX; i++) - blk_crypto_ksm.crypto_modes_supported[i] = 0xFFFFFFFF; - blk_crypto_ksm.crypto_modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; + blk_crypto_fallback_profile->modes_supported[i] = 0xFFFFFFFF; + blk_crypto_fallback_profile->modes_supported[BLK_ENCRYPTION_MODE_INVALID] = 0; blk_crypto_wq = alloc_workqueue("blk_crypto_wq", WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, num_online_cpus()); if (!blk_crypto_wq) - goto fail_free_ksm; + goto fail_destroy_profile; blk_crypto_keyslots = kcalloc(blk_crypto_num_keyslots, sizeof(blk_crypto_keyslots[0]), @@ -587,8 +604,12 @@ fail_free_keyslots: kfree(blk_crypto_keyslots); fail_free_wq: destroy_workqueue(blk_crypto_wq); -fail_free_ksm: - blk_ksm_destroy(&blk_crypto_ksm); +fail_destroy_profile: + blk_crypto_profile_destroy(blk_crypto_fallback_profile); +fail_free_profile: + kfree(blk_crypto_fallback_profile); +fail_free_bioset: + bioset_exit(&crypto_bio_split); out: return err; } @@ -600,7 +621,7 @@ out: int blk_crypto_fallback_start_using_mode(enum blk_crypto_mode_num mode_num) { const char *cipher_str = blk_crypto_modes[mode_num].cipher_str; - struct blk_crypto_keyslot *slotp; + struct blk_crypto_fallback_keyslot *slotp; unsigned int i; int err = 0; diff --git a/block/blk-crypto-internal.h b/block/blk-crypto-internal.h index d2b0f565d83c..93a141979694 100644 --- a/block/blk-crypto-internal.h +++ b/block/blk-crypto-internal.h @@ -7,10 +7,11 @@ #define __LINUX_BLK_CRYPTO_INTERNAL_H #include <linux/bio.h> -#include <linux/blkdev.h> +#include <linux/blk-mq.h> /* Represents a crypto mode supported by blk-crypto */ struct blk_crypto_mode { + const char *name; /* name of this mode, shown in sysfs */ const char *cipher_str; /* crypto API name (for fallback case) */ unsigned int keysize; /* key size in bytes */ unsigned int ivsize; /* iv size in bytes */ @@ -20,6 +21,10 @@ extern const struct blk_crypto_mode blk_crypto_modes[]; #ifdef CONFIG_BLK_INLINE_ENCRYPTION +int blk_crypto_sysfs_register(struct gendisk *disk); + +void blk_crypto_sysfs_unregister(struct gendisk *disk); + void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], unsigned int inc); @@ -60,8 +65,34 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return rq->crypt_ctx; } +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return rq->crypt_keyslot; +} + +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr); + +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot); + +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key); + +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg); + #else /* CONFIG_BLK_INLINE_ENCRYPTION */ +static inline int blk_crypto_sysfs_register(struct gendisk *disk) +{ + return 0; +} + +static inline void blk_crypto_sysfs_unregister(struct gendisk *disk) +{ +} + static inline bool bio_crypt_rq_ctx_compatible(struct request *rq, struct bio *bio) { @@ -93,6 +124,11 @@ static inline bool blk_crypto_rq_is_encrypted(struct request *rq) return false; } +static inline bool blk_crypto_rq_has_keyslot(struct request *rq) +{ + return false; +} + #endif /* CONFIG_BLK_INLINE_ENCRYPTION */ void __bio_crypt_advance(struct bio *bio, unsigned int bytes); @@ -127,14 +163,21 @@ static inline bool blk_crypto_bio_prep(struct bio **bio_ptr) return true; } -blk_status_t __blk_crypto_init_request(struct request *rq); -static inline blk_status_t blk_crypto_init_request(struct request *rq) +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq); +static inline blk_status_t blk_crypto_rq_get_keyslot(struct request *rq) { if (blk_crypto_rq_is_encrypted(rq)) - return __blk_crypto_init_request(rq); + return __blk_crypto_rq_get_keyslot(rq); return BLK_STS_OK; } +void __blk_crypto_rq_put_keyslot(struct request *rq); +static inline void blk_crypto_rq_put_keyslot(struct request *rq) +{ + if (blk_crypto_rq_has_keyslot(rq)) + __blk_crypto_rq_put_keyslot(rq); +} + void __blk_crypto_free_request(struct request *rq); static inline void blk_crypto_free_request(struct request *rq) { @@ -142,28 +185,24 @@ static inline void blk_crypto_free_request(struct request *rq) __blk_crypto_free_request(rq); } -void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, - gfp_t gfp_mask); -static inline void blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, - gfp_t gfp_mask) -{ - if (bio_has_crypt_ctx(bio)) - __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); -} - +int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask); /** - * blk_crypto_insert_cloned_request - Prepare a cloned request to be inserted - * into a request queue. - * @rq: the request being queued + * blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio + * is inserted + * @rq: The request to prepare + * @bio: The first bio being inserted into the request + * @gfp_mask: Memory allocation flags * - * Return: BLK_STS_OK on success, nonzero on error. + * Return: 0 on success, -ENOMEM if out of memory. -ENOMEM is only possible if + * @gfp_mask doesn't include %__GFP_DIRECT_RECLAIM. */ -static inline blk_status_t blk_crypto_insert_cloned_request(struct request *rq) +static inline int blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) { - - if (blk_crypto_rq_is_encrypted(rq)) - return blk_crypto_init_request(rq); - return BLK_STS_OK; + if (bio_has_crypt_ctx(bio)) + return __blk_crypto_rq_bio_prep(rq, bio, gfp_mask); + return 0; } #ifdef CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK diff --git a/block/blk-crypto-profile.c b/block/blk-crypto-profile.c new file mode 100644 index 000000000000..7fabc883e39f --- /dev/null +++ b/block/blk-crypto-profile.c @@ -0,0 +1,559 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2019 Google LLC + */ + +/** + * DOC: blk-crypto profiles + * + * 'struct blk_crypto_profile' contains all generic inline encryption-related + * state for a particular inline encryption device. blk_crypto_profile serves + * as the way that drivers for inline encryption hardware expose their crypto + * capabilities and certain functions (e.g., functions to program and evict + * keys) to upper layers. Device drivers that want to support inline encryption + * construct a crypto profile, then associate it with the disk's request_queue. + * + * If the device has keyslots, then its blk_crypto_profile also handles managing + * these keyslots in a device-independent way, using the driver-provided + * functions to program and evict keys as needed. This includes keeping track + * of which key and how many I/O requests are using each keyslot, getting + * keyslots for I/O requests, and handling key eviction requests. + * + * For more information, see Documentation/block/inline-encryption.rst. + */ + +#define pr_fmt(fmt) "blk-crypto: " fmt + +#include <linux/blk-crypto-profile.h> +#include <linux/device.h> +#include <linux/atomic.h> +#include <linux/mutex.h> +#include <linux/pm_runtime.h> +#include <linux/wait.h> +#include <linux/blkdev.h> +#include <linux/blk-integrity.h> +#include "blk-crypto-internal.h" + +struct blk_crypto_keyslot { + atomic_t slot_refs; + struct list_head idle_slot_node; + struct hlist_node hash_node; + const struct blk_crypto_key *key; + struct blk_crypto_profile *profile; +}; + +static inline void blk_crypto_hw_enter(struct blk_crypto_profile *profile) +{ + /* + * Calling into the driver requires profile->lock held and the device + * resumed. But we must resume the device first, since that can acquire + * and release profile->lock via blk_crypto_reprogram_all_keys(). + */ + if (profile->dev) + pm_runtime_get_sync(profile->dev); + down_write(&profile->lock); +} + +static inline void blk_crypto_hw_exit(struct blk_crypto_profile *profile) +{ + up_write(&profile->lock); + if (profile->dev) + pm_runtime_put_sync(profile->dev); +} + +/** + * blk_crypto_profile_init() - Initialize a blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Storage drivers must call this when starting to set up a blk_crypto_profile, + * before filling in additional fields. + * + * Return: 0 on success, or else a negative error code. + */ +int blk_crypto_profile_init(struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + unsigned int slot; + unsigned int i; + unsigned int slot_hashtable_size; + + memset(profile, 0, sizeof(*profile)); + + /* + * profile->lock of an underlying device can nest inside profile->lock + * of a device-mapper device, so use a dynamic lock class to avoid + * false-positive lockdep reports. + */ + lockdep_register_key(&profile->lockdep_key); + __init_rwsem(&profile->lock, "&profile->lock", &profile->lockdep_key); + + if (num_slots == 0) + return 0; + + /* Initialize keyslot management data. */ + + profile->slots = kvcalloc(num_slots, sizeof(profile->slots[0]), + GFP_KERNEL); + if (!profile->slots) + goto err_destroy; + + profile->num_slots = num_slots; + + init_waitqueue_head(&profile->idle_slots_wait_queue); + INIT_LIST_HEAD(&profile->idle_slots); + + for (slot = 0; slot < num_slots; slot++) { + profile->slots[slot].profile = profile; + list_add_tail(&profile->slots[slot].idle_slot_node, + &profile->idle_slots); + } + + spin_lock_init(&profile->idle_slots_lock); + + slot_hashtable_size = roundup_pow_of_two(num_slots); + /* + * hash_ptr() assumes bits != 0, so ensure the hash table has at least 2 + * buckets. This only makes a difference when there is only 1 keyslot. + */ + if (slot_hashtable_size < 2) + slot_hashtable_size = 2; + + profile->log_slot_ht_size = ilog2(slot_hashtable_size); + profile->slot_hashtable = + kvmalloc_array(slot_hashtable_size, + sizeof(profile->slot_hashtable[0]), GFP_KERNEL); + if (!profile->slot_hashtable) + goto err_destroy; + for (i = 0; i < slot_hashtable_size; i++) + INIT_HLIST_HEAD(&profile->slot_hashtable[i]); + + return 0; + +err_destroy: + blk_crypto_profile_destroy(profile); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_init); + +static void blk_crypto_profile_destroy_callback(void *profile) +{ + blk_crypto_profile_destroy(profile); +} + +/** + * devm_blk_crypto_profile_init() - Resource-managed blk_crypto_profile_init() + * @dev: the device which owns the blk_crypto_profile + * @profile: the blk_crypto_profile to initialize + * @num_slots: the number of keyslots + * + * Like blk_crypto_profile_init(), but causes blk_crypto_profile_destroy() to be + * called automatically on driver detach. + * + * Return: 0 on success, or else a negative error code. + */ +int devm_blk_crypto_profile_init(struct device *dev, + struct blk_crypto_profile *profile, + unsigned int num_slots) +{ + int err = blk_crypto_profile_init(profile, num_slots); + + if (err) + return err; + + return devm_add_action_or_reset(dev, + blk_crypto_profile_destroy_callback, + profile); +} +EXPORT_SYMBOL_GPL(devm_blk_crypto_profile_init); + +static inline struct hlist_head * +blk_crypto_hash_bucket_for_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + return &profile->slot_hashtable[ + hash_ptr(key, profile->log_slot_ht_size)]; +} + +static void +blk_crypto_remove_slot_from_lru_list(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile = slot->profile; + unsigned long flags; + + spin_lock_irqsave(&profile->idle_slots_lock, flags); + list_del(&slot->idle_slot_node); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); +} + +static struct blk_crypto_keyslot * +blk_crypto_find_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + const struct hlist_head *head = + blk_crypto_hash_bucket_for_key(profile, key); + struct blk_crypto_keyslot *slotp; + + hlist_for_each_entry(slotp, head, hash_node) { + if (slotp->key == key) + return slotp; + } + return NULL; +} + +static struct blk_crypto_keyslot * +blk_crypto_find_and_grab_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) + return NULL; + if (atomic_inc_return(&slot->slot_refs) == 1) { + /* Took first reference to this slot; remove it from LRU list */ + blk_crypto_remove_slot_from_lru_list(slot); + } + return slot; +} + +/** + * blk_crypto_keyslot_index() - Get the index of a keyslot + * @slot: a keyslot that blk_crypto_get_keyslot() returned + * + * Return: the 0-based index of the keyslot within the device's keyslots. + */ +unsigned int blk_crypto_keyslot_index(struct blk_crypto_keyslot *slot) +{ + return slot - slot->profile->slots; +} +EXPORT_SYMBOL_GPL(blk_crypto_keyslot_index); + +/** + * blk_crypto_get_keyslot() - Get a keyslot for a key, if needed. + * @profile: the crypto profile of the device the key will be used on + * @key: the key that will be used + * @slot_ptr: If a keyslot is allocated, an opaque pointer to the keyslot struct + * will be stored here. blk_crypto_put_keyslot() must be called + * later to release it. Otherwise, NULL will be stored here. + * + * If the device has keyslots, this gets a keyslot that's been programmed with + * the specified key. If the key is already in a slot, this reuses it; + * otherwise this waits for a slot to become idle and programs the key into it. + * + * Context: Process context. Takes and releases profile->lock. + * Return: BLK_STS_OK on success, meaning that either a keyslot was allocated or + * one wasn't needed; or a blk_status_t error on failure. + */ +blk_status_t blk_crypto_get_keyslot(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key, + struct blk_crypto_keyslot **slot_ptr) +{ + struct blk_crypto_keyslot *slot; + int slot_idx; + int err; + + *slot_ptr = NULL; + + /* + * If the device has no concept of "keyslots", then there is no need to + * get one. + */ + if (profile->num_slots == 0) + return BLK_STS_OK; + + down_read(&profile->lock); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + up_read(&profile->lock); + if (slot) + goto success; + + for (;;) { + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_and_grab_keyslot(profile, key); + if (slot) { + blk_crypto_hw_exit(profile); + goto success; + } + + /* + * If we're here, that means there wasn't a slot that was + * already programmed with the key. So try to program it. + */ + if (!list_empty(&profile->idle_slots)) + break; + + blk_crypto_hw_exit(profile); + wait_event(profile->idle_slots_wait_queue, + !list_empty(&profile->idle_slots)); + } + + slot = list_first_entry(&profile->idle_slots, struct blk_crypto_keyslot, + idle_slot_node); + slot_idx = blk_crypto_keyslot_index(slot); + + err = profile->ll_ops.keyslot_program(profile, key, slot_idx); + if (err) { + wake_up(&profile->idle_slots_wait_queue); + blk_crypto_hw_exit(profile); + return errno_to_blk_status(err); + } + + /* Move this slot to the hash list for the new key. */ + if (slot->key) + hlist_del(&slot->hash_node); + slot->key = key; + hlist_add_head(&slot->hash_node, + blk_crypto_hash_bucket_for_key(profile, key)); + + atomic_set(&slot->slot_refs, 1); + + blk_crypto_remove_slot_from_lru_list(slot); + + blk_crypto_hw_exit(profile); +success: + *slot_ptr = slot; + return BLK_STS_OK; +} + +/** + * blk_crypto_put_keyslot() - Release a reference to a keyslot + * @slot: The keyslot to release the reference of + * + * Context: Any context. + */ +void blk_crypto_put_keyslot(struct blk_crypto_keyslot *slot) +{ + struct blk_crypto_profile *profile = slot->profile; + unsigned long flags; + + if (atomic_dec_and_lock_irqsave(&slot->slot_refs, + &profile->idle_slots_lock, flags)) { + list_add_tail(&slot->idle_slot_node, &profile->idle_slots); + spin_unlock_irqrestore(&profile->idle_slots_lock, flags); + wake_up(&profile->idle_slots_wait_queue); + } +} + +/** + * __blk_crypto_cfg_supported() - Check whether the given crypto profile + * supports the given crypto configuration. + * @profile: the crypto profile to check + * @cfg: the crypto configuration to check for + * + * Return: %true if @profile supports the given @cfg. + */ +bool __blk_crypto_cfg_supported(struct blk_crypto_profile *profile, + const struct blk_crypto_config *cfg) +{ + if (!profile) + return false; + if (!(profile->modes_supported[cfg->crypto_mode] & cfg->data_unit_size)) + return false; + if (profile->max_dun_bytes_supported < cfg->dun_bytes) + return false; + return true; +} + +/* + * This is an internal function that evicts a key from an inline encryption + * device that can be either a real device or the blk-crypto-fallback "device". + * It is used only by blk_crypto_evict_key(); see that function for details. + */ +int __blk_crypto_evict_key(struct blk_crypto_profile *profile, + const struct blk_crypto_key *key) +{ + struct blk_crypto_keyslot *slot; + int err; + + if (profile->num_slots == 0) { + if (profile->ll_ops.keyslot_evict) { + blk_crypto_hw_enter(profile); + err = profile->ll_ops.keyslot_evict(profile, key, -1); + blk_crypto_hw_exit(profile); + return err; + } + return 0; + } + + blk_crypto_hw_enter(profile); + slot = blk_crypto_find_keyslot(profile, key); + if (!slot) { + /* + * Not an error, since a key not in use by I/O is not guaranteed + * to be in a keyslot. There can be more keys than keyslots. + */ + err = 0; + goto out; + } + + if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { + /* BUG: key is still in use by I/O */ + err = -EBUSY; + goto out_remove; + } + err = profile->ll_ops.keyslot_evict(profile, key, + blk_crypto_keyslot_index(slot)); +out_remove: + /* + * Callers free the key even on error, so unlink the key from the hash + * table and clear slot->key even on error. + */ + hlist_del(&slot->hash_node); + slot->key = NULL; +out: + blk_crypto_hw_exit(profile); + return err; +} + +/** + * blk_crypto_reprogram_all_keys() - Re-program all keyslots. + * @profile: The crypto profile + * + * Re-program all keyslots that are supposed to have a key programmed. This is + * intended only for use by drivers for hardware that loses its keys on reset. + * + * Context: Process context. Takes and releases profile->lock. + */ +void blk_crypto_reprogram_all_keys(struct blk_crypto_profile *profile) +{ + unsigned int slot; + + if (profile->num_slots == 0) + return; + + /* This is for device initialization, so don't resume the device */ + down_write(&profile->lock); + for (slot = 0; slot < profile->num_slots; slot++) { + const struct blk_crypto_key *key = profile->slots[slot].key; + int err; + + if (!key) + continue; + + err = profile->ll_ops.keyslot_program(profile, key, slot); + WARN_ON(err); + } + up_write(&profile->lock); +} +EXPORT_SYMBOL_GPL(blk_crypto_reprogram_all_keys); + +void blk_crypto_profile_destroy(struct blk_crypto_profile *profile) +{ + if (!profile) + return; + lockdep_unregister_key(&profile->lockdep_key); + kvfree(profile->slot_hashtable); + kvfree_sensitive(profile->slots, + sizeof(profile->slots[0]) * profile->num_slots); + memzero_explicit(profile, sizeof(*profile)); +} +EXPORT_SYMBOL_GPL(blk_crypto_profile_destroy); + +bool blk_crypto_register(struct blk_crypto_profile *profile, + struct request_queue *q) +{ + if (blk_integrity_queue_supports_integrity(q)) { + pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); + return false; + } + q->crypto_profile = profile; + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_register); + +/** + * blk_crypto_intersect_capabilities() - restrict supported crypto capabilities + * by child device + * @parent: the crypto profile for the parent device + * @child: the crypto profile for the child device, or NULL + * + * This clears all crypto capabilities in @parent that aren't set in @child. If + * @child is NULL, then this clears all parent capabilities. + * + * Only use this when setting up the crypto profile for a layered device, before + * it's been exposed yet. + */ +void blk_crypto_intersect_capabilities(struct blk_crypto_profile *parent, + const struct blk_crypto_profile *child) +{ + if (child) { + unsigned int i; + + parent->max_dun_bytes_supported = + min(parent->max_dun_bytes_supported, + child->max_dun_bytes_supported); + for (i = 0; i < ARRAY_SIZE(child->modes_supported); i++) + parent->modes_supported[i] &= child->modes_supported[i]; + } else { + parent->max_dun_bytes_supported = 0; + memset(parent->modes_supported, 0, + sizeof(parent->modes_supported)); + } +} +EXPORT_SYMBOL_GPL(blk_crypto_intersect_capabilities); + +/** + * blk_crypto_has_capabilities() - Check whether @target supports at least all + * the crypto capabilities that @reference does. + * @target: the target profile + * @reference: the reference profile + * + * Return: %true if @target supports all the crypto capabilities of @reference. + */ +bool blk_crypto_has_capabilities(const struct blk_crypto_profile *target, + const struct blk_crypto_profile *reference) +{ + int i; + + if (!reference) + return true; + + if (!target) + return false; + + for (i = 0; i < ARRAY_SIZE(target->modes_supported); i++) { + if (reference->modes_supported[i] & ~target->modes_supported[i]) + return false; + } + + if (reference->max_dun_bytes_supported > + target->max_dun_bytes_supported) + return false; + + return true; +} +EXPORT_SYMBOL_GPL(blk_crypto_has_capabilities); + +/** + * blk_crypto_update_capabilities() - Update the capabilities of a crypto + * profile to match those of another crypto + * profile. + * @dst: The crypto profile whose capabilities to update. + * @src: The crypto profile whose capabilities this function will update @dst's + * capabilities to. + * + * Blk-crypto requires that crypto capabilities that were + * advertised when a bio was created continue to be supported by the + * device until that bio is ended. This is turn means that a device cannot + * shrink its advertised crypto capabilities without any explicit + * synchronization with upper layers. So if there's no such explicit + * synchronization, @src must support all the crypto capabilities that + * @dst does (i.e. we need blk_crypto_has_capabilities(@src, @dst)). + * + * Note also that as long as the crypto capabilities are being expanded, the + * order of updates becoming visible is not important because it's alright + * for blk-crypto to see stale values - they only cause blk-crypto to + * believe that a crypto capability isn't supported when it actually is (which + * might result in blk-crypto-fallback being used if available, or the bio being + * failed). + */ +void blk_crypto_update_capabilities(struct blk_crypto_profile *dst, + const struct blk_crypto_profile *src) +{ + memcpy(dst->modes_supported, src->modes_supported, + sizeof(dst->modes_supported)); + + dst->max_dun_bytes_supported = src->max_dun_bytes_supported; +} +EXPORT_SYMBOL_GPL(blk_crypto_update_capabilities); diff --git a/block/blk-crypto-sysfs.c b/block/blk-crypto-sysfs.c new file mode 100644 index 000000000000..a304434489ba --- /dev/null +++ b/block/blk-crypto-sysfs.c @@ -0,0 +1,173 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright 2021 Google LLC + * + * sysfs support for blk-crypto. This file contains the code which exports the + * crypto capabilities of devices via /sys/block/$disk/queue/crypto/. + */ + +#include <linux/blk-crypto-profile.h> + +#include "blk-crypto-internal.h" + +struct blk_crypto_kobj { + struct kobject kobj; + struct blk_crypto_profile *profile; +}; + +struct blk_crypto_attr { + struct attribute attr; + ssize_t (*show)(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page); +}; + +static struct blk_crypto_profile *kobj_to_crypto_profile(struct kobject *kobj) +{ + return container_of(kobj, struct blk_crypto_kobj, kobj)->profile; +} + +static struct blk_crypto_attr *attr_to_crypto_attr(struct attribute *attr) +{ + return container_of(attr, struct blk_crypto_attr, attr); +} + +static ssize_t max_dun_bits_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", 8 * profile->max_dun_bytes_supported); +} + +static ssize_t num_keyslots_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + return sysfs_emit(page, "%u\n", profile->num_slots); +} + +#define BLK_CRYPTO_RO_ATTR(_name) \ + static struct blk_crypto_attr _name##_attr = __ATTR_RO(_name) + +BLK_CRYPTO_RO_ATTR(max_dun_bits); +BLK_CRYPTO_RO_ATTR(num_keyslots); + +static struct attribute *blk_crypto_attrs[] = { + &max_dun_bits_attr.attr, + &num_keyslots_attr.attr, + NULL, +}; + +static const struct attribute_group blk_crypto_attr_group = { + .attrs = blk_crypto_attrs, +}; + +/* + * The encryption mode attributes. To avoid hard-coding the list of encryption + * modes, these are initialized at boot time by blk_crypto_sysfs_init(). + */ +static struct blk_crypto_attr __blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX]; +static struct attribute *blk_crypto_mode_attrs[BLK_ENCRYPTION_MODE_MAX + 1]; + +static umode_t blk_crypto_mode_is_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + int mode_num = a - __blk_crypto_mode_attrs; + + if (profile->modes_supported[mode_num]) + return 0444; + return 0; +} + +static ssize_t blk_crypto_mode_show(struct blk_crypto_profile *profile, + struct blk_crypto_attr *attr, char *page) +{ + int mode_num = attr - __blk_crypto_mode_attrs; + + return sysfs_emit(page, "0x%x\n", profile->modes_supported[mode_num]); +} + +static const struct attribute_group blk_crypto_modes_attr_group = { + .name = "modes", + .attrs = blk_crypto_mode_attrs, + .is_visible = blk_crypto_mode_is_visible, +}; + +static const struct attribute_group *blk_crypto_attr_groups[] = { + &blk_crypto_attr_group, + &blk_crypto_modes_attr_group, + NULL, +}; + +static ssize_t blk_crypto_attr_show(struct kobject *kobj, + struct attribute *attr, char *page) +{ + struct blk_crypto_profile *profile = kobj_to_crypto_profile(kobj); + struct blk_crypto_attr *a = attr_to_crypto_attr(attr); + + return a->show(profile, a, page); +} + +static const struct sysfs_ops blk_crypto_attr_ops = { + .show = blk_crypto_attr_show, +}; + +static void blk_crypto_release(struct kobject *kobj) +{ + kfree(container_of(kobj, struct blk_crypto_kobj, kobj)); +} + +static const struct kobj_type blk_crypto_ktype = { + .default_groups = blk_crypto_attr_groups, + .sysfs_ops = &blk_crypto_attr_ops, + .release = blk_crypto_release, +}; + +/* + * If the request_queue has a blk_crypto_profile, create the "crypto" + * subdirectory in sysfs (/sys/block/$disk/queue/crypto/). + */ +int blk_crypto_sysfs_register(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct blk_crypto_kobj *obj; + int err; + + if (!q->crypto_profile) + return 0; + + obj = kzalloc(sizeof(*obj), GFP_KERNEL); + if (!obj) + return -ENOMEM; + obj->profile = q->crypto_profile; + + err = kobject_init_and_add(&obj->kobj, &blk_crypto_ktype, + &disk->queue_kobj, "crypto"); + if (err) { + kobject_put(&obj->kobj); + return err; + } + q->crypto_kobject = &obj->kobj; + return 0; +} + +void blk_crypto_sysfs_unregister(struct gendisk *disk) +{ + kobject_put(disk->queue->crypto_kobject); +} + +static int __init blk_crypto_sysfs_init(void) +{ + int i; + + BUILD_BUG_ON(BLK_ENCRYPTION_MODE_INVALID != 0); + for (i = 1; i < BLK_ENCRYPTION_MODE_MAX; i++) { + struct blk_crypto_attr *attr = &__blk_crypto_mode_attrs[i]; + + attr->attr.name = blk_crypto_modes[i].name; + attr->attr.mode = 0444; + attr->show = blk_crypto_mode_show; + blk_crypto_mode_attrs[i - 1] = &attr->attr; + } + return 0; +} +subsys_initcall(blk_crypto_sysfs_init); diff --git a/block/blk-crypto.c b/block/blk-crypto.c index 6533c9b36ab8..4d760b092deb 100644 --- a/block/blk-crypto.c +++ b/block/blk-crypto.c @@ -11,28 +11,38 @@ #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/keyslot-manager.h> +#include <linux/blk-crypto-profile.h> #include <linux/module.h> +#include <linux/ratelimit.h> #include <linux/slab.h> #include "blk-crypto-internal.h" const struct blk_crypto_mode blk_crypto_modes[] = { [BLK_ENCRYPTION_MODE_AES_256_XTS] = { + .name = "AES-256-XTS", .cipher_str = "xts(aes)", .keysize = 64, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_AES_128_CBC_ESSIV] = { + .name = "AES-128-CBC-ESSIV", .cipher_str = "essiv(cbc(aes),sha256)", .keysize = 16, .ivsize = 16, }, [BLK_ENCRYPTION_MODE_ADIANTUM] = { + .name = "Adiantum", .cipher_str = "adiantum(xchacha12,aes)", .keysize = 32, .ivsize = 32, }, + [BLK_ENCRYPTION_MODE_SM4_XTS] = { + .name = "SM4-XTS", + .cipher_str = "xts(sm4)", + .keysize = 32, + .ivsize = 16, + }, }; /* @@ -81,7 +91,15 @@ subsys_initcall(bio_crypt_ctx_init); void bio_crypt_set_ctx(struct bio *bio, const struct blk_crypto_key *key, const u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], gfp_t gfp_mask) { - struct bio_crypt_ctx *bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + struct bio_crypt_ctx *bc; + + /* + * The caller must use a gfp_mask that contains __GFP_DIRECT_RECLAIM so + * that the mempool_alloc() can't fail. + */ + WARN_ON_ONCE(!(gfp_mask & __GFP_DIRECT_RECLAIM)); + + bc = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); bc->bc_key = key; memcpy(bc->bc_dun, dun, sizeof(bc->bc_dun)); @@ -95,12 +113,14 @@ void __bio_crypt_free_ctx(struct bio *bio) bio->bi_crypt_context = NULL; } -void __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) +int __bio_crypt_clone(struct bio *dst, struct bio *src, gfp_t gfp_mask) { dst->bi_crypt_context = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + if (!dst->bi_crypt_context) + return -ENOMEM; *dst->bi_crypt_context = *src->bi_crypt_context; + return 0; } -EXPORT_SYMBOL_GPL(__bio_crypt_clone); /* Increments @dun by @inc, treating @dun as a multi-limb integer. */ void bio_crypt_dun_increment(u64 dun[BLK_CRYPTO_DUN_ARRAY_SIZE], @@ -205,26 +225,27 @@ static bool bio_crypt_check_alignment(struct bio *bio) return true; } -blk_status_t __blk_crypto_init_request(struct request *rq) +blk_status_t __blk_crypto_rq_get_keyslot(struct request *rq) { - return blk_ksm_get_slot_for_key(rq->q->ksm, rq->crypt_ctx->bc_key, - &rq->crypt_keyslot); + return blk_crypto_get_keyslot(rq->q->crypto_profile, + rq->crypt_ctx->bc_key, + &rq->crypt_keyslot); +} + +void __blk_crypto_rq_put_keyslot(struct request *rq) +{ + blk_crypto_put_keyslot(rq->crypt_keyslot); + rq->crypt_keyslot = NULL; } -/** - * __blk_crypto_free_request - Uninitialize the crypto fields of a request. - * - * @rq: The request whose crypto fields to uninitialize. - * - * Completely uninitializes the crypto fields of a request. If a keyslot has - * been programmed into some inline encryption hardware, that keyslot is - * released. The rq->crypt_ctx is also freed. - */ void __blk_crypto_free_request(struct request *rq) { - blk_ksm_put_slot(rq->crypt_keyslot); + /* The keyslot, if one was needed, should have been released earlier. */ + if (WARN_ON_ONCE(rq->crypt_keyslot)) + __blk_crypto_rq_put_keyslot(rq); + mempool_free(rq->crypt_ctx, bio_crypt_ctx_pool); - blk_crypto_rq_set_defaults(rq); + rq->crypt_ctx = NULL; } /** @@ -239,7 +260,7 @@ void __blk_crypto_free_request(struct request *rq) * kernel crypto API. When the crypto API fallback is used for encryption, * blk-crypto may choose to split the bio into 2 - the first one that will * continue to be processed and the second one that will be resubmitted via - * generic_make_request. A bounce bio will be allocated to encrypt the contents + * submit_bio_noacct. A bounce bio will be allocated to encrypt the contents * of the aforementioned "first one", and *bio_ptr will be updated to this * bounce bio. * @@ -269,10 +290,9 @@ bool __blk_crypto_bio_prep(struct bio **bio_ptr) * Success if device supports the encryption context, or if we succeeded * in falling back to the crypto API. */ - if (blk_ksm_crypto_cfg_supported(bio->bi_disk->queue->ksm, - &bc_key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bio->bi_bdev, + &bc_key->crypto_cfg)) return true; - if (blk_crypto_fallback_bio_prep(bio_ptr)) return true; fail: @@ -280,20 +300,16 @@ fail: return false; } -/** - * __blk_crypto_rq_bio_prep - Prepare a request's crypt_ctx when its first bio - * is inserted - * - * @rq: The request to prepare - * @bio: The first bio being inserted into the request - * @gfp_mask: gfp mask - */ -void __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, - gfp_t gfp_mask) +int __blk_crypto_rq_bio_prep(struct request *rq, struct bio *bio, + gfp_t gfp_mask) { - if (!rq->crypt_ctx) + if (!rq->crypt_ctx) { rq->crypt_ctx = mempool_alloc(bio_crypt_ctx_pool, gfp_mask); + if (!rq->crypt_ctx) + return -ENOMEM; + } *rq->crypt_ctx = *bio->bi_crypt_context; + return 0; } /** @@ -325,7 +341,7 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, if (mode->keysize == 0) return -EINVAL; - if (dun_bytes == 0 || dun_bytes > BLK_CRYPTO_MAX_IV_SIZE) + if (dun_bytes == 0 || dun_bytes > mode->ivsize) return -EINVAL; if (!is_power_of_2(data_unit_size)) @@ -341,22 +357,29 @@ int blk_crypto_init_key(struct blk_crypto_key *blk_key, const u8 *raw_key, return 0; } +bool blk_crypto_config_supported_natively(struct block_device *bdev, + const struct blk_crypto_config *cfg) +{ + return __blk_crypto_cfg_supported(bdev_get_queue(bdev)->crypto_profile, + cfg); +} + /* * Check if bios with @cfg can be en/decrypted by blk-crypto (i.e. either the - * request queue it's submitted to supports inline crypto, or the + * block_device it's submitted to supports inline crypto, or the * blk-crypto-fallback is enabled and supports the cfg). */ -bool blk_crypto_config_supported(struct request_queue *q, +bool blk_crypto_config_supported(struct block_device *bdev, const struct blk_crypto_config *cfg) { return IS_ENABLED(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) || - blk_ksm_crypto_cfg_supported(q->ksm, cfg); + blk_crypto_config_supported_natively(bdev, cfg); } /** * blk_crypto_start_using_key() - Start using a blk_crypto_key on a device + * @bdev: block device to operate on * @key: A key to use on the device - * @q: the request queue for the device * * Upper layers must call this function to ensure that either the hardware * supports the key's crypto settings, or the crypto API fallback has transforms @@ -368,37 +391,48 @@ bool blk_crypto_config_supported(struct request_queue *q, * blk-crypto-fallback is either disabled or the needed algorithm * is disabled in the crypto API; or another -errno code. */ -int blk_crypto_start_using_key(const struct blk_crypto_key *key, - struct request_queue *q) +int blk_crypto_start_using_key(struct block_device *bdev, + const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) return 0; return blk_crypto_fallback_start_using_mode(key->crypto_cfg.crypto_mode); } /** - * blk_crypto_evict_key() - Evict a key from any inline encryption hardware - * it may have been programmed into - * @q: The request queue who's associated inline encryption hardware this key - * might have been programmed into - * @key: The key to evict + * blk_crypto_evict_key() - Evict a blk_crypto_key from a block_device + * @bdev: a block_device on which I/O using the key may have been done + * @key: the key to evict + * + * For a given block_device, this function removes the given blk_crypto_key from + * the keyslot management structures and evicts it from any underlying hardware + * keyslot(s) or blk-crypto-fallback keyslot it may have been programmed into. * - * Upper layers (filesystems) must call this function to ensure that a key is - * evicted from any hardware that it might have been programmed into. The key - * must not be in use by any in-flight IO when this function is called. + * Upper layers must call this before freeing the blk_crypto_key. It must be + * called for every block_device the key may have been used on. The key must no + * longer be in use by any I/O when this function is called. * - * Return: 0 on success or if key is not present in the q's ksm, -err on error. + * Context: May sleep. */ -int blk_crypto_evict_key(struct request_queue *q, - const struct blk_crypto_key *key) +void blk_crypto_evict_key(struct block_device *bdev, + const struct blk_crypto_key *key) { - if (blk_ksm_crypto_cfg_supported(q->ksm, &key->crypto_cfg)) - return blk_ksm_evict_key(q->ksm, key); + struct request_queue *q = bdev_get_queue(bdev); + int err; + if (blk_crypto_config_supported_natively(bdev, &key->crypto_cfg)) + err = __blk_crypto_evict_key(q->crypto_profile, key); + else + err = blk_crypto_fallback_evict_key(key); /* - * If the request queue's associated inline encryption hardware didn't - * have support for the key, then the key might have been programmed - * into the fallback keyslot manager, so try to evict from there. + * An error can only occur here if the key failed to be evicted from a + * keyslot (due to a hardware or driver issue) or is allegedly still in + * use by I/O (due to a kernel bug). Even in these cases, the key is + * still unlinked from the keyslot management structures, and the caller + * is allowed and expected to free it right away. There's nothing + * callers can do to handle errors, so just log them and return void. */ - return blk_crypto_fallback_evict_key(key); + if (err) + pr_warn_ratelimited("%pg: error %d evicting key\n", bdev, err); } +EXPORT_SYMBOL_GPL(blk_crypto_evict_key); diff --git a/block/blk-exec.c b/block/blk-exec.c deleted file mode 100644 index 85324d53d072..000000000000 --- a/block/blk-exec.c +++ /dev/null @@ -1,95 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to setting various queue properties from drivers - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/sched/sysctl.h> - -#include "blk.h" -#include "blk-mq-sched.h" - -/** - * blk_end_sync_rq - executes a completion event on a request - * @rq: request to complete - * @error: end I/O status of the request - */ -static void blk_end_sync_rq(struct request *rq, blk_status_t error) -{ - struct completion *waiting = rq->end_io_data; - - rq->end_io_data = NULL; - - /* - * complete last, if this is a stack request the process (and thus - * the rq pointer) could be invalid right after this complete() - */ - complete(waiting); -} - -/** - * blk_execute_rq_nowait - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * @done: I/O completion handler - * - * Description: - * Insert a fully prepared request at the back of the I/O scheduler queue - * for execution. Don't wait for completion. - * - * Note: - * This function will invoke @done directly if the queue is dead. - */ -void blk_execute_rq_nowait(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head, - rq_end_io_fn *done) -{ - WARN_ON(irqs_disabled()); - WARN_ON(!blk_rq_is_passthrough(rq)); - - rq->rq_disk = bd_disk; - rq->end_io = done; - - blk_account_io_start(rq); - - /* - * don't check dying flag for MQ because the request won't - * be reused after dying flag is set - */ - blk_mq_sched_insert_request(rq, at_head, true, false); -} -EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); - -/** - * blk_execute_rq - insert a request into queue for execution - * @q: queue to insert the request in - * @bd_disk: matching gendisk - * @rq: request to insert - * @at_head: insert request at head or tail of queue - * - * Description: - * Insert a fully prepared request at the back of the I/O scheduler queue - * for execution and wait for completion. - */ -void blk_execute_rq(struct request_queue *q, struct gendisk *bd_disk, - struct request *rq, int at_head) -{ - DECLARE_COMPLETION_ONSTACK(wait); - unsigned long hang_check; - - rq->end_io_data = &wait; - blk_execute_rq_nowait(q, bd_disk, rq, at_head, blk_end_sync_rq); - - /* Prevent hang_check timer from firing at us during very long I/O */ - hang_check = sysctl_hung_task_timeout_secs; - if (hang_check) - while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); - else - wait_for_completion_io(&wait); -} -EXPORT_SYMBOL(blk_execute_rq); diff --git a/block/blk-flush.c b/block/blk-flush.c index 15ae0155ec07..3f4d41952ef2 100644 --- a/block/blk-flush.c +++ b/block/blk-flush.c @@ -68,12 +68,10 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/gfp.h> -#include <linux/blk-mq.h> -#include <linux/lockdep.h> +#include <linux/part_stat.h> #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" #include "blk-mq-sched.h" /* PREFLUSH/FUA sequences */ @@ -94,7 +92,13 @@ enum { }; static void blk_kick_flush(struct request_queue *q, - struct blk_flush_queue *fq, unsigned int flags); + struct blk_flush_queue *fq, blk_opf_t flags); + +static inline struct blk_flush_queue * +blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) +{ + return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; +} static unsigned int blk_flush_policy(unsigned long fflags, struct request *rq) { @@ -132,14 +136,9 @@ static void blk_flush_restore_request(struct request *rq) rq->end_io = rq->flush.saved_end_io; } -static void blk_flush_queue_rq(struct request *rq, bool add_front) -{ - blk_mq_add_to_requeue_list(rq, add_front, true); -} - static void blk_account_io_flush(struct request *rq) { - struct hd_struct *part = &rq->rq_disk->part0; + struct block_device *part = rq->q->disk->part0; part_stat_lock(); part_stat_inc(part, ios[STAT_FLUSH]); @@ -167,7 +166,7 @@ static void blk_flush_complete_seq(struct request *rq, { struct request_queue *q = rq->q; struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; - unsigned int cmd_flags; + blk_opf_t cmd_flags; BUG_ON(rq->flush.seq & seq); rq->flush.seq |= seq; @@ -184,12 +183,15 @@ static void blk_flush_complete_seq(struct request *rq, /* queue for flush */ if (list_empty(pending)) fq->flush_pending_since = jiffies; - list_move_tail(&rq->flush.list, pending); + list_move_tail(&rq->queuelist, pending); break; case REQ_FSEQ_DATA: - list_move_tail(&rq->flush.list, &fq->flush_data_in_flight); - blk_flush_queue_rq(rq, true); + fq->flush_data_in_flight++; + spin_lock(&q->requeue_lock); + list_move(&rq->queuelist, &q->requeue_list); + spin_unlock(&q->requeue_lock); + blk_mq_kick_requeue_list(q); break; case REQ_FSEQ_DONE: @@ -199,8 +201,7 @@ static void blk_flush_complete_seq(struct request *rq, * flush data request completion path. Restore @rq for * normal completion and end it. */ - BUG_ON(!list_empty(&rq->queuelist)); - list_del_init(&rq->flush.list); + list_del_init(&rq->queuelist); blk_flush_restore_request(rq); blk_mq_end_request(rq, error); break; @@ -212,36 +213,41 @@ static void blk_flush_complete_seq(struct request *rq, blk_kick_flush(q, fq, cmd_flags); } -static void flush_end_io(struct request *flush_rq, blk_status_t error) +static enum rq_end_io_ret flush_end_io(struct request *flush_rq, + blk_status_t error) { struct request_queue *q = flush_rq->q; struct list_head *running; struct request *rq, *n; unsigned long flags = 0; struct blk_flush_queue *fq = blk_get_flush_queue(q, flush_rq->mq_ctx); - struct blk_mq_hw_ctx *hctx; - - blk_account_io_flush(flush_rq); /* release the tag's ownership to the req cloned from */ spin_lock_irqsave(&fq->mq_flush_lock, flags); - if (!refcount_dec_and_test(&flush_rq->ref)) { + if (!req_ref_put_and_test(flush_rq)) { fq->rq_status = error; spin_unlock_irqrestore(&fq->mq_flush_lock, flags); - return; + return RQ_END_IO_NONE; } - if (fq->rq_status != BLK_STS_OK) + blk_account_io_flush(flush_rq); + /* + * Flush request has to be marked as IDLE when it is really ended + * because its .end_io() is called from timeout code path too for + * avoiding use-after-free. + */ + WRITE_ONCE(flush_rq->state, MQ_RQ_IDLE); + if (fq->rq_status != BLK_STS_OK) { error = fq->rq_status; + fq->rq_status = BLK_STS_OK; + } - hctx = flush_rq->mq_hctx; if (!q->elevator) { - blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); - flush_rq->tag = -1; + flush_rq->tag = BLK_MQ_NO_TAG; } else { blk_mq_put_driver_tag(flush_rq); - flush_rq->internal_tag = -1; + flush_rq->internal_tag = BLK_MQ_NO_TAG; } running = &fq->flush_queue[fq->flush_running_idx]; @@ -251,7 +257,7 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) fq->flush_running_idx ^= 1; /* and push the waiting requests to the next stage */ - list_for_each_entry_safe(rq, n, running, flush.list) { + list_for_each_entry_safe(rq, n, running, queuelist) { unsigned int seq = blk_flush_cur_seq(rq); BUG_ON(seq != REQ_FSEQ_PREFLUSH && seq != REQ_FSEQ_POSTFLUSH); @@ -259,6 +265,12 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) } spin_unlock_irqrestore(&fq->mq_flush_lock, flags); + return RQ_END_IO_NONE; +} + +bool is_flush_rq(struct request *rq) +{ + return rq->end_io == flush_end_io; } /** @@ -275,24 +287,19 @@ static void flush_end_io(struct request *flush_rq, blk_status_t error) * */ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, - unsigned int flags) + blk_opf_t flags) { struct list_head *pending = &fq->flush_queue[fq->flush_pending_idx]; struct request *first_rq = - list_first_entry(pending, struct request, flush.list); + list_first_entry(pending, struct request, queuelist); struct request *flush_rq = fq->flush_rq; /* C1 described at the top of this file */ if (fq->flush_pending_idx != fq->flush_running_idx || list_empty(pending)) return; - /* C2 and C3 - * - * For blk-mq + scheduling, we can risk having all driver tags - * assigned to empty flushes, and we deadlock if we are expecting - * other requests to make progress. Don't defer for that case. - */ - if (!list_empty(&fq->flush_data_in_flight) && q->elevator && + /* C2 and C3 */ + if (fq->flush_data_in_flight && time_before(jiffies, fq->flush_pending_since + FLUSH_PENDING_TIMEOUT)) return; @@ -316,24 +323,33 @@ static void blk_kick_flush(struct request_queue *q, struct blk_flush_queue *fq, flush_rq->mq_ctx = first_rq->mq_ctx; flush_rq->mq_hctx = first_rq->mq_hctx; - if (!q->elevator) { - fq->orig_rq = first_rq; + if (!q->elevator) flush_rq->tag = first_rq->tag; - blk_mq_tag_set_rq(flush_rq->mq_hctx, first_rq->tag, flush_rq); - } else { + else flush_rq->internal_tag = first_rq->internal_tag; - } flush_rq->cmd_flags = REQ_OP_FLUSH | REQ_PREFLUSH; flush_rq->cmd_flags |= (flags & REQ_DRV) | (flags & REQ_FAILFAST_MASK); flush_rq->rq_flags |= RQF_FLUSH_SEQ; - flush_rq->rq_disk = first_rq->rq_disk; flush_rq->end_io = flush_end_io; + /* + * Order WRITE ->end_io and WRITE rq->ref, and its pair is the one + * implied in refcount_inc_not_zero() called from + * blk_mq_find_and_get_req(), which orders WRITE/READ flush_rq->ref + * and READ flush_rq->end_io + */ + smp_wmb(); + req_ref_set(flush_rq, 1); - blk_flush_queue_rq(flush_rq, false); + spin_lock(&q->requeue_lock); + list_add_tail(&flush_rq->queuelist, &q->flush_list); + spin_unlock(&q->requeue_lock); + + blk_mq_kick_requeue_list(q); } -static void mq_flush_data_end_io(struct request *rq, blk_status_t error) +static enum rq_end_io_ret mq_flush_data_end_io(struct request *rq, + blk_status_t error) { struct request_queue *q = rq->q; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; @@ -351,28 +367,42 @@ static void mq_flush_data_end_io(struct request *rq, blk_status_t error) * the comment in flush_end_io(). */ spin_lock_irqsave(&fq->mq_flush_lock, flags); + fq->flush_data_in_flight--; + /* + * May have been corrupted by rq->rq_next reuse, we need to + * re-initialize rq->queuelist before reusing it here. + */ + INIT_LIST_HEAD(&rq->queuelist); blk_flush_complete_seq(rq, fq, REQ_FSEQ_DATA, error); spin_unlock_irqrestore(&fq->mq_flush_lock, flags); blk_mq_sched_restart(hctx); + return RQ_END_IO_NONE; } -/** - * blk_insert_flush - insert a new PREFLUSH/FUA request - * @rq: request to insert - * - * To be called from __elv_add_request() for %ELEVATOR_INSERT_FLUSH insertions. - * or __blk_mq_run_hw_queue() to dispatch request. - * @rq is being submitted. Analyze what needs to be done and put it on the - * right queue. +static void blk_rq_init_flush(struct request *rq) +{ + rq->flush.seq = 0; + rq->rq_flags |= RQF_FLUSH_SEQ; + rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ + rq->end_io = mq_flush_data_end_io; +} + +/* + * Insert a PREFLUSH/FUA request into the flush state machine. + * Returns true if the request has been consumed by the flush state machine, + * or false if the caller should continue to process it. */ -void blk_insert_flush(struct request *rq) +bool blk_insert_flush(struct request *rq) { struct request_queue *q = rq->q; unsigned long fflags = q->queue_flags; /* may change, cache */ unsigned int policy = blk_flush_policy(fflags, rq); struct blk_flush_queue *fq = blk_get_flush_queue(q, rq->mq_ctx); + /* FLUSH/FUA request must never be merged */ + WARN_ON_ONCE(rq->bio != rq->biotail); + /* * @policy now records what operations need to be done. Adjust * REQ_PREFLUSH and FUA for the driver. @@ -388,66 +418,60 @@ void blk_insert_flush(struct request *rq) */ rq->cmd_flags |= REQ_SYNC; - /* - * An empty flush handed down from a stacking driver may - * translate into nothing if the underlying device does not - * advertise a write-back cache. In this case, simply - * complete the request. - */ - if (!policy) { + switch (policy) { + case 0: + /* + * An empty flush handed down from a stacking driver may + * translate into nothing if the underlying device does not + * advertise a write-back cache. In this case, simply + * complete the request. + */ blk_mq_end_request(rq, 0); - return; - } - - BUG_ON(rq->bio != rq->biotail); /*assumes zero or single bio rq */ - - /* - * If there's data but flush is not necessary, the request can be - * processed directly without going through flush machinery. Queue - * for normal execution. - */ - if ((policy & REQ_FSEQ_DATA) && - !(policy & (REQ_FSEQ_PREFLUSH | REQ_FSEQ_POSTFLUSH))) { - blk_mq_request_bypass_insert(rq, false, false); - return; + return true; + case REQ_FSEQ_DATA: + /* + * If there's data, but no flush is necessary, the request can + * be processed directly without going through flush machinery. + * Queue for normal execution. + */ + return false; + case REQ_FSEQ_DATA | REQ_FSEQ_POSTFLUSH: + /* + * Initialize the flush fields and completion handler to trigger + * the post flush, and then just pass the command on. + */ + blk_rq_init_flush(rq); + rq->flush.seq |= REQ_FSEQ_PREFLUSH; + spin_lock_irq(&fq->mq_flush_lock); + fq->flush_data_in_flight++; + spin_unlock_irq(&fq->mq_flush_lock); + return false; + default: + /* + * Mark the request as part of a flush sequence and submit it + * for further processing to the flush state machine. + */ + blk_rq_init_flush(rq); + spin_lock_irq(&fq->mq_flush_lock); + blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); + spin_unlock_irq(&fq->mq_flush_lock); + return true; } - - /* - * @rq should go through flush machinery. Mark it part of flush - * sequence and submit for further processing. - */ - memset(&rq->flush, 0, sizeof(rq->flush)); - INIT_LIST_HEAD(&rq->flush.list); - rq->rq_flags |= RQF_FLUSH_SEQ; - rq->flush.saved_end_io = rq->end_io; /* Usually NULL */ - - rq->end_io = mq_flush_data_end_io; - - spin_lock_irq(&fq->mq_flush_lock); - blk_flush_complete_seq(rq, fq, REQ_FSEQ_ACTIONS & ~policy, 0); - spin_unlock_irq(&fq->mq_flush_lock); } /** * blkdev_issue_flush - queue a flush * @bdev: blockdev to issue flush for - * @gfp_mask: memory allocation flags (for bio_alloc) * * Description: * Issue a flush for the block device in question. */ -int blkdev_issue_flush(struct block_device *bdev, gfp_t gfp_mask) +int blkdev_issue_flush(struct block_device *bdev) { - struct bio *bio; - int ret = 0; + struct bio bio; - bio = bio_alloc(gfp_mask, 0); - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE | REQ_PREFLUSH; - - ret = submit_bio_wait(bio); - bio_put(bio); - return ret; + bio_init(&bio, bdev, NULL, 0, REQ_OP_WRITE | REQ_PREFLUSH); + return submit_bio_wait(&bio); } EXPORT_SYMBOL(blkdev_issue_flush); @@ -470,10 +494,6 @@ struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, INIT_LIST_HEAD(&fq->flush_queue[0]); INIT_LIST_HEAD(&fq->flush_queue[1]); - INIT_LIST_HEAD(&fq->flush_data_in_flight); - - lockdep_register_key(&fq->key); - lockdep_set_class(&fq->mq_flush_lock, &fq->key); return fq; @@ -489,7 +509,31 @@ void blk_free_flush_queue(struct blk_flush_queue *fq) if (!fq) return; - lockdep_unregister_key(&fq->key); kfree(fq->flush_rq); kfree(fq); } + +/* + * Allow driver to set its own lock class to fq->mq_flush_lock for + * avoiding lockdep complaint. + * + * flush_end_io() may be called recursively from some driver, such as + * nvme-loop, so lockdep may complain 'possible recursive locking' because + * all 'struct blk_flush_queue' instance share same mq_flush_lock lock class + * key. We need to assign different lock class for these driver's + * fq->mq_flush_lock for avoiding the lockdep warning. + * + * Use dynamically allocated lock class key for each 'blk_flush_queue' + * instance is over-kill, and more worse it introduces horrible boot delay + * issue because synchronize_rcu() is implied in lockdep_unregister_key which + * is called for each hctx release. SCSI probing may synchronously create and + * destroy lots of MQ request_queues for non-existent devices, and some robot + * test kernel always enable lockdep option. It is observed that more than half + * an hour is taken during SCSI MQ probe with per-fq lock class. + */ +void blk_mq_hctx_set_fq_lock_class(struct blk_mq_hw_ctx *hctx, + struct lock_class_key *key) +{ + lockdep_set_class(&hctx->fq->mq_flush_lock, key); +} +EXPORT_SYMBOL_GPL(blk_mq_hctx_set_fq_lock_class); diff --git a/block/blk-ia-ranges.c b/block/blk-ia-ranges.c new file mode 100644 index 000000000000..c9eb4241e048 --- /dev/null +++ b/block/blk-ia-ranges.c @@ -0,0 +1,318 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block device concurrent positioning ranges. + * + * Copyright (C) 2021 Western Digital Corporation or its Affiliates. + */ +#include <linux/kernel.h> +#include <linux/blkdev.h> +#include <linux/slab.h> +#include <linux/init.h> + +#include "blk.h" + +static ssize_t +blk_ia_range_sector_show(struct blk_independent_access_range *iar, + char *buf) +{ + return sprintf(buf, "%llu\n", iar->sector); +} + +static ssize_t +blk_ia_range_nr_sectors_show(struct blk_independent_access_range *iar, + char *buf) +{ + return sprintf(buf, "%llu\n", iar->nr_sectors); +} + +struct blk_ia_range_sysfs_entry { + struct attribute attr; + ssize_t (*show)(struct blk_independent_access_range *iar, char *buf); +}; + +static struct blk_ia_range_sysfs_entry blk_ia_range_sector_entry = { + .attr = { .name = "sector", .mode = 0444 }, + .show = blk_ia_range_sector_show, +}; + +static struct blk_ia_range_sysfs_entry blk_ia_range_nr_sectors_entry = { + .attr = { .name = "nr_sectors", .mode = 0444 }, + .show = blk_ia_range_nr_sectors_show, +}; + +static struct attribute *blk_ia_range_attrs[] = { + &blk_ia_range_sector_entry.attr, + &blk_ia_range_nr_sectors_entry.attr, + NULL, +}; +ATTRIBUTE_GROUPS(blk_ia_range); + +static ssize_t blk_ia_range_sysfs_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct blk_ia_range_sysfs_entry *entry = + container_of(attr, struct blk_ia_range_sysfs_entry, attr); + struct blk_independent_access_range *iar = + container_of(kobj, struct blk_independent_access_range, kobj); + + return entry->show(iar, buf); +} + +static const struct sysfs_ops blk_ia_range_sysfs_ops = { + .show = blk_ia_range_sysfs_show, +}; + +/* + * Independent access range entries are not freed individually, but alltogether + * with struct blk_independent_access_ranges and its array of ranges. Since + * kobject_add() takes a reference on the parent kobject contained in + * struct blk_independent_access_ranges, the array of independent access range + * entries cannot be freed until kobject_del() is called for all entries. + * So we do not need to do anything here, but still need this no-op release + * operation to avoid complaints from the kobject code. + */ +static void blk_ia_range_sysfs_nop_release(struct kobject *kobj) +{ +} + +static const struct kobj_type blk_ia_range_ktype = { + .sysfs_ops = &blk_ia_range_sysfs_ops, + .default_groups = blk_ia_range_groups, + .release = blk_ia_range_sysfs_nop_release, +}; + +/* + * This will be executed only after all independent access range entries are + * removed with kobject_del(), at which point, it is safe to free everything, + * including the array of ranges. + */ +static void blk_ia_ranges_sysfs_release(struct kobject *kobj) +{ + struct blk_independent_access_ranges *iars = + container_of(kobj, struct blk_independent_access_ranges, kobj); + + kfree(iars); +} + +static const struct kobj_type blk_ia_ranges_ktype = { + .release = blk_ia_ranges_sysfs_release, +}; + +/** + * disk_register_independent_access_ranges - register with sysfs a set of + * independent access ranges + * @disk: Target disk + * + * Register with sysfs a set of independent access ranges for @disk. + */ +int disk_register_independent_access_ranges(struct gendisk *disk) +{ + struct blk_independent_access_ranges *iars = disk->ia_ranges; + struct request_queue *q = disk->queue; + int i, ret; + + lockdep_assert_held(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_lock); + + if (!iars) + return 0; + + /* + * At this point, iars is the new set of sector access ranges that needs + * to be registered with sysfs. + */ + WARN_ON(iars->sysfs_registered); + ret = kobject_init_and_add(&iars->kobj, &blk_ia_ranges_ktype, + &disk->queue_kobj, "%s", + "independent_access_ranges"); + if (ret) { + disk->ia_ranges = NULL; + kobject_put(&iars->kobj); + return ret; + } + + for (i = 0; i < iars->nr_ia_ranges; i++) { + ret = kobject_init_and_add(&iars->ia_range[i].kobj, + &blk_ia_range_ktype, &iars->kobj, + "%d", i); + if (ret) { + while (--i >= 0) + kobject_del(&iars->ia_range[i].kobj); + kobject_del(&iars->kobj); + kobject_put(&iars->kobj); + return ret; + } + } + + iars->sysfs_registered = true; + + return 0; +} + +void disk_unregister_independent_access_ranges(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct blk_independent_access_ranges *iars = disk->ia_ranges; + int i; + + lockdep_assert_held(&q->sysfs_dir_lock); + lockdep_assert_held(&q->sysfs_lock); + + if (!iars) + return; + + if (iars->sysfs_registered) { + for (i = 0; i < iars->nr_ia_ranges; i++) + kobject_del(&iars->ia_range[i].kobj); + kobject_del(&iars->kobj); + kobject_put(&iars->kobj); + } else { + kfree(iars); + } + + disk->ia_ranges = NULL; +} + +static struct blk_independent_access_range * +disk_find_ia_range(struct blk_independent_access_ranges *iars, + sector_t sector) +{ + struct blk_independent_access_range *iar; + int i; + + for (i = 0; i < iars->nr_ia_ranges; i++) { + iar = &iars->ia_range[i]; + if (sector >= iar->sector && + sector < iar->sector + iar->nr_sectors) + return iar; + } + + return NULL; +} + +static bool disk_check_ia_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars) +{ + struct blk_independent_access_range *iar, *tmp; + sector_t capacity = get_capacity(disk); + sector_t sector = 0; + int i; + + if (WARN_ON_ONCE(!iars->nr_ia_ranges)) + return false; + + /* + * While sorting the ranges in increasing LBA order, check that the + * ranges do not overlap, that there are no sector holes and that all + * sectors belong to one range. + */ + for (i = 0; i < iars->nr_ia_ranges; i++) { + tmp = disk_find_ia_range(iars, sector); + if (!tmp || tmp->sector != sector) { + pr_warn("Invalid non-contiguous independent access ranges\n"); + return false; + } + + iar = &iars->ia_range[i]; + if (tmp != iar) { + swap(iar->sector, tmp->sector); + swap(iar->nr_sectors, tmp->nr_sectors); + } + + sector += iar->nr_sectors; + } + + if (sector != capacity) { + pr_warn("Independent access ranges do not match disk capacity\n"); + return false; + } + + return true; +} + +static bool disk_ia_ranges_changed(struct gendisk *disk, + struct blk_independent_access_ranges *new) +{ + struct blk_independent_access_ranges *old = disk->ia_ranges; + int i; + + if (!old) + return true; + + if (old->nr_ia_ranges != new->nr_ia_ranges) + return true; + + for (i = 0; i < old->nr_ia_ranges; i++) { + if (new->ia_range[i].sector != old->ia_range[i].sector || + new->ia_range[i].nr_sectors != old->ia_range[i].nr_sectors) + return true; + } + + return false; +} + +/** + * disk_alloc_independent_access_ranges - Allocate an independent access ranges + * data structure + * @disk: target disk + * @nr_ia_ranges: Number of independent access ranges + * + * Allocate a struct blk_independent_access_ranges structure with @nr_ia_ranges + * access range descriptors. + */ +struct blk_independent_access_ranges * +disk_alloc_independent_access_ranges(struct gendisk *disk, int nr_ia_ranges) +{ + struct blk_independent_access_ranges *iars; + + iars = kzalloc_node(struct_size(iars, ia_range, nr_ia_ranges), + GFP_KERNEL, disk->queue->node); + if (iars) + iars->nr_ia_ranges = nr_ia_ranges; + return iars; +} +EXPORT_SYMBOL_GPL(disk_alloc_independent_access_ranges); + +/** + * disk_set_independent_access_ranges - Set a disk independent access ranges + * @disk: target disk + * @iars: independent access ranges structure + * + * Set the independent access ranges information of the request queue + * of @disk to @iars. If @iars is NULL and the independent access ranges + * structure already set is cleared. If there are no differences between + * @iars and the independent access ranges structure already set, @iars + * is freed. + */ +void disk_set_independent_access_ranges(struct gendisk *disk, + struct blk_independent_access_ranges *iars) +{ + struct request_queue *q = disk->queue; + + mutex_lock(&q->sysfs_dir_lock); + mutex_lock(&q->sysfs_lock); + if (iars && !disk_check_ia_ranges(disk, iars)) { + kfree(iars); + iars = NULL; + } + if (iars && !disk_ia_ranges_changed(disk, iars)) { + kfree(iars); + goto unlock; + } + + /* + * This may be called for a registered queue. E.g. during a device + * revalidation. If that is the case, we need to unregister the old + * set of independent access ranges and register the new set. If the + * queue is not registered, registration of the device request queue + * will register the independent access ranges. + */ + disk_unregister_independent_access_ranges(disk); + disk->ia_ranges = iars; + if (blk_queue_registered(q)) + disk_register_independent_access_ranges(disk); +unlock: + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); +} +EXPORT_SYMBOL_GPL(disk_set_independent_access_ranges); diff --git a/block/blk-integrity.c b/block/blk-integrity.c index c03705cbb9c9..d4e9b4556d14 100644 --- a/block/blk-integrity.c +++ b/block/blk-integrity.c @@ -6,7 +6,7 @@ * Written by: Martin K. Petersen <martin.petersen@oracle.com> */ -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/backing-dev.h> #include <linux/mempool.h> #include <linux/bio.h> @@ -183,7 +183,6 @@ bool blk_integrity_merge_rq(struct request_queue *q, struct request *req, return true; } -EXPORT_SYMBOL(blk_integrity_merge_rq); bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, struct bio *bio) @@ -212,63 +211,45 @@ bool blk_integrity_merge_bio(struct request_queue *q, struct request *req, return true; } -EXPORT_SYMBOL(blk_integrity_merge_bio); -struct integrity_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_integrity *, char *); - ssize_t (*store)(struct blk_integrity *, const char *, size_t); -}; - -static ssize_t integrity_attr_show(struct kobject *kobj, struct attribute *attr, - char *page) +static inline struct blk_integrity *dev_to_bi(struct device *dev) { - struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = &disk->queue->integrity; - struct integrity_sysfs_entry *entry = - container_of(attr, struct integrity_sysfs_entry, attr); - - return entry->show(bi, page); + return &dev_to_disk(dev)->queue->integrity; } -static ssize_t integrity_attr_store(struct kobject *kobj, - struct attribute *attr, const char *page, - size_t count) +static ssize_t format_show(struct device *dev, struct device_attribute *attr, + char *page) { - struct gendisk *disk = container_of(kobj, struct gendisk, integrity_kobj); - struct blk_integrity *bi = &disk->queue->integrity; - struct integrity_sysfs_entry *entry = - container_of(attr, struct integrity_sysfs_entry, attr); - ssize_t ret = 0; + struct blk_integrity *bi = dev_to_bi(dev); - if (entry->store) - ret = entry->store(bi, page, count); - - return ret; -} - -static ssize_t integrity_format_show(struct blk_integrity *bi, char *page) -{ if (bi->profile && bi->profile->name) - return sprintf(page, "%s\n", bi->profile->name); - else - return sprintf(page, "none\n"); + return sysfs_emit(page, "%s\n", bi->profile->name); + return sysfs_emit(page, "none\n"); } -static ssize_t integrity_tag_size_show(struct blk_integrity *bi, char *page) +static ssize_t tag_size_show(struct device *dev, struct device_attribute *attr, + char *page) { - return sprintf(page, "%u\n", bi->tag_size); + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%u\n", bi->tag_size); } -static ssize_t integrity_interval_show(struct blk_integrity *bi, char *page) +static ssize_t protection_interval_bytes_show(struct device *dev, + struct device_attribute *attr, + char *page) { - return sprintf(page, "%u\n", - bi->interval_exp ? 1 << bi->interval_exp : 0); + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%u\n", + bi->interval_exp ? 1 << bi->interval_exp : 0); } -static ssize_t integrity_verify_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t read_verify_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t count) { + struct blk_integrity *bi = dev_to_bi(dev); char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); @@ -280,14 +261,20 @@ static ssize_t integrity_verify_store(struct blk_integrity *bi, return count; } -static ssize_t integrity_verify_show(struct blk_integrity *bi, char *page) +static ssize_t read_verify_show(struct device *dev, + struct device_attribute *attr, char *page) { - return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_VERIFY) != 0); + struct blk_integrity *bi = dev_to_bi(dev); + + return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_VERIFY)); } -static ssize_t integrity_generate_store(struct blk_integrity *bi, - const char *page, size_t count) +static ssize_t write_generate_store(struct device *dev, + struct device_attribute *attr, + const char *page, size_t count) { + struct blk_integrity *bi = dev_to_bi(dev); + char *p = (char *) page; unsigned long val = simple_strtoul(p, &p, 10); @@ -299,68 +286,44 @@ static ssize_t integrity_generate_store(struct blk_integrity *bi, return count; } -static ssize_t integrity_generate_show(struct blk_integrity *bi, char *page) +static ssize_t write_generate_show(struct device *dev, + struct device_attribute *attr, char *page) { - return sprintf(page, "%d\n", (bi->flags & BLK_INTEGRITY_GENERATE) != 0); -} + struct blk_integrity *bi = dev_to_bi(dev); -static ssize_t integrity_device_show(struct blk_integrity *bi, char *page) -{ - return sprintf(page, "%u\n", - (bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE) != 0); + return sysfs_emit(page, "%d\n", !!(bi->flags & BLK_INTEGRITY_GENERATE)); } -static struct integrity_sysfs_entry integrity_format_entry = { - .attr = { .name = "format", .mode = 0444 }, - .show = integrity_format_show, -}; - -static struct integrity_sysfs_entry integrity_tag_size_entry = { - .attr = { .name = "tag_size", .mode = 0444 }, - .show = integrity_tag_size_show, -}; - -static struct integrity_sysfs_entry integrity_interval_entry = { - .attr = { .name = "protection_interval_bytes", .mode = 0444 }, - .show = integrity_interval_show, -}; - -static struct integrity_sysfs_entry integrity_verify_entry = { - .attr = { .name = "read_verify", .mode = 0644 }, - .show = integrity_verify_show, - .store = integrity_verify_store, -}; +static ssize_t device_is_integrity_capable_show(struct device *dev, + struct device_attribute *attr, + char *page) +{ + struct blk_integrity *bi = dev_to_bi(dev); -static struct integrity_sysfs_entry integrity_generate_entry = { - .attr = { .name = "write_generate", .mode = 0644 }, - .show = integrity_generate_show, - .store = integrity_generate_store, -}; + return sysfs_emit(page, "%u\n", + !!(bi->flags & BLK_INTEGRITY_DEVICE_CAPABLE)); +} -static struct integrity_sysfs_entry integrity_device_entry = { - .attr = { .name = "device_is_integrity_capable", .mode = 0444 }, - .show = integrity_device_show, -}; +static DEVICE_ATTR_RO(format); +static DEVICE_ATTR_RO(tag_size); +static DEVICE_ATTR_RO(protection_interval_bytes); +static DEVICE_ATTR_RW(read_verify); +static DEVICE_ATTR_RW(write_generate); +static DEVICE_ATTR_RO(device_is_integrity_capable); static struct attribute *integrity_attrs[] = { - &integrity_format_entry.attr, - &integrity_tag_size_entry.attr, - &integrity_interval_entry.attr, - &integrity_verify_entry.attr, - &integrity_generate_entry.attr, - &integrity_device_entry.attr, - NULL, -}; -ATTRIBUTE_GROUPS(integrity); - -static const struct sysfs_ops integrity_ops = { - .show = &integrity_attr_show, - .store = &integrity_attr_store, + &dev_attr_format.attr, + &dev_attr_tag_size.attr, + &dev_attr_protection_interval_bytes.attr, + &dev_attr_read_verify.attr, + &dev_attr_write_generate.attr, + &dev_attr_device_is_integrity_capable.attr, + NULL }; -static struct kobj_type integrity_ktype = { - .default_groups = integrity_groups, - .sysfs_ops = &integrity_ops, +const struct attribute_group blk_integrity_attr_group = { + .name = "integrity", + .attrs = integrity_attrs, }; static blk_status_t blk_integrity_nop_fn(struct blk_integrity_iter *iter) @@ -408,12 +371,12 @@ void blk_integrity_register(struct gendisk *disk, struct blk_integrity *template bi->tuple_size = template->tuple_size; bi->tag_size = template->tag_size; - disk->queue->backing_dev_info->capabilities |= BDI_CAP_STABLE_WRITES; + blk_queue_flag_set(QUEUE_FLAG_STABLE_WRITES, disk->queue); #ifdef CONFIG_BLK_INLINE_ENCRYPTION - if (disk->queue->ksm) { + if (disk->queue->crypto_profile) { pr_warn("blk-integrity: Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - blk_ksm_unregister(disk->queue); + disk->queue->crypto_profile = NULL; } #endif } @@ -428,23 +391,14 @@ EXPORT_SYMBOL(blk_integrity_register); */ void blk_integrity_unregister(struct gendisk *disk) { - disk->queue->backing_dev_info->capabilities &= ~BDI_CAP_STABLE_WRITES; - memset(&disk->queue->integrity, 0, sizeof(struct blk_integrity)); -} -EXPORT_SYMBOL(blk_integrity_unregister); + struct blk_integrity *bi = &disk->queue->integrity; -void blk_integrity_add(struct gendisk *disk) -{ - if (kobject_init_and_add(&disk->integrity_kobj, &integrity_ktype, - &disk_to_dev(disk)->kobj, "%s", "integrity")) + if (!bi->profile) return; - kobject_uevent(&disk->integrity_kobj, KOBJ_ADD); -} - -void blk_integrity_del(struct gendisk *disk) -{ - kobject_uevent(&disk->integrity_kobj, KOBJ_REMOVE); - kobject_del(&disk->integrity_kobj); - kobject_put(&disk->integrity_kobj); + /* ensure all bios are off the integrity workqueue */ + blk_flush_integrity(); + blk_queue_flag_clear(QUEUE_FLAG_STABLE_WRITES, disk->queue); + memset(bi, 0, sizeof(*bi)); } +EXPORT_SYMBOL(blk_integrity_unregister); diff --git a/block/blk-ioc.c b/block/blk-ioc.c index 9df50fb507ca..25dd4db11121 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -8,22 +8,25 @@ #include <linux/bio.h> #include <linux/blkdev.h> #include <linux/slab.h> +#include <linux/security.h> #include <linux/sched/task.h> #include "blk.h" +#include "blk-mq-sched.h" /* * For io context allocations */ static struct kmem_cache *iocontext_cachep; +#ifdef CONFIG_BLK_ICQ /** * get_io_context - increment reference count to io_context * @ioc: io_context to get * * Increment reference count to @ioc. */ -void get_io_context(struct io_context *ioc) +static void get_io_context(struct io_context *ioc) { BUG_ON(atomic_long_read(&ioc->refcount) <= 0); atomic_long_inc(&ioc->refcount); @@ -53,6 +56,16 @@ static void ioc_exit_icq(struct io_cq *icq) icq->flags |= ICQ_EXITED; } +static void ioc_exit_icqs(struct io_context *ioc) +{ + struct io_cq *icq; + + spin_lock_irq(&ioc->lock); + hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) + ioc_exit_icq(icq); + spin_unlock_irq(&ioc->lock); +} + /* * Release an icq. Called with ioc locked for blk-mq, and with both ioc * and queue locked for legacy. @@ -64,6 +77,10 @@ static void ioc_destroy_icq(struct io_cq *icq) struct elevator_type *et = q->elevator->type; lockdep_assert_held(&ioc->lock); + lockdep_assert_held(&q->queue_lock); + + if (icq->flags & ICQ_DESTROYED) + return; radix_tree_delete(&ioc->icq_tree, icq->q->id); hlist_del_init(&icq->ioc_node); @@ -96,15 +113,7 @@ static void ioc_release_fn(struct work_struct *work) { struct io_context *ioc = container_of(work, struct io_context, release_work); - unsigned long flags; - - /* - * Exiting icq may call into put_io_context() through elevator - * which will trigger lockdep warning. The ioc's are guaranteed to - * be different, use a different locking subclass here. Use - * irqsave variant as there's no spin_lock_irq_nested(). - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + spin_lock_irq(&ioc->lock); while (!hlist_empty(&ioc->icq_list)) { struct io_cq *icq = hlist_entry(ioc->icq_list.first, @@ -115,85 +124,91 @@ static void ioc_release_fn(struct work_struct *work) ioc_destroy_icq(icq); spin_unlock(&q->queue_lock); } else { - spin_unlock_irqrestore(&ioc->lock, flags); - cpu_relax(); - spin_lock_irqsave_nested(&ioc->lock, flags, 1); + /* Make sure q and icq cannot be freed. */ + rcu_read_lock(); + + /* Re-acquire the locks in the correct order. */ + spin_unlock(&ioc->lock); + spin_lock(&q->queue_lock); + spin_lock(&ioc->lock); + + ioc_destroy_icq(icq); + + spin_unlock(&q->queue_lock); + rcu_read_unlock(); } } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&ioc->lock); kmem_cache_free(iocontext_cachep, ioc); } -/** - * put_io_context - put a reference of io_context - * @ioc: io_context to put - * - * Decrement reference count of @ioc and release it if the count reaches - * zero. +/* + * Releasing icqs requires reverse order double locking and we may already be + * holding a queue_lock. Do it asynchronously from a workqueue. */ -void put_io_context(struct io_context *ioc) +static bool ioc_delay_free(struct io_context *ioc) { unsigned long flags; - bool free_ioc = false; - - if (ioc == NULL) - return; - - BUG_ON(atomic_long_read(&ioc->refcount) <= 0); - /* - * Releasing ioc requires reverse order double locking and we may - * already be holding a queue_lock. Do it asynchronously from wq. - */ - if (atomic_long_dec_and_test(&ioc->refcount)) { - spin_lock_irqsave(&ioc->lock, flags); - if (!hlist_empty(&ioc->icq_list)) - queue_work(system_power_efficient_wq, - &ioc->release_work); - else - free_ioc = true; + spin_lock_irqsave(&ioc->lock, flags); + if (!hlist_empty(&ioc->icq_list)) { + queue_work(system_power_efficient_wq, &ioc->release_work); spin_unlock_irqrestore(&ioc->lock, flags); + return true; } - - if (free_ioc) - kmem_cache_free(iocontext_cachep, ioc); + spin_unlock_irqrestore(&ioc->lock, flags); + return false; } /** - * put_io_context_active - put active reference on ioc - * @ioc: ioc of interest + * ioc_clear_queue - break any ioc association with the specified queue + * @q: request_queue being cleared * - * Undo get_io_context_active(). If active reference reaches zero after - * put, @ioc can never issue further IOs and ioscheds are notified. + * Walk @q->icq_list and exit all io_cq's. */ -void put_io_context_active(struct io_context *ioc) +void ioc_clear_queue(struct request_queue *q) { - unsigned long flags; - struct io_cq *icq; - - if (!atomic_dec_and_test(&ioc->active_ref)) { - put_io_context(ioc); - return; - } - - /* - * Need ioc lock to walk icq_list and q lock to exit icq. Perform - * reverse double locking. Read comment in ioc_release_fn() for - * explanation on the nested locking annotation. - */ - spin_lock_irqsave_nested(&ioc->lock, flags, 1); - hlist_for_each_entry(icq, &ioc->icq_list, ioc_node) { - if (icq->flags & ICQ_EXITED) - continue; - - ioc_exit_icq(icq); + spin_lock_irq(&q->queue_lock); + while (!list_empty(&q->icq_list)) { + struct io_cq *icq = + list_first_entry(&q->icq_list, struct io_cq, q_node); + + /* + * Other context won't hold ioc lock to wait for queue_lock, see + * details in ioc_release_fn(). + */ + spin_lock(&icq->ioc->lock); + ioc_destroy_icq(icq); + spin_unlock(&icq->ioc->lock); } - spin_unlock_irqrestore(&ioc->lock, flags); + spin_unlock_irq(&q->queue_lock); +} +#else /* CONFIG_BLK_ICQ */ +static inline void ioc_exit_icqs(struct io_context *ioc) +{ +} +static inline bool ioc_delay_free(struct io_context *ioc) +{ + return false; +} +#endif /* CONFIG_BLK_ICQ */ - put_io_context(ioc); +/** + * put_io_context - put a reference of io_context + * @ioc: io_context to put + * + * Decrement reference count of @ioc and release it if the count reaches + * zero. + */ +void put_io_context(struct io_context *ioc) +{ + BUG_ON(atomic_long_read(&ioc->refcount) <= 0); + if (atomic_long_dec_and_test(&ioc->refcount) && !ioc_delay_free(ioc)) + kmem_cache_free(iocontext_cachep, ioc); } +EXPORT_SYMBOL_GPL(put_io_context); /* Called by the exiting task */ void exit_io_context(struct task_struct *task) @@ -205,132 +220,110 @@ void exit_io_context(struct task_struct *task) task->io_context = NULL; task_unlock(task); - atomic_dec(&ioc->nr_tasks); - put_io_context_active(ioc); -} - -static void __ioc_clear_queue(struct list_head *icq_list) -{ - unsigned long flags; - - rcu_read_lock(); - while (!list_empty(icq_list)) { - struct io_cq *icq = list_entry(icq_list->next, - struct io_cq, q_node); - struct io_context *ioc = icq->ioc; - - spin_lock_irqsave(&ioc->lock, flags); - if (icq->flags & ICQ_DESTROYED) { - spin_unlock_irqrestore(&ioc->lock, flags); - continue; - } - ioc_destroy_icq(icq); - spin_unlock_irqrestore(&ioc->lock, flags); + if (atomic_dec_and_test(&ioc->active_ref)) { + ioc_exit_icqs(ioc); + put_io_context(ioc); } - rcu_read_unlock(); } -/** - * ioc_clear_queue - break any ioc association with the specified queue - * @q: request_queue being cleared - * - * Walk @q->icq_list and exit all io_cq's. - */ -void ioc_clear_queue(struct request_queue *q) -{ - LIST_HEAD(icq_list); - - spin_lock_irq(&q->queue_lock); - list_splice_init(&q->icq_list, &icq_list); - spin_unlock_irq(&q->queue_lock); - - __ioc_clear_queue(&icq_list); -} - -int create_task_io_context(struct task_struct *task, gfp_t gfp_flags, int node) +static struct io_context *alloc_io_context(gfp_t gfp_flags, int node) { struct io_context *ioc; - int ret; ioc = kmem_cache_alloc_node(iocontext_cachep, gfp_flags | __GFP_ZERO, node); if (unlikely(!ioc)) - return -ENOMEM; + return NULL; - /* initialize */ atomic_long_set(&ioc->refcount, 1); - atomic_set(&ioc->nr_tasks, 1); atomic_set(&ioc->active_ref, 1); +#ifdef CONFIG_BLK_ICQ spin_lock_init(&ioc->lock); INIT_RADIX_TREE(&ioc->icq_tree, GFP_ATOMIC); INIT_HLIST_HEAD(&ioc->icq_list); INIT_WORK(&ioc->release_work, ioc_release_fn); +#endif + ioc->ioprio = IOPRIO_DEFAULT; - /* - * Try to install. ioc shouldn't be installed if someone else - * already did or @task, which isn't %current, is exiting. Note - * that we need to allow ioc creation on exiting %current as exit - * path may issue IOs from e.g. exit_files(). The exit path is - * responsible for not issuing IO after exit_io_context(). - */ - task_lock(task); - if (!task->io_context && - (task == current || !(task->flags & PF_EXITING))) - task->io_context = ioc; - else - kmem_cache_free(iocontext_cachep, ioc); + return ioc; +} - ret = task->io_context ? 0 : -EBUSY; +int set_task_ioprio(struct task_struct *task, int ioprio) +{ + int err; + const struct cred *cred = current_cred(), *tcred; - task_unlock(task); + rcu_read_lock(); + tcred = __task_cred(task); + if (!uid_eq(tcred->uid, cred->euid) && + !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { + rcu_read_unlock(); + return -EPERM; + } + rcu_read_unlock(); - return ret; -} + err = security_task_setioprio(task, ioprio); + if (err) + return err; -/** - * get_task_io_context - get io_context of a task - * @task: task of interest - * @gfp_flags: allocation flags, used if allocation is necessary - * @node: allocation node, used if allocation is necessary - * - * Return io_context of @task. If it doesn't exist, it is created with - * @gfp_flags and @node. The returned io_context has its reference count - * incremented. - * - * This function always goes through task_lock() and it's better to use - * %current->io_context + get_io_context() for %current. - */ -struct io_context *get_task_io_context(struct task_struct *task, - gfp_t gfp_flags, int node) -{ - struct io_context *ioc; + task_lock(task); + if (unlikely(!task->io_context)) { + struct io_context *ioc; + + task_unlock(task); - might_sleep_if(gfpflags_allow_blocking(gfp_flags)); + ioc = alloc_io_context(GFP_ATOMIC, NUMA_NO_NODE); + if (!ioc) + return -ENOMEM; - do { task_lock(task); - ioc = task->io_context; - if (likely(ioc)) { - get_io_context(ioc); - task_unlock(task); - return ioc; + if (task->flags & PF_EXITING) { + kmem_cache_free(iocontext_cachep, ioc); + goto out; } - task_unlock(task); - } while (!create_task_io_context(task, gfp_flags, node)); + if (task->io_context) + kmem_cache_free(iocontext_cachep, ioc); + else + task->io_context = ioc; + } + task->io_context->ioprio = ioprio; +out: + task_unlock(task); + return 0; +} +EXPORT_SYMBOL_GPL(set_task_ioprio); + +int __copy_io(unsigned long clone_flags, struct task_struct *tsk) +{ + struct io_context *ioc = current->io_context; - return NULL; + /* + * Share io context with parent, if CLONE_IO is set + */ + if (clone_flags & CLONE_IO) { + atomic_inc(&ioc->active_ref); + tsk->io_context = ioc; + } else if (ioprio_valid(ioc->ioprio)) { + tsk->io_context = alloc_io_context(GFP_KERNEL, NUMA_NO_NODE); + if (!tsk->io_context) + return -ENOMEM; + tsk->io_context->ioprio = ioc->ioprio; + } + + return 0; } +#ifdef CONFIG_BLK_ICQ /** * ioc_lookup_icq - lookup io_cq from ioc - * @ioc: the associated io_context * @q: the associated request_queue * * Look up io_cq associated with @ioc - @q pair from @ioc. Must be called * with @q->queue_lock held. */ -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q) +struct io_cq *ioc_lookup_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct io_cq *icq; lockdep_assert_held(&q->queue_lock); @@ -359,9 +352,7 @@ EXPORT_SYMBOL(ioc_lookup_icq); /** * ioc_create_icq - create and link io_cq - * @ioc: io_context of interest * @q: request_queue of interest - * @gfp_mask: allocation mask * * Make sure io_cq linking @ioc and @q exists. If icq doesn't exist, they * will be created using @gfp_mask. @@ -369,19 +360,19 @@ EXPORT_SYMBOL(ioc_lookup_icq); * The caller is responsible for ensuring @ioc won't go away and @q is * alive and will stay alive until this function returns. */ -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask) +static struct io_cq *ioc_create_icq(struct request_queue *q) { + struct io_context *ioc = current->io_context; struct elevator_type *et = q->elevator->type; struct io_cq *icq; /* allocate stuff */ - icq = kmem_cache_alloc_node(et->icq_cache, gfp_mask | __GFP_ZERO, + icq = kmem_cache_alloc_node(et->icq_cache, GFP_ATOMIC | __GFP_ZERO, q->node); if (!icq) return NULL; - if (radix_tree_maybe_preload(gfp_mask) < 0) { + if (radix_tree_maybe_preload(GFP_ATOMIC) < 0) { kmem_cache_free(et->icq_cache, icq); return NULL; } @@ -402,7 +393,7 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, et->ops.init_icq(icq); } else { kmem_cache_free(et->icq_cache, icq); - icq = ioc_lookup_icq(ioc, q); + icq = ioc_lookup_icq(q); if (!icq) printk(KERN_ERR "cfq: icq link failed!\n"); } @@ -413,6 +404,46 @@ struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, return icq; } +struct io_cq *ioc_find_get_icq(struct request_queue *q) +{ + struct io_context *ioc = current->io_context; + struct io_cq *icq = NULL; + + if (unlikely(!ioc)) { + ioc = alloc_io_context(GFP_ATOMIC, q->node); + if (!ioc) + return NULL; + + task_lock(current); + if (current->io_context) { + kmem_cache_free(iocontext_cachep, ioc); + ioc = current->io_context; + } else { + current->io_context = ioc; + } + + get_io_context(ioc); + task_unlock(current); + } else { + get_io_context(ioc); + + spin_lock_irq(&q->queue_lock); + icq = ioc_lookup_icq(q); + spin_unlock_irq(&q->queue_lock); + } + + if (!icq) { + icq = ioc_create_icq(q); + if (!icq) { + put_io_context(ioc); + return NULL; + } + } + return icq; +} +EXPORT_SYMBOL_GPL(ioc_find_get_icq); +#endif /* CONFIG_BLK_ICQ */ + static int __init blk_ioc_init(void) { iocontext_cachep = kmem_cache_create("blkdev_ioc", diff --git a/block/blk-iocost.c b/block/blk-iocost.c index 8ac4aad66ebc..04d44f0bcbc8 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -39,7 +39,7 @@ * On top of that, a size cost proportional to the length of the IO is * added. While simple, this model captures the operational * characteristics of a wide varienty of devices well enough. Default - * paramters for several different classes of devices are provided and the + * parameters for several different classes of devices are provided and the * parameters can be configured from userspace via * /sys/fs/cgroup/io.cost.model. * @@ -68,7 +68,7 @@ * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest, * 12.5% each. The distribution mechanism only cares about these flattened * shares. They're called hweights (hierarchical weights) and always add - * upto 1 (HWEIGHT_WHOLE). + * upto 1 (WEIGHT_ONE). * * A given cgroup's vtime runs slower in inverse proportion to its hweight. * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5) @@ -77,7 +77,7 @@ * * This constitutes the basis of IO capacity distribution. Each cgroup's * vtime is running at a rate determined by its hweight. A cgroup tracks - * the vtime consumed by past IOs and can issue a new IO iff doing so + * the vtime consumed by past IOs and can issue a new IO if doing so * wouldn't outrun the current device vtime. Otherwise, the IO is * suspended until the vtime has progressed enough to cover it. * @@ -111,7 +111,7 @@ * busy signal. * * As devices can have deep queues and be unfair in how the queued commands - * are executed, soley depending on rq wait may not result in satisfactory + * are executed, solely depending on rq wait may not result in satisfactory * control quality. For a better control quality, completion latency QoS * parameters can be configured so that the device is considered saturated * if N'th percentile completion latency rises above the set point. @@ -155,7 +155,7 @@ * Instead of debugfs or other clumsy monitoring mechanisms, this * controller uses a drgn based monitoring script - * tools/cgroup/iocost_monitor.py. For details on drgn, please see - * https://github.com/osandov/drgn. The ouput looks like the following. + * https://github.com/osandov/drgn. The output looks like the following. * * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12% * active weight hweight% inflt% dbt delay usages% @@ -178,10 +178,12 @@ #include <linux/time64.h> #include <linux/parser.h> #include <linux/sched/signal.h> -#include <linux/blk-cgroup.h> +#include <asm/local.h> +#include <asm/local64.h> #include "blk-rq-qos.h" #include "blk-stat.h" #include "blk-wbt.h" +#include "blk-cgroup.h" #ifdef CONFIG_TRACEPOINTS @@ -215,37 +217,24 @@ enum { MAX_PERIOD = USEC_PER_SEC, /* - * A cgroup's vtime can run 50% behind the device vtime, which + * iocg->vtime is targeted at 50% behind the device vtime, which * serves as its IO credit buffer. Surplus weight adjustment is * immediately canceled if the vtime margin runs below 10%. */ - MARGIN_PCT = 50, - INUSE_MARGIN_PCT = 10, + MARGIN_MIN_PCT = 10, + MARGIN_LOW_PCT = 20, + MARGIN_TARGET_PCT = 50, - /* Have some play in waitq timer operations */ - WAITQ_TIMER_MARGIN_PCT = 5, + INUSE_ADJ_STEP_PCT = 25, - /* - * vtime can wrap well within a reasonable uptime when vrate is - * consistently raised. Don't trust recorded cgroup vtime if the - * period counter indicates that it's older than 5mins. - */ - VTIME_VALID_DUR = 300 * USEC_PER_SEC, - - /* - * Remember the past three non-zero usages and use the max for - * surplus calculation. Three slots guarantee that we remember one - * full period usage from the last active stretch even after - * partial deactivation and re-activation periods. Don't start - * giving away weight before collecting two data points to prevent - * hweight adjustments based on one partial activation period. - */ - NR_USAGE_SLOTS = 3, - MIN_VALID_USAGES = 2, + /* Have some play in timer operations */ + TIMER_SLACK_PCT = 1, /* 1/64k is granular enough and can easily be handled w/ u32 */ - HWEIGHT_WHOLE = 1 << 16, + WEIGHT_ONE = 1 << 16, +}; +enum { /* * As vtime is used to calculate the cost of each IO, it needs to * be fairly high precision. For example, it should be able to @@ -269,25 +258,48 @@ enum { VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION, VRATE_CLAMP_ADJ_PCT = 4, + /* switch iff the conditions are met for longer than this */ + AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, +}; + +enum { /* if IOs end up waiting for requests, issue less */ RQ_WAIT_BUSY_PCT = 5, /* unbusy hysterisis */ UNBUSY_THR_PCT = 75, - /* don't let cmds which take a very long time pin lagging for too long */ - MAX_LAGGING_PERIODS = 10, - /* - * If usage% * 1.25 + 2% is lower than hweight% by more than 3%, - * donate the surplus. + * The effect of delay is indirect and non-linear and a huge amount of + * future debt can accumulate abruptly while unthrottled. Linearly scale + * up delay as debt is going up and then let it decay exponentially. + * This gives us quick ramp ups while delay is accumulating and long + * tails which can help reducing the frequency of debt explosions on + * unthrottle. The parameters are experimentally determined. + * + * The delay mechanism provides adequate protection and behavior in many + * cases. However, this is far from ideal and falls shorts on both + * fronts. The debtors are often throttled too harshly costing a + * significant level of fairness and possibly total work while the + * protection against their impacts on the system can be choppy and + * unreliable. + * + * The shortcoming primarily stems from the fact that, unlike for page + * cache, the kernel doesn't have well-defined back-pressure propagation + * mechanism and policies for anonymous memory. Fully addressing this + * issue will likely require substantial improvements in the area. */ - SURPLUS_SCALE_PCT = 125, /* * 125% */ - SURPLUS_SCALE_ABS = HWEIGHT_WHOLE / 50, /* + 2% */ - SURPLUS_MIN_ADJ_DELTA = HWEIGHT_WHOLE / 33, /* 3% */ + MIN_DELAY_THR_PCT = 500, + MAX_DELAY_THR_PCT = 25000, + MIN_DELAY = 250, + MAX_DELAY = 250 * USEC_PER_MSEC, - /* switch iff the conditions are met for longer than this */ - AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC, + /* halve debts if avg usage over 100ms is under 50% */ + DFGV_USAGE_PCT = 50, + DFGV_PERIOD = 100 * USEC_PER_MSEC, + + /* don't let cmds which take a very long time pin lagging for too long */ + MAX_LAGGING_PERIODS = 10, /* * Count IO size in 4k pages. The 12bit shift helps keeping @@ -362,8 +374,6 @@ enum { AUTOP_SSD_FAST, }; -struct ioc_gq; - struct ioc_params { u32 qos[NR_QOS_PARAMS]; u64 i_lcoefs[NR_I_LCOEFS]; @@ -372,9 +382,15 @@ struct ioc_params { u32 too_slow_vrate_pct; }; +struct ioc_margins { + s64 min; + s64 low; + s64 target; +}; + struct ioc_missed { - u32 nr_met; - u32 nr_missed; + local_t nr_met; + local_t nr_missed; u32 last_met; u32 last_missed; }; @@ -382,7 +398,7 @@ struct ioc_missed { struct ioc_pcpu_stat { struct ioc_missed missed[2]; - u64 rq_wait_ns; + local64_t rq_wait_ns; u64 last_rq_wait_ns; }; @@ -393,8 +409,9 @@ struct ioc { bool enabled; struct ioc_params params; + struct ioc_margins margins; u32 period_us; - u32 margin_us; + u32 timer_slack_ns; u64 vrate_min; u64 vrate_max; @@ -405,18 +422,24 @@ struct ioc { enum ioc_running running; atomic64_t vtime_rate; + u64 vtime_base_rate; + s64 vtime_err; - seqcount_t period_seqcount; - u32 period_at; /* wallclock starttime */ + seqcount_spinlock_t period_seqcount; + u64 period_at; /* wallclock starttime */ u64 period_at_vtime; /* vtime starttime */ atomic64_t cur_period; /* inc'd each period */ int busy_level; /* saturation history */ - u64 inuse_margin_vtime; bool weights_updated; atomic_t hweight_gen; /* for lazy hweights */ + /* debt forgivness */ + u64 dfgv_period_at; + u64 dfgv_period_rem; + u64 dfgv_usage_us_sum; + u64 autop_too_fast_at; u64 autop_too_slow_at; int autop_idx; @@ -424,6 +447,17 @@ struct ioc { bool user_cost_model:1; }; +struct iocg_pcpu_stat { + local64_t abs_vusage; +}; + +struct iocg_stat { + u64 usage_us; + u64 wait_us; + u64 indebt_us; + u64 indelay_us; +}; + /* per device-cgroup pair */ struct ioc_gq { struct blkg_policy_data pd; @@ -443,32 +477,37 @@ struct ioc_gq { * * `last_inuse` remembers `inuse` while an iocg is idle to persist * surplus adjustments. + * + * `inuse` may be adjusted dynamically during period. `saved_*` are used + * to determine and track adjustments. */ u32 cfg_weight; u32 weight; u32 active; u32 inuse; + u32 last_inuse; + s64 saved_margin; sector_t cursor; /* to detect randio */ /* * `vtime` is this iocg's vtime cursor which progresses as IOs are * issued. If lagging behind device vtime, the delta represents - * the currently available IO budget. If runnning ahead, the + * the currently available IO budget. If running ahead, the * overage. * * `vtime_done` is the same but progressed on completion rather * than issue. The delta behind `vtime` represents the cost of * currently in-flight IOs. - * - * `last_vtime` is used to remember `vtime` at the end of the last - * period to calculate utilization. */ atomic64_t vtime; atomic64_t done_vtime; u64 abs_vdebt; - u64 last_vtime; + + /* current delay in effect and when it started */ + u64 delay; + u64 delay_at; /* * The period this iocg was last active in. Used for deactivation @@ -477,21 +516,34 @@ struct ioc_gq { atomic64_t active_period; struct list_head active_list; - /* see __propagate_active_weight() and current_hweight() for details */ + /* see __propagate_weights() and current_hweight() for details */ u64 child_active_sum; u64 child_inuse_sum; + u64 child_adjusted_sum; int hweight_gen; u32 hweight_active; u32 hweight_inuse; - bool has_surplus; + u32 hweight_donating; + u32 hweight_after_donation; + + struct list_head walk_list; + struct list_head surplus_list; struct wait_queue_head waitq; struct hrtimer waitq_timer; - struct hrtimer delay_timer; - /* usage is recorded as fractions of HWEIGHT_WHOLE */ - int usage_idx; - u32 usages[NR_USAGE_SLOTS]; + /* timestamp at the latest activation */ + u64 activated_at; + + /* statistics */ + struct iocg_pcpu_stat __percpu *pcpu_stat; + struct iocg_stat stat; + struct iocg_stat last_stat; + u64 last_stat_abs_vusage; + u64 usage_delta_us; + u64 wait_since; + u64 indebt_since; + u64 indelay_since; /* this iocg's depth in the hierarchy and ancestors including self */ int level; @@ -506,9 +558,8 @@ struct ioc_cgrp { struct ioc_now { u64 now_ns; - u32 now; + u64 now; u64 vnow; - u64 vrate; }; struct iocg_wait { @@ -616,17 +667,13 @@ static struct ioc *q_to_ioc(struct request_queue *q) return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST)); } -static const char *q_name(struct request_queue *q) -{ - if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) - return kobject_name(q->kobj.parent); - else - return "<unknown>"; -} - static const char __maybe_unused *ioc_name(struct ioc *ioc) { - return q_name(ioc->rqos.q); + struct gendisk *disk = ioc->rqos.disk; + + if (!disk) + return "<unknown>"; + return disk->disk_name; } static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd) @@ -656,7 +703,7 @@ static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg) */ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse); + return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse); } /* @@ -664,18 +711,56 @@ static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse) */ static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) { - return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE); + return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); } -static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost) +static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, + u64 abs_cost, u64 cost) { + struct iocg_pcpu_stat *gcs; + bio->bi_iocost_cost = cost; atomic64_add(cost, &iocg->vtime); + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags) +{ + if (lock_ioc) { + spin_lock_irqsave(&iocg->ioc->lock, *flags); + spin_lock(&iocg->waitq.lock); + } else { + spin_lock_irqsave(&iocg->waitq.lock, *flags); + } +} + +static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags) +{ + if (unlock_ioc) { + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&iocg->ioc->lock, *flags); + } else { + spin_unlock_irqrestore(&iocg->waitq.lock, *flags); + } } #define CREATE_TRACE_POINTS #include <trace/events/iocost.h> +static void ioc_refresh_margins(struct ioc *ioc) +{ + struct ioc_margins *margins = &ioc->margins; + u32 period_us = ioc->period_us; + u64 vrate = ioc->vtime_base_rate; + + margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; + margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; + margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; +} + /* latency Qos params changed, update period_us and all the dependent params */ static void ioc_refresh_period_us(struct ioc *ioc) { @@ -709,12 +794,17 @@ static void ioc_refresh_period_us(struct ioc *ioc) /* calculate dependent params */ ioc->period_us = period_us; - ioc->margin_us = period_us * MARGIN_PCT / 100; - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100); + ioc->timer_slack_ns = div64_u64( + (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT, + 100); + ioc_refresh_margins(ioc); } -static int ioc_autop_idx(struct ioc *ioc) +/* + * ioc->rqos.disk isn't initialized when this function is called from + * the init path. + */ +static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; @@ -722,11 +812,11 @@ static int ioc_autop_idx(struct ioc *ioc) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(ioc->rqos.q)) + if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ - if (blk_queue_depth(ioc->rqos.q) == 1) + if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ @@ -738,8 +828,7 @@ static int ioc_autop_idx(struct ioc *ioc) return idx; /* step up/down based on the vrate */ - vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100, - VTIME_PER_USEC); + vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); now_ns = ktime_get_ns(); if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { @@ -783,9 +872,14 @@ static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops, *page = *seqio = *randio = 0; - if (bps) - *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, - DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE)); + if (bps) { + u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE); + + if (bps_pages) + *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages); + else + *page = 1; + } if (seqiops) { v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops); @@ -811,21 +905,28 @@ static void ioc_refresh_lcoefs(struct ioc *ioc) &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } -static bool ioc_refresh_params(struct ioc *ioc, bool force) +/* + * struct gendisk is required as an argument because ioc->rqos.disk + * is not properly initialized when called from the init path. + */ +static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, + struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); - idx = ioc_autop_idx(ioc); + idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) return false; - if (idx != ioc->autop_idx) + if (idx != ioc->autop_idx) { atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); + ioc->vtime_base_rate = VTIME_PER_USEC; + } ioc->autop_idx = idx; ioc->autop_too_fast_at = 0; @@ -841,20 +942,111 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * VTIME_PER_USEC, MILLION); - ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] * - VTIME_PER_USEC, MILLION); + ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * + VTIME_PER_USEC, MILLION); return true; } +static bool ioc_refresh_params(struct ioc *ioc, bool force) +{ + return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); +} + +/* + * When an iocg accumulates too much vtime or gets deactivated, we throw away + * some vtime, which lowers the overall device utilization. As the exact amount + * which is being thrown away is known, we can compensate by accelerating the + * vrate accordingly so that the extra vtime generated in the current period + * matches what got lost. + */ +static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now) +{ + s64 pleft = ioc->period_at + ioc->period_us - now->now; + s64 vperiod = ioc->period_us * ioc->vtime_base_rate; + s64 vcomp, vcomp_min, vcomp_max; + + lockdep_assert_held(&ioc->lock); + + /* we need some time left in this period */ + if (pleft <= 0) + goto done; + + /* + * Calculate how much vrate should be adjusted to offset the error. + * Limit the amount of adjustment and deduct the adjusted amount from + * the error. + */ + vcomp = -div64_s64(ioc->vtime_err, pleft); + vcomp_min = -(ioc->vtime_base_rate >> 1); + vcomp_max = ioc->vtime_base_rate; + vcomp = clamp(vcomp, vcomp_min, vcomp_max); + + ioc->vtime_err += vcomp * pleft; + + atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); +done: + /* bound how much error can accumulate */ + ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); +} + +static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct, + int nr_lagging, int nr_shortages, + int prev_busy_level, u32 *missed_ppm) +{ + u64 vrate = ioc->vtime_base_rate; + u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; + + if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { + if (ioc->busy_level != prev_busy_level || nr_lagging) + trace_iocost_ioc_vrate_adj(ioc, vrate, + missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + return; + } + + /* + * If vrate is out of bounds, apply clamp gradually as the + * bounds can change abruptly. Otherwise, apply busy_level + * based adjustment. + */ + if (vrate < vrate_min) { + vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100); + vrate = min(vrate, vrate_min); + } else if (vrate > vrate_max) { + vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); + vrate = max(vrate, vrate_max); + } else { + int idx = min_t(int, abs(ioc->busy_level), + ARRAY_SIZE(vrate_adj_pct) - 1); + u32 adj_pct = vrate_adj_pct[idx]; + + if (ioc->busy_level > 0) + adj_pct = 100 - adj_pct; + else + adj_pct = 100 + adj_pct; + + vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), + vrate_min, vrate_max); + } + + trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, + nr_lagging, nr_shortages); + + ioc->vtime_base_rate = vrate; + ioc_refresh_margins(ioc); +} + /* take a snapshot of the current [v]time and vrate */ static void ioc_now(struct ioc *ioc, struct ioc_now *now) { unsigned seq; + u64 vrate; now->now_ns = ktime_get(); now->now = ktime_to_us(now->now_ns); - now->vrate = atomic64_read(&ioc->vtime_rate); + vrate = atomic64_read(&ioc->vtime_rate); /* * The current vtime is @@ -867,13 +1059,12 @@ static void ioc_now(struct ioc *ioc, struct ioc_now *now) do { seq = read_seqcount_begin(&ioc->period_seqcount); now->vnow = ioc->period_at_vtime + - (now->now - ioc->period_at) * now->vrate; + (now->now - ioc->period_at) * vrate; } while (read_seqcount_retry(&ioc->period_seqcount, seq)); } static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) { - lockdep_assert_held(&ioc->lock); WARN_ON_ONCE(ioc->running != IOC_RUNNING); write_seqcount_begin(&ioc->period_seqcount); @@ -887,16 +1078,35 @@ static void ioc_start_period(struct ioc *ioc, struct ioc_now *now) /* * Update @iocg's `active` and `inuse` to @active and @inuse, update level - * weight sums and propagate upwards accordingly. + * weight sums and propagate upwards accordingly. If @save, the current margin + * is saved to be used as reference for later inuse in-period adjustments. */ -static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; int lvl; lockdep_assert_held(&ioc->lock); - inuse = min(active, inuse); + /* + * For an active leaf node, its inuse shouldn't be zero or exceed + * @active. An active internal node's inuse is solely determined by the + * inuse to active ratio of its children regardless of @inuse. + */ + if (list_empty(&iocg->active_list) && iocg->child_active_sum) { + inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, + iocg->child_active_sum); + } else { + inuse = clamp_t(u32, inuse, 1, active); + } + + iocg->last_inuse = iocg->inuse; + if (save) + iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); + + if (active == iocg->active && inuse == iocg->inuse) + return; for (lvl = iocg->level - 1; lvl >= 0; lvl--) { struct ioc_gq *parent = iocg->ancestors[lvl]; @@ -906,13 +1116,13 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse /* update the level sums */ parent->child_active_sum += (s32)(active - child->active); parent->child_inuse_sum += (s32)(inuse - child->inuse); - /* apply the udpates */ + /* apply the updates */ child->active = active; child->inuse = inuse; /* * The delta between inuse and active sums indicates that - * that much of weight is being given away. Parent's inuse + * much of weight is being given away. Parent's inuse * and active should reflect the ratio. */ if (parent->child_active_sum) { @@ -934,7 +1144,7 @@ static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse ioc->weights_updated = true; } -static void commit_active_weights(struct ioc *ioc) +static void commit_weights(struct ioc *ioc) { lockdep_assert_held(&ioc->lock); @@ -946,10 +1156,11 @@ static void commit_active_weights(struct ioc *ioc) } } -static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse) +static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse, + bool save, struct ioc_now *now) { - __propagate_active_weight(iocg, active, inuse); - commit_active_weights(iocg->ioc); + __propagate_weights(iocg, active, inuse, save, now); + commit_weights(iocg->ioc); } static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep) @@ -965,9 +1176,9 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep goto out; /* - * Paired with wmb in commit_active_weights(). If we saw the - * updated hweight_gen, all the weight updates from - * __propagate_active_weight() are visible too. + * Paired with wmb in commit_weights(). If we saw the updated + * hweight_gen, all the weight updates from __propagate_weights() are + * visible too. * * We can race with weight updates during calculation and get it * wrong. However, hweight_gen would have changed and a future @@ -976,12 +1187,12 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep */ smp_rmb(); - hwa = hwi = HWEIGHT_WHOLE; + hwa = hwi = WEIGHT_ONE; for (lvl = 0; lvl <= iocg->level - 1; lvl++) { struct ioc_gq *parent = iocg->ancestors[lvl]; struct ioc_gq *child = iocg->ancestors[lvl + 1]; - u32 active_sum = READ_ONCE(parent->child_active_sum); - u32 inuse_sum = READ_ONCE(parent->child_inuse_sum); + u64 active_sum = READ_ONCE(parent->child_active_sum); + u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); u32 active = READ_ONCE(child->active); u32 inuse = READ_ONCE(child->inuse); @@ -989,11 +1200,11 @@ static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep if (!active_sum || !inuse_sum) continue; - active_sum = max(active, active_sum); - hwa = hwa * active / active_sum; /* max 16bits * 10000 */ + active_sum = max_t(u64, active, active_sum); + hwa = div64_u64((u64)hwa * active, active_sum); - inuse_sum = max(inuse, inuse_sum); - hwi = hwi * inuse / inuse_sum; /* max 16bits * 10000 */ + inuse_sum = max_t(u64, inuse, inuse_sum); + hwi = div64_u64((u64)hwi * inuse, inuse_sum); } iocg->hweight_active = max_t(u32, hwa, 1); @@ -1006,7 +1217,33 @@ out: *hw_inusep = iocg->hweight_inuse; } -static void weight_updated(struct ioc_gq *iocg) +/* + * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the + * other weights stay unchanged. + */ +static u32 current_hweight_max(struct ioc_gq *iocg) +{ + u32 hwm = WEIGHT_ONE; + u32 inuse = iocg->active; + u64 child_inuse_sum; + int lvl; + + lockdep_assert_held(&iocg->ioc->lock); + + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + struct ioc_gq *parent = iocg->ancestors[lvl]; + struct ioc_gq *child = iocg->ancestors[lvl + 1]; + + child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; + hwm = div64_u64((u64)hwm * inuse, child_inuse_sum); + inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, + parent->child_active_sum); + } + + return max_t(u32, hwm, 1); +} + +static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct blkcg_gq *blkg = iocg_to_blkg(iocg); @@ -1017,16 +1254,15 @@ static void weight_updated(struct ioc_gq *iocg) weight = iocg->cfg_weight ?: iocc->dfl_weight; if (weight != iocg->weight && iocg->active) - propagate_active_weight(iocg, weight, - DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight)); + propagate_weights(iocg, weight, iocg->inuse, true, now); iocg->weight = weight; } static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) { struct ioc *ioc = iocg->ioc; - u64 last_period, cur_period, max_period_delta; - u64 vtime, vmargin, vmin; + u64 __maybe_unused last_period, cur_period; + u64 vtime, vtarget; int i; /* @@ -1065,22 +1301,15 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) goto fail_unlock; /* - * vtime may wrap when vrate is raised substantially due to - * underestimated IO costs. Look at the period and ignore its - * vtime if the iocg has been idle for too long. Also, cap the - * budget it can start with to the margin. + * Always start with the target budget. On deactivation, we throw away + * anything above it. */ - max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us); + vtarget = now->vnow - ioc->margins.target; vtime = atomic64_read(&iocg->vtime); - vmargin = ioc->margin_us * now->vrate; - vmin = now->vnow - vmargin; - if (last_period + max_period_delta < cur_period || - time_before64(vtime, vmin)) { - atomic64_add(vmin - vtime, &iocg->vtime); - atomic64_add(vmin - vtime, &iocg->done_vtime); - vtime = vmin; - } + atomic64_add(vtarget - vtime, &iocg->vtime); + atomic64_add(vtarget - vtime, &iocg->done_vtime); + vtime = vtarget; /* * Activate, propagate weight and start period timer if not @@ -1089,16 +1318,19 @@ static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now) */ iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; list_add(&iocg->active_list, &ioc->active_iocgs); - propagate_active_weight(iocg, iocg->weight, - iocg->last_inuse ?: iocg->weight); + + propagate_weights(iocg, iocg->weight, + iocg->last_inuse ?: iocg->weight, true, now); TRACE_IOCG_PATH(iocg_activate, iocg, now, last_period, cur_period, vtime); - iocg->last_vtime = vtime; + iocg->activated_at = now->now; if (ioc->running == IOC_IDLE) { ioc->running = IOC_RUNNING; + ioc->dfgv_period_at = now->now; + ioc->dfgv_period_rem = 0; ioc_start_period(ioc, now); } @@ -1111,11 +1343,122 @@ fail_unlock: return false; } +static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct blkcg_gq *blkg = iocg_to_blkg(iocg); + u64 tdelta, delay, new_delay; + s64 vover, vover_pct; + u32 hwa; + + lockdep_assert_held(&iocg->waitq.lock); + + /* + * If the delay is set by another CPU, we may be in the past. No need to + * change anything if so. This avoids decay calculation underflow. + */ + if (time_before64(now->now, iocg->delay_at)) + return false; + + /* calculate the current delay in effect - 1/2 every second */ + tdelta = now->now - iocg->delay_at; + if (iocg->delay) + delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC); + else + delay = 0; + + /* calculate the new delay from the debt amount */ + current_hweight(iocg, &hwa, NULL); + vover = atomic64_read(&iocg->vtime) + + abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; + vover_pct = div64_s64(100 * vover, + ioc->period_us * ioc->vtime_base_rate); + + if (vover_pct <= MIN_DELAY_THR_PCT) + new_delay = 0; + else if (vover_pct >= MAX_DELAY_THR_PCT) + new_delay = MAX_DELAY; + else + new_delay = MIN_DELAY + + div_u64((MAX_DELAY - MIN_DELAY) * + (vover_pct - MIN_DELAY_THR_PCT), + MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); + + /* pick the higher one and apply */ + if (new_delay > delay) { + iocg->delay = new_delay; + iocg->delay_at = now->now; + delay = new_delay; + } + + if (delay >= MIN_DELAY) { + if (!iocg->indelay_since) + iocg->indelay_since = now->now; + blkcg_set_delay(blkg, delay * NSEC_PER_USEC); + return true; + } else { + if (iocg->indelay_since) { + iocg->stat.indelay_us += now->now - iocg->indelay_since; + iocg->indelay_since = 0; + } + iocg->delay = 0; + blkcg_clear_delay(blkg); + return false; + } +} + +static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost, + struct ioc_now *now) +{ + struct iocg_pcpu_stat *gcs; + + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + WARN_ON_ONCE(list_empty(&iocg->active_list)); + + /* + * Once in debt, debt handling owns inuse. @iocg stays at the minimum + * inuse donating all of it share to others until its debt is paid off. + */ + if (!iocg->abs_vdebt && abs_cost) { + iocg->indebt_since = now->now; + propagate_weights(iocg, iocg->active, 0, false, now); + } + + iocg->abs_vdebt += abs_cost; + + gcs = get_cpu_ptr(iocg->pcpu_stat); + local64_add(abs_cost, &gcs->abs_vusage); + put_cpu_ptr(gcs); +} + +static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay, + struct ioc_now *now) +{ + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + + /* make sure that nobody messed with @iocg */ + WARN_ON_ONCE(list_empty(&iocg->active_list)); + WARN_ON_ONCE(iocg->inuse > 1); + + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); + + /* if debt is paid in full, restore inuse */ + if (!iocg->abs_vdebt) { + iocg->stat.indebt_us += now->now - iocg->indebt_since; + iocg->indebt_since = 0; + + propagate_weights(iocg, iocg->active, iocg->last_inuse, + false, now); + } +} + static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, int flags, void *key) { struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait); - struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key; + struct iocg_wake_ctx *ctx = key; u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); ctx->vbudget -= cost; @@ -1123,146 +1466,121 @@ static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode, if (ctx->vbudget < 0) return -1; - iocg_commit_bio(ctx->iocg, wait->bio, cost); + iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); + wait->committed = true; /* * autoremove_wake_function() removes the wait entry only when it - * actually changed the task state. We want the wait always - * removed. Remove explicitly and use default_wake_function(). + * actually changed the task state. We want the wait always removed. + * Remove explicitly and use default_wake_function(). Note that the + * order of operations is important as finish_wait() tests whether + * @wq_entry is removed without grabbing the lock. */ - list_del_init(&wq_entry->entry); - wait->committed = true; - default_wake_function(wq_entry, mode, flags, key); + list_del_init_careful(&wq_entry->entry); return 0; } -static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now) +/* + * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters + * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in + * addition to iocg->waitq.lock. + */ +static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt, + struct ioc_now *now) { struct ioc *ioc = iocg->ioc; struct iocg_wake_ctx ctx = { .iocg = iocg }; - u64 margin_ns = (u64)(ioc->period_us * - WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC; - u64 vdebt, vshortage, expires, oexpires; + u64 vshortage, expires, oexpires; s64 vbudget; - u32 hw_inuse; + u32 hwa; lockdep_assert_held(&iocg->waitq.lock); - current_hweight(iocg, NULL, &hw_inuse); + current_hweight(iocg, &hwa, NULL); vbudget = now->vnow - atomic64_read(&iocg->vtime); /* pay off debt */ - vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - if (vdebt && vbudget > 0) { - u64 delta = min_t(u64, vbudget, vdebt); - u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse), - iocg->abs_vdebt); + if (pay_debt && iocg->abs_vdebt && vbudget > 0) { + u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa); + u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); + u64 vpay = abs_cost_to_cost(abs_vpay, hwa); + + lockdep_assert_held(&ioc->lock); - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - iocg->abs_vdebt -= abs_delta; + atomic64_add(vpay, &iocg->vtime); + atomic64_add(vpay, &iocg->done_vtime); + iocg_pay_debt(iocg, abs_vpay, now); + vbudget -= vpay; } + if (iocg->abs_vdebt || iocg->delay) + iocg_kick_delay(iocg, now); + /* - * Wake up the ones which are due and see how much vtime we'll need - * for the next one. + * Debt can still be outstanding if we haven't paid all yet or the + * caller raced and called without @pay_debt. Shouldn't wake up waiters + * under debt. Make sure @vbudget reflects the outstanding amount and is + * not positive. */ - ctx.hw_inuse = hw_inuse; - ctx.vbudget = vbudget - vdebt; + if (iocg->abs_vdebt) { + s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); + vbudget = min_t(s64, 0, vbudget - vdebt); + } + + /* + * Wake up the ones which are due and see how much vtime we'll need for + * the next one. As paying off debt restores hw_inuse, it must be read + * after the above debt payment. + */ + ctx.vbudget = vbudget; + current_hweight(iocg, NULL, &ctx.hw_inuse); + __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); - if (!waitqueue_active(&iocg->waitq)) + + if (!waitqueue_active(&iocg->waitq)) { + if (iocg->wait_since) { + iocg->stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = 0; + } return; + } + + if (!iocg->wait_since) + iocg->wait_since = now->now; + if (WARN_ON_ONCE(ctx.vbudget >= 0)) return; - /* determine next wakeup, add a quarter margin to guarantee chunking */ + /* determine next wakeup, add a timer margin to guarantee chunking */ vshortage = -ctx.vbudget; expires = now->now_ns + - DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC; - expires += margin_ns / 4; + DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * + NSEC_PER_USEC; + expires += ioc->timer_slack_ns; /* if already active and close enough, don't bother */ oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); if (hrtimer_is_queued(&iocg->waitq_timer) && - abs(oexpires - expires) <= margin_ns / 4) + abs(oexpires - expires) <= ioc->timer_slack_ns) return; hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); + ioc->timer_slack_ns, HRTIMER_MODE_ABS); } static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer) { struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer); + bool pay_debt = READ_ONCE(iocg->abs_vdebt); struct ioc_now now; unsigned long flags; ioc_now(iocg->ioc, &now); - spin_lock_irqsave(&iocg->waitq.lock, flags); - iocg_kick_waitq(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); - - return HRTIMER_NORESTART; -} - -static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now) -{ - struct ioc *ioc = iocg->ioc; - struct blkcg_gq *blkg = iocg_to_blkg(iocg); - u64 vtime = atomic64_read(&iocg->vtime); - u64 vmargin = ioc->margin_us * now->vrate; - u64 margin_ns = ioc->margin_us * NSEC_PER_USEC; - u64 delta_ns, expires, oexpires; - u32 hw_inuse; - - lockdep_assert_held(&iocg->waitq.lock); - - /* debt-adjust vtime */ - current_hweight(iocg, NULL, &hw_inuse); - vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse); - - /* - * Clear or maintain depending on the overage. Non-zero vdebt is what - * guarantees that @iocg is online and future iocg_kick_delay() will - * clear use_delay. Don't leave it on when there's no vdebt. - */ - if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) { - blkcg_clear_delay(blkg); - return false; - } - if (!atomic_read(&blkg->use_delay) && - time_before_eq64(vtime, now->vnow + vmargin)) - return false; - - /* use delay */ - delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow, - now->vrate) * NSEC_PER_USEC; - blkcg_set_delay(blkg, delta_ns); - expires = now->now_ns + delta_ns; - - /* if already active and close enough, don't bother */ - oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer)); - if (hrtimer_is_queued(&iocg->delay_timer) && - abs(oexpires - expires) <= margin_ns / 4) - return true; - - hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires), - margin_ns / 4, HRTIMER_MODE_ABS); - return true; -} - -static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer) -{ - struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer); - struct ioc_now now; - unsigned long flags; - - spin_lock_irqsave(&iocg->waitq.lock, flags); - ioc_now(iocg->ioc, &now); - iocg_kick_delay(iocg, &now); - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + iocg_lock(iocg, pay_debt, &flags); + iocg_kick_waitq(iocg, pay_debt, &now); + iocg_unlock(iocg, pay_debt, &flags); return HRTIMER_NORESTART; } @@ -1279,8 +1597,8 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p u64 this_rq_wait_ns; for (rw = READ; rw <= WRITE; rw++) { - u32 this_met = READ_ONCE(stat->missed[rw].nr_met); - u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed); + u32 this_met = local_read(&stat->missed[rw].nr_met); + u32 this_missed = local_read(&stat->missed[rw].nr_missed); nr_met[rw] += this_met - stat->missed[rw].last_met; nr_missed[rw] += this_missed - stat->missed[rw].last_missed; @@ -1288,7 +1606,7 @@ static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p stat->missed[rw].last_missed = this_missed; } - this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns); + this_rq_wait_ns = local64_read(&stat->rq_wait_ns); rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; stat->last_rq_wait_ns = this_rq_wait_ns; } @@ -1323,78 +1641,634 @@ static bool iocg_is_idle(struct ioc_gq *iocg) return true; } -/* returns usage with margin added if surplus is large enough */ -static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse) +/* + * Call this function on the target leaf @iocg's to build pre-order traversal + * list of all the ancestors in @inner_walk. The inner nodes are linked through + * ->walk_list and the caller is responsible for dissolving the list after use. + */ +static void iocg_build_inner_walk(struct ioc_gq *iocg, + struct list_head *inner_walk) { - /* add margin */ - usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100); - usage += SURPLUS_SCALE_ABS; + int lvl; - /* don't bother if the surplus is too small */ - if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse) - return 0; + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); - return usage; + /* find the first ancestor which hasn't been visited yet */ + for (lvl = iocg->level - 1; lvl >= 0; lvl--) { + if (!list_empty(&iocg->ancestors[lvl]->walk_list)) + break; + } + + /* walk down and visit the inner nodes to get pre-order traversal */ + while (++lvl <= iocg->level - 1) { + struct ioc_gq *inner = iocg->ancestors[lvl]; + + /* record traversal order */ + list_add_tail(&inner->walk_list, inner_walk); + } } -static void ioc_timer_fn(struct timer_list *timer) +/* propagate the deltas to the parent */ +static void iocg_flush_stat_upward(struct ioc_gq *iocg) { - struct ioc *ioc = container_of(timer, struct ioc, timer); + if (iocg->level > 0) { + struct iocg_stat *parent_stat = + &iocg->ancestors[iocg->level - 1]->stat; + + parent_stat->usage_us += + iocg->stat.usage_us - iocg->last_stat.usage_us; + parent_stat->wait_us += + iocg->stat.wait_us - iocg->last_stat.wait_us; + parent_stat->indebt_us += + iocg->stat.indebt_us - iocg->last_stat.indebt_us; + parent_stat->indelay_us += + iocg->stat.indelay_us - iocg->last_stat.indelay_us; + } + + iocg->last_stat = iocg->stat; +} + +/* collect per-cpu counters and propagate the deltas to the parent */ +static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 abs_vusage = 0; + u64 vusage_delta; + int cpu; + + lockdep_assert_held(&iocg->ioc->lock); + + /* collect per-cpu counters */ + for_each_possible_cpu(cpu) { + abs_vusage += local64_read( + per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); + } + vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; + iocg->last_stat_abs_vusage = abs_vusage; + + iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); + iocg->stat.usage_us += iocg->usage_delta_us; + + iocg_flush_stat_upward(iocg); +} + +/* get stat counters ready for reading on all active iocgs */ +static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now) +{ + LIST_HEAD(inner_walk); struct ioc_gq *iocg, *tiocg; - struct ioc_now now; - int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0; - u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; - u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; - u32 missed_ppm[2], rq_wait_pct; - u64 period_vtime; - int prev_busy_level, i; - /* how were the latencies during the period? */ - ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + /* flush leaves and build inner node walk list */ + list_for_each_entry(iocg, target_iocgs, active_list) { + iocg_flush_stat_leaf(iocg, now); + iocg_build_inner_walk(iocg, &inner_walk); + } - /* take care of active iocgs */ - spin_lock_irq(&ioc->lock); + /* keep flushing upwards by walking the inner list backwards */ + list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) { + iocg_flush_stat_upward(iocg); + list_del_init(&iocg->walk_list); + } +} - ioc_now(ioc, &now); +/* + * Determine what @iocg's hweight_inuse should be after donating unused + * capacity. @hwm is the upper bound and used to signal no donation. This + * function also throws away @iocg's excess budget. + */ +static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm, + u32 usage, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess, delta, target, new_hwi; + + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return 1; + + /* see whether minimum margin requirement is met */ + if (waitqueue_active(&iocg->waitq) || + time_after64(vtime, now->vnow - ioc->margins.min)) + return hwm; + + /* throw away excess above target */ + excess = now->vnow - vtime - ioc->margins.target; + if (excess > 0) { + atomic64_add(excess, &iocg->vtime); + atomic64_add(excess, &iocg->done_vtime); + vtime += excess; + ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); + } - period_vtime = now.vnow - ioc->period_at_vtime; - if (WARN_ON_ONCE(!period_vtime)) { - spin_unlock_irq(&ioc->lock); + /* + * Let's say the distance between iocg's and device's vtimes as a + * fraction of period duration is delta. Assuming that the iocg will + * consume the usage determined above, we want to determine new_hwi so + * that delta equals MARGIN_TARGET at the end of the next period. + * + * We need to execute usage worth of IOs while spending the sum of the + * new budget (1 - MARGIN_TARGET) and the leftover from the last period + * (delta): + * + * usage = (1 - MARGIN_TARGET + delta) * new_hwi + * + * Therefore, the new_hwi is: + * + * new_hwi = usage / (1 - MARGIN_TARGET + delta) + */ + delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), + now->vnow - ioc->period_at_vtime); + target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100; + new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); + + return clamp_t(s64, new_hwi, 1, hwm); +} + +/* + * For work-conservation, an iocg which isn't using all of its share should + * donate the leftover to other iocgs. There are two ways to achieve this - 1. + * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight. + * + * #1 is mathematically simpler but has the drawback of requiring synchronous + * global hweight_inuse updates when idle iocg's get activated or inuse weights + * change due to donation snapbacks as it has the possibility of grossly + * overshooting what's allowed by the model and vrate. + * + * #2 is inherently safe with local operations. The donating iocg can easily + * snap back to higher weights when needed without worrying about impacts on + * other nodes as the impacts will be inherently correct. This also makes idle + * iocg activations safe. The only effect activations have is decreasing + * hweight_inuse of others, the right solution to which is for those iocgs to + * snap back to higher weights. + * + * So, we go with #2. The challenge is calculating how each donating iocg's + * inuse should be adjusted to achieve the target donation amounts. This is done + * using Andy's method described in the following pdf. + * + * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo + * + * Given the weights and target after-donation hweight_inuse values, Andy's + * method determines how the proportional distribution should look like at each + * sibling level to maintain the relative relationship between all non-donating + * pairs. To roughly summarize, it divides the tree into donating and + * non-donating parts, calculates global donation rate which is used to + * determine the target hweight_inuse for each node, and then derives per-level + * proportions. + * + * The following pdf shows that global distribution calculated this way can be + * achieved by scaling inuse weights of donating leaves and propagating the + * adjustments upwards proportionally. + * + * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE + * + * Combining the above two, we can determine how each leaf iocg's inuse should + * be adjusted to achieve the target donation. + * + * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN + * + * The inline comments use symbols from the last pdf. + * + * b is the sum of the absolute budgets in the subtree. 1 for the root node. + * f is the sum of the absolute budgets of non-donating nodes in the subtree. + * t is the sum of the absolute budgets of donating nodes in the subtree. + * w is the weight of the node. w = w_f + w_t + * w_f is the non-donating portion of w. w_f = w * f / b + * w_b is the donating portion of w. w_t = w * t / b + * s is the sum of all sibling weights. s = Sum(w) for siblings + * s_f and s_t are the non-donating and donating portions of s. + * + * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g. + * w_pt is the donating portion of the parent's weight and w'_pt the same value + * after adjustments. Subscript r denotes the root node's values. + */ +static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now) +{ + LIST_HEAD(over_hwa); + LIST_HEAD(inner_walk); + struct ioc_gq *iocg, *tiocg, *root_iocg; + u32 after_sum, over_sum, over_target, gamma; + + /* + * It's pretty unlikely but possible for the total sum of + * hweight_after_donation's to be higher than WEIGHT_ONE, which will + * confuse the following calculations. If such condition is detected, + * scale down everyone over its full share equally to keep the sum below + * WEIGHT_ONE. + */ + after_sum = 0; + over_sum = 0; + list_for_each_entry(iocg, surpluses, surplus_list) { + u32 hwa; + + current_hweight(iocg, &hwa, NULL); + after_sum += iocg->hweight_after_donation; + + if (iocg->hweight_after_donation > hwa) { + over_sum += iocg->hweight_after_donation; + list_add(&iocg->walk_list, &over_hwa); + } + } + + if (after_sum >= WEIGHT_ONE) { + /* + * The delta should be deducted from the over_sum, calculate + * target over_sum value. + */ + u32 over_delta = after_sum - (WEIGHT_ONE - 1); + WARN_ON_ONCE(over_sum <= over_delta); + over_target = over_sum - over_delta; + } else { + over_target = 0; + } + + list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) { + if (over_target) + iocg->hweight_after_donation = + div_u64((u64)iocg->hweight_after_donation * + over_target, over_sum); + list_del_init(&iocg->walk_list); + } + + /* + * Build pre-order inner node walk list and prepare for donation + * adjustment calculations. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + iocg_build_inner_walk(iocg, &inner_walk); + } + + root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list); + WARN_ON_ONCE(root_iocg->level > 0); + + list_for_each_entry(iocg, &inner_walk, walk_list) { + iocg->child_adjusted_sum = 0; + iocg->hweight_donating = 0; + iocg->hweight_after_donation = 0; + } + + /* + * Propagate the donating budget (b_t) and after donation budget (b'_t) + * up the hierarchy. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + + list_for_each_entry_reverse(iocg, &inner_walk, walk_list) { + if (iocg->level > 0) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + parent->hweight_donating += iocg->hweight_donating; + parent->hweight_after_donation += iocg->hweight_after_donation; + } + } + + /* + * Calculate inner hwa's (b) and make sure the donation values are + * within the accepted ranges as we're doing low res calculations with + * roundups. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + if (iocg->level) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + + iocg->hweight_active = DIV64_U64_ROUND_UP( + (u64)parent->hweight_active * iocg->active, + parent->child_active_sum); + + } + + iocg->hweight_donating = min(iocg->hweight_donating, + iocg->hweight_active); + iocg->hweight_after_donation = min(iocg->hweight_after_donation, + iocg->hweight_donating - 1); + if (WARN_ON_ONCE(iocg->hweight_active <= 1 || + iocg->hweight_donating <= 1 || + iocg->hweight_after_donation == 0)) { + pr_warn("iocg: invalid donation weights in "); + pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); + pr_cont(": active=%u donating=%u after=%u\n", + iocg->hweight_active, iocg->hweight_donating, + iocg->hweight_after_donation); + } + } + + /* + * Calculate the global donation rate (gamma) - the rate to adjust + * non-donating budgets by. + * + * No need to use 64bit multiplication here as the first operand is + * guaranteed to be smaller than WEIGHT_ONE (1<<16). + * + * We know that there are beneficiary nodes and the sum of the donating + * hweights can't be whole; however, due to the round-ups during hweight + * calculations, root_iocg->hweight_donating might still end up equal to + * or greater than whole. Limit the range when calculating the divider. + * + * gamma = (1 - t_r') / (1 - t_r) + */ + gamma = DIV_ROUND_UP( + (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, + WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); + + /* + * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner + * nodes. + */ + list_for_each_entry(iocg, &inner_walk, walk_list) { + struct ioc_gq *parent; + u32 inuse, wpt, wptp; + u64 st, sf; + + if (iocg->level == 0) { + /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */ + iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( + iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), + WEIGHT_ONE - iocg->hweight_after_donation); + continue; + } + + parent = iocg->ancestors[iocg->level - 1]; + + /* b' = gamma * b_f + b_t' */ + iocg->hweight_inuse = DIV64_U64_ROUND_UP( + (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), + WEIGHT_ONE) + iocg->hweight_after_donation; + + /* w' = s' * b' / b'_p */ + inuse = DIV64_U64_ROUND_UP( + (u64)parent->child_adjusted_sum * iocg->hweight_inuse, + parent->hweight_inuse); + + /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */ + st = DIV64_U64_ROUND_UP( + iocg->child_active_sum * iocg->hweight_donating, + iocg->hweight_active); + sf = iocg->child_active_sum - st; + wpt = DIV64_U64_ROUND_UP( + (u64)iocg->active * iocg->hweight_donating, + iocg->hweight_active); + wptp = DIV64_U64_ROUND_UP( + (u64)inuse * iocg->hweight_after_donation, + iocg->hweight_inuse); + + iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); + } + + /* + * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and + * we can finally determine leaf adjustments. + */ + list_for_each_entry(iocg, surpluses, surplus_list) { + struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; + u32 inuse; + + /* + * In-debt iocgs participated in the donation calculation with + * the minimum target hweight_inuse. Configuring inuse + * accordingly would work fine but debt handling expects + * @iocg->inuse stay at the minimum and we don't wanna + * interfere. + */ + if (iocg->abs_vdebt) { + WARN_ON_ONCE(iocg->inuse > 1); + continue; + } + + /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */ + inuse = DIV64_U64_ROUND_UP( + parent->child_adjusted_sum * iocg->hweight_after_donation, + parent->hweight_inuse); + + TRACE_IOCG_PATH(inuse_transfer, iocg, now, + iocg->inuse, inuse, + iocg->hweight_inuse, + iocg->hweight_after_donation); + + __propagate_weights(iocg, iocg->active, inuse, true, now); + } + + /* walk list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list) + list_del_init(&iocg->walk_list); +} + +/* + * A low weight iocg can amass a large amount of debt, for example, when + * anonymous memory gets reclaimed aggressively. If the system has a lot of + * memory paired with a slow IO device, the debt can span multiple seconds or + * more. If there are no other subsequent IO issuers, the in-debt iocg may end + * up blocked paying its debt while the IO device is idle. + * + * The following protects against such cases. If the device has been + * sufficiently idle for a while, the debts are halved and delays are + * recalculated. + */ +static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors, + struct ioc_now *now) +{ + struct ioc_gq *iocg; + u64 dur, usage_pct, nr_cycles; + + /* if no debtor, reset the cycle */ + if (!nr_debtors) { + ioc->dfgv_period_at = now->now; + ioc->dfgv_period_rem = 0; + ioc->dfgv_usage_us_sum = 0; return; } /* - * Waiters determine the sleep durations based on the vrate they - * saw at the time of sleep. If vrate has increased, some waiters - * could be sleeping for too long. Wake up tardy waiters which - * should have woken up in the last period and expire idle iocgs. + * Debtors can pass through a lot of writes choking the device and we + * don't want to be forgiving debts while the device is struggling from + * write bursts. If we're missing latency targets, consider the device + * fully utilized. + */ + if (ioc->busy_level > 0) + usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); + + ioc->dfgv_usage_us_sum += usage_us_sum; + if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) + return; + + /* + * At least DFGV_PERIOD has passed since the last period. Calculate the + * average usage and reset the period counters. */ + dur = now->now - ioc->dfgv_period_at; + usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); + + ioc->dfgv_period_at = now->now; + ioc->dfgv_usage_us_sum = 0; + + /* if was too busy, reset everything */ + if (usage_pct > DFGV_USAGE_PCT) { + ioc->dfgv_period_rem = 0; + return; + } + + /* + * Usage is lower than threshold. Let's forgive some debts. Debt + * forgiveness runs off of the usual ioc timer but its period usually + * doesn't match ioc's. Compensate the difference by performing the + * reduction as many times as would fit in the duration since the last + * run and carrying over the left-over duration in @ioc->dfgv_period_rem + * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive + * reductions is doubled. + */ + nr_cycles = dur + ioc->dfgv_period_rem; + ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); + + list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { + u64 __maybe_unused old_debt, __maybe_unused old_delay; + + if (!iocg->abs_vdebt && !iocg->delay) + continue; + + spin_lock(&iocg->waitq.lock); + + old_debt = iocg->abs_vdebt; + old_delay = iocg->delay; + + if (iocg->abs_vdebt) + iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1; + if (iocg->delay) + iocg->delay = iocg->delay >> nr_cycles ?: 1; + + iocg_kick_waitq(iocg, true, now); + + TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct, + old_debt, iocg->abs_vdebt, + old_delay, iocg->delay); + + spin_unlock(&iocg->waitq.lock); + } +} + +/* + * Check the active iocgs' state to avoid oversleeping and deactive + * idle iocgs. + * + * Since waiters determine the sleep durations based on the vrate + * they saw at the time of sleep, if vrate has increased, some + * waiters could be sleeping for too long. Wake up tardy waiters + * which should have woken up in the last period and expire idle + * iocgs. + */ +static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now) +{ + int nr_debtors = 0; + struct ioc_gq *iocg, *tiocg; + list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { - if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt && - !iocg_is_idle(iocg)) + if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && + !iocg->delay && !iocg_is_idle(iocg)) continue; spin_lock(&iocg->waitq.lock); - if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) { + /* flush wait and indebt stat deltas */ + if (iocg->wait_since) { + iocg->stat.wait_us += now->now - iocg->wait_since; + iocg->wait_since = now->now; + } + if (iocg->indebt_since) { + iocg->stat.indebt_us += + now->now - iocg->indebt_since; + iocg->indebt_since = now->now; + } + if (iocg->indelay_since) { + iocg->stat.indelay_us += + now->now - iocg->indelay_since; + iocg->indelay_since = now->now; + } + + if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || + iocg->delay) { /* might be oversleeping vtime / hweight changes, kick */ - iocg_kick_waitq(iocg, &now); - iocg_kick_delay(iocg, &now); + iocg_kick_waitq(iocg, true, now); + if (iocg->abs_vdebt || iocg->delay) + nr_debtors++; } else if (iocg_is_idle(iocg)) { /* no waiter and idle, deactivate */ - iocg->last_inuse = iocg->inuse; - __propagate_active_weight(iocg, 0, 0); + u64 vtime = atomic64_read(&iocg->vtime); + s64 excess; + + /* + * @iocg has been inactive for a full duration and will + * have a high budget. Account anything above target as + * error and throw away. On reactivation, it'll start + * with the target budget. + */ + excess = now->vnow - vtime - ioc->margins.target; + if (excess > 0) { + u32 old_hwi; + + current_hweight(iocg, NULL, &old_hwi); + ioc->vtime_err -= div64_u64(excess * old_hwi, + WEIGHT_ONE); + } + + TRACE_IOCG_PATH(iocg_idle, iocg, now, + atomic64_read(&iocg->active_period), + atomic64_read(&ioc->cur_period), vtime); + __propagate_weights(iocg, 0, 0, false, now); list_del_init(&iocg->active_list); } spin_unlock(&iocg->waitq.lock); } - commit_active_weights(ioc); - /* calc usages and see whether some weights need to be moved around */ + commit_weights(ioc); + return nr_debtors; +} + +static void ioc_timer_fn(struct timer_list *timer) +{ + struct ioc *ioc = container_of(timer, struct ioc, timer); + struct ioc_gq *iocg, *tiocg; + struct ioc_now now; + LIST_HEAD(surpluses); + int nr_debtors, nr_shortages = 0, nr_lagging = 0; + u64 usage_us_sum = 0; + u32 ppm_rthr; + u32 ppm_wthr; + u32 missed_ppm[2], rq_wait_pct; + u64 period_vtime; + int prev_busy_level; + + /* how were the latencies during the period? */ + ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct); + + /* take care of active iocgs */ + spin_lock_irq(&ioc->lock); + + ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; + ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; + ioc_now(ioc, &now); + + period_vtime = now.vnow - ioc->period_at_vtime; + if (WARN_ON_ONCE(!period_vtime)) { + spin_unlock_irq(&ioc->lock); + return; + } + + nr_debtors = ioc_check_iocgs(ioc, &now); + + /* + * Wait and indebt stat are flushed above and the donation calculation + * below needs updated usage stat. Let's bring stat up-to-date. + */ + iocg_flush_stat(&ioc->active_iocgs, &now); + + /* calc usage and see whether some weights need to be moved around */ list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u64 vdone, vtime, vusage, vmargin, vmin; - u32 hw_active, hw_inuse, usage; + u64 vdone, vtime, usage_us; + u32 hw_active, hw_inuse; /* * Collect unused and wind vtime closer to vnow to prevent @@ -1418,116 +2292,92 @@ static void ioc_timer_fn(struct timer_list *timer) time_before64(vdone, now.vnow - period_vtime)) nr_lagging++; - if (waitqueue_active(&iocg->waitq)) - vusage = now.vnow - iocg->last_vtime; - else if (time_before64(iocg->last_vtime, vtime)) - vusage = vtime - iocg->last_vtime; - else - vusage = 0; - - iocg->last_vtime += vusage; /* - * Factor in in-flight vtime into vusage to avoid - * high-latency completions appearing as idle. This should - * be done after the above ->last_time adjustment. + * Determine absolute usage factoring in in-flight IOs to avoid + * high-latency completions appearing as idle. */ - vusage = max(vusage, vtime - vdone); - - /* calculate hweight based usage ratio and record */ - if (vusage) { - usage = DIV64_U64_ROUND_UP(vusage * hw_inuse, - period_vtime); - iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS; - iocg->usages[iocg->usage_idx] = usage; - } else { - usage = 0; - } + usage_us = iocg->usage_delta_us; + usage_us_sum += usage_us; /* see whether there's surplus vtime */ - vmargin = ioc->margin_us * now.vrate; - vmin = now.vnow - vmargin; - - iocg->has_surplus = false; - - if (!waitqueue_active(&iocg->waitq) && - time_before64(vtime, vmin)) { - u64 delta = vmin - vtime; - - /* throw away surplus vtime */ - atomic64_add(delta, &iocg->vtime); - atomic64_add(delta, &iocg->done_vtime); - iocg->last_vtime += delta; - /* if usage is sufficiently low, maybe it can donate */ - if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) { - iocg->has_surplus = true; - nr_surpluses++; - } - } else if (hw_inuse < hw_active) { - u32 new_hwi, new_inuse; - - /* was donating but might need to take back some */ - if (waitqueue_active(&iocg->waitq)) { - new_hwi = hw_active; - } else { - new_hwi = max(hw_inuse, - usage * SURPLUS_SCALE_PCT / 100 + - SURPLUS_SCALE_ABS); + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + if (hw_inuse < hw_active || + (!waitqueue_active(&iocg->waitq) && + time_before64(vtime, now.vnow - ioc->margins.low))) { + u32 hwa, old_hwi, hwm, new_hwi, usage; + u64 usage_dur; + + if (vdone != vtime) { + u64 inflight_us = DIV64_U64_ROUND_UP( + cost_to_abs_cost(vtime - vdone, hw_inuse), + ioc->vtime_base_rate); + + usage_us = max(usage_us, inflight_us); } - new_inuse = div64_u64((u64)iocg->inuse * new_hwi, - hw_inuse); - new_inuse = clamp_t(u32, new_inuse, 1, iocg->active); + /* convert to hweight based usage ratio */ + if (time_after64(iocg->activated_at, ioc->period_at)) + usage_dur = max_t(u64, now.now - iocg->activated_at, 1); + else + usage_dur = max_t(u64, now.now - ioc->period_at, 1); + + usage = clamp_t(u32, + DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE, + usage_dur), + 1, WEIGHT_ONE); - if (new_inuse > iocg->inuse) { - TRACE_IOCG_PATH(inuse_takeback, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, - new_inuse); + /* + * Already donating or accumulated enough to start. + * Determine the donation amount. + */ + current_hweight(iocg, &hwa, &old_hwi); + hwm = current_hweight_max(iocg); + new_hwi = hweight_after_donation(iocg, old_hwi, hwm, + usage, &now); + /* + * Donation calculation assumes hweight_after_donation + * to be positive, a condition that a donor w/ hwa < 2 + * can't meet. Don't bother with donation if hwa is + * below 2. It's not gonna make a meaningful difference + * anyway. + */ + if (new_hwi < hwm && hwa >= 2) { + iocg->hweight_donating = hwa; + iocg->hweight_after_donation = new_hwi; + list_add(&iocg->surplus_list, &surpluses); + } else if (!iocg->abs_vdebt) { + /* + * @iocg doesn't have enough to donate. Reset + * its inuse to active. + * + * Don't reset debtors as their inuse's are + * owned by debt handling. This shouldn't affect + * donation calculuation in any meaningful way + * as @iocg doesn't have a meaningful amount of + * share anyway. + */ + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, + iocg->inuse, iocg->active, + iocg->hweight_inuse, new_hwi); + + __propagate_weights(iocg, iocg->active, + iocg->active, true, &now); + nr_shortages++; } } else { - /* genuninely out of vtime */ + /* genuinely short on vtime */ nr_shortages++; } } - if (!nr_shortages || !nr_surpluses) - goto skip_surplus_transfers; + if (!list_empty(&surpluses) && nr_shortages) + transfer_surpluses(&surpluses, &now); - /* there are both shortages and surpluses, transfer surpluses */ - list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { - u32 usage, hw_active, hw_inuse, new_hwi, new_inuse; - int nr_valid = 0; + commit_weights(ioc); - if (!iocg->has_surplus) - continue; - - /* base the decision on max historical usage */ - for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) { - if (iocg->usages[i]) { - usage = max(usage, iocg->usages[i]); - nr_valid++; - } - } - if (nr_valid < MIN_VALID_USAGES) - continue; - - current_hweight(iocg, &hw_active, &hw_inuse); - new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse); - if (!new_hwi) - continue; - - new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi, - hw_inuse); - if (new_inuse < iocg->inuse) { - TRACE_IOCG_PATH(inuse_giveaway, iocg, &now, - iocg->inuse, new_inuse, - hw_inuse, new_hwi); - __propagate_active_weight(iocg, iocg->weight, new_inuse); - } - } -skip_surplus_transfers: - commit_active_weights(ioc); + /* surplus list should be dissolved after use */ + list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list) + list_del_init(&iocg->surplus_list); /* * If q is getting clogged or we're missing too much, we're issuing @@ -1555,11 +2405,9 @@ skip_surplus_transfers: /* * If there are IOs spanning multiple periods, wait - * them out before pushing the device harder. If - * there are surpluses, let redistribution work it - * out first. + * them out before pushing the device harder. */ - if (!nr_lagging && !nr_surpluses) + if (!nr_lagging) ioc->busy_level--; } else { /* @@ -1577,56 +2425,13 @@ skip_surplus_transfers: ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); - if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) { - u64 vrate = atomic64_read(&ioc->vtime_rate); - u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; - - /* rq_wait signal is always reliable, ignore user vrate_min */ - if (rq_wait_pct > RQ_WAIT_BUSY_PCT) - vrate_min = VRATE_MIN; - - /* - * If vrate is out of bounds, apply clamp gradually as the - * bounds can change abruptly. Otherwise, apply busy_level - * based adjustment. - */ - if (vrate < vrate_min) { - vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), - 100); - vrate = min(vrate, vrate_min); - } else if (vrate > vrate_max) { - vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), - 100); - vrate = max(vrate, vrate_max); - } else { - int idx = min_t(int, abs(ioc->busy_level), - ARRAY_SIZE(vrate_adj_pct) - 1); - u32 adj_pct = vrate_adj_pct[idx]; - - if (ioc->busy_level > 0) - adj_pct = 100 - adj_pct; - else - adj_pct = 100 + adj_pct; - - vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100), - vrate_min, vrate_max); - } - - trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct, - nr_lagging, nr_shortages, - nr_surpluses); - - atomic64_set(&ioc->vtime_rate, vrate); - ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP( - ioc->period_us * vrate * INUSE_MARGIN_PCT, 100); - } else if (ioc->busy_level != prev_busy_level || nr_lagging) { - trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate), - missed_ppm, rq_wait_pct, nr_lagging, - nr_shortages, nr_surpluses); - } + ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages, + prev_busy_level, missed_ppm); ioc_refresh_params(ioc, false); + ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now); + /* * This period is done. Move onto the next one. If nothing's * going on with the device, stop the timer. @@ -1638,13 +2443,77 @@ skip_surplus_transfers: ioc_start_period(ioc, &now); } else { ioc->busy_level = 0; + ioc->vtime_err = 0; ioc->running = IOC_IDLE; } + + ioc_refresh_vrate(ioc, &now); } spin_unlock_irq(&ioc->lock); } +static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime, + u64 abs_cost, struct ioc_now *now) +{ + struct ioc *ioc = iocg->ioc; + struct ioc_margins *margins = &ioc->margins; + u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; + u32 hwi, adj_step; + s64 margin; + u64 cost, new_inuse; + unsigned long flags; + + current_hweight(iocg, NULL, &hwi); + old_hwi = hwi; + cost = abs_cost_to_cost(abs_cost, hwi); + margin = now->vnow - vtime - cost; + + /* debt handling owns inuse for debtors */ + if (iocg->abs_vdebt) + return cost; + + /* + * We only increase inuse during period and do so if the margin has + * deteriorated since the previous adjustment. + */ + if (margin >= iocg->saved_margin || margin >= margins->low || + iocg->inuse == iocg->active) + return cost; + + spin_lock_irqsave(&ioc->lock, flags); + + /* we own inuse only when @iocg is in the normal active state */ + if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { + spin_unlock_irqrestore(&ioc->lock, flags); + return cost; + } + + /* + * Bump up inuse till @abs_cost fits in the existing budget. + * adj_step must be determined after acquiring ioc->lock - we might + * have raced and lost to another thread for activation and could + * be reading 0 iocg->active before ioc->lock which will lead to + * infinite loop. + */ + new_inuse = iocg->inuse; + adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); + do { + new_inuse = new_inuse + adj_step; + propagate_weights(iocg, iocg->active, new_inuse, true, now); + current_hweight(iocg, NULL, &hwi); + cost = abs_cost_to_cost(abs_cost, hwi); + } while (time_after64(vtime + cost, now->vnow) && + iocg->inuse != iocg->active); + + spin_unlock_irqrestore(&ioc->lock, flags); + + TRACE_IOCG_PATH(inuse_adjust, iocg, now, + old_inuse, iocg->inuse, old_hwi, hwi); + + return cost; +} + static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, bool is_merge, u64 *costp) { @@ -1654,6 +2523,10 @@ static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg, u64 seek_pages = 0; u64 cost = 0; + /* Can't calculate cost for empty bio */ + if (!bio->bi_iter.bi_size) + goto out; + switch (bio_op(bio)) { case REQ_OP_READ: coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; @@ -1726,15 +2599,12 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) struct ioc_gq *iocg = blkg_to_iocg(blkg); struct ioc_now now; struct iocg_wait wait; - u32 hw_active, hw_inuse; u64 abs_cost, cost, vtime; + bool use_debt, ioc_locked; + unsigned long flags; - /* bypass IOs if disabled or for root cgroup */ - if (!ioc->enabled || !iocg->level) - return; - - /* always activate so that even 0 cost IOs get protected to some level */ - if (!iocg_activate(iocg, &now)) + /* bypass IOs if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) return; /* calculate the absolute vtime cost */ @@ -1742,22 +2612,12 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) if (!abs_cost) return; - iocg->cursor = bio_end_sector(bio); + if (!iocg_activate(iocg, &now)) + return; + iocg->cursor = bio_end_sector(bio); vtime = atomic64_read(&iocg->vtime); - current_hweight(iocg, &hw_active, &hw_inuse); - - if (hw_inuse < hw_active && - time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) { - TRACE_IOCG_PATH(inuse_reset, iocg, &now, - iocg->inuse, iocg->weight, hw_inuse, hw_active); - spin_lock_irq(&ioc->lock); - propagate_active_weight(iocg, iocg->weight, iocg->weight); - spin_unlock_irq(&ioc->lock); - current_hweight(iocg, &hw_active, &hw_inuse); - } - - cost = abs_cost_to_cost(abs_cost, hw_inuse); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* * If no one's waiting and within budget, issue right away. The @@ -1766,21 +2626,32 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) */ if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && time_before_eq64(vtime + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } /* - * We activated above but w/o any synchronization. Deactivation is - * synchronized with waitq.lock and we won't get deactivated as long - * as we're waiting or has debt, so we're good if we're activated - * here. In the unlikely case that we aren't, just issue the IO. + * We're over budget. This can be handled in two ways. IOs which may + * cause priority inversions are punted to @ioc->aux_iocg and charged as + * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling + * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine + * whether debt handling is needed and acquire locks accordingly. */ - spin_lock_irq(&iocg->waitq.lock); + use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current); + ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); +retry_lock: + iocg_lock(iocg, ioc_locked, &flags); + /* + * @iocg must stay activated for debt and waitq handling. Deactivation + * is synchronized against both ioc->lock and waitq.lock and we won't + * get deactivated as long as we're waiting or has debt, so we're good + * if we're activated here. In the unlikely cases that we aren't, just + * issue the IO. + */ if (unlikely(list_empty(&iocg->active_list))) { - spin_unlock_irq(&iocg->waitq.lock); - iocg_commit_bio(iocg, bio, cost); + iocg_unlock(iocg, ioc_locked, &flags); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1801,15 +2672,26 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) * clear them and leave @iocg inactive w/ dangling use_delay heavily * penalizing the cgroup and its descendants. */ - if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) { - iocg->abs_vdebt += abs_cost; + if (use_debt) { + iocg_incur_debt(iocg, abs_cost, &now); if (iocg_kick_delay(iocg, &now)) - blkcg_schedule_throttle(rqos->q, + blkcg_schedule_throttle(rqos->disk, (bio->bi_opf & REQ_SWAP) == REQ_SWAP); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); return; } + /* guarantee that iocgs w/ waiters have maximum inuse */ + if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { + if (!ioc_locked) { + iocg_unlock(iocg, false, &flags); + ioc_locked = true; + goto retry_lock; + } + propagate_weights(iocg, iocg->active, iocg->active, true, + &now); + } + /* * Append self to the waitq and schedule the wakeup timer if we're * the first waiter. The timer duration is calculated based on the @@ -1830,9 +2712,9 @@ static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio) wait.committed = false; /* will be set true by waker */ __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); - iocg_kick_waitq(iocg, &now); + iocg_kick_waitq(iocg, ioc_locked, &now); - spin_unlock_irq(&iocg->waitq.lock); + iocg_unlock(iocg, ioc_locked, &flags); while (true) { set_current_state(TASK_UNINTERRUPTIBLE); @@ -1849,15 +2731,14 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, struct bio *bio) { struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); - struct ioc *ioc = iocg->ioc; + struct ioc *ioc = rqos_to_ioc(rqos); sector_t bio_end = bio_end_sector(bio); struct ioc_now now; - u32 hw_inuse; - u64 abs_cost, cost; + u64 vtime, abs_cost, cost; unsigned long flags; - /* bypass if disabled or for root cgroup */ - if (!ioc->enabled || !iocg->level) + /* bypass if disabled, still initializing, or for root cgroup */ + if (!ioc->enabled || !iocg || !iocg->level) return; abs_cost = calc_vtime_cost(bio, iocg, true); @@ -1865,8 +2746,9 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, return; ioc_now(ioc, &now); - current_hweight(iocg, NULL, &hw_inuse); - cost = abs_cost_to_cost(abs_cost, hw_inuse); + + vtime = atomic64_read(&iocg->vtime); + cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); /* update cursor if backmerging into the request at the cursor */ if (blk_rq_pos(rq) < bio_end && @@ -1879,7 +2761,7 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, */ if (rq->bio && rq->bio->bi_iocost_cost && time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); return; } @@ -1888,14 +2770,20 @@ static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq, * be for the vast majority of cases. See debt handling in * ioc_rqos_throttle() for details. */ - spin_lock_irqsave(&iocg->waitq.lock, flags); + spin_lock_irqsave(&ioc->lock, flags); + spin_lock(&iocg->waitq.lock); + if (likely(!list_empty(&iocg->active_list))) { - iocg->abs_vdebt += abs_cost; - iocg_kick_delay(iocg, &now); + iocg_incur_debt(iocg, abs_cost, &now); + if (iocg_kick_delay(iocg, &now)) + blkcg_schedule_throttle(rqos->disk, + (bio->bi_opf & REQ_SWAP) == REQ_SWAP); } else { - iocg_commit_bio(iocg, bio, cost); + iocg_commit_bio(iocg, bio, abs_cost, cost); } - spin_unlock_irqrestore(&iocg->waitq.lock, flags); + + spin_unlock(&iocg->waitq.lock); + spin_unlock_irqrestore(&ioc->lock, flags); } static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) @@ -1909,13 +2797,14 @@ static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio) static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) { struct ioc *ioc = rqos_to_ioc(rqos); + struct ioc_pcpu_stat *ccs; u64 on_q_ns, rq_wait_ns, size_nsec; int pidx, rw; if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) return; - switch (req_op(rq) & REQ_OP_MASK) { + switch (req_op(rq)) { case REQ_OP_READ: pidx = QOS_RLAT; rw = READ; @@ -1932,13 +2821,17 @@ static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq) rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC); + ccs = get_cpu_ptr(ioc->pcpu_stat); + if (on_q_ns <= size_nsec || on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met); + local_inc(&ccs->missed[rw].nr_met); else - this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed); + local_inc(&ccs->missed[rw].nr_missed); - this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns); + local64_add(rq_wait_ns, &ccs->rq_wait_ns); + + put_cpu_ptr(ccs); } static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos) @@ -1954,18 +2847,18 @@ static void ioc_rqos_exit(struct rq_qos *rqos) { struct ioc *ioc = rqos_to_ioc(rqos); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); spin_lock_irq(&ioc->lock); ioc->running = IOC_STOP; spin_unlock_irq(&ioc->lock); - del_timer_sync(&ioc->timer); + timer_shutdown_sync(&ioc->timer); free_percpu(ioc->pcpu_stat); kfree(ioc); } -static struct rq_qos_ops ioc_rqos_ops = { +static const struct rq_qos_ops ioc_rqos_ops = { .throttle = ioc_rqos_throttle, .merge = ioc_rqos_merge, .done_bio = ioc_rqos_done_bio, @@ -1974,11 +2867,10 @@ static struct rq_qos_ops ioc_rqos_ops = { .exit = ioc_rqos_exit, }; -static int blk_iocost_init(struct request_queue *q) +static int blk_iocost_init(struct gendisk *disk) { struct ioc *ioc; - struct rq_qos *rqos; - int ret; + int i, cpu, ret; ioc = kzalloc(sizeof(*ioc), GFP_KERNEL); if (!ioc) @@ -1990,36 +2882,54 @@ static int blk_iocost_init(struct request_queue *q) return -ENOMEM; } - rqos = &ioc->rqos; - rqos->id = RQ_QOS_COST; - rqos->ops = &ioc_rqos_ops; - rqos->q = q; + for_each_possible_cpu(cpu) { + struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); + + for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { + local_set(&ccs->missed[i].nr_met, 0); + local_set(&ccs->missed[i].nr_missed, 0); + } + local64_set(&ccs->rq_wait_ns, 0); + } spin_lock_init(&ioc->lock); timer_setup(&ioc->timer, ioc_timer_fn, 0); INIT_LIST_HEAD(&ioc->active_iocgs); ioc->running = IOC_IDLE; + ioc->vtime_base_rate = VTIME_PER_USEC; atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); - seqcount_init(&ioc->period_seqcount); + seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); ioc->period_at = ktime_to_us(ktime_get()); atomic64_set(&ioc->cur_period, 0); atomic_set(&ioc->hweight_gen, 0); spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; - ioc_refresh_params(ioc, true); + ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); - rq_qos_add(q, rqos); - ret = blkcg_activate_policy(q, &blkcg_policy_iocost); - if (ret) { - rq_qos_del(q, rqos); - free_percpu(ioc->pcpu_stat); - kfree(ioc); - return ret; - } + /* + * rqos must be added before activation to allow ioc_pd_init() to + * lookup the ioc from q. This means that the rqos methods may get + * called before policy activation completion, can't assume that the + * target bio has an iocg associated and need to test for NULL iocg. + */ + ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); + if (ret) + goto err_free_ioc; + + ret = blkcg_activate_policy(disk, &blkcg_policy_iocost); + if (ret) + goto err_del_qos; return 0; + +err_del_qos: + rq_qos_del(&ioc->rqos); +err_free_ioc: + free_percpu(ioc->pcpu_stat); + kfree(ioc); + return ret; } static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) @@ -2030,7 +2940,7 @@ static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp) if (!iocc) return NULL; - iocc->dfl_weight = CGROUP_WEIGHT_DFL; + iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; return &iocc->cpd; } @@ -2039,17 +2949,23 @@ static void ioc_cpd_free(struct blkcg_policy_data *cpd) kfree(container_of(cpd, struct ioc_cgrp, cpd)); } -static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { int levels = blkcg->css.cgroup->level + 1; struct ioc_gq *iocg; - iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]), - gfp, q->node); + iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, + disk->node_id); if (!iocg) return NULL; + iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); + if (!iocg->pcpu_stat) { + kfree(iocg); + return NULL; + } + return &iocg->pd; } @@ -2069,14 +2985,14 @@ static void ioc_pd_init(struct blkg_policy_data *pd) atomic64_set(&iocg->done_vtime, now.vnow); atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); INIT_LIST_HEAD(&iocg->active_list); - iocg->hweight_active = HWEIGHT_WHOLE; - iocg->hweight_inuse = HWEIGHT_WHOLE; + INIT_LIST_HEAD(&iocg->walk_list); + INIT_LIST_HEAD(&iocg->surplus_list); + iocg->hweight_active = WEIGHT_ONE; + iocg->hweight_inuse = WEIGHT_ONE; init_waitqueue_head(&iocg->waitq); hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); iocg->waitq_timer.function = iocg_waitq_timer_fn; - hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); - iocg->delay_timer.function = iocg_delay_timer_fn; iocg->level = blkg->blkcg->css.cgroup->level; @@ -2086,7 +3002,7 @@ static void ioc_pd_init(struct blkg_policy_data *pd) } spin_lock_irqsave(&ioc->lock, flags); - weight_updated(iocg); + weight_updated(iocg, &now); spin_unlock_irqrestore(&ioc->lock, flags); } @@ -2094,21 +3010,54 @@ static void ioc_pd_free(struct blkg_policy_data *pd) { struct ioc_gq *iocg = pd_to_iocg(pd); struct ioc *ioc = iocg->ioc; + unsigned long flags; if (ioc) { - spin_lock(&ioc->lock); + spin_lock_irqsave(&ioc->lock, flags); + if (!list_empty(&iocg->active_list)) { - propagate_active_weight(iocg, 0, 0); + struct ioc_now now; + + ioc_now(ioc, &now); + propagate_weights(iocg, 0, 0, false, &now); list_del_init(&iocg->active_list); } - spin_unlock(&ioc->lock); + + WARN_ON_ONCE(!list_empty(&iocg->walk_list)); + WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); + + spin_unlock_irqrestore(&ioc->lock, flags); hrtimer_cancel(&iocg->waitq_timer); - hrtimer_cancel(&iocg->delay_timer); } + free_percpu(iocg->pcpu_stat); kfree(iocg); } +static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) +{ + struct ioc_gq *iocg = pd_to_iocg(pd); + struct ioc *ioc = iocg->ioc; + + if (!ioc->enabled) + return; + + if (iocg->level == 0) { + unsigned vp10k = DIV64_U64_ROUND_CLOSEST( + ioc->vtime_base_rate * 10000, + VTIME_PER_USEC); + seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); + } + + seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); + + if (blkcg_debug_stats) + seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", + iocg->last_stat.wait_us, + iocg->last_stat.indebt_us, + iocg->last_stat.indelay_us); +} + static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, int off) { @@ -2116,7 +3065,7 @@ static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd, struct ioc_gq *iocg = pd_to_iocg(pd); if (dname && iocg->cfg_weight) - seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight); + seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); return 0; } @@ -2126,7 +3075,7 @@ static int ioc_weight_show(struct seq_file *sf, void *v) struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); - seq_printf(sf, "default %u\n", iocc->dfl_weight); + seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill, &blkcg_policy_iocost, seq_cft(sf)->private, false); return 0; @@ -2138,6 +3087,7 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, struct blkcg *blkcg = css_to_blkcg(of_css(of)); struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg); struct blkg_conf_ctx ctx; + struct ioc_now now; struct ioc_gq *iocg; u32 v; int ret; @@ -2151,25 +3101,28 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX) return -EINVAL; - spin_lock(&blkcg->lock); - iocc->dfl_weight = v; + spin_lock_irq(&blkcg->lock); + iocc->dfl_weight = v * WEIGHT_ONE; hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { struct ioc_gq *iocg = blkg_to_iocg(blkg); if (iocg) { - spin_lock_irq(&iocg->ioc->lock); - weight_updated(iocg); - spin_unlock_irq(&iocg->ioc->lock); + spin_lock(&iocg->ioc->lock); + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); + spin_unlock(&iocg->ioc->lock); } } - spin_unlock(&blkcg->lock); + spin_unlock_irq(&blkcg->lock); return nbytes; } - ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx); if (ret) - return ret; + goto err; iocg = blkg_to_iocg(ctx.blkg); @@ -2183,16 +3136,19 @@ static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf, } spin_lock(&iocg->ioc->lock); - iocg->cfg_weight = v; - weight_updated(iocg); + iocg->cfg_weight = v * WEIGHT_ONE; + ioc_now(iocg->ioc, &now); + weight_updated(iocg, &now); spin_unlock(&iocg->ioc->lock); - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return nbytes; einval: - blkg_conf_finish(&ctx); - return -EINVAL; + ret = -EINVAL; +err: + blkg_conf_exit(&ctx); + return ret; } static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, @@ -2204,6 +3160,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, if (!dname) return 0; + spin_lock_irq(&ioc->lock); seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n", dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", ioc->params.qos[QOS_RPPM] / 10000, @@ -2216,6 +3173,7 @@ static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd, ioc->params.qos[QOS_MIN] % 10000 / 100, ioc->params.qos[QOS_MAX] / 10000, ioc->params.qos[QOS_MAX] % 10000 / 100); + spin_unlock_irq(&ioc->lock); return 0; } @@ -2247,32 +3205,44 @@ static const match_table_t qos_tokens = { static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { + struct blkg_conf_ctx ctx; struct gendisk *disk; struct ioc *ioc; u32 qos[NR_QOS_PARAMS]; bool enable, user; - char *p; + char *body, *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + blkg_conf_init(&ctx, input); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + disk = ctx.bdev->bd_disk; + if (!queue_is_mq(disk->queue)) { + ret = -EOPNOTSUPP; + goto err; + } ioc = q_to_ioc(disk->queue); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(disk); if (ret) goto err; ioc = q_to_ioc(disk->queue); } + blk_mq_freeze_queue(disk->queue); + blk_mq_quiesce_queue(disk->queue); + spin_lock_irq(&ioc->lock); memcpy(qos, ioc->params.qos, sizeof(qos)); enable = ioc->enabled; user = ioc->user_qos_params; - spin_unlock_irq(&ioc->lock); - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -2283,7 +3253,8 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, switch (match_token(p, qos_ctrl_tokens, args)) { case QOS_ENABLE: - match_u64(&args[0], &v); + if (match_u64(&args[0], &v)) + goto einval; enable = v; continue; case QOS_CTRL: @@ -2337,14 +3308,13 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, if (qos[QOS_MIN] > qos[QOS_MAX]) goto einval; - spin_lock_irq(&ioc->lock); - - if (enable) { - blk_stat_enable_accounting(ioc->rqos.q); - blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + if (enable && !ioc->enabled) { + blk_stat_enable_accounting(disk->queue); + blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = true; - } else { - blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q); + } else if (!enable && ioc->enabled) { + blk_stat_disable_accounting(disk->queue); + blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); ioc->enabled = false; } @@ -2358,12 +3328,25 @@ static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + if (enable) + wbt_disable_default(disk); + else + wbt_enable_default(disk); + + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + + blkg_conf_exit(&ctx); return nbytes; einval: + spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(disk->queue); + blk_mq_unfreeze_queue(disk->queue); + ret = -EINVAL; err: - put_disk_and_module(disk); + blkg_conf_exit(&ctx); return ret; } @@ -2377,12 +3360,14 @@ static u64 ioc_cost_model_prfill(struct seq_file *sf, if (!dname) return 0; + spin_lock_irq(&ioc->lock); seq_printf(sf, "%s ctrl=%s model=linear " "rbps=%llu rseqiops=%llu rrandiops=%llu " "wbps=%llu wseqiops=%llu wrandiops=%llu\n", dname, ioc->user_cost_model ? "user" : "auto", u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS], u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]); + spin_unlock_irq(&ioc->lock); return 0; } @@ -2414,31 +3399,43 @@ static const match_table_t i_lcoef_tokens = { static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, size_t nbytes, loff_t off) { - struct gendisk *disk; + struct blkg_conf_ctx ctx; + struct request_queue *q; struct ioc *ioc; u64 u[NR_I_LCOEFS]; bool user; - char *p; + char *body, *p; int ret; - disk = blkcg_conf_get_disk(&input); - if (IS_ERR(disk)) - return PTR_ERR(disk); + blkg_conf_init(&ctx, input); - ioc = q_to_ioc(disk->queue); + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto err; + + body = ctx.body; + q = bdev_get_queue(ctx.bdev); + if (!queue_is_mq(q)) { + ret = -EOPNOTSUPP; + goto err; + } + + ioc = q_to_ioc(q); if (!ioc) { - ret = blk_iocost_init(disk->queue); + ret = blk_iocost_init(ctx.bdev->bd_disk); if (ret) goto err; - ioc = q_to_ioc(disk->queue); + ioc = q_to_ioc(q); } + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + spin_lock_irq(&ioc->lock); memcpy(u, ioc->params.i_lcoefs, sizeof(u)); user = ioc->user_cost_model; - spin_unlock_irq(&ioc->lock); - while ((p = strsep(&input, " \t\n"))) { + while ((p = strsep(&body, " \t\n"))) { substring_t args[MAX_OPT_ARGS]; char buf[32]; int tok; @@ -2473,7 +3470,6 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, user = true; } - spin_lock_irq(&ioc->lock); if (user) { memcpy(ioc->params.i_lcoefs, u, sizeof(u)); ioc->user_cost_model = true; @@ -2483,13 +3479,21 @@ static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input, ioc_refresh_params(ioc, true); spin_unlock_irq(&ioc->lock); - put_disk_and_module(disk); + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + + blkg_conf_exit(&ctx); return nbytes; einval: + spin_unlock_irq(&ioc->lock); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); + ret = -EINVAL; err: - put_disk_and_module(disk); + blkg_conf_exit(&ctx); return ret; } @@ -2522,6 +3526,7 @@ static struct blkcg_policy blkcg_policy_iocost = { .pd_alloc_fn = ioc_pd_alloc, .pd_init_fn = ioc_pd_init, .pd_free_fn = ioc_pd_free, + .pd_stat_fn = ioc_pd_stat, }; static int __init ioc_init(void) @@ -2531,7 +3536,7 @@ static int __init ioc_init(void) static void __exit ioc_exit(void) { - return blkcg_policy_unregister(&blkcg_policy_iocost); + blkcg_policy_unregister(&blkcg_policy_iocost); } module_init(ioc_init); diff --git a/block/blk-iolatency.c b/block/blk-iolatency.c index c128d50cb410..c1a6aba1d59e 100644 --- a/block/blk-iolatency.c +++ b/block/blk-iolatency.c @@ -76,6 +76,7 @@ #include <linux/blk-mq.h> #include "blk-rq-qos.h" #include "blk-stat.h" +#include "blk-cgroup.h" #include "blk.h" #define DEFAULT_SCALE_COOKIE 1000000U @@ -86,7 +87,17 @@ struct iolatency_grp; struct blk_iolatency { struct rq_qos rqos; struct timer_list timer; - atomic_t enabled; + + /* + * ->enabled is the master enable switch gating the throttling logic and + * inflight tracking. The number of cgroups which have iolat enabled is + * tracked in ->enable_cnt, and ->enable is flipped on/off accordingly + * from ->enable_work with the request_queue frozen. For details, See + * blkiolatency_enable_work_fn(). + */ + bool enabled; + atomic_t enable_cnt; + struct work_struct enable_work; }; static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) @@ -94,11 +105,6 @@ static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) return container_of(rqos, struct blk_iolatency, rqos); } -static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) -{ - return atomic_read(&blkiolat->enabled) > 0; -} - struct child_latency_info { spinlock_t lock; @@ -135,7 +141,7 @@ struct iolatency_grp { struct latency_stat __percpu *stats; struct latency_stat cur_stat; struct blk_iolatency *blkiolat; - struct rq_depth rq_depth; + unsigned int max_depth; struct rq_wait rq_wait; atomic64_t window_start; atomic_t scale_cookie; @@ -274,7 +280,7 @@ static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) { struct iolatency_grp *iolat = private_data; - return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); + return rq_wait_inc_below(rqw, iolat->max_depth); } static void __blkcg_iolatency_throttle(struct rq_qos *rqos, @@ -286,7 +292,7 @@ static void __blkcg_iolatency_throttle(struct rq_qos *rqos, unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); if (use_delay) - blkcg_schedule_throttle(rqos->q, use_memdelay); + blkcg_schedule_throttle(rqos->disk, use_memdelay); /* * To avoid priority inversions we want to just take a slot if we are @@ -324,7 +330,7 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, struct child_latency_info *lat_info, bool up) { - unsigned long qd = blkiolat->rqos.q->nr_requests; + unsigned long qd = blkiolat->rqos.disk->queue->nr_requests; unsigned long scale = scale_amount(qd, up); unsigned long old = atomic_read(&lat_info->scale_cookie); unsigned long max_scale = qd << 1; @@ -358,15 +364,17 @@ static void scale_cookie_change(struct blk_iolatency *blkiolat, } /* - * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the + * Change the queue depth of the iolatency_grp. We add 1/16th of the * queue depth at a time so we don't get wild swings and hopefully dial in to - * fairer distribution of the overall queue depth. + * fairer distribution of the overall queue depth. We halve the queue depth + * at a time so we can scale down queue depth quickly from default unlimited + * to target. */ static void scale_change(struct iolatency_grp *iolat, bool up) { - unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; + unsigned long qd = iolat->blkiolat->rqos.disk->queue->nr_requests; unsigned long scale = scale_amount(qd, up); - unsigned long old = iolat->rq_depth.max_depth; + unsigned long old = iolat->max_depth; if (old > qd) old = qd; @@ -378,12 +386,12 @@ static void scale_change(struct iolatency_grp *iolat, bool up) if (old < qd) { old += scale; old = min(old, qd); - iolat->rq_depth.max_depth = old; + iolat->max_depth = old; wake_up_all(&iolat->rq_wait.wait); } } else { old >>= 1; - iolat->rq_depth.max_depth = max(old, 1UL); + iolat->max_depth = max(old, 1UL); } } @@ -395,12 +403,8 @@ static void check_scale_change(struct iolatency_grp *iolat) unsigned int cur_cookie; unsigned int our_cookie = atomic_read(&iolat->scale_cookie); u64 scale_lat; - unsigned int old; int direction = 0; - if (lat_to_blkg(iolat)->parent == NULL) - return; - parent = blkg_to_lat(lat_to_blkg(iolat)->parent); if (!parent) return; @@ -416,11 +420,10 @@ static void check_scale_change(struct iolatency_grp *iolat) else return; - old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); - - /* Somebody beat us to the punch, just bail. */ - if (old != our_cookie) + if (!atomic_try_cmpxchg(&iolat->scale_cookie, &our_cookie, cur_cookie)) { + /* Somebody beat us to the punch, just bail. */ return; + } if (direction < 0 && iolat->min_lat_nsec) { u64 samples_thresh; @@ -441,7 +444,7 @@ static void check_scale_change(struct iolatency_grp *iolat) } /* We're as low as we can go. */ - if (iolat->rq_depth.max_depth == 1 && direction < 0) { + if (iolat->max_depth == 1 && direction < 0) { blkcg_use_delay(lat_to_blkg(iolat)); return; } @@ -449,7 +452,7 @@ static void check_scale_change(struct iolatency_grp *iolat) /* We're back to the default cookie, unthrottle all the things. */ if (cur_cookie == DEFAULT_SCALE_COOKIE) { blkcg_clear_delay(lat_to_blkg(iolat)); - iolat->rq_depth.max_depth = UINT_MAX; + iolat->max_depth = UINT_MAX; wake_up_all(&iolat->rq_wait.wait); return; } @@ -463,7 +466,7 @@ static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) struct blkcg_gq *blkg = bio->bi_blkg; bool issue_as_root = bio_issue_as_root_blkg(bio); - if (!blk_iolatency_enabled(blkiolat)) + if (!blkiolat->enabled) return; while (blkg && blkg->parent) { @@ -504,7 +507,7 @@ static void iolatency_record_time(struct iolatency_grp *iolat, * We don't want to count issue_as_root bio's in the cgroups latency * statistics as it could skew the numbers downwards. */ - if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { + if (unlikely(issue_as_root && iolat->max_depth != UINT_MAX)) { u64 sub = iolat->min_lat_nsec; if (req_time < sub) blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); @@ -591,23 +594,22 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) struct rq_wait *rqw; struct iolatency_grp *iolat; u64 window_start; - u64 now = ktime_to_ns(ktime_get()); + u64 now; bool issue_as_root = bio_issue_as_root_blkg(bio); - bool enabled = false; int inflight = 0; blkg = bio->bi_blkg; - if (!blkg || !bio_flagged(bio, BIO_TRACKED)) + if (!blkg || !bio_flagged(bio, BIO_QOS_THROTTLED)) return; iolat = blkg_to_lat(bio->bi_blkg); if (!iolat) return; - enabled = blk_iolatency_enabled(iolat->blkiolat); - if (!enabled) + if (!iolat->blkiolat->enabled) return; + now = ktime_to_ns(ktime_get()); while (blkg && blkg->parent) { iolat = blkg_to_lat(blkg); if (!iolat) { @@ -628,8 +630,8 @@ static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) window_start = atomic64_read(&iolat->window_start); if (now > window_start && (now - window_start) >= iolat->cur_win_nsec) { - if (atomic64_cmpxchg(&iolat->window_start, - window_start, now) == window_start) + if (atomic64_try_cmpxchg(&iolat->window_start, + &window_start, now)) iolatency_check_latencies(iolat, now); } } @@ -642,12 +644,13 @@ static void blkcg_iolatency_exit(struct rq_qos *rqos) { struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); - del_timer_sync(&blkiolat->timer); - blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); + timer_shutdown_sync(&blkiolat->timer); + flush_work(&blkiolat->enable_work); + blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iolatency); kfree(blkiolat); } -static struct rq_qos_ops blkcg_iolatency_ops = { +static const struct rq_qos_ops blkcg_iolatency_ops = { .throttle = blkcg_iolatency_throttle, .done_bio = blkcg_iolatency_done_bio, .exit = blkcg_iolatency_exit, @@ -662,7 +665,7 @@ static void blkiolatency_timer_fn(struct timer_list *t) rcu_read_lock(); blkg_for_each_descendant_pre(blkg, pos_css, - blkiolat->rqos.q->root_blkg) { + blkiolat->rqos.disk->queue->root_blkg) { struct iolatency_grp *iolat; struct child_latency_info *lat_info; unsigned long flags; @@ -714,42 +717,77 @@ next: rcu_read_unlock(); } -int blk_iolatency_init(struct request_queue *q) +/** + * blkiolatency_enable_work_fn - Enable or disable iolatency on the device + * @work: enable_work of the blk_iolatency of interest + * + * iolatency needs to keep track of the number of in-flight IOs per cgroup. This + * is relatively expensive as it involves walking up the hierarchy twice for + * every IO. Thus, if iolatency is not enabled in any cgroup for the device, we + * want to disable the in-flight tracking. + * + * We have to make sure that the counting is balanced - we don't want to leak + * the in-flight counts by disabling accounting in the completion path while IOs + * are in flight. This is achieved by ensuring that no IO is in flight by + * freezing the queue while flipping ->enabled. As this requires a sleepable + * context, ->enabled flipping is punted to this work function. + */ +static void blkiolatency_enable_work_fn(struct work_struct *work) +{ + struct blk_iolatency *blkiolat = container_of(work, struct blk_iolatency, + enable_work); + bool enabled; + + /* + * There can only be one instance of this function running for @blkiolat + * and it's guaranteed to be executed at least once after the latest + * ->enabled_cnt modification. Acting on the latest ->enable_cnt is + * sufficient. + * + * Also, we know @blkiolat is safe to access as ->enable_work is flushed + * in blkcg_iolatency_exit(). + */ + enabled = atomic_read(&blkiolat->enable_cnt); + if (enabled != blkiolat->enabled) { + blk_mq_freeze_queue(blkiolat->rqos.disk->queue); + blkiolat->enabled = enabled; + blk_mq_unfreeze_queue(blkiolat->rqos.disk->queue); + } +} + +static int blk_iolatency_init(struct gendisk *disk) { struct blk_iolatency *blkiolat; - struct rq_qos *rqos; int ret; blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); if (!blkiolat) return -ENOMEM; - rqos = &blkiolat->rqos; - rqos->id = RQ_QOS_LATENCY; - rqos->ops = &blkcg_iolatency_ops; - rqos->q = q; - - rq_qos_add(q, rqos); - - ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); - if (ret) { - rq_qos_del(q, rqos); - kfree(blkiolat); - return ret; - } + ret = rq_qos_add(&blkiolat->rqos, disk, RQ_QOS_LATENCY, + &blkcg_iolatency_ops); + if (ret) + goto err_free; + ret = blkcg_activate_policy(disk, &blkcg_policy_iolatency); + if (ret) + goto err_qos_del; timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); + INIT_WORK(&blkiolat->enable_work, blkiolatency_enable_work_fn); return 0; + +err_qos_del: + rq_qos_del(&blkiolat->rqos); +err_free: + kfree(blkiolat); + return ret; } -/* - * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise - * return 0. - */ -static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) +static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) { struct iolatency_grp *iolat = blkg_to_lat(blkg); + struct blk_iolatency *blkiolat = iolat->blkiolat; u64 oldval = iolat->min_lat_nsec; iolat->min_lat_nsec = val; @@ -757,13 +795,15 @@ static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, BLKIOLATENCY_MAX_WIN_SIZE); - if (!oldval && val) - return 1; + if (!oldval && val) { + if (atomic_inc_return(&blkiolat->enable_cnt) == 1) + schedule_work(&blkiolat->enable_work); + } if (oldval && !val) { blkcg_clear_delay(blkg); - return -1; + if (atomic_dec_return(&blkiolat->enable_cnt) == 0) + schedule_work(&blkiolat->enable_work); } - return 0; } static void iolatency_clear_scaling(struct blkcg_gq *blkg) @@ -795,11 +835,26 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, u64 lat_val = 0; u64 oldval; int ret; - int enable = 0; - ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_open_bdev(&ctx); + if (ret) + goto out; + + /* + * blk_iolatency_init() may fail after rq_qos_add() succeeds which can + * confuse iolat_rq_qos() test. Make the test and init atomic. + */ + lockdep_assert_held(&ctx.bdev->bd_queue->rq_qos_mutex); + if (!iolat_rq_qos(ctx.bdev->bd_queue)) + ret = blk_iolatency_init(ctx.bdev->bd_disk); + if (ret) + goto out; + + ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, &ctx); if (ret) - return ret; + goto out; iolat = blkg_to_lat(ctx.blkg); p = ctx.body; @@ -830,37 +885,12 @@ static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, blkg = ctx.blkg; oldval = iolat->min_lat_nsec; - enable = iolatency_set_min_lat_nsec(blkg, lat_val); - if (enable) { - WARN_ON_ONCE(!blk_get_queue(blkg->q)); - blkg_get(blkg); - } - - if (oldval != iolat->min_lat_nsec) { + iolatency_set_min_lat_nsec(blkg, lat_val); + if (oldval != iolat->min_lat_nsec) iolatency_clear_scaling(blkg); - } - ret = 0; out: - blkg_conf_finish(&ctx); - if (ret == 0 && enable) { - struct iolatency_grp *tmp = blkg_to_lat(blkg); - struct blk_iolatency *blkiolat = tmp->blkiolat; - - blk_mq_freeze_queue(blkg->q); - - if (enable == 1) - atomic_inc(&blkiolat->enabled); - else if (enable == -1) - atomic_dec(&blkiolat->enabled); - else - WARN_ON_ONCE(1); - - blk_mq_unfreeze_queue(blkg->q); - - blkg_put(blkg); - blk_put_queue(blkg->q); - } + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -885,8 +915,7 @@ static int iolatency_print_limit(struct seq_file *sf, void *v) return 0; } -static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, - size_t size) +static void iolatency_ssd_stat(struct iolatency_grp *iolat, struct seq_file *s) { struct latency_stat stat; int cpu; @@ -900,47 +929,45 @@ static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, } preempt_enable(); - if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " missed=%llu total=%llu depth=max", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total); - return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", - (unsigned long long)stat.ps.missed, - (unsigned long long)stat.ps.total, - iolat->rq_depth.max_depth); + if (iolat->max_depth == UINT_MAX) + seq_printf(s, " missed=%llu total=%llu depth=max", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total); + else + seq_printf(s, " missed=%llu total=%llu depth=%u", + (unsigned long long)stat.ps.missed, + (unsigned long long)stat.ps.total, + iolat->max_depth); } -static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, - size_t size) +static void iolatency_pd_stat(struct blkg_policy_data *pd, struct seq_file *s) { struct iolatency_grp *iolat = pd_to_lat(pd); unsigned long long avg_lat; unsigned long long cur_win; if (!blkcg_debug_stats) - return 0; + return; if (iolat->ssd) - return iolatency_ssd_stat(iolat, buf, size); + return iolatency_ssd_stat(iolat, s); avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); - if (iolat->rq_depth.max_depth == UINT_MAX) - return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", - avg_lat, cur_win); - - return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", - iolat->rq_depth.max_depth, avg_lat, cur_win); + if (iolat->max_depth == UINT_MAX) + seq_printf(s, " depth=max avg_lat=%llu win=%llu", + avg_lat, cur_win); + else + seq_printf(s, " depth=%u avg_lat=%llu win=%llu", + iolat->max_depth, avg_lat, cur_win); } - -static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, - struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *iolatency_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct iolatency_grp *iolat; - iolat = kzalloc_node(sizeof(*iolat), gfp, q->node); + iolat = kzalloc_node(sizeof(*iolat), gfp, disk->node_id); if (!iolat) return NULL; iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), @@ -956,7 +983,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct rq_qos *rqos = blkcg_rq_qos(blkg->q); + struct rq_qos *rqos = iolat_rq_qos(blkg->q); struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); u64 now = ktime_to_ns(ktime_get()); int cpu; @@ -975,9 +1002,7 @@ static void iolatency_pd_init(struct blkg_policy_data *pd) latency_stat_init(iolat, &iolat->cur_stat); rq_wait_init(&iolat->rq_wait); spin_lock_init(&iolat->child_lat.lock); - iolat->rq_depth.queue_depth = blkg->q->nr_requests; - iolat->rq_depth.max_depth = UINT_MAX; - iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; + iolat->max_depth = UINT_MAX; iolat->blkiolat = blkiolat; iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; atomic64_set(&iolat->window_start, now); @@ -1001,14 +1026,8 @@ static void iolatency_pd_offline(struct blkg_policy_data *pd) { struct iolatency_grp *iolat = pd_to_lat(pd); struct blkcg_gq *blkg = lat_to_blkg(iolat); - struct blk_iolatency *blkiolat = iolat->blkiolat; - int ret; - ret = iolatency_set_min_lat_nsec(blkg, 0); - if (ret == 1) - atomic_inc(&blkiolat->enabled); - if (ret == -1) - atomic_dec(&blkiolat->enabled); + iolatency_set_min_lat_nsec(blkg, 0); iolatency_clear_scaling(blkg); } @@ -1045,7 +1064,7 @@ static int __init iolatency_init(void) static void __exit iolatency_exit(void) { - return blkcg_policy_unregister(&blkcg_policy_iolatency); + blkcg_policy_unregister(&blkcg_policy_iolatency); } module_init(iolatency_init); diff --git a/block/blk-ioprio.c b/block/blk-ioprio.c new file mode 100644 index 000000000000..4051fada01f1 --- /dev/null +++ b/block/blk-ioprio.c @@ -0,0 +1,243 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Block rq-qos policy for assigning an I/O priority class to requests. + * + * Using an rq-qos policy for assigning I/O priority class has two advantages + * over using the ioprio_set() system call: + * + * - This policy is cgroup based so it has all the advantages of cgroups. + * - While ioprio_set() does not affect page cache writeback I/O, this rq-qos + * controller affects page cache writeback I/O for filesystems that support + * assiociating a cgroup with writeback I/O. See also + * Documentation/admin-guide/cgroup-v2.rst. + */ + +#include <linux/blk-mq.h> +#include <linux/blk_types.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include "blk-cgroup.h" +#include "blk-ioprio.h" +#include "blk-rq-qos.h" + +/** + * enum prio_policy - I/O priority class policy. + * @POLICY_NO_CHANGE: (default) do not modify the I/O priority class. + * @POLICY_PROMOTE_TO_RT: modify no-IOPRIO_CLASS_RT to IOPRIO_CLASS_RT. + * @POLICY_RESTRICT_TO_BE: modify IOPRIO_CLASS_NONE and IOPRIO_CLASS_RT into + * IOPRIO_CLASS_BE. + * @POLICY_ALL_TO_IDLE: change the I/O priority class into IOPRIO_CLASS_IDLE. + * @POLICY_NONE_TO_RT: an alias for POLICY_PROMOTE_TO_RT. + * + * See also <linux/ioprio.h>. + */ +enum prio_policy { + POLICY_NO_CHANGE = 0, + POLICY_PROMOTE_TO_RT = 1, + POLICY_RESTRICT_TO_BE = 2, + POLICY_ALL_TO_IDLE = 3, + POLICY_NONE_TO_RT = 4, +}; + +static const char *policy_name[] = { + [POLICY_NO_CHANGE] = "no-change", + [POLICY_PROMOTE_TO_RT] = "promote-to-rt", + [POLICY_RESTRICT_TO_BE] = "restrict-to-be", + [POLICY_ALL_TO_IDLE] = "idle", + [POLICY_NONE_TO_RT] = "none-to-rt", +}; + +static struct blkcg_policy ioprio_policy; + +/** + * struct ioprio_blkg - Per (cgroup, request queue) data. + * @pd: blkg_policy_data structure. + */ +struct ioprio_blkg { + struct blkg_policy_data pd; +}; + +/** + * struct ioprio_blkcg - Per cgroup data. + * @cpd: blkcg_policy_data structure. + * @prio_policy: One of the IOPRIO_CLASS_* values. See also <linux/ioprio.h>. + */ +struct ioprio_blkcg { + struct blkcg_policy_data cpd; + enum prio_policy prio_policy; +}; + +static inline struct ioprio_blkg *pd_to_ioprio(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct ioprio_blkg, pd) : NULL; +} + +static struct ioprio_blkcg *blkcg_to_ioprio_blkcg(struct blkcg *blkcg) +{ + return container_of(blkcg_to_cpd(blkcg, &ioprio_policy), + struct ioprio_blkcg, cpd); +} + +static struct ioprio_blkcg * +ioprio_blkcg_from_css(struct cgroup_subsys_state *css) +{ + return blkcg_to_ioprio_blkcg(css_to_blkcg(css)); +} + +static struct ioprio_blkcg *ioprio_blkcg_from_bio(struct bio *bio) +{ + struct blkg_policy_data *pd = blkg_to_pd(bio->bi_blkg, &ioprio_policy); + + if (!pd) + return NULL; + + return blkcg_to_ioprio_blkcg(pd->blkg->blkcg); +} + +static int ioprio_show_prio_policy(struct seq_file *sf, void *v) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(seq_css(sf)); + + seq_printf(sf, "%s\n", policy_name[blkcg->prio_policy]); + return 0; +} + +static ssize_t ioprio_set_prio_policy(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_css(of_css(of)); + int ret; + + if (off != 0) + return -EIO; + /* kernfs_fop_write_iter() terminates 'buf' with '\0'. */ + ret = sysfs_match_string(policy_name, buf); + if (ret < 0) + return ret; + blkcg->prio_policy = ret; + return nbytes; +} + +static struct blkg_policy_data * +ioprio_alloc_pd(struct gendisk *disk, struct blkcg *blkcg, gfp_t gfp) +{ + struct ioprio_blkg *ioprio_blkg; + + ioprio_blkg = kzalloc(sizeof(*ioprio_blkg), gfp); + if (!ioprio_blkg) + return NULL; + + return &ioprio_blkg->pd; +} + +static void ioprio_free_pd(struct blkg_policy_data *pd) +{ + struct ioprio_blkg *ioprio_blkg = pd_to_ioprio(pd); + + kfree(ioprio_blkg); +} + +static struct blkcg_policy_data *ioprio_alloc_cpd(gfp_t gfp) +{ + struct ioprio_blkcg *blkcg; + + blkcg = kzalloc(sizeof(*blkcg), gfp); + if (!blkcg) + return NULL; + blkcg->prio_policy = POLICY_NO_CHANGE; + return &blkcg->cpd; +} + +static void ioprio_free_cpd(struct blkcg_policy_data *cpd) +{ + struct ioprio_blkcg *blkcg = container_of(cpd, typeof(*blkcg), cpd); + + kfree(blkcg); +} + +#define IOPRIO_ATTRS \ + { \ + .name = "prio.class", \ + .seq_show = ioprio_show_prio_policy, \ + .write = ioprio_set_prio_policy, \ + }, \ + { } /* sentinel */ + +/* cgroup v2 attributes */ +static struct cftype ioprio_files[] = { + IOPRIO_ATTRS +}; + +/* cgroup v1 attributes */ +static struct cftype ioprio_legacy_files[] = { + IOPRIO_ATTRS +}; + +static struct blkcg_policy ioprio_policy = { + .dfl_cftypes = ioprio_files, + .legacy_cftypes = ioprio_legacy_files, + + .cpd_alloc_fn = ioprio_alloc_cpd, + .cpd_free_fn = ioprio_free_cpd, + + .pd_alloc_fn = ioprio_alloc_pd, + .pd_free_fn = ioprio_free_pd, +}; + +void blkcg_set_ioprio(struct bio *bio) +{ + struct ioprio_blkcg *blkcg = ioprio_blkcg_from_bio(bio); + u16 prio; + + if (!blkcg || blkcg->prio_policy == POLICY_NO_CHANGE) + return; + + if (blkcg->prio_policy == POLICY_PROMOTE_TO_RT || + blkcg->prio_policy == POLICY_NONE_TO_RT) { + /* + * For RT threads, the default priority level is 4 because + * task_nice is 0. By promoting non-RT io-priority to RT-class + * and default level 4, those requests that are already + * RT-class but need a higher io-priority can use ioprio_set() + * to achieve this. + */ + if (IOPRIO_PRIO_CLASS(bio->bi_ioprio) != IOPRIO_CLASS_RT) + bio->bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 4); + return; + } + + /* + * Except for IOPRIO_CLASS_NONE, higher I/O priority numbers + * correspond to a lower priority. Hence, the max_t() below selects + * the lower priority of bi_ioprio and the cgroup I/O priority class. + * If the bio I/O priority equals IOPRIO_CLASS_NONE, the cgroup I/O + * priority is assigned to the bio. + */ + prio = max_t(u16, bio->bi_ioprio, + IOPRIO_PRIO_VALUE(blkcg->prio_policy, 0)); + if (prio > bio->bi_ioprio) + bio->bi_ioprio = prio; +} + +void blk_ioprio_exit(struct gendisk *disk) +{ + blkcg_deactivate_policy(disk, &ioprio_policy); +} + +int blk_ioprio_init(struct gendisk *disk) +{ + return blkcg_activate_policy(disk, &ioprio_policy); +} + +static int __init ioprio_init(void) +{ + return blkcg_policy_register(&ioprio_policy); +} + +static void __exit ioprio_exit(void) +{ + blkcg_policy_unregister(&ioprio_policy); +} + +module_init(ioprio_init); +module_exit(ioprio_exit); diff --git a/block/blk-ioprio.h b/block/blk-ioprio.h new file mode 100644 index 000000000000..b6afb8e80de0 --- /dev/null +++ b/block/blk-ioprio.h @@ -0,0 +1,28 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef _BLK_IOPRIO_H_ +#define _BLK_IOPRIO_H_ + +#include <linux/kconfig.h> + +struct request_queue; +struct bio; + +#ifdef CONFIG_BLK_CGROUP_IOPRIO +int blk_ioprio_init(struct gendisk *disk); +void blk_ioprio_exit(struct gendisk *disk); +void blkcg_set_ioprio(struct bio *bio); +#else +static inline int blk_ioprio_init(struct gendisk *disk) +{ + return 0; +} +static inline void blk_ioprio_exit(struct gendisk *disk) +{ +} +static inline void blkcg_set_ioprio(struct bio *bio) +{ +} +#endif + +#endif /* _BLK_IOPRIO_H_ */ diff --git a/block/blk-lib.c b/block/blk-lib.c index 5f2c429d4378..e59c3069e835 100644 --- a/block/blk-lib.c +++ b/block/blk-lib.c @@ -10,41 +10,47 @@ #include "blk.h" -struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp) +static sector_t bio_discard_limit(struct block_device *bdev, sector_t sector) { - struct bio *new = bio_alloc(gfp, nr_pages); - - if (bio) { - bio_chain(bio, new); - submit_bio(bio); - } - - return new; + unsigned int discard_granularity = bdev_discard_granularity(bdev); + sector_t granularity_aligned_sector; + + if (bdev_is_partition(bdev)) + sector += bdev->bd_start_sect; + + granularity_aligned_sector = + round_up(sector, discard_granularity >> SECTOR_SHIFT); + + /* + * Make sure subsequent bios start aligned to the discard granularity if + * it needs to be split. + */ + if (granularity_aligned_sector != sector) + return granularity_aligned_sector - sector; + + /* + * Align the bio size to the discard granularity to make splitting the bio + * at discard granularity boundaries easier in the driver if needed. + */ + return round_down(UINT_MAX, discard_granularity) >> SECTOR_SHIFT; } int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, int flags, - struct bio **biop) + sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; - unsigned int op; sector_t bs_mask; - if (!q) - return -ENXIO; - if (bdev_read_only(bdev)) return -EPERM; + if (!bdev_max_discard_sectors(bdev)) + return -EOPNOTSUPP; - if (flags & BLKDEV_DISCARD_SECURE) { - if (!blk_queue_secure_erase(q)) - return -EOPNOTSUPP; - op = REQ_OP_SECURE_ERASE; - } else { - if (!blk_queue_discard(q)) - return -EOPNOTSUPP; - op = REQ_OP_DISCARD; + /* In case the discard granularity isn't set by buggy device driver */ + if (WARN_ON_ONCE(!bdev_discard_granularity(bdev))) { + pr_err_ratelimited("%pg: Error: discard_granularity is 0.\n", + bdev); + return -EOPNOTSUPP; } bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; @@ -55,16 +61,11 @@ int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return -EINVAL; while (nr_sects) { - sector_t req_sects = min_t(sector_t, nr_sects, - bio_allowed_max_sectors(q)); - - WARN_ON_ONCE((req_sects << 9) > UINT_MAX); + sector_t req_sects = + min(nr_sects, bio_discard_limit(bdev, sector)); - bio = blk_next_bio(bio, 0, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_DISCARD, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, op, 0); - bio->bi_iter.bi_size = req_sects << 9; sector += req_sects; nr_sects -= req_sects; @@ -89,21 +90,19 @@ EXPORT_SYMBOL(__blkdev_issue_discard); * @sector: start sector * @nr_sects: number of sectors to discard * @gfp_mask: memory allocation flags (for bio_alloc) - * @flags: BLKDEV_DISCARD_* flags to control behaviour * * Description: * Issue a discard request for the sectors in question. */ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, unsigned long flags) + sector_t nr_sects, gfp_t gfp_mask) { struct bio *bio = NULL; struct blk_plug plug; int ret; blk_start_plug(&plug); - ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, flags, - &bio); + ret = __blkdev_issue_discard(bdev, sector, nr_sects, gfp_mask, &bio); if (!ret && bio) { ret = submit_bio_wait(bio); if (ret == -EOPNOTSUPP) @@ -116,109 +115,12 @@ int blkdev_issue_discard(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL(blkdev_issue_discard); -/** - * __blkdev_issue_write_same - generate number of bios with same page - * @bdev: target blockdev - * @sector: start sector - * @nr_sects: number of sectors to write - * @gfp_mask: memory allocation flags (for bio_alloc) - * @page: page containing data to write - * @biop: pointer to anchor bio - * - * Description: - * Generate and issue number of bios(REQ_OP_WRITE_SAME) with same page. - */ -static int __blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, struct page *page, - struct bio **biop) -{ - struct request_queue *q = bdev_get_queue(bdev); - unsigned int max_write_same_sectors; - struct bio *bio = *biop; - sector_t bs_mask; - - if (!q) - return -ENXIO; - - if (bdev_read_only(bdev)) - return -EPERM; - - bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; - if ((sector | nr_sects) & bs_mask) - return -EINVAL; - - if (!bdev_write_same(bdev)) - return -EOPNOTSUPP; - - /* Ensure that max_write_same_sectors doesn't overflow bi_size */ - max_write_same_sectors = bio_allowed_max_sectors(q); - - while (nr_sects) { - bio = blk_next_bio(bio, 1, gfp_mask); - bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio->bi_vcnt = 1; - bio->bi_io_vec->bv_page = page; - bio->bi_io_vec->bv_offset = 0; - bio->bi_io_vec->bv_len = bdev_logical_block_size(bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE_SAME, 0); - - if (nr_sects > max_write_same_sectors) { - bio->bi_iter.bi_size = max_write_same_sectors << 9; - nr_sects -= max_write_same_sectors; - sector += max_write_same_sectors; - } else { - bio->bi_iter.bi_size = nr_sects << 9; - nr_sects = 0; - } - cond_resched(); - } - - *biop = bio; - return 0; -} - -/** - * blkdev_issue_write_same - queue a write same operation - * @bdev: target blockdev - * @sector: start sector - * @nr_sects: number of sectors to write - * @gfp_mask: memory allocation flags (for bio_alloc) - * @page: page containing data - * - * Description: - * Issue a write same request for the sectors in question. - */ -int blkdev_issue_write_same(struct block_device *bdev, sector_t sector, - sector_t nr_sects, gfp_t gfp_mask, - struct page *page) -{ - struct bio *bio = NULL; - struct blk_plug plug; - int ret; - - blk_start_plug(&plug); - ret = __blkdev_issue_write_same(bdev, sector, nr_sects, gfp_mask, page, - &bio); - if (ret == 0 && bio) { - ret = submit_bio_wait(bio); - bio_put(bio); - } - blk_finish_plug(&plug); - return ret; -} -EXPORT_SYMBOL(blkdev_issue_write_same); - static int __blkdev_issue_write_zeroes(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop, unsigned flags) { struct bio *bio = *biop; unsigned int max_write_zeroes_sectors; - struct request_queue *q = bdev_get_queue(bdev); - - if (!q) - return -ENXIO; if (bdev_read_only(bdev)) return -EPERM; @@ -230,10 +132,8 @@ static int __blkdev_issue_write_zeroes(struct block_device *bdev, return -EOPNOTSUPP; while (nr_sects) { - bio = blk_next_bio(bio, 0, gfp_mask); + bio = blk_next_bio(bio, bdev, 0, REQ_OP_WRITE_ZEROES, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio->bi_opf = REQ_OP_WRITE_ZEROES; if (flags & BLKDEV_ZERO_NOUNMAP) bio->bi_opf |= REQ_NOUNMAP; @@ -262,30 +162,24 @@ static unsigned int __blkdev_sectors_to_bio_pages(sector_t nr_sects) { sector_t pages = DIV_ROUND_UP_SECTOR_T(nr_sects, PAGE_SIZE / 512); - return min(pages, (sector_t)BIO_MAX_PAGES); + return min(pages, (sector_t)BIO_MAX_VECS); } static int __blkdev_issue_zero_pages(struct block_device *bdev, sector_t sector, sector_t nr_sects, gfp_t gfp_mask, struct bio **biop) { - struct request_queue *q = bdev_get_queue(bdev); struct bio *bio = *biop; int bi_size = 0; unsigned int sz; - if (!q) - return -ENXIO; - if (bdev_read_only(bdev)) return -EPERM; while (nr_sects != 0) { - bio = blk_next_bio(bio, __blkdev_sectors_to_bio_pages(nr_sects), - gfp_mask); + bio = blk_next_bio(bio, bdev, __blkdev_sectors_to_bio_pages(nr_sects), + REQ_OP_WRITE, gfp_mask); bio->bi_iter.bi_sector = sector; - bio_set_dev(bio, bdev); - bio_set_op_attrs(bio, REQ_OP_WRITE, 0); while (nr_sects != 0) { sz = min((sector_t) PAGE_SIZE, nr_sects << 9); @@ -405,3 +299,47 @@ retry: return ret; } EXPORT_SYMBOL(blkdev_issue_zeroout); + +int blkdev_issue_secure_erase(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp) +{ + sector_t bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + unsigned int max_sectors = bdev_max_secure_erase_sectors(bdev); + struct bio *bio = NULL; + struct blk_plug plug; + int ret = 0; + + /* make sure that "len << SECTOR_SHIFT" doesn't overflow */ + if (max_sectors > UINT_MAX >> SECTOR_SHIFT) + max_sectors = UINT_MAX >> SECTOR_SHIFT; + max_sectors &= ~bs_mask; + + if (max_sectors == 0) + return -EOPNOTSUPP; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + if (bdev_read_only(bdev)) + return -EPERM; + + blk_start_plug(&plug); + for (;;) { + unsigned int len = min_t(sector_t, nr_sects, max_sectors); + + bio = blk_next_bio(bio, bdev, 0, REQ_OP_SECURE_ERASE, gfp); + bio->bi_iter.bi_sector = sector; + bio->bi_iter.bi_size = len << SECTOR_SHIFT; + + sector += len; + nr_sects -= len; + if (!nr_sects) { + ret = submit_bio_wait(bio); + bio_put(bio); + break; + } + cond_resched(); + } + blk_finish_plug(&plug); + + return ret; +} +EXPORT_SYMBOL(blkdev_issue_secure_erase); diff --git a/block/blk-map.c b/block/blk-map.c index 6e804892d5ec..71210cdb3442 100644 --- a/block/blk-map.c +++ b/block/blk-map.c @@ -12,7 +12,8 @@ #include "blk.h" struct bio_map_data { - int is_our_pages; + bool is_our_pages : 1; + bool is_null_mapped : 1; struct iov_iter iter; struct iovec iov[]; }; @@ -28,9 +29,11 @@ static struct bio_map_data *bio_alloc_map_data(struct iov_iter *data, bmd = kmalloc(struct_size(bmd, iov, data->nr_segs), gfp_mask); if (!bmd) return NULL; - memcpy(bmd->iov, data->iov, sizeof(struct iovec) * data->nr_segs); bmd->iter = *data; - bmd->iter.iov = bmd->iov; + if (iter_is_iovec(data)) { + memcpy(bmd->iov, iter_iov(data), sizeof(struct iovec) * data->nr_segs); + bmd->iter.__iov = bmd->iov; + } return bmd; } @@ -108,7 +111,7 @@ static int bio_uncopy_user(struct bio *bio) struct bio_map_data *bmd = bio->bi_private; int ret = 0; - if (!bio_flagged(bio, BIO_NULL_MAPPED)) { + if (!bmd->is_null_mapped) { /* * if we're in a workqueue, the request is orphaned, so * don't copy into a random user address space, just free @@ -122,24 +125,11 @@ static int bio_uncopy_user(struct bio *bio) bio_free_pages(bio); } kfree(bmd); - bio_put(bio); return ret; } -/** - * bio_copy_user_iov - copy user data to bio - * @q: destination block queue - * @map_data: pointer to the rq_map_data holding pages (if necessary) - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Prepares and returns a bio for indirect user io, bouncing data - * to/from kernel pages as necessary. Must be paired with - * call bio_uncopy_user() on io completion. - */ -static struct bio *bio_copy_user_iov(struct request_queue *q, - struct rq_map_data *map_data, struct iov_iter *iter, - gfp_t gfp_mask) +static int bio_copy_user_iov(struct request *rq, struct rq_map_data *map_data, + struct iov_iter *iter, gfp_t gfp_mask) { struct bio_map_data *bmd; struct page *page; @@ -151,28 +141,26 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, bmd = bio_alloc_map_data(iter, gfp_mask); if (!bmd) - return ERR_PTR(-ENOMEM); + return -ENOMEM; /* * We need to do a deep copy of the iov_iter including the iovecs. * The caller provided iov might point to an on-stack or otherwise * shortlived one. */ - bmd->is_our_pages = map_data ? 0 : 1; + bmd->is_our_pages = !map_data; + bmd->is_null_mapped = (map_data && map_data->null_mapped); - nr_pages = DIV_ROUND_UP(offset + len, PAGE_SIZE); - if (nr_pages > BIO_MAX_PAGES) - nr_pages = BIO_MAX_PAGES; + nr_pages = bio_max_segs(DIV_ROUND_UP(offset + len, PAGE_SIZE)); ret = -ENOMEM; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) goto out_bmd; - - ret = 0; + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, req_op(rq)); if (map_data) { - nr_pages = 1 << map_data->page_order; + nr_pages = 1U << map_data->page_order; i = map_data->offset / PAGE_SIZE; } while (len) { @@ -186,7 +174,7 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, if (map_data) { if (i == map_data->nr_entries * nr_pages) { ret = -ENOMEM; - break; + goto cleanup; } page = map_data->pages[i / nr_pages]; @@ -194,14 +182,14 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, i++; } else { - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | gfp_mask); if (!page) { ret = -ENOMEM; - break; + goto cleanup; } } - if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { + if (bio_add_pc_page(rq->q, bio, page, bytes, offset) < bytes) { if (!map_data) __free_page(page); break; @@ -211,21 +199,25 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, offset = 0; } - if (ret) - goto cleanup; - if (map_data) map_data->offset += bio->bi_iter.bi_size; /* * success */ - if ((iov_iter_rw(iter) == WRITE && - (!map_data || !map_data->null_mapped)) || - (map_data && map_data->from_user)) { + if (iov_iter_rw(iter) == WRITE && + (!map_data || !map_data->null_mapped)) { ret = bio_copy_from_iter(bio, iter); if (ret) goto cleanup; + } else if (map_data && map_data->from_user) { + struct iov_iter iter2 = *iter; + + /* This is the copy-in part of SG_DXFER_TO_FROM_DEV. */ + iter2.data_source = ITER_SOURCE; + ret = bio_copy_from_iter(bio, &iter2); + if (ret) + goto cleanup; } else { if (bmd->is_our_pages) zero_fill_bio(bio); @@ -233,49 +225,84 @@ static struct bio *bio_copy_user_iov(struct request_queue *q, } bio->bi_private = bmd; - if (map_data && map_data->null_mapped) - bio_set_flag(bio, BIO_NULL_MAPPED); - return bio; + + ret = blk_rq_append_bio(rq, bio); + if (ret) + goto cleanup; + return 0; cleanup: if (!map_data) bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); out_bmd: kfree(bmd); - return ERR_PTR(ret); + return ret; } -/** - * bio_map_user_iov - map user iovec into bio - * @q: the struct request_queue for the bio - * @iter: iovec iterator - * @gfp_mask: memory allocation flags - * - * Map the user space address into a bio suitable for io to a block - * device. Returns an error pointer in case of error. - */ -static struct bio *bio_map_user_iov(struct request_queue *q, - struct iov_iter *iter, gfp_t gfp_mask) +static void blk_mq_map_bio_put(struct bio *bio) +{ + if (bio->bi_opf & REQ_ALLOC_CACHE) { + bio_put(bio); + } else { + bio_uninit(bio); + kfree(bio); + } +} + +static struct bio *blk_rq_map_bio_alloc(struct request *rq, + unsigned int nr_vecs, gfp_t gfp_mask) { - unsigned int max_sectors = queue_max_hw_sectors(q); - int j; + struct bio *bio; + + if (rq->cmd_flags & REQ_ALLOC_CACHE && (nr_vecs <= BIO_INLINE_VECS)) { + bio = bio_alloc_bioset(NULL, nr_vecs, rq->cmd_flags, gfp_mask, + &fs_bio_set); + if (!bio) + return NULL; + } else { + bio = bio_kmalloc(nr_vecs, gfp_mask); + if (!bio) + return NULL; + bio_init(bio, NULL, bio->bi_inline_vecs, nr_vecs, req_op(rq)); + } + return bio; +} + +static int bio_map_user_iov(struct request *rq, struct iov_iter *iter, + gfp_t gfp_mask) +{ + iov_iter_extraction_t extraction_flags = 0; + unsigned int max_sectors = queue_max_hw_sectors(rq->q); + unsigned int nr_vecs = iov_iter_npages(iter, BIO_MAX_VECS); struct bio *bio; int ret; + int j; if (!iov_iter_count(iter)) - return ERR_PTR(-EINVAL); + return -EINVAL; - bio = bio_kmalloc(gfp_mask, iov_iter_npages(iter, BIO_MAX_PAGES)); - if (!bio) - return ERR_PTR(-ENOMEM); + bio = blk_rq_map_bio_alloc(rq, nr_vecs, gfp_mask); + if (bio == NULL) + return -ENOMEM; + + if (blk_queue_pci_p2pdma(rq->q)) + extraction_flags |= ITER_ALLOW_P2PDMA; + if (iov_iter_extract_will_pin(iter)) + bio_set_flag(bio, BIO_PAGE_PINNED); while (iov_iter_count(iter)) { - struct page **pages; + struct page *stack_pages[UIO_FASTIOV]; + struct page **pages = stack_pages; ssize_t bytes; - size_t offs, added = 0; + size_t offs; int npages; - bytes = iov_iter_get_pages_alloc(iter, &pages, LONG_MAX, &offs); + if (nr_vecs > ARRAY_SIZE(stack_pages)) + pages = NULL; + + bytes = iov_iter_extract_pages(iter, &pages, LONG_MAX, + nr_vecs, extraction_flags, &offs); if (unlikely(bytes <= 0)) { ret = bytes ? bytes : -EFAULT; goto out_unmap; @@ -283,10 +310,9 @@ static struct bio *bio_map_user_iov(struct request_queue *q, npages = DIV_ROUND_UP(offs + bytes, PAGE_SIZE); - if (unlikely(offs & queue_dma_alignment(q))) { - ret = -EINVAL; + if (unlikely(offs & queue_dma_alignment(rq->q))) j = 0; - } else { + else { for (j = 0; j < npages; j++) { struct page *page = pages[j]; unsigned int n = PAGE_SIZE - offs; @@ -295,66 +321,44 @@ static struct bio *bio_map_user_iov(struct request_queue *q, if (n > bytes) n = bytes; - if (!bio_add_hw_page(q, bio, page, n, offs, - max_sectors, &same_page)) { - if (same_page) - put_page(page); + if (!bio_add_hw_page(rq->q, bio, page, n, offs, + max_sectors, &same_page)) break; - } - added += n; + if (same_page) + bio_release_page(bio, page); bytes -= n; offs = 0; } - iov_iter_advance(iter, added); } /* * release the pages we didn't map into the bio, if any */ while (j < npages) - put_page(pages[j++]); - kvfree(pages); + bio_release_page(bio, pages[j++]); + if (pages != stack_pages) + kvfree(pages); /* couldn't stuff something into bio? */ - if (bytes) + if (bytes) { + iov_iter_revert(iter, bytes); break; + } } - bio_set_flag(bio, BIO_USER_MAPPED); - - /* - * subtle -- if bio_map_user_iov() ended up bouncing a bio, - * it would normally disappear when its bi_end_io is run. - * however, we need it for the unmap, so grab an extra - * reference to it - */ - bio_get(bio); - return bio; + ret = blk_rq_append_bio(rq, bio); + if (ret) + goto out_unmap; + return 0; out_unmap: bio_release_pages(bio, false); - bio_put(bio); - return ERR_PTR(ret); -} - -/** - * bio_unmap_user - unmap a bio - * @bio: the bio being unmapped - * - * Unmap a bio previously mapped by bio_map_user_iov(). Must be called from - * process context. - * - * bio_unmap_user() may sleep. - */ -static void bio_unmap_user(struct bio *bio) -{ - bio_release_pages(bio, bio_data_dir(bio) == READ); - bio_put(bio); - bio_put(bio); + blk_mq_map_bio_put(bio); + return ret; } static void bio_invalidate_vmalloc_pages(struct bio *bio) { -#ifdef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +#ifdef ARCH_IMPLEMENTS_FLUSH_KERNEL_VMAP_RANGE if (bio->bi_private && !op_is_write(bio_op(bio))) { unsigned long i, len = 0; @@ -368,7 +372,8 @@ static void bio_invalidate_vmalloc_pages(struct bio *bio) static void bio_map_kern_endio(struct bio *bio) { bio_invalidate_vmalloc_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } /** @@ -393,9 +398,10 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, int offset, i; struct bio *bio; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); if (is_vmalloc) { flush_kernel_vmap_range(data, len); @@ -419,7 +425,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, if (bio_add_pc_page(q, bio, page, bytes, offset) < bytes) { /* we don't support partial mappings */ - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-EINVAL); } @@ -435,7 +442,8 @@ static struct bio *bio_map_kern(struct request_queue *q, void *data, static void bio_copy_kern_endio(struct bio *bio) { bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); } static void bio_copy_kern_endio_read(struct bio *bio) @@ -445,7 +453,7 @@ static void bio_copy_kern_endio_read(struct bio *bio) struct bvec_iter_all iter_all; bio_for_each_segment_all(bvec, bio, iter_all) { - memcpy(p, page_address(bvec->bv_page), bvec->bv_len); + memcpy_from_bvec(p, bvec); p += bvec->bv_len; } @@ -480,9 +488,10 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, return ERR_PTR(-EINVAL); nr_pages = end - start; - bio = bio_kmalloc(gfp_mask, nr_pages); + bio = bio_kmalloc(nr_pages, gfp_mask); if (!bio) return ERR_PTR(-ENOMEM); + bio_init(bio, NULL, bio->bi_inline_vecs, nr_pages, 0); while (len) { struct page *page; @@ -491,7 +500,7 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, if (bytes > len) bytes = len; - page = alloc_page(q->bounce_gfp | gfp_mask); + page = alloc_page(GFP_NOIO | __GFP_ZERO | gfp_mask); if (!page) goto cleanup; @@ -516,7 +525,8 @@ static struct bio *bio_copy_kern(struct request_queue *q, void *data, cleanup: bio_free_pages(bio); - bio_put(bio); + bio_uninit(bio); + kfree(bio); return ERR_PTR(-ENOMEM); } @@ -524,86 +534,84 @@ cleanup: * Append a bio to a passthrough request. Only works if the bio can be merged * into the request based on the driver constraints. */ -int blk_rq_append_bio(struct request *rq, struct bio **bio) +int blk_rq_append_bio(struct request *rq, struct bio *bio) { - struct bio *orig_bio = *bio; struct bvec_iter iter; struct bio_vec bv; unsigned int nr_segs = 0; - blk_queue_bounce(rq->q, bio); - - bio_for_each_bvec(bv, *bio, iter) + bio_for_each_bvec(bv, bio, iter) nr_segs++; if (!rq->bio) { - blk_rq_bio_prep(rq, *bio, nr_segs); + blk_rq_bio_prep(rq, bio, nr_segs); } else { - if (!ll_back_merge_fn(rq, *bio, nr_segs)) { - if (orig_bio != *bio) { - bio_put(*bio); - *bio = orig_bio; - } + if (!ll_back_merge_fn(rq, bio, nr_segs)) return -EINVAL; - } - - rq->biotail->bi_next = *bio; - rq->biotail = *bio; - rq->__data_len += (*bio)->bi_iter.bi_size; - bio_crypt_free_ctx(*bio); + rq->biotail->bi_next = bio; + rq->biotail = bio; + rq->__data_len += (bio)->bi_iter.bi_size; + bio_crypt_free_ctx(bio); } return 0; } EXPORT_SYMBOL(blk_rq_append_bio); -static int __blk_rq_unmap_user(struct bio *bio) -{ - int ret = 0; - - if (bio) { - if (bio_flagged(bio, BIO_USER_MAPPED)) - bio_unmap_user(bio); - else - ret = bio_uncopy_user(bio); - } - - return ret; -} - -static int __blk_rq_map_user_iov(struct request *rq, - struct rq_map_data *map_data, struct iov_iter *iter, - gfp_t gfp_mask, bool copy) +/* Prepare bio for passthrough IO given ITER_BVEC iter */ +static int blk_rq_map_user_bvec(struct request *rq, const struct iov_iter *iter) { struct request_queue *q = rq->q; - struct bio *bio, *orig_bio; - int ret; + size_t nr_iter = iov_iter_count(iter); + size_t nr_segs = iter->nr_segs; + struct bio_vec *bvecs, *bvprvp = NULL; + const struct queue_limits *lim = &q->limits; + unsigned int nsegs = 0, bytes = 0; + struct bio *bio; + size_t i; - if (copy) - bio = bio_copy_user_iov(q, map_data, iter, gfp_mask); - else - bio = bio_map_user_iov(q, iter, gfp_mask); + if (!nr_iter || (nr_iter >> SECTOR_SHIFT) > queue_max_hw_sectors(q)) + return -EINVAL; + if (nr_segs > queue_max_segments(q)) + return -EINVAL; - if (IS_ERR(bio)) - return PTR_ERR(bio); + /* no iovecs to alloc, as we already have a BVEC iterator */ + bio = blk_rq_map_bio_alloc(rq, 0, GFP_KERNEL); + if (bio == NULL) + return -ENOMEM; - bio->bi_opf &= ~REQ_OP_MASK; - bio->bi_opf |= req_op(rq); + bio_iov_bvec_set(bio, (struct iov_iter *)iter); + blk_rq_bio_prep(rq, bio, nr_segs); - orig_bio = bio; + /* loop to perform a bunch of sanity checks */ + bvecs = (struct bio_vec *)iter->bvec; + for (i = 0; i < nr_segs; i++) { + struct bio_vec *bv = &bvecs[i]; - /* - * We link the bounce buffer in and could have to traverse it - * later so we have to get a ref to prevent it from being freed - */ - ret = blk_rq_append_bio(rq, &bio); - if (ret) { - __blk_rq_unmap_user(orig_bio); - return ret; + /* + * If the queue doesn't support SG gaps and adding this + * offset would create a gap, fallback to copy. + */ + if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv->bv_offset)) { + blk_mq_map_bio_put(bio); + return -EREMOTEIO; + } + /* check full condition */ + if (nsegs >= nr_segs || bytes > UINT_MAX - bv->bv_len) + goto put_bio; + if (bytes + bv->bv_len > nr_iter) + goto put_bio; + if (bv->bv_offset + bv->bv_len > PAGE_SIZE) + goto put_bio; + + nsegs++; + bytes += bv->bv_len; + bvprvp = bv; } - bio_get(bio); - return 0; +put_bio: + blk_mq_map_bio_put(bio); + return -EINVAL; } /** @@ -620,36 +628,46 @@ static int __blk_rq_map_user_iov(struct request *rq, * * A matching blk_rq_unmap_user() must be issued at the end of I/O, while * still in process context. - * - * Note: The mapped bio may need to be bounced through blk_queue_bounce() - * before being submitted to the device, as pages mapped may be out of - * reach. It's the callers responsibility to make sure this happens. The - * original bio must be passed back in to blk_rq_unmap_user() for proper - * unmapping. */ int blk_rq_map_user_iov(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, const struct iov_iter *iter, gfp_t gfp_mask) { - bool copy = false; + bool copy = false, map_bvec = false; unsigned long align = q->dma_pad_mask | queue_dma_alignment(q); struct bio *bio = NULL; struct iov_iter i; int ret = -EINVAL; - if (!iter_is_iovec(iter)) - goto fail; - if (map_data) copy = true; + else if (blk_queue_may_bounce(q)) + copy = true; else if (iov_iter_alignment(iter) & align) copy = true; + else if (iov_iter_is_bvec(iter)) + map_bvec = true; + else if (!user_backed_iter(iter)) + copy = true; else if (queue_virt_boundary(q)) copy = queue_virt_boundary(q) & iov_iter_gap_alignment(iter); + if (map_bvec) { + ret = blk_rq_map_user_bvec(rq, iter); + if (!ret) + return 0; + if (ret != -EREMOTEIO) + goto fail; + /* fall back to copying the data on limits mismatches */ + copy = true; + } + i = *iter; do { - ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy); + if (copy) + ret = bio_copy_user_iov(rq, map_data, &i, gfp_mask); + else + ret = bio_map_user_iov(rq, &i, gfp_mask); if (ret) goto unmap_rq; if (!bio) @@ -670,9 +688,8 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, struct rq_map_data *map_data, void __user *ubuf, unsigned long len, gfp_t gfp_mask) { - struct iovec iov; struct iov_iter i; - int ret = import_single_range(rq_data_dir(rq), ubuf, len, &iov, &i); + int ret = import_ubuf(rq_data_dir(rq), ubuf, len, &i); if (unlikely(ret < 0)) return ret; @@ -681,6 +698,42 @@ int blk_rq_map_user(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(blk_rq_map_user); +int blk_rq_map_user_io(struct request *req, struct rq_map_data *map_data, + void __user *ubuf, unsigned long buf_len, gfp_t gfp_mask, + bool vec, int iov_count, bool check_iter_count, int rw) +{ + int ret = 0; + + if (vec) { + struct iovec fast_iov[UIO_FASTIOV]; + struct iovec *iov = fast_iov; + struct iov_iter iter; + + ret = import_iovec(rw, ubuf, iov_count ? iov_count : buf_len, + UIO_FASTIOV, &iov, &iter); + if (ret < 0) + return ret; + + if (iov_count) { + /* SG_IO howto says that the shorter of the two wins */ + iov_iter_truncate(&iter, buf_len); + if (check_iter_count && !iov_iter_count(&iter)) { + kfree(iov); + return -EINVAL; + } + } + + ret = blk_rq_map_user_iov(req->q, req, map_data, &iter, + gfp_mask); + kfree(iov); + } else if (buf_len) { + ret = blk_rq_map_user(req->q, req, map_data, ubuf, buf_len, + gfp_mask); + } + return ret; +} +EXPORT_SYMBOL(blk_rq_map_user_io); + /** * blk_rq_unmap_user - unmap a request with user data * @bio: start of bio list @@ -692,21 +745,21 @@ EXPORT_SYMBOL(blk_rq_map_user); */ int blk_rq_unmap_user(struct bio *bio) { - struct bio *mapped_bio; + struct bio *next_bio; int ret = 0, ret2; while (bio) { - mapped_bio = bio; - if (unlikely(bio_flagged(bio, BIO_BOUNCED))) - mapped_bio = bio->bi_private; - - ret2 = __blk_rq_unmap_user(mapped_bio); - if (ret2 && !ret) - ret = ret2; + if (bio->bi_private) { + ret2 = bio_uncopy_user(bio); + if (ret2 && !ret) + ret = ret2; + } else { + bio_release_pages(bio, bio_data_dir(bio) == READ); + } - mapped_bio = bio; + next_bio = bio; bio = bio->bi_next; - bio_put(mapped_bio); + blk_mq_map_bio_put(next_bio); } return ret; @@ -731,7 +784,7 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, { int reading = rq_data_dir(rq) == READ; unsigned long addr = (unsigned long) kbuf; - struct bio *bio, *orig_bio; + struct bio *bio; int ret; if (len > (queue_max_hw_sectors(q) << 9)) @@ -739,7 +792,8 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, if (!len || !kbuf) return -EINVAL; - if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf)) + if (!blk_rq_aligned(q, addr, len) || object_is_on_stack(kbuf) || + blk_queue_may_bounce(q)) bio = bio_copy_kern(q, kbuf, len, gfp_mask, reading); else bio = bio_map_kern(q, kbuf, len, gfp_mask); @@ -750,14 +804,11 @@ int blk_rq_map_kern(struct request_queue *q, struct request *rq, void *kbuf, bio->bi_opf &= ~REQ_OP_MASK; bio->bi_opf |= req_op(rq); - orig_bio = bio; - ret = blk_rq_append_bio(rq, &bio); + ret = blk_rq_append_bio(rq, bio); if (unlikely(ret)) { - /* request is too big */ - bio_put(orig_bio); - return ret; + bio_uninit(bio); + kfree(bio); } - - return 0; + return ret; } EXPORT_SYMBOL(blk_rq_map_kern); diff --git a/block/blk-merge.c b/block/blk-merge.c index f0b0bae075a0..2d470cf2173e 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -6,11 +6,48 @@ #include <linux/module.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/scatterlist.h> +#include <linux/part_stat.h> +#include <linux/blk-cgroup.h> #include <trace/events/block.h> #include "blk.h" +#include "blk-mq-sched.h" +#include "blk-rq-qos.h" +#include "blk-throttle.h" + +static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv) +{ + *bv = mp_bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter); +} + +static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv) +{ + struct bvec_iter iter = bio->bi_iter; + int idx; + + bio_get_first_bvec(bio, bv); + if (bv->bv_len == bio->bi_iter.bi_size) + return; /* this bio only has a single bvec */ + + bio_advance_iter(bio, &iter, iter.bi_size); + + if (!iter.bi_bvec_done) + idx = iter.bi_idx - 1; + else /* in the middle of bvec */ + idx = iter.bi_idx; + + *bv = bio->bi_io_vec[idx]; + + /* + * iter.bi_bvec_done records actual length of the last bvec + * if this bio ends in the middle of one io vector + */ + if (iter.bi_bvec_done) + bv->bv_len = iter.bi_bvec_done; +} static inline bool bio_will_gap(struct request_queue *q, struct request *prev_rq, struct bio *prev, struct bio *next) @@ -45,7 +82,7 @@ static inline bool bio_will_gap(struct request_queue *q, bio_get_first_bvec(next, &nb); if (biovec_phys_mergeable(q, &pb, &nb)) return false; - return __bvec_gap_to_prev(q, &pb, nb.bv_offset); + return __bvec_gap_to_prev(&q->limits, &pb, nb.bv_offset); } static inline bool req_gap_back_merge(struct request *req, struct bio *bio) @@ -58,29 +95,33 @@ static inline bool req_gap_front_merge(struct request *req, struct bio *bio) return bio_will_gap(req->q, NULL, bio, req->bio); } -static struct bio *blk_bio_discard_split(struct request_queue *q, - struct bio *bio, - struct bio_set *bs, - unsigned *nsegs) +/* + * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size + * is defined as 'unsigned int', meantime it has to be aligned to with the + * logical block size, which is the minimum accepted unit by hardware. + */ +static unsigned int bio_allowed_max_sectors(const struct queue_limits *lim) +{ + return round_down(UINT_MAX, lim->logical_block_size) >> SECTOR_SHIFT; +} + +static struct bio *bio_split_discard(struct bio *bio, + const struct queue_limits *lim, + unsigned *nsegs, struct bio_set *bs) { unsigned int max_discard_sectors, granularity; - int alignment; sector_t tmp; unsigned split_sectors; *nsegs = 1; - /* Zero-sector (unknown) and one-sector granularities are the same. */ - granularity = max(q->limits.discard_granularity >> 9, 1U); + granularity = max(lim->discard_granularity >> 9, 1U); - max_discard_sectors = min(q->limits.max_discard_sectors, - bio_allowed_max_sectors(q)); + max_discard_sectors = + min(lim->max_discard_sectors, bio_allowed_max_sectors(lim)); max_discard_sectors -= max_discard_sectors % granularity; - - if (unlikely(!max_discard_sectors)) { - /* XXX: warn */ + if (unlikely(!max_discard_sectors)) return NULL; - } if (bio_sectors(bio) <= max_discard_sectors) return NULL; @@ -91,9 +132,8 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, * If the next starting sector would be misaligned, stop the discard at * the previous aligned sector. */ - alignment = (q->limits.discard_alignment >> 9) % granularity; - - tmp = bio->bi_iter.bi_sector + split_sectors - alignment; + tmp = bio->bi_iter.bi_sector + split_sectors - + ((lim->discard_alignment >> 9) % granularity); tmp = sector_div(tmp, granularity); if (split_sectors > tmp) @@ -102,34 +142,16 @@ static struct bio *blk_bio_discard_split(struct request_queue *q, return bio_split(bio, split_sectors, GFP_NOIO, bs); } -static struct bio *blk_bio_write_zeroes_split(struct request_queue *q, - struct bio *bio, struct bio_set *bs, unsigned *nsegs) +static struct bio *bio_split_write_zeroes(struct bio *bio, + const struct queue_limits *lim, + unsigned *nsegs, struct bio_set *bs) { *nsegs = 0; - - if (!q->limits.max_write_zeroes_sectors) - return NULL; - - if (bio_sectors(bio) <= q->limits.max_write_zeroes_sectors) - return NULL; - - return bio_split(bio, q->limits.max_write_zeroes_sectors, GFP_NOIO, bs); -} - -static struct bio *blk_bio_write_same_split(struct request_queue *q, - struct bio *bio, - struct bio_set *bs, - unsigned *nsegs) -{ - *nsegs = 1; - - if (!q->limits.max_write_same_sectors) + if (!lim->max_write_zeroes_sectors) return NULL; - - if (bio_sectors(bio) <= q->limits.max_write_same_sectors) + if (bio_sectors(bio) <= lim->max_write_zeroes_sectors) return NULL; - - return bio_split(bio, q->limits.max_write_same_sectors, GFP_NOIO, bs); + return bio_split(bio, lim->max_write_zeroes_sectors, GFP_NOIO, bs); } /* @@ -140,51 +162,60 @@ static struct bio *blk_bio_write_same_split(struct request_queue *q, * requests that are submitted to a block device if the start of a bio is not * aligned to a physical block boundary. */ -static inline unsigned get_max_io_size(struct request_queue *q, - struct bio *bio) +static inline unsigned get_max_io_size(struct bio *bio, + const struct queue_limits *lim) { - unsigned sectors = blk_max_size_offset(q, bio->bi_iter.bi_sector); - unsigned max_sectors = sectors; - unsigned pbs = queue_physical_block_size(q) >> SECTOR_SHIFT; - unsigned lbs = queue_logical_block_size(q) >> SECTOR_SHIFT; - unsigned start_offset = bio->bi_iter.bi_sector & (pbs - 1); - - max_sectors += start_offset; - max_sectors &= ~(pbs - 1); - if (max_sectors > start_offset) - return max_sectors - start_offset; + unsigned pbs = lim->physical_block_size >> SECTOR_SHIFT; + unsigned lbs = lim->logical_block_size >> SECTOR_SHIFT; + unsigned max_sectors = lim->max_sectors, start, end; + + if (lim->chunk_sectors) { + max_sectors = min(max_sectors, + blk_chunk_sectors_left(bio->bi_iter.bi_sector, + lim->chunk_sectors)); + } - return sectors & (lbs - 1); + start = bio->bi_iter.bi_sector & (pbs - 1); + end = (start + max_sectors) & ~(pbs - 1); + if (end > start) + return end - start; + return max_sectors & ~(lbs - 1); } -static inline unsigned get_max_segment_size(const struct request_queue *q, - struct page *start_page, - unsigned long offset) +/** + * get_max_segment_size() - maximum number of bytes to add as a single segment + * @lim: Request queue limits. + * @start_page: See below. + * @offset: Offset from @start_page where to add a segment. + * + * Returns the maximum number of bytes that can be added as a single segment. + */ +static inline unsigned get_max_segment_size(const struct queue_limits *lim, + struct page *start_page, unsigned long offset) { - unsigned long mask = queue_segment_boundary(q); + unsigned long mask = lim->seg_boundary_mask; offset = mask & (page_to_phys(start_page) + offset); /* - * overflow may be triggered in case of zero page physical address - * on 32bit arch, use queue's max segment size when that happens. + * Prevent an overflow if mask = ULONG_MAX and offset = 0 by adding 1 + * after having calculated the minimum. */ - return min_not_zero(mask - offset + 1, - (unsigned long)queue_max_segment_size(q)); + return min(mask - offset, (unsigned long)lim->max_segment_size - 1) + 1; } /** * bvec_split_segs - verify whether or not a bvec should be split in the middle - * @q: [in] request queue associated with the bio associated with @bv + * @lim: [in] queue limits to split based on * @bv: [in] bvec to examine * @nsegs: [in,out] Number of segments in the bio being built. Incremented * by the number of segments from @bv that may be appended to that * bio without exceeding @max_segs - * @sectors: [in,out] Number of sectors in the bio being built. Incremented - * by the number of sectors from @bv that may be appended to that - * bio without exceeding @max_sectors + * @bytes: [in,out] Number of bytes in the bio being built. Incremented + * by the number of bytes from @bv that may be appended to that + * bio without exceeding @max_bytes * @max_segs: [in] upper bound for *@nsegs - * @max_sectors: [in] upper bound for *@sectors + * @max_bytes: [in] upper bound for *@bytes * * When splitting a bio, it can happen that a bvec is encountered that is too * big to fit in a single segment and hence that it has to be split in the @@ -193,18 +224,17 @@ static inline unsigned get_max_segment_size(const struct request_queue *q, * *@nsegs segments and *@sectors sectors would make that bio unacceptable for * the block driver. */ -static bool bvec_split_segs(const struct request_queue *q, - const struct bio_vec *bv, unsigned *nsegs, - unsigned *sectors, unsigned max_segs, - unsigned max_sectors) +static bool bvec_split_segs(const struct queue_limits *lim, + const struct bio_vec *bv, unsigned *nsegs, unsigned *bytes, + unsigned max_segs, unsigned max_bytes) { - unsigned max_len = (min(max_sectors, UINT_MAX >> 9) - *sectors) << 9; + unsigned max_len = min(max_bytes, UINT_MAX) - *bytes; unsigned len = min(bv->bv_len, max_len); unsigned total_len = 0; unsigned seg_size = 0; while (len && *nsegs < max_segs) { - seg_size = get_max_segment_size(q, bv->bv_page, + seg_size = get_max_segment_size(lim, bv->bv_page, bv->bv_offset + total_len); seg_size = min(seg_size, len); @@ -212,27 +242,28 @@ static bool bvec_split_segs(const struct request_queue *q, total_len += seg_size; len -= seg_size; - if ((bv->bv_offset + total_len) & queue_virt_boundary(q)) + if ((bv->bv_offset + total_len) & lim->virt_boundary_mask) break; } - *sectors += total_len >> 9; + *bytes += total_len; /* tell the caller to split the bvec if it is too big to fit */ return len > 0 || bv->bv_len > max_len; } /** - * blk_bio_segment_split - split a bio in two bios - * @q: [in] request queue pointer + * bio_split_rw - split a bio in two bios * @bio: [in] bio to be split - * @bs: [in] bio set to allocate the clone from + * @lim: [in] queue limits to split based on * @segs: [out] number of segments in the bio with the first half of the sectors + * @bs: [in] bio set to allocate the clone from + * @max_bytes: [in] maximum number of bytes per bio * * Clone @bio, update the bi_iter of the clone to represent the first sectors * of @bio and update @bio->bi_iter to represent the remaining sectors. The * following is guaranteed for the cloned bio: - * - That it has at most get_max_io_size(@q, @bio) sectors. + * - That it has at most @max_bytes worth of data * - That it has at most queue_max_segments(@q) segments. * * Except for discard requests the cloned bio will point at the bi_io_vec of @@ -241,33 +272,30 @@ static bool bvec_split_segs(const struct request_queue *q, * responsible for ensuring that @bs is only destroyed after processing of the * split bio has finished. */ -static struct bio *blk_bio_segment_split(struct request_queue *q, - struct bio *bio, - struct bio_set *bs, - unsigned *segs) +struct bio *bio_split_rw(struct bio *bio, const struct queue_limits *lim, + unsigned *segs, struct bio_set *bs, unsigned max_bytes) { struct bio_vec bv, bvprv, *bvprvp = NULL; struct bvec_iter iter; - unsigned nsegs = 0, sectors = 0; - const unsigned max_sectors = get_max_io_size(q, bio); - const unsigned max_segs = queue_max_segments(q); + unsigned nsegs = 0, bytes = 0; bio_for_each_bvec(bv, bio, iter) { /* * If the queue doesn't support SG gaps and adding this * offset would create a gap, disallow it. */ - if (bvprvp && bvec_gap_to_prev(q, bvprvp, bv.bv_offset)) + if (bvprvp && bvec_gap_to_prev(lim, bvprvp, bv.bv_offset)) goto split; - if (nsegs < max_segs && - sectors + (bv.bv_len >> 9) <= max_sectors && + if (nsegs < lim->max_segments && + bytes + bv.bv_len <= max_bytes && bv.bv_offset + bv.bv_len <= PAGE_SIZE) { nsegs++; - sectors += bv.bv_len >> 9; - } else if (bvec_split_segs(q, &bv, &nsegs, §ors, max_segs, - max_sectors)) { - goto split; + bytes += bv.bv_len; + } else { + if (bvec_split_segs(lim, &bv, &nsegs, &bytes, + lim->max_segments, max_bytes)) + goto split; } bvprv = bv; @@ -277,95 +305,110 @@ static struct bio *blk_bio_segment_split(struct request_queue *q, *segs = nsegs; return NULL; split: + /* + * We can't sanely support splitting for a REQ_NOWAIT bio. End it + * with EAGAIN if splitting is required and return an error pointer. + */ + if (bio->bi_opf & REQ_NOWAIT) { + bio->bi_status = BLK_STS_AGAIN; + bio_endio(bio); + return ERR_PTR(-EAGAIN); + } + *segs = nsegs; - return bio_split(bio, sectors, GFP_NOIO, bs); + + /* + * Individual bvecs might not be logical block aligned. Round down the + * split size so that each bio is properly block size aligned, even if + * we do not use the full hardware limits. + */ + bytes = ALIGN_DOWN(bytes, lim->logical_block_size); + + /* + * Bio splitting may cause subtle trouble such as hang when doing sync + * iopoll in direct IO routine. Given performance gain of iopoll for + * big IO can be trival, disable iopoll when split needed. + */ + bio_clear_polled(bio); + return bio_split(bio, bytes >> SECTOR_SHIFT, GFP_NOIO, bs); } +EXPORT_SYMBOL_GPL(bio_split_rw); /** - * __blk_queue_split - split a bio and submit the second half - * @q: [in] request queue pointer - * @bio: [in, out] bio to be split - * @nr_segs: [out] number of segments in the first bio + * __bio_split_to_limits - split a bio to fit the queue limits + * @bio: bio to be split + * @lim: queue limits to split based on + * @nr_segs: returns the number of segments in the returned bio * - * Split a bio into two bios, chain the two bios, submit the second half and - * store a pointer to the first half in *@bio. If the second bio is still too - * big it will be split by a recursive call to this function. Since this - * function may allocate a new bio from @q->bio_split, it is the responsibility - * of the caller to ensure that @q is only released after processing of the - * split bio has finished. + * Check if @bio needs splitting based on the queue limits, and if so split off + * a bio fitting the limits from the beginning of @bio and return it. @bio is + * shortened to the remainder and re-submitted. + * + * The split bio is allocated from @q->bio_split, which is provided by the + * block layer. */ -void __blk_queue_split(struct request_queue *q, struct bio **bio, - unsigned int *nr_segs) +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs) { - struct bio *split = NULL; + struct bio_set *bs = &bio->bi_bdev->bd_disk->bio_split; + struct bio *split; - switch (bio_op(*bio)) { + switch (bio_op(bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: - split = blk_bio_discard_split(q, *bio, &q->bio_split, nr_segs); + split = bio_split_discard(bio, lim, nr_segs, bs); break; case REQ_OP_WRITE_ZEROES: - split = blk_bio_write_zeroes_split(q, *bio, &q->bio_split, - nr_segs); - break; - case REQ_OP_WRITE_SAME: - split = blk_bio_write_same_split(q, *bio, &q->bio_split, - nr_segs); + split = bio_split_write_zeroes(bio, lim, nr_segs, bs); break; default: - /* - * All drivers must accept single-segments bios that are <= - * PAGE_SIZE. This is a quick and dirty check that relies on - * the fact that bi_io_vec[0] is always valid if a bio has data. - * The check might lead to occasional false negatives when bios - * are cloned, but compared to the performance impact of cloned - * bios themselves the loop below doesn't matter anyway. - */ - if (!q->limits.chunk_sectors && - (*bio)->bi_vcnt == 1 && - ((*bio)->bi_io_vec[0].bv_len + - (*bio)->bi_io_vec[0].bv_offset) <= PAGE_SIZE) { - *nr_segs = 1; - break; - } - split = blk_bio_segment_split(q, *bio, &q->bio_split, nr_segs); + split = bio_split_rw(bio, lim, nr_segs, bs, + get_max_io_size(bio, lim) << SECTOR_SHIFT); + if (IS_ERR(split)) + return NULL; break; } if (split) { - /* there isn't chance to merge the splitted bio */ + /* there isn't chance to merge the split bio */ split->bi_opf |= REQ_NOMERGE; - bio_chain(split, *bio); - trace_block_split(q, split, (*bio)->bi_iter.bi_sector); - generic_make_request(*bio); - *bio = split; + blkcg_bio_issue_init(split); + bio_chain(split, bio); + trace_block_split(split, bio->bi_iter.bi_sector); + submit_bio_noacct(bio); + return split; } + return bio; } /** - * blk_queue_split - split a bio and submit the second half - * @q: [in] request queue pointer - * @bio: [in, out] bio to be split + * bio_split_to_limits - split a bio to fit the queue limits + * @bio: bio to be split + * + * Check if @bio needs splitting based on the queue limits of @bio->bi_bdev, and + * if so split off a bio fitting the limits from the beginning of @bio and + * return it. @bio is shortened to the remainder and re-submitted. * - * Split a bio into two bios, chains the two bios, submit the second half and - * store a pointer to the first half in *@bio. Since this function may allocate - * a new bio from @q->bio_split, it is the responsibility of the caller to - * ensure that @q is only released after processing of the split bio has - * finished. + * The split bio is allocated from @q->bio_split, which is provided by the + * block layer. */ -void blk_queue_split(struct request_queue *q, struct bio **bio) +struct bio *bio_split_to_limits(struct bio *bio) { + const struct queue_limits *lim = &bdev_get_queue(bio->bi_bdev)->limits; unsigned int nr_segs; - __blk_queue_split(q, bio, &nr_segs); + if (bio_may_exceed_limits(bio, lim)) + return __bio_split_to_limits(bio, lim, &nr_segs); + return bio; } -EXPORT_SYMBOL(blk_queue_split); +EXPORT_SYMBOL(bio_split_to_limits); unsigned int blk_recalc_rq_segments(struct request *rq) { unsigned int nr_phys_segs = 0; - unsigned int nr_sectors = 0; + unsigned int bytes = 0; struct req_iterator iter; struct bio_vec bv; @@ -375,14 +418,22 @@ unsigned int blk_recalc_rq_segments(struct request *rq) switch (bio_op(rq->bio)) { case REQ_OP_DISCARD: case REQ_OP_SECURE_ERASE: + if (queue_max_discard_segments(rq->q) > 1) { + struct bio *bio = rq->bio; + + for_each_bio(bio) + nr_phys_segs++; + return nr_phys_segs; + } + return 1; case REQ_OP_WRITE_ZEROES: return 0; - case REQ_OP_WRITE_SAME: - return 1; + default: + break; } rq_for_each_bvec(bv, rq, iter) - bvec_split_segs(rq->q, &bv, &nr_phys_segs, &nr_sectors, + bvec_split_segs(&rq->q->limits, &bv, &nr_phys_segs, &bytes, UINT_MAX, UINT_MAX); return nr_phys_segs; } @@ -413,8 +464,8 @@ static unsigned blk_bvec_map_sg(struct request_queue *q, while (nbytes > 0) { unsigned offset = bvec->bv_offset + total; - unsigned len = min(get_max_segment_size(q, bvec->bv_page, - offset), nbytes); + unsigned len = min(get_max_segment_size(&q->limits, + bvec->bv_page, offset), nbytes); struct page *page = bvec->bv_page; /* @@ -473,7 +524,7 @@ static int __blk_bios_map_sg(struct request_queue *q, struct bio *bio, struct scatterlist *sglist, struct scatterlist **sg) { - struct bio_vec uninitialized_var(bvec), bvprv = { NULL }; + struct bio_vec bvec, bvprv = { NULL }; struct bvec_iter iter; int nsegs = 0; bool new_bio = false; @@ -516,8 +567,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) nsegs = __blk_bvec_map_sg(rq->special_vec, sglist, last_sg); - else if (rq->bio && bio_op(rq->bio) == REQ_OP_WRITE_SAME) - nsegs = __blk_bvec_map_sg(bio_iovec(rq->bio), sglist, last_sg); else if (rq->bio) nsegs = __blk_bios_map_sg(q, rq->bio, sglist, last_sg); @@ -534,15 +583,40 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(__blk_rq_map_sg); +static inline unsigned int blk_rq_get_max_sectors(struct request *rq, + sector_t offset) +{ + struct request_queue *q = rq->q; + unsigned int max_sectors; + + if (blk_rq_is_passthrough(rq)) + return q->limits.max_hw_sectors; + + max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + if (!q->limits.chunk_sectors || + req_op(rq) == REQ_OP_DISCARD || + req_op(rq) == REQ_OP_SECURE_ERASE) + return max_sectors; + return min(max_sectors, + blk_chunk_sectors_left(offset, q->limits.chunk_sectors)); +} + static inline int ll_new_hw_segment(struct request *req, struct bio *bio, unsigned int nr_phys_segs) { - if (req->nr_phys_segments + nr_phys_segs > queue_max_segments(req->q)) + if (!blk_cgroup_mergeable(req, bio)) goto no_merge; if (blk_integrity_merge_bio(req->q, req, bio) == false) goto no_merge; + /* discard request merge won't add new segment */ + if (req_op(req) == REQ_OP_DISCARD) + return 1; + + if (req->nr_phys_segments + nr_phys_segs > blk_rq_get_max_segments(req)) + goto no_merge; + /* * This will form the start of a new hw segment. Bump both * counters. @@ -573,7 +647,8 @@ int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) return ll_new_hw_segment(req, bio, nr_segs); } -int ll_front_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs) +static int ll_front_merge_fn(struct request *req, struct bio *bio, + unsigned int nr_segs) { if (req_gap_front_merge(req, bio)) return 0; @@ -625,7 +700,10 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, return 0; total_phys_segments = req->nr_phys_segments + next->nr_phys_segments; - if (total_phys_segments > queue_max_segments(q)) + if (total_phys_segments > blk_rq_get_max_segments(req)) + return 0; + + if (!blk_cgroup_mergeable(req, next->bio)) return 0; if (blk_integrity_merge_rq(q, req, next) == false) @@ -650,7 +728,7 @@ static int ll_merge_requests_fn(struct request_queue *q, struct request *req, */ void blk_rq_set_mixed_merge(struct request *rq) { - unsigned int ff = rq->cmd_flags & REQ_FAILFAST_MASK; + blk_opf_t ff = rq->cmd_flags & REQ_FAILFAST_MASK; struct bio *bio; if (rq->rq_flags & RQF_MIXED_MERGE) @@ -669,31 +747,40 @@ void blk_rq_set_mixed_merge(struct request *rq) rq->rq_flags |= RQF_MIXED_MERGE; } -static void blk_account_io_merge_request(struct request *req) +static inline blk_opf_t bio_failfast(const struct bio *bio) { - if (blk_do_io_stat(req)) { - part_stat_lock(); - part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); - part_stat_unlock(); + if (bio->bi_opf & REQ_RAHEAD) + return REQ_FAILFAST_MASK; - hd_struct_put(req->part); - } + return bio->bi_opf & REQ_FAILFAST_MASK; } /* - * Two cases of handling DISCARD merge: - * If max_discard_segments > 1, the driver takes every bio - * as a range and send them to controller together. The ranges - * needn't to be contiguous. - * Otherwise, the bios/requests will be handled as same as - * others which should be contiguous. + * After we are marked as MIXED_MERGE, any new RA bio has to be updated + * as failfast, and request's failfast has to be updated in case of + * front merge. */ -static inline bool blk_discard_mergable(struct request *req) +static inline void blk_update_mixed_merge(struct request *req, + struct bio *bio, bool front_merge) { - if (req_op(req) == REQ_OP_DISCARD && - queue_max_discard_segments(req->q) > 1) - return true; - return false; + if (req->rq_flags & RQF_MIXED_MERGE) { + if (bio->bi_opf & REQ_RAHEAD) + bio->bi_opf |= REQ_FAILFAST_MASK; + + if (front_merge) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= bio->bi_opf & REQ_FAILFAST_MASK; + } + } +} + +static void blk_account_io_merge_request(struct request *req) +{ + if (blk_do_io_stat(req)) { + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); + } } static enum elv_merge blk_try_req_merge(struct request *req, @@ -720,19 +807,7 @@ static struct request *attempt_merge(struct request_queue *q, if (req_op(req) != req_op(next)) return NULL; - if (rq_data_dir(req) != rq_data_dir(next) - || req->rq_disk != next->rq_disk) - return NULL; - - if (req_op(req) == REQ_OP_WRITE_SAME && - !blk_write_same_mergeable(req->bio, next->bio)) - return NULL; - - /* - * Don't allow merge of different write hints, or for a hint with - * non-hint IO. - */ - if (req->write_hint != next->write_hint) + if (rq_data_dir(req) != rq_data_dir(next)) return NULL; if (req->ioprio != next->ioprio) @@ -788,11 +863,15 @@ static struct request *attempt_merge(struct request_queue *q, if (!blk_discard_mergable(req)) elv_merge_requests(q, req, next); + blk_crypto_rq_put_keyslot(next); + /* * 'next' is going away, so update stats accordingly */ blk_account_io_merge_request(next); + trace_block_rq_merge(next); + /* * ownership of bio passed from next to req, return 'next' for * the caller to free @@ -801,7 +880,8 @@ static struct request *attempt_merge(struct request_queue *q, return next; } -struct request *attempt_back_merge(struct request_queue *q, struct request *rq) +static struct request *attempt_back_merge(struct request_queue *q, + struct request *rq) { struct request *next = elv_latter_request(q, rq); @@ -811,7 +891,8 @@ struct request *attempt_back_merge(struct request_queue *q, struct request *rq) return NULL; } -struct request *attempt_front_merge(struct request_queue *q, struct request *rq) +static struct request *attempt_front_merge(struct request_queue *q, + struct request *rq) { struct request *prev = elv_former_request(q, rq); @@ -821,18 +902,15 @@ struct request *attempt_front_merge(struct request_queue *q, struct request *rq) return NULL; } -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, - struct request *next) +/* + * Try to merge 'next' into 'rq'. Return true if the merge happened, false + * otherwise. The caller is responsible for freeing 'next' if the merge + * happened. + */ +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, + struct request *next) { - struct request *free; - - free = attempt_merge(q, rq, next); - if (free) { - blk_put_request(free); - return 1; - } - - return 0; + return attempt_merge(q, rq, next); } bool blk_rq_merge_ok(struct request *rq, struct bio *bio) @@ -847,8 +925,8 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (bio_data_dir(bio) != rq_data_dir(rq)) return false; - /* must be same device */ - if (rq->rq_disk != bio->bi_disk) + /* don't merge across cgroup boundaries */ + if (!blk_cgroup_mergeable(rq, bio)) return false; /* only merge integrity protected bio into ditto rq */ @@ -859,18 +937,6 @@ bool blk_rq_merge_ok(struct request *rq, struct bio *bio) if (!bio_crypt_rq_ctx_compatible(rq, bio)) return false; - /* must be using the same buffer */ - if (req_op(rq) == REQ_OP_WRITE_SAME && - !blk_write_same_mergeable(rq->bio, bio)) - return false; - - /* - * Don't allow merge of different write hints, or for a hint with - * non-hint IO. - */ - if (rq->write_hint != bio->bi_write_hint) - return false; - if (rq->ioprio != bio_prio(bio)) return false; @@ -887,3 +953,234 @@ enum elv_merge blk_try_merge(struct request *rq, struct bio *bio) return ELEVATOR_FRONT_MERGE; return ELEVATOR_NO_MERGE; } + +static void blk_account_io_merge_bio(struct request *req) +{ + if (!blk_do_io_stat(req)) + return; + + part_stat_lock(); + part_stat_inc(req->part, merges[op_stat_group(req_op(req))]); + part_stat_unlock(); +} + +enum bio_merge_status { + BIO_MERGE_OK, + BIO_MERGE_NONE, + BIO_MERGE_FAILED, +}; + +static enum bio_merge_status bio_attempt_back_merge(struct request *req, + struct bio *bio, unsigned int nr_segs) +{ + const blk_opf_t ff = bio_failfast(bio); + + if (!ll_back_merge_fn(req, bio, nr_segs)) + return BIO_MERGE_FAILED; + + trace_block_bio_backmerge(bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + blk_update_mixed_merge(req, bio, false); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_free_ctx(bio); + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +} + +static enum bio_merge_status bio_attempt_front_merge(struct request *req, + struct bio *bio, unsigned int nr_segs) +{ + const blk_opf_t ff = bio_failfast(bio); + + if (!ll_front_merge_fn(req, bio, nr_segs)) + return BIO_MERGE_FAILED; + + trace_block_bio_frontmerge(bio); + rq_qos_merge(req->q, req, bio); + + if ((req->cmd_flags & REQ_FAILFAST_MASK) != ff) + blk_rq_set_mixed_merge(req); + + blk_update_mixed_merge(req, bio, true); + + bio->bi_next = req->bio; + req->bio = bio; + + req->__sector = bio->bi_iter.bi_sector; + req->__data_len += bio->bi_iter.bi_size; + + bio_crypt_do_front_merge(req, bio); + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +} + +static enum bio_merge_status bio_attempt_discard_merge(struct request_queue *q, + struct request *req, struct bio *bio) +{ + unsigned short segments = blk_rq_nr_discard_segments(req); + + if (segments >= queue_max_discard_segments(q)) + goto no_merge; + if (blk_rq_sectors(req) + bio_sectors(bio) > + blk_rq_get_max_sectors(req, blk_rq_pos(req))) + goto no_merge; + + rq_qos_merge(q, req, bio); + + req->biotail->bi_next = bio; + req->biotail = bio; + req->__data_len += bio->bi_iter.bi_size; + req->nr_phys_segments = segments + 1; + + blk_account_io_merge_bio(req); + return BIO_MERGE_OK; +no_merge: + req_set_nomerge(q, req); + return BIO_MERGE_FAILED; +} + +static enum bio_merge_status blk_attempt_bio_merge(struct request_queue *q, + struct request *rq, + struct bio *bio, + unsigned int nr_segs, + bool sched_allow_merge) +{ + if (!blk_rq_merge_ok(rq, bio)) + return BIO_MERGE_NONE; + + switch (blk_try_merge(rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) + return bio_attempt_back_merge(rq, bio, nr_segs); + break; + case ELEVATOR_FRONT_MERGE: + if (!sched_allow_merge || blk_mq_sched_allow_merge(q, rq, bio)) + return bio_attempt_front_merge(rq, bio, nr_segs); + break; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio); + default: + return BIO_MERGE_NONE; + } + + return BIO_MERGE_FAILED; +} + +/** + * blk_attempt_plug_merge - try to merge with %current's plugged list + * @q: request_queue new bio is being queued at + * @bio: new bio being queued + * @nr_segs: number of segments in @bio + * from the passed in @q already in the plug list + * + * Determine whether @bio being queued on @q can be merged with the previous + * request on %current's plugged list. Returns %true if merge was successful, + * otherwise %false. + * + * Plugging coalesces IOs from the same issuer for the same purpose without + * going through @q->queue_lock. As such it's more of an issuing mechanism + * than scheduling, and the request, while may have elvpriv data, is not + * added on the elevator at this point. In addition, we don't have + * reliable access to the elevator outside queue lock. Only check basic + * merging parameters without querying the elevator. + * + * Caller must ensure !blk_queue_nomerges(q) beforehand. + */ +bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) +{ + struct blk_plug *plug; + struct request *rq; + + plug = blk_mq_plug(bio); + if (!plug || rq_list_empty(plug->mq_list)) + return false; + + rq_list_for_each(&plug->mq_list, rq) { + if (rq->q == q) { + if (blk_attempt_bio_merge(q, rq, bio, nr_segs, false) == + BIO_MERGE_OK) + return true; + break; + } + + /* + * Only keep iterating plug list for merges if we have multiple + * queues + */ + if (!plug->multiple_queues) + break; + } + return false; +} + +/* + * Iterate list of requests and see if we can merge this bio with any + * of them. + */ +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs) +{ + struct request *rq; + int checked = 8; + + list_for_each_entry_reverse(rq, list, queuelist) { + if (!checked--) + break; + + switch (blk_attempt_bio_merge(q, rq, bio, nr_segs, true)) { + case BIO_MERGE_NONE: + continue; + case BIO_MERGE_OK: + return true; + case BIO_MERGE_FAILED: + return false; + } + + } + + return false; +} +EXPORT_SYMBOL_GPL(blk_bio_list_merge); + +bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs, struct request **merged_request) +{ + struct request *rq; + + switch (elv_merge(q, &rq, bio)) { + case ELEVATOR_BACK_MERGE: + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_back_merge(rq, bio, nr_segs) != BIO_MERGE_OK) + return false; + *merged_request = attempt_back_merge(q, rq); + if (!*merged_request) + elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); + return true; + case ELEVATOR_FRONT_MERGE: + if (!blk_mq_sched_allow_merge(q, rq, bio)) + return false; + if (bio_attempt_front_merge(rq, bio, nr_segs) != BIO_MERGE_OK) + return false; + *merged_request = attempt_front_merge(q, rq); + if (!*merged_request) + elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); + return true; + case ELEVATOR_DISCARD_MERGE: + return bio_attempt_discard_merge(q, rq, bio) == BIO_MERGE_OK; + default: + return false; + } +} +EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); diff --git a/block/blk-mq-cpumap.c b/block/blk-mq-cpumap.c index 0157f2b3485a..9638b25fd521 100644 --- a/block/blk-mq-cpumap.c +++ b/block/blk-mq-cpumap.c @@ -10,68 +10,28 @@ #include <linux/mm.h> #include <linux/smp.h> #include <linux/cpu.h> +#include <linux/group_cpus.h> -#include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" -static int queue_index(struct blk_mq_queue_map *qmap, - unsigned int nr_queues, const int q) +void blk_mq_map_queues(struct blk_mq_queue_map *qmap) { - return qmap->queue_offset + (q % nr_queues); -} - -static int get_first_sibling(unsigned int cpu) -{ - unsigned int ret; - - ret = cpumask_first(topology_sibling_cpumask(cpu)); - if (ret < nr_cpu_ids) - return ret; - - return cpu; -} - -int blk_mq_map_queues(struct blk_mq_queue_map *qmap) -{ - unsigned int *map = qmap->mq_map; - unsigned int nr_queues = qmap->nr_queues; - unsigned int cpu, first_sibling, q = 0; - - for_each_possible_cpu(cpu) - map[cpu] = -1; - - /* - * Spread queues among present CPUs first for minimizing - * count of dead queues which are mapped by all un-present CPUs - */ - for_each_present_cpu(cpu) { - if (q >= nr_queues) - break; - map[cpu] = queue_index(qmap, nr_queues, q++); + const struct cpumask *masks; + unsigned int queue, cpu; + + masks = group_cpus_evenly(qmap->nr_queues); + if (!masks) { + for_each_possible_cpu(cpu) + qmap->mq_map[cpu] = qmap->queue_offset; + return; } - for_each_possible_cpu(cpu) { - if (map[cpu] != -1) - continue; - /* - * First do sequential mapping between CPUs and queues. - * In case we still have CPUs to map, and we have some number of - * threads per cores then map sibling threads to the same queue - * for performance optimizations. - */ - if (q < nr_queues) { - map[cpu] = queue_index(qmap, nr_queues, q++); - } else { - first_sibling = get_first_sibling(cpu); - if (first_sibling == cpu) - map[cpu] = queue_index(qmap, nr_queues, q++); - else - map[cpu] = map[first_sibling]; - } + for (queue = 0; queue < qmap->nr_queues; queue++) { + for_each_cpu(cpu, &masks[queue]) + qmap->mq_map[cpu] = qmap->queue_offset + queue; } - - return 0; + kfree(masks); } EXPORT_SYMBOL_GPL(blk_mq_map_queues); @@ -89,7 +49,7 @@ int blk_mq_hw_queue_to_node(struct blk_mq_queue_map *qmap, unsigned int index) for_each_possible_cpu(i) { if (index == qmap->mq_map[i]) - return local_memory_node(cpu_to_node(i)); + return cpu_to_node(i); } return NUMA_NO_NODE; diff --git a/block/blk-mq-debugfs-zoned.c b/block/blk-mq-debugfs-zoned.c index 038cb627c868..a77b099c34b7 100644 --- a/block/blk-mq-debugfs-zoned.c +++ b/block/blk-mq-debugfs-zoned.c @@ -11,11 +11,11 @@ int queue_zone_wlock_show(void *data, struct seq_file *m) struct request_queue *q = data; unsigned int i; - if (!q->seq_zones_wlock) + if (!q->disk->seq_zones_wlock) return 0; - for (i = 0; i < q->nr_zones; i++) - if (test_bit(i, q->seq_zones_wlock)) + for (i = 0; i < q->disk->nr_zones; i++) + if (test_bit(i, q->disk->seq_zones_wlock)) seq_printf(m, "%u\n", i); return 0; diff --git a/block/blk-mq-debugfs.c b/block/blk-mq-debugfs.c index e0b2bc131bf5..94668e72ab09 100644 --- a/block/blk-mq-debugfs.c +++ b/block/blk-mq-debugfs.c @@ -7,37 +7,14 @@ #include <linux/blkdev.h> #include <linux/debugfs.h> -#include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" +#include "blk-mq-sched.h" #include "blk-rq-qos.h" -static void print_stat(struct seq_file *m, struct blk_rq_stat *stat) -{ - if (stat->nr_samples) { - seq_printf(m, "samples=%d, mean=%llu, min=%llu, max=%llu", - stat->nr_samples, stat->mean, stat->min, stat->max); - } else { - seq_puts(m, "samples=0"); - } -} - static int queue_poll_stat_show(void *data, struct seq_file *m) { - struct request_queue *q = data; - int bucket; - - for (bucket = 0; bucket < (BLK_MQ_POLL_STATS_BKTS / 2); bucket++) { - seq_printf(m, "read (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket]); - seq_puts(m, "\n"); - - seq_printf(m, "write (%d Bytes): ", 1 << (9 + bucket)); - print_stat(m, &q->poll_stat[2 * bucket + 1]); - seq_puts(m, "\n"); - } return 0; } @@ -109,25 +86,26 @@ static const char *const blk_queue_flag_name[] = { QUEUE_FLAG_NAME(FAIL_IO), QUEUE_FLAG_NAME(NONROT), QUEUE_FLAG_NAME(IO_STAT), - QUEUE_FLAG_NAME(DISCARD), QUEUE_FLAG_NAME(NOXMERGES), QUEUE_FLAG_NAME(ADD_RANDOM), - QUEUE_FLAG_NAME(SECERASE), + QUEUE_FLAG_NAME(SYNCHRONOUS), QUEUE_FLAG_NAME(SAME_FORCE), - QUEUE_FLAG_NAME(DEAD), QUEUE_FLAG_NAME(INIT_DONE), + QUEUE_FLAG_NAME(STABLE_WRITES), QUEUE_FLAG_NAME(POLL), QUEUE_FLAG_NAME(WC), QUEUE_FLAG_NAME(FUA), QUEUE_FLAG_NAME(DAX), QUEUE_FLAG_NAME(STATS), - QUEUE_FLAG_NAME(POLL_STATS), QUEUE_FLAG_NAME(REGISTERED), - QUEUE_FLAG_NAME(SCSI_PASSTHROUGH), QUEUE_FLAG_NAME(QUIESCED), QUEUE_FLAG_NAME(PCI_P2PDMA), QUEUE_FLAG_NAME(ZONE_RESETALL), QUEUE_FLAG_NAME(RQ_ALLOC_TIME), + QUEUE_FLAG_NAME(HCTX_ACTIVE), + QUEUE_FLAG_NAME(NOWAIT), + QUEUE_FLAG_NAME(SQ_SCHED), + QUEUE_FLAG_NAME(SKIP_TAGSET_QUIESCE), }; #undef QUEUE_FLAG_NAME @@ -148,11 +126,10 @@ static ssize_t queue_state_write(void *data, const char __user *buf, char opbuf[16] = { }, *op; /* - * The "state" attribute is removed after blk_cleanup_queue() has called - * blk_mq_free_queue(). Return if QUEUE_FLAG_DEAD has been set to avoid - * triggering a use-after-free. + * The "state" attribute is removed when the queue is removed. Don't + * allow setting the state on a dying queue to avoid a use-after-free. */ - if (blk_queue_dead(q)) + if (blk_queue_dying(q)) return -ENOENT; if (count >= sizeof(opbuf)) { @@ -178,35 +155,11 @@ inval: return count; } -static int queue_write_hint_show(void *data, struct seq_file *m) -{ - struct request_queue *q = data; - int i; - - for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) - seq_printf(m, "hint%d: %llu\n", i, q->write_hints[i]); - - return 0; -} - -static ssize_t queue_write_hint_store(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct request_queue *q = data; - int i; - - for (i = 0; i < BLK_MAX_WRITE_HINTS; i++) - q->write_hints[i] = 0; - - return count; -} - static const struct blk_mq_debugfs_attr blk_mq_debugfs_queue_attrs[] = { { "poll_stat", 0400, queue_poll_stat_show }, { "requeue_list", 0400, .seq_ops = &queue_requeue_list_seq_ops }, { "pm_only", 0600, queue_pm_only_show, NULL }, { "state", 0600, queue_state_show, queue_state_write }, - { "write_hints", 0600, queue_write_hint_show, queue_write_hint_store }, { "zone_wlock", 0400, queue_zone_wlock_show, NULL }, { }, }; @@ -240,10 +193,11 @@ static const char *const alloc_policy_name[] = { #define HCTX_FLAG_NAME(name) [ilog2(BLK_MQ_F_##name)] = #name static const char *const hctx_flag_name[] = { HCTX_FLAG_NAME(SHOULD_MERGE), - HCTX_FLAG_NAME(TAG_SHARED), + HCTX_FLAG_NAME(TAG_QUEUE_SHARED), HCTX_FLAG_NAME(BLOCKING), HCTX_FLAG_NAME(NO_SCHED), HCTX_FLAG_NAME(STACKING), + HCTX_FLAG_NAME(TAG_HCTX_SHARED), }; #undef HCTX_FLAG_NAME @@ -283,31 +237,28 @@ static const char *const cmd_flag_name[] = { CMD_FLAG_NAME(BACKGROUND), CMD_FLAG_NAME(NOWAIT), CMD_FLAG_NAME(NOUNMAP), - CMD_FLAG_NAME(HIPRI), + CMD_FLAG_NAME(POLLED), }; #undef CMD_FLAG_NAME #define RQF_NAME(name) [ilog2((__force u32)RQF_##name)] = #name static const char *const rqf_name[] = { - RQF_NAME(SORTED), RQF_NAME(STARTED), - RQF_NAME(SOFTBARRIER), RQF_NAME(FLUSH_SEQ), RQF_NAME(MIXED_MERGE), - RQF_NAME(MQ_INFLIGHT), RQF_NAME(DONTPREP), - RQF_NAME(PREEMPT), + RQF_NAME(SCHED_TAGS), + RQF_NAME(USE_SCHED), RQF_NAME(FAILED), RQF_NAME(QUIET), - RQF_NAME(ELVPRIV), RQF_NAME(IO_STAT), - RQF_NAME(ALLOCED), RQF_NAME(PM), RQF_NAME(HASHED), RQF_NAME(STATS), RQF_NAME(SPECIAL_PAYLOAD), RQF_NAME(ZONE_WRITE_LOCKED), - RQF_NAME(MQ_POLL_SLEPT), + RQF_NAME(TIMED_OUT), + RQF_NAME(RESV), }; #undef RQF_NAME @@ -328,7 +279,7 @@ static const char *blk_mq_rq_state_name(enum mq_rq_state rq_state) int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) { const struct blk_mq_ops *const mq_ops = rq->q->mq_ops; - const unsigned int op = req_op(rq); + const enum req_op op = req_op(rq); const char *op_str = blk_op_str(op); seq_printf(m, "%p {.op=", rq); @@ -337,8 +288,8 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq) else seq_printf(m, "%s", op_str); seq_puts(m, ", .cmd_flags="); - blk_flags_show(m, rq->cmd_flags & ~REQ_OP_MASK, cmd_flag_name, - ARRAY_SIZE(cmd_flag_name)); + blk_flags_show(m, (__force unsigned int)(rq->cmd_flags & ~REQ_OP_MASK), + cmd_flag_name, ARRAY_SIZE(cmd_flag_name)); seq_puts(m, ", .rq_flags="); blk_flags_show(m, (__force unsigned int)rq->rq_flags, rqf_name, ARRAY_SIZE(rqf_name)); @@ -399,13 +350,12 @@ struct show_busy_params { * e.g. due to a concurrent blk_mq_finish_request() call. Returns true to * keep iterating requests. */ -static bool hctx_show_busy_rq(struct request *rq, void *data, bool reserved) +static bool hctx_show_busy_rq(struct request *rq, void *data) { const struct show_busy_params *params = data; if (rq->mq_hctx == params->hctx) - __blk_mq_debugfs_rq_show(params->m, - list_entry_rq(&rq->queuelist)); + __blk_mq_debugfs_rq_show(params->m, rq); return true; } @@ -450,7 +400,7 @@ static void blk_mq_debugfs_tags_show(struct seq_file *m, seq_printf(m, "nr_tags=%u\n", tags->nr_tags); seq_printf(m, "nr_reserved_tags=%u\n", tags->nr_reserved_tags); seq_printf(m, "active_queues=%d\n", - atomic_read(&tags->active_queues)); + READ_ONCE(tags->active_queues)); seq_puts(m, "\nbitmap_tags:\n"); sbitmap_queue_show(&tags->bitmap_tags, m); @@ -529,92 +479,11 @@ out: return res; } -static int hctx_io_poll_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "considered=%lu\n", hctx->poll_considered); - seq_printf(m, "invoked=%lu\n", hctx->poll_invoked); - seq_printf(m, "success=%lu\n", hctx->poll_success); - return 0; -} - -static ssize_t hctx_io_poll_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->poll_considered = hctx->poll_invoked = hctx->poll_success = 0; - return count; -} - -static int hctx_dispatched_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - int i; - - seq_printf(m, "%8u\t%lu\n", 0U, hctx->dispatched[0]); - - for (i = 1; i < BLK_MQ_MAX_DISPATCH_ORDER - 1; i++) { - unsigned int d = 1U << (i - 1); - - seq_printf(m, "%8u\t%lu\n", d, hctx->dispatched[i]); - } - - seq_printf(m, "%8u+\t%lu\n", 1U << (i - 1), hctx->dispatched[i]); - return 0; -} - -static ssize_t hctx_dispatched_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - int i; - - for (i = 0; i < BLK_MQ_MAX_DISPATCH_ORDER; i++) - hctx->dispatched[i] = 0; - return count; -} - -static int hctx_queued_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->queued); - return 0; -} - -static ssize_t hctx_queued_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->queued = 0; - return count; -} - -static int hctx_run_show(void *data, struct seq_file *m) -{ - struct blk_mq_hw_ctx *hctx = data; - - seq_printf(m, "%lu\n", hctx->run); - return 0; -} - -static ssize_t hctx_run_write(void *data, const char __user *buf, size_t count, - loff_t *ppos) -{ - struct blk_mq_hw_ctx *hctx = data; - - hctx->run = 0; - return count; -} - static int hctx_active_show(void *data, struct seq_file *m) { struct blk_mq_hw_ctx *hctx = data; - seq_printf(m, "%d\n", atomic_read(&hctx->nr_active)); + seq_printf(m, "%d\n", __blk_mq_active_requests(hctx)); return 0; } @@ -663,57 +532,6 @@ CTX_RQ_SEQ_OPS(default, HCTX_TYPE_DEFAULT); CTX_RQ_SEQ_OPS(read, HCTX_TYPE_READ); CTX_RQ_SEQ_OPS(poll, HCTX_TYPE_POLL); -static int ctx_dispatched_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu %lu\n", ctx->rq_dispatched[1], ctx->rq_dispatched[0]); - return 0; -} - -static ssize_t ctx_dispatched_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_dispatched[0] = ctx->rq_dispatched[1] = 0; - return count; -} - -static int ctx_merged_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu\n", ctx->rq_merged); - return 0; -} - -static ssize_t ctx_merged_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_merged = 0; - return count; -} - -static int ctx_completed_show(void *data, struct seq_file *m) -{ - struct blk_mq_ctx *ctx = data; - - seq_printf(m, "%lu %lu\n", ctx->rq_completed[1], ctx->rq_completed[0]); - return 0; -} - -static ssize_t ctx_completed_write(void *data, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct blk_mq_ctx *ctx = data; - - ctx->rq_completed[0] = ctx->rq_completed[1] = 0; - return count; -} - static int blk_mq_debugfs_show(struct seq_file *m, void *v) { const struct blk_mq_debugfs_attr *attr = m->private; @@ -789,10 +607,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_hctx_attrs[] = { {"tags_bitmap", 0400, hctx_tags_bitmap_show}, {"sched_tags", 0400, hctx_sched_tags_show}, {"sched_tags_bitmap", 0400, hctx_sched_tags_bitmap_show}, - {"io_poll", 0600, hctx_io_poll_show, hctx_io_poll_write}, - {"dispatched", 0600, hctx_dispatched_show, hctx_dispatched_write}, - {"queued", 0600, hctx_queued_show, hctx_queued_write}, - {"run", 0600, hctx_run_show, hctx_run_write}, {"active", 0400, hctx_active_show}, {"dispatch_busy", 0400, hctx_dispatch_busy_show}, {"type", 0400, hctx_type_show}, @@ -803,9 +617,6 @@ static const struct blk_mq_debugfs_attr blk_mq_debugfs_ctx_attrs[] = { {"default_rq_list", 0400, .seq_ops = &ctx_default_rq_list_seq_ops}, {"read_rq_list", 0400, .seq_ops = &ctx_read_rq_list_seq_ops}, {"poll_rq_list", 0400, .seq_ops = &ctx_poll_rq_list_seq_ops}, - {"dispatched", 0600, ctx_dispatched_show, ctx_dispatched_write}, - {"merged", 0600, ctx_merged_show, ctx_merged_write}, - {"completed", 0600, ctx_completed_show, ctx_completed_write}, {}, }; @@ -825,10 +636,7 @@ static void debugfs_create_files(struct dentry *parent, void *data, void blk_mq_debugfs_register(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; - - q->debugfs_dir = debugfs_create_dir(kobject_name(q->kobj.parent), - blk_debugfs_root); + unsigned long i; debugfs_create_files(q->debugfs_dir, q, blk_mq_debugfs_queue_attrs); @@ -858,13 +666,6 @@ void blk_mq_debugfs_register(struct request_queue *q) } } -void blk_mq_debugfs_unregister(struct request_queue *q) -{ - debugfs_remove_recursive(q->debugfs_dir); - q->sched_debugfs_dir = NULL; - q->debugfs_dir = NULL; -} - static void blk_mq_debugfs_register_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx) { @@ -884,6 +685,9 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, char name[20]; int i; + if (!q->debugfs_dir) + return; + snprintf(name, sizeof(name), "hctx%u", hctx->queue_num); hctx->debugfs_dir = debugfs_create_dir(name, q->debugfs_dir); @@ -895,6 +699,8 @@ void blk_mq_debugfs_register_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) { + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->debugfs_dir); hctx->sched_debugfs_dir = NULL; hctx->debugfs_dir = NULL; @@ -903,7 +709,7 @@ void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx) void blk_mq_debugfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_register_hctx(q, hctx); @@ -912,7 +718,7 @@ void blk_mq_debugfs_register_hctxs(struct request_queue *q) void blk_mq_debugfs_unregister_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_debugfs_unregister_hctx(hctx); @@ -922,6 +728,8 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + /* * If the parent directory has not been created yet, return, we will be * called again later on and the directory/files will be created then. @@ -939,21 +747,42 @@ void blk_mq_debugfs_register_sched(struct request_queue *q) void blk_mq_debugfs_unregister_sched(struct request_queue *q) { + lockdep_assert_held(&q->debugfs_mutex); + debugfs_remove_recursive(q->sched_debugfs_dir); q->sched_debugfs_dir = NULL; } +static const char *rq_qos_id_to_name(enum rq_qos_id id) +{ + switch (id) { + case RQ_QOS_WBT: + return "wbt"; + case RQ_QOS_LATENCY: + return "latency"; + case RQ_QOS_COST: + return "cost"; + } + return "unknown"; +} + void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { + lockdep_assert_held(&rqos->disk->queue->debugfs_mutex); + + if (!rqos->disk->queue->debugfs_dir) + return; debugfs_remove_recursive(rqos->debugfs_dir); rqos->debugfs_dir = NULL; } void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) { - struct request_queue *q = rqos->q; + struct request_queue *q = rqos->disk->queue; const char *dir_name = rq_qos_id_to_name(rqos->id); + lockdep_assert_held(&q->debugfs_mutex); + if (rqos->debugfs_dir || !rqos->ops->debugfs_attrs) return; @@ -961,23 +790,25 @@ void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) q->rqos_debugfs_dir = debugfs_create_dir("rqos", q->debugfs_dir); - rqos->debugfs_dir = debugfs_create_dir(dir_name, - rqos->q->rqos_debugfs_dir); - + rqos->debugfs_dir = debugfs_create_dir(dir_name, q->rqos_debugfs_dir); debugfs_create_files(rqos->debugfs_dir, rqos, rqos->ops->debugfs_attrs); } -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) -{ - debugfs_remove_recursive(q->rqos_debugfs_dir); - q->rqos_debugfs_dir = NULL; -} - void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { struct elevator_type *e = q->elevator->type; + lockdep_assert_held(&q->debugfs_mutex); + + /* + * If the parent debugfs directory has not been created yet, return; + * We will be called again later on with appropriate parent debugfs + * directory from blk_register_queue() + */ + if (!hctx->debugfs_dir) + return; + if (!e->hctx_debugfs_attrs) return; @@ -989,6 +820,10 @@ void blk_mq_debugfs_register_sched_hctx(struct request_queue *q, void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx) { + lockdep_assert_held(&hctx->queue->debugfs_mutex); + + if (!hctx->queue->debugfs_dir) + return; debugfs_remove_recursive(hctx->sched_debugfs_dir); hctx->sched_debugfs_dir = NULL; } diff --git a/block/blk-mq-debugfs.h b/block/blk-mq-debugfs.h index a68aa6041a10..9c7d4b6117d4 100644 --- a/block/blk-mq-debugfs.h +++ b/block/blk-mq-debugfs.h @@ -6,6 +6,8 @@ #include <linux/seq_file.h> +struct blk_mq_hw_ctx; + struct blk_mq_debugfs_attr { const char *name; umode_t mode; @@ -19,7 +21,6 @@ int __blk_mq_debugfs_rq_show(struct seq_file *m, struct request *rq); int blk_mq_debugfs_rq_show(struct seq_file *m, void *v); void blk_mq_debugfs_register(struct request_queue *q); -void blk_mq_debugfs_unregister(struct request_queue *q); void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_unregister_hctx(struct blk_mq_hw_ctx *hctx); @@ -34,16 +35,11 @@ void blk_mq_debugfs_unregister_sched_hctx(struct blk_mq_hw_ctx *hctx); void blk_mq_debugfs_register_rqos(struct rq_qos *rqos); void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos); -void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q); #else static inline void blk_mq_debugfs_register(struct request_queue *q) { } -static inline void blk_mq_debugfs_unregister(struct request_queue *q) -{ -} - static inline void blk_mq_debugfs_register_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx) { @@ -85,10 +81,6 @@ static inline void blk_mq_debugfs_register_rqos(struct rq_qos *rqos) static inline void blk_mq_debugfs_unregister_rqos(struct rq_qos *rqos) { } - -static inline void blk_mq_debugfs_unregister_queue_rqos(struct request_queue *q) -{ -} #endif #ifdef CONFIG_BLK_DEBUG_FS_ZONED diff --git a/block/blk-mq-pci.c b/block/blk-mq-pci.c index b595a94c4d16..d47b5c73c9eb 100644 --- a/block/blk-mq-pci.c +++ b/block/blk-mq-pci.c @@ -4,7 +4,6 @@ */ #include <linux/kobject.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> #include <linux/blk-mq-pci.h> #include <linux/pci.h> #include <linux/module.h> @@ -23,8 +22,8 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, - int offset) +void blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, + int offset) { const struct cpumask *mask; unsigned int queue, cpu; @@ -38,11 +37,10 @@ int blk_mq_pci_map_queues(struct blk_mq_queue_map *qmap, struct pci_dev *pdev, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; fallback: WARN_ON_ONCE(qmap->nr_queues > 1); blk_mq_clear_mq_map(qmap); - return 0; } EXPORT_SYMBOL_GPL(blk_mq_pci_map_queues); diff --git a/block/blk-mq-rdma.c b/block/blk-mq-rdma.c deleted file mode 100644 index 14f968e58b8f..000000000000 --- a/block/blk-mq-rdma.c +++ /dev/null @@ -1,44 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (c) 2017 Sagi Grimberg. - */ -#include <linux/blk-mq.h> -#include <linux/blk-mq-rdma.h> -#include <rdma/ib_verbs.h> - -/** - * blk_mq_rdma_map_queues - provide a default queue mapping for rdma device - * @map: CPU to hardware queue map. - * @dev: rdma device to provide a mapping for. - * @first_vec: first interrupt vectors to use for queues (usually 0) - * - * This function assumes the rdma device @dev has at least as many available - * interrupt vetors as @set has queues. It will then query it's affinity mask - * and built queue mapping that maps a queue to the CPUs that have irq affinity - * for the corresponding vector. - * - * In case either the driver passed a @dev with less vectors than - * @set->nr_hw_queues, or @dev does not provide an affinity mask for a - * vector, we fallback to the naive mapping. - */ -int blk_mq_rdma_map_queues(struct blk_mq_queue_map *map, - struct ib_device *dev, int first_vec) -{ - const struct cpumask *mask; - unsigned int queue, cpu; - - for (queue = 0; queue < map->nr_queues; queue++) { - mask = ib_get_vector_affinity(dev, first_vec + queue); - if (!mask) - goto fallback; - - for_each_cpu(cpu, mask) - map->mq_map[cpu] = map->queue_offset + queue; - } - - return 0; - -fallback: - return blk_mq_map_queues(map); -} -EXPORT_SYMBOL_GPL(blk_mq_rdma_map_queues); diff --git a/block/blk-mq-sched.c b/block/blk-mq-sched.c index fdcc2c1dd178..451a2c1f1f32 100644 --- a/block/blk-mq-sched.c +++ b/block/blk-mq-sched.c @@ -6,7 +6,7 @@ */ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/blk-mq.h> +#include <linux/list_sort.h> #include <trace/events/block.h> @@ -14,70 +14,64 @@ #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" #include "blk-wbt.h" -void blk_mq_sched_free_hctx_data(struct request_queue *q, - void (*exit)(struct blk_mq_hw_ctx *)) +/* + * Mark a hardware queue as needing a restart. + */ +void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) { - struct blk_mq_hw_ctx *hctx; - int i; + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + return; - queue_for_each_hw_ctx(q, hctx, i) { - if (exit && hctx->sched_data) - exit(hctx); - kfree(hctx->sched_data); - hctx->sched_data = NULL; - } + set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); } -EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); +EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); -void blk_mq_sched_assign_ioc(struct request *rq) +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - struct request_queue *q = rq->q; - struct io_context *ioc; - struct io_cq *icq; + clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); /* - * May not have an IO context if it's a passthrough request + * Order clearing SCHED_RESTART and list_empty_careful(&hctx->dispatch) + * in blk_mq_run_hw_queue(). Its pair is the barrier in + * blk_mq_dispatch_rq_list(). So dispatch code won't see SCHED_RESTART, + * meantime new request added to hctx->dispatch is missed to check in + * blk_mq_run_hw_queue(). */ - ioc = current->io_context; - if (!ioc) - return; + smp_mb(); - spin_lock_irq(&q->queue_lock); - icq = ioc_lookup_icq(ioc, q); - spin_unlock_irq(&q->queue_lock); - - if (!icq) { - icq = ioc_create_icq(ioc, q, GFP_ATOMIC); - if (!icq) - return; - } - get_io_context(icq->ioc); - rq->elv.icq = icq; + blk_mq_run_hw_queue(hctx, true); } -/* - * Mark a hardware queue as needing a restart. For shared queues, maintain - * a count of how many hardware queues are marked for restart. - */ -void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) +static int sched_rq_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { - if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; + struct request *rqa = container_of(a, struct request, queuelist); + struct request *rqb = container_of(b, struct request, queuelist); - set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + return rqa->mq_hctx > rqb->mq_hctx; } -EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) +static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) { - if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) - return; - clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); + struct blk_mq_hw_ctx *hctx = + list_first_entry(rq_list, struct request, queuelist)->mq_hctx; + struct request *rq; + LIST_HEAD(hctx_list); + unsigned int count = 0; - blk_mq_run_hw_queue(hctx, true); + list_for_each_entry(rq, rq_list, queuelist) { + if (rq->mq_hctx != hctx) { + list_cut_before(&hctx_list, rq_list, &rq->queuelist); + goto dispatch; + } + count++; + } + list_splice_tail_init(rq_list, &hctx_list); + +dispatch: + return blk_mq_dispatch_rq_list(hctx, &hctx_list, count); } #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ @@ -85,35 +79,45 @@ void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to * be run again. This is necessary to avoid starving flushes. */ -static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct elevator_queue *e = q->elevator; + bool multi_hctxs = false, run_queue = false; + bool dispatched = false, busy = false; + unsigned int max_dispatch; LIST_HEAD(rq_list); - int ret = 0; + int count = 0; + + if (hctx->dispatch_busy) + max_dispatch = 1; + else + max_dispatch = hctx->queue->nr_requests; do { struct request *rq; + int budget_token; if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) break; if (!list_empty_careful(&hctx->dispatch)) { - ret = -EAGAIN; + busy = true; break; } - if (!blk_mq_get_dispatch_budget(hctx)) + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) break; rq = e->type->ops.dispatch_request(hctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -121,17 +125,70 @@ static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) * no guarantee anyone will kick the queue. Kick it * ourselves. */ - blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + run_queue = true; break; } + blk_mq_set_rq_budget_token(rq, budget_token); + /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() * in blk_mq_dispatch_rq_list(). */ - list_add(&rq->queuelist, &rq_list); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + list_add_tail(&rq->queuelist, &rq_list); + count++; + if (rq->mq_hctx != hctx) + multi_hctxs = true; + + /* + * If we cannot get tag for the request, stop dequeueing + * requests from the IO scheduler. We are unlikely to be able + * to submit them anyway and it creates false impression for + * scheduling heuristics that the device can take more IO. + */ + if (!blk_mq_get_driver_tag(rq)) + break; + } while (count < max_dispatch); + + if (!count) { + if (run_queue) + blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); + } else if (multi_hctxs) { + /* + * Requests from different hctx may be dequeued from some + * schedulers, such as bfq and deadline. + * + * Sort the requests in the list according to their hctx, + * dispatch batching requests from same hctx at a time. + */ + list_sort(NULL, &rq_list, sched_rq_cmp); + do { + dispatched |= blk_mq_dispatch_hctx_list(&rq_list); + } while (!list_empty(&rq_list)); + } else { + dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); + } + + if (busy) + return -EAGAIN; + return !!dispatched; +} + +static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) +{ + unsigned long end = jiffies + HZ; + int ret; + + do { + ret = __blk_mq_do_dispatch_sched(hctx); + if (ret != 1) + break; + if (need_resched() || time_is_before_jiffies(end)) { + blk_mq_delay_run_hw_queue(hctx, 0); + break; + } + } while (1); return ret; } @@ -150,10 +207,10 @@ static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, /* * Only SCSI implements .get_budget and .put_budget, and SCSI restarts * its queue by itself in its completion handler, so we don't need to - * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. + * restart queue if .get_budget() fails to get the budget. * * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to - * to be run again. This is necessary to avoid starving flushes. + * be run again. This is necessary to avoid starving flushes. */ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) { @@ -161,9 +218,10 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) LIST_HEAD(rq_list); struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); int ret = 0; + struct request *rq; do { - struct request *rq; + int budget_token; if (!list_empty_careful(&hctx->dispatch)) { ret = -EAGAIN; @@ -173,12 +231,13 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) if (!sbitmap_any_bit_set(&hctx->ctx_map)) break; - if (!blk_mq_get_dispatch_budget(hctx)) + budget_token = blk_mq_get_dispatch_budget(q); + if (budget_token < 0) break; rq = blk_mq_dequeue_from_ctx(hctx, ctx); if (!rq) { - blk_mq_put_dispatch_budget(hctx); + blk_mq_put_dispatch_budget(q, budget_token); /* * We're releasing without dispatching. Holding the * budget could have blocked any "hctx"s with the @@ -190,6 +249,8 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) break; } + blk_mq_set_rq_budget_token(rq, budget_token); + /* * Now this rq owns the budget which has to be released * if this rq won't be queued to driver via .queue_rq() @@ -200,7 +261,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) /* round robin for fair dispatch */ ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); - } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); + } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); WRITE_ONCE(hctx->dispatch_from, ctx); return ret; @@ -208,10 +269,7 @@ static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) { - struct request_queue *q = hctx->queue; - struct elevator_queue *e = q->elevator; - const bool has_sched_dispatch = e && e->type->ops.dispatch_request; - int ret = 0; + bool need_dispatch = false; LIST_HEAD(rq_list); /* @@ -240,23 +298,22 @@ static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) */ if (!list_empty(&rq_list)) { blk_mq_sched_mark_restart_hctx(hctx); - if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { - if (has_sched_dispatch) - ret = blk_mq_do_dispatch_sched(hctx); - else - ret = blk_mq_do_dispatch_ctx(hctx); - } - } else if (has_sched_dispatch) { - ret = blk_mq_do_dispatch_sched(hctx); - } else if (hctx->dispatch_busy) { - /* dequeue request one by one from sw queue if queue is busy */ - ret = blk_mq_do_dispatch_ctx(hctx); + if (!blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) + return 0; + need_dispatch = true; } else { - blk_mq_flush_busy_ctxs(hctx, &rq_list); - blk_mq_dispatch_rq_list(q, &rq_list, false); + need_dispatch = hctx->dispatch_busy; } - return ret; + if (hctx->queue->elevator) + return blk_mq_do_dispatch_sched(hctx); + + /* dequeue request one by one from sw queue if queue is busy */ + if (need_dispatch) + return blk_mq_do_dispatch_ctx(hctx); + blk_mq_flush_busy_ctxs(hctx, &rq_list); + blk_mq_dispatch_rq_list(hctx, &rq_list, 0); + return 0; } void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) @@ -267,8 +324,6 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) return; - hctx->run++; - /* * A return of -EAGAIN is an indication that hctx->dispatch is not * empty and we must run again in order to avoid starving flushes. @@ -279,355 +334,168 @@ void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) } } -bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **merged_request) -{ - struct request *rq; - - switch (elv_merge(q, &rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (!blk_mq_sched_allow_merge(q, rq, bio)) - return false; - if (!bio_attempt_back_merge(rq, bio, nr_segs)) - return false; - *merged_request = attempt_back_merge(q, rq); - if (!*merged_request) - elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); - return true; - case ELEVATOR_FRONT_MERGE: - if (!blk_mq_sched_allow_merge(q, rq, bio)) - return false; - if (!bio_attempt_front_merge(rq, bio, nr_segs)) - return false; - *merged_request = attempt_front_merge(q, rq); - if (!*merged_request) - elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); - return true; - case ELEVATOR_DISCARD_MERGE: - return bio_attempt_discard_merge(q, rq, bio); - default: - return false; - } -} -EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); - -/* - * Iterate list of requests and see if we can merge this bio with any - * of them. - */ -bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, - struct bio *bio, unsigned int nr_segs) -{ - struct request *rq; - int checked = 8; - - list_for_each_entry_reverse(rq, list, queuelist) { - bool merged = false; - - if (!checked--) - break; - - if (!blk_rq_merge_ok(rq, bio)) - continue; - - switch (blk_try_merge(rq, bio)) { - case ELEVATOR_BACK_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_back_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_FRONT_MERGE: - if (blk_mq_sched_allow_merge(q, rq, bio)) - merged = bio_attempt_front_merge(rq, bio, - nr_segs); - break; - case ELEVATOR_DISCARD_MERGE: - merged = bio_attempt_discard_merge(q, rq, bio); - break; - default: - continue; - } - - return merged; - } - - return false; -} -EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); - -/* - * Reverse check our software queue for entries that we could potentially - * merge with. Currently includes a hand-wavy stop count of 8, to not spend - * too much time checking for merges. - */ -static bool blk_mq_attempt_merge(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, struct bio *bio, - unsigned int nr_segs) -{ - enum hctx_type type = hctx->type; - - lockdep_assert_held(&ctx->lock); - - if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { - ctx->rq_merged++; - return true; - } - - return false; -} - -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); - struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); + struct blk_mq_ctx *ctx; + struct blk_mq_hw_ctx *hctx; bool ret = false; enum hctx_type type; - if (e && e->type->ops.bio_merge) - return e->type->ops.bio_merge(hctx, bio, nr_segs); + if (e && e->type->ops.bio_merge) { + ret = e->type->ops.bio_merge(q, bio, nr_segs); + goto out_put; + } + ctx = blk_mq_get_ctx(q); + hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); type = hctx->type; - if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && - !list_empty_careful(&ctx->rq_lists[type])) { - /* default per sw-queue merge */ - spin_lock(&ctx->lock); - ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); - spin_unlock(&ctx->lock); - } + if (!(hctx->flags & BLK_MQ_F_SHOULD_MERGE) || + list_empty_careful(&ctx->rq_lists[type])) + goto out_put; + + /* default per sw-queue merge */ + spin_lock(&ctx->lock); + /* + * Reverse check our software queue for entries that we could + * potentially merge with. Currently includes a hand-wavy stop + * count of 8, to not spend too much time checking for merges. + */ + if (blk_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) + ret = true; + spin_unlock(&ctx->lock); +out_put: return ret; } -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { - return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); + return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq, free); } EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); -void blk_mq_sched_request_inserted(struct request *rq) +static int blk_mq_sched_alloc_map_and_rqs(struct request_queue *q, + struct blk_mq_hw_ctx *hctx, + unsigned int hctx_idx) { - trace_block_rq_insert(rq->q, rq); -} -EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); - -static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, - bool has_sched, - struct request *rq) -{ - /* - * dispatch flush and passthrough rq directly - * - * passthrough request has to be added to hctx->dispatch directly. - * For some reason, device may be in one situation which can't - * handle FS request, so STS_RESOURCE is always returned and the - * FS request will be added to hctx->dispatch. However passthrough - * request may be required at that time for fixing the problem. If - * passthrough request is added to scheduler queue, there isn't any - * chance to dispatch it given we prioritize requests in hctx->dispatch. - */ - if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) - return true; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + hctx->sched_tags = q->sched_shared_tags; + return 0; + } - if (has_sched) - rq->rq_flags |= RQF_SORTED; + hctx->sched_tags = blk_mq_alloc_map_and_rqs(q->tag_set, hctx_idx, + q->nr_requests); - return false; + if (!hctx->sched_tags) + return -ENOMEM; + return 0; } -void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async) +static void blk_mq_exit_sched_shared_tags(struct request_queue *queue) { - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - /* flush rq in flush machinery need to be dispatched directly */ - if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { - blk_insert_flush(rq); - goto run; - } - - WARN_ON(e && (rq->tag != -1)); - - if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { - /* - * Firstly normal IO request is inserted to scheduler queue or - * sw queue, meantime we add flush request to dispatch queue( - * hctx->dispatch) directly and there is at most one in-flight - * flush request for each hw queue, so it doesn't matter to add - * flush request to tail or front of the dispatch queue. - * - * Secondly in case of NCQ, flush request belongs to non-NCQ - * command, and queueing it will fail when there is any - * in-flight normal IO request(NCQ command). When adding flush - * rq to the front of hctx->dispatch, it is easier to introduce - * extra time to flush rq's latency because of S_SCHED_RESTART - * compared with adding to the tail of dispatch queue, then - * chance of flush merge is increased, and less flush requests - * will be issued to controller. It is observed that ~10% time - * is saved in blktests block/004 on disk attached to AHCI/NCQ - * drive when adding flush rq to the front of hctx->dispatch. - * - * Simply queue flush rq to the front of hctx->dispatch so that - * intensive flush workloads can benefit in case of NCQ HW. - */ - at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; - blk_mq_request_bypass_insert(rq, at_head, false); - goto run; - } - - if (e && e->type->ops.insert_requests) { - LIST_HEAD(list); - - list_add(&rq->queuelist, &list); - e->type->ops.insert_requests(hctx, &list, at_head); - } else { - spin_lock(&ctx->lock); - __blk_mq_insert_request(hctx, rq, at_head); - spin_unlock(&ctx->lock); - } - -run: - if (run_queue) - blk_mq_run_hw_queue(hctx, async); + blk_mq_free_rq_map(queue->sched_shared_tags); + queue->sched_shared_tags = NULL; } -void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async) +/* called in queue's release handler, tagset has gone away */ +static void blk_mq_sched_tags_teardown(struct request_queue *q, unsigned int flags) { - struct elevator_queue *e; - struct request_queue *q = hctx->queue; - - /* - * blk_mq_sched_insert_requests() is called from flush plug - * context only, and hold one usage counter to prevent queue - * from being released. - */ - percpu_ref_get(&q->q_usage_counter); + struct blk_mq_hw_ctx *hctx; + unsigned long i; - e = hctx->queue->elevator; - if (e && e->type->ops.insert_requests) - e->type->ops.insert_requests(hctx, list, false); - else { - /* - * try to issue requests directly if the hw queue isn't - * busy in case of 'none' scheduler, and this way may save - * us one extra enqueue & dequeue to sw queue. - */ - if (!hctx->dispatch_busy && !e && !run_queue_async) { - blk_mq_try_issue_list_directly(hctx, list); - if (list_empty(list)) - goto out; + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) { + if (!blk_mq_is_shared_tags(flags)) + blk_mq_free_rq_map(hctx->sched_tags); + hctx->sched_tags = NULL; } - blk_mq_insert_requests(hctx, ctx, list); } - blk_mq_run_hw_queue(hctx, run_queue_async); - out: - percpu_ref_put(&q->q_usage_counter); + if (blk_mq_is_shared_tags(flags)) + blk_mq_exit_sched_shared_tags(q); } -static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) +static int blk_mq_init_sched_shared_tags(struct request_queue *queue) { - if (hctx->sched_tags) { - blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } -} + struct blk_mq_tag_set *set = queue->tag_set; -static int blk_mq_sched_alloc_tags(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, - unsigned int hctx_idx) -{ - struct blk_mq_tag_set *set = q->tag_set; - int ret; - - hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, - set->reserved_tags); - if (!hctx->sched_tags) + /* + * Set initial depth at max so that we don't need to reallocate for + * updating nr_requests. + */ + queue->sched_shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + MAX_SCHED_RQ); + if (!queue->sched_shared_tags) return -ENOMEM; - ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); - if (ret) - blk_mq_sched_free_tags(set, hctx, hctx_idx); + blk_mq_tag_update_sched_shared_tags(queue); - return ret; -} - -/* called in queue's release handler, tagset has gone away */ -static void blk_mq_sched_tags_teardown(struct request_queue *q) -{ - struct blk_mq_hw_ctx *hctx; - int i; - - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) { - blk_mq_free_rq_map(hctx->sched_tags); - hctx->sched_tags = NULL; - } - } + return 0; } +/* caller must have a reference to @e, will grab another one if successful */ int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) { + unsigned int flags = q->tag_set->flags; struct blk_mq_hw_ctx *hctx; struct elevator_queue *eq; - unsigned int i; + unsigned long i; int ret; - if (!e) { - q->elevator = NULL; - q->nr_requests = q->tag_set->queue_depth; - return 0; - } - /* * Default to double of smaller one between hw queue_depth and 128, * since we don't split into sync/async like the old code did. * Additionally, this is a per-hw queue depth. */ q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, - BLKDEV_MAX_RQ); + BLKDEV_DEFAULT_RQ); + + if (blk_mq_is_shared_tags(flags)) { + ret = blk_mq_init_sched_shared_tags(q); + if (ret) + return ret; + } queue_for_each_hw_ctx(q, hctx, i) { - ret = blk_mq_sched_alloc_tags(q, hctx, i); + ret = blk_mq_sched_alloc_map_and_rqs(q, hctx, i); if (ret) - goto err; + goto err_free_map_and_rqs; } ret = e->ops.init_sched(q, e); if (ret) - goto err; + goto err_free_map_and_rqs; + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched(q); + mutex_unlock(&q->debugfs_mutex); queue_for_each_hw_ctx(q, hctx, i) { if (e->ops.init_hctx) { ret = e->ops.init_hctx(hctx, i); if (ret) { eq = q->elevator; - blk_mq_sched_free_requests(q); + blk_mq_sched_free_rqs(q); blk_mq_exit_sched(q, eq); kobject_put(&eq->kobj); return ret; } } + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_register_sched_hctx(q, hctx); + mutex_unlock(&q->debugfs_mutex); } return 0; -err: - blk_mq_sched_free_requests(q); - blk_mq_sched_tags_teardown(q); +err_free_map_and_rqs: + blk_mq_sched_free_rqs(q); + blk_mq_sched_tags_teardown(q, flags); + q->elevator = NULL; return ret; } @@ -636,32 +504,47 @@ err: * called in either blk_queue_cleanup or elevator_switch, tagset * is required for freeing requests */ -void blk_mq_sched_free_requests(struct request_queue *q) +void blk_mq_sched_free_rqs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->sched_tags) - blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + blk_mq_free_rqs(q->tag_set, q->sched_shared_tags, + BLK_MQ_NO_HCTX_IDX); + } else { + queue_for_each_hw_ctx(q, hctx, i) { + if (hctx->sched_tags) + blk_mq_free_rqs(q->tag_set, + hctx->sched_tags, i); + } } } void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; + unsigned int flags = 0; queue_for_each_hw_ctx(q, hctx, i) { + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched_hctx(hctx); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_hctx && hctx->sched_data) { e->type->ops.exit_hctx(hctx, i); hctx->sched_data = NULL; } + flags = hctx->flags; } + + mutex_lock(&q->debugfs_mutex); blk_mq_debugfs_unregister_sched(q); + mutex_unlock(&q->debugfs_mutex); + if (e->type->ops.exit_sched) e->type->ops.exit_sched(e); - blk_mq_sched_tags_teardown(q); + blk_mq_sched_tags_teardown(q, flags); q->elevator = NULL; } diff --git a/block/blk-mq-sched.h b/block/blk-mq-sched.h index 126021fc3a11..1326526bb733 100644 --- a/block/blk-mq-sched.h +++ b/block/blk-mq-sched.h @@ -2,72 +2,69 @@ #ifndef BLK_MQ_SCHED_H #define BLK_MQ_SCHED_H +#include "elevator.h" #include "blk-mq.h" -#include "blk-mq-tag.h" -void blk_mq_sched_free_hctx_data(struct request_queue *q, - void (*exit)(struct blk_mq_hw_ctx *)); +#define MAX_SCHED_RQ (16 * BLKDEV_DEFAULT_RQ) -void blk_mq_sched_assign_ioc(struct request *rq); - -void blk_mq_sched_request_inserted(struct request *rq); bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs, struct request **merged_request); -bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, +bool blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs); -bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq); +bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free); void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx); -void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); - -void blk_mq_sched_insert_request(struct request *rq, bool at_head, - bool run_queue, bool async); -void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, - struct blk_mq_ctx *ctx, - struct list_head *list, bool run_queue_async); +void __blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx); void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx); int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e); void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e); -void blk_mq_sched_free_requests(struct request_queue *q); +void blk_mq_sched_free_rqs(struct request_queue *q); -static inline bool -blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs) +static inline void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) { - if (blk_queue_nomerges(q) || !bio_mergeable(bio)) - return false; + if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) + __blk_mq_sched_restart(hctx); +} - return __blk_mq_sched_bio_merge(q, bio, nr_segs); +static inline bool bio_mergeable(struct bio *bio) +{ + return !(bio->bi_opf & REQ_NOMERGE_FLAGS); } static inline bool blk_mq_sched_allow_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - struct elevator_queue *e = q->elevator; - - if (e && e->type->ops.allow_merge) - return e->type->ops.allow_merge(q, rq, bio); + if (rq->rq_flags & RQF_USE_SCHED) { + struct elevator_queue *e = q->elevator; + if (e->type->ops.allow_merge) + return e->type->ops.allow_merge(q, rq, bio); + } return true; } static inline void blk_mq_sched_completed_request(struct request *rq, u64 now) { - struct elevator_queue *e = rq->q->elevator; + if (rq->rq_flags & RQF_USE_SCHED) { + struct elevator_queue *e = rq->q->elevator; - if (e && e->type->ops.completed_request) - e->type->ops.completed_request(rq, now); + if (e->type->ops.completed_request) + e->type->ops.completed_request(rq, now); + } } static inline void blk_mq_sched_requeue_request(struct request *rq) { - struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; + if (rq->rq_flags & RQF_USE_SCHED) { + struct request_queue *q = rq->q; + struct elevator_queue *e = q->elevator; - if (e && e->type->ops.requeue_request) - e->type->ops.requeue_request(rq); + if (e->type->ops.requeue_request) + e->type->ops.requeue_request(rq); + } } static inline bool blk_mq_sched_has_work(struct blk_mq_hw_ctx *hctx) diff --git a/block/blk-mq-sysfs.c b/block/blk-mq-sysfs.c index 062229395a50..156e9bb07abf 100644 --- a/block/blk-mq-sysfs.c +++ b/block/blk-mq-sysfs.c @@ -10,10 +10,8 @@ #include <linux/workqueue.h> #include <linux/smp.h> -#include <linux/blk-mq.h> #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" static void blk_mq_sysfs_release(struct kobject *kobj) { @@ -36,10 +34,6 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) struct blk_mq_hw_ctx *hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - cancel_delayed_work_sync(&hctx->run_work); - - if (hctx->flags & BLK_MQ_F_BLOCKING) - cleanup_srcu_struct(hctx->srcu); blk_free_flush_queue(hctx->fq); sbitmap_free(&hctx->ctx_map); free_cpumask_var(hctx->cpumask); @@ -47,60 +41,11 @@ static void blk_mq_hw_sysfs_release(struct kobject *kobj) kfree(hctx); } -struct blk_mq_ctx_sysfs_entry { - struct attribute attr; - ssize_t (*show)(struct blk_mq_ctx *, char *); - ssize_t (*store)(struct blk_mq_ctx *, const char *, size_t); -}; - struct blk_mq_hw_ctx_sysfs_entry { struct attribute attr; ssize_t (*show)(struct blk_mq_hw_ctx *, char *); - ssize_t (*store)(struct blk_mq_hw_ctx *, const char *, size_t); }; -static ssize_t blk_mq_sysfs_show(struct kobject *kobj, struct attribute *attr, - char *page) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->show) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->show(ctx, page); - mutex_unlock(&q->sysfs_lock); - return res; -} - -static ssize_t blk_mq_sysfs_store(struct kobject *kobj, struct attribute *attr, - const char *page, size_t length) -{ - struct blk_mq_ctx_sysfs_entry *entry; - struct blk_mq_ctx *ctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_ctx_sysfs_entry, attr); - ctx = container_of(kobj, struct blk_mq_ctx, kobj); - q = ctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(ctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, struct attribute *attr, char *page) { @@ -122,28 +67,6 @@ static ssize_t blk_mq_hw_sysfs_show(struct kobject *kobj, return res; } -static ssize_t blk_mq_hw_sysfs_store(struct kobject *kobj, - struct attribute *attr, const char *page, - size_t length) -{ - struct blk_mq_hw_ctx_sysfs_entry *entry; - struct blk_mq_hw_ctx *hctx; - struct request_queue *q; - ssize_t res; - - entry = container_of(attr, struct blk_mq_hw_ctx_sysfs_entry, attr); - hctx = container_of(kobj, struct blk_mq_hw_ctx, kobj); - q = hctx->queue; - - if (!entry->store) - return -EIO; - - mutex_lock(&q->sysfs_lock); - res = entry->store(hctx, page, length); - mutex_unlock(&q->sysfs_lock); - return res; -} - static ssize_t blk_mq_hw_sysfs_nr_tags_show(struct blk_mq_hw_ctx *hctx, char *page) { @@ -200,27 +123,19 @@ static struct attribute *default_hw_ctx_attrs[] = { }; ATTRIBUTE_GROUPS(default_hw_ctx); -static const struct sysfs_ops blk_mq_sysfs_ops = { - .show = blk_mq_sysfs_show, - .store = blk_mq_sysfs_store, -}; - static const struct sysfs_ops blk_mq_hw_sysfs_ops = { .show = blk_mq_hw_sysfs_show, - .store = blk_mq_hw_sysfs_store, }; -static struct kobj_type blk_mq_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, +static const struct kobj_type blk_mq_ktype = { .release = blk_mq_sysfs_release, }; -static struct kobj_type blk_mq_ctx_ktype = { - .sysfs_ops = &blk_mq_sysfs_ops, +static const struct kobj_type blk_mq_ctx_ktype = { .release = blk_mq_ctx_sysfs_release, }; -static struct kobj_type blk_mq_hw_ktype = { +static const struct kobj_type blk_mq_hw_ktype = { .sysfs_ops = &blk_mq_hw_sysfs_ops, .default_groups = default_hw_ctx_groups, .release = blk_mq_hw_sysfs_release, @@ -244,7 +159,7 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) { struct request_queue *q = hctx->queue; struct blk_mq_ctx *ctx; - int i, ret; + int i, j, ret; if (!hctx->nr_ctx) return 0; @@ -256,29 +171,19 @@ static int blk_mq_register_hctx(struct blk_mq_hw_ctx *hctx) hctx_for_each_ctx(hctx, ctx, i) { ret = kobject_add(&ctx->kobj, &hctx->kobj, "cpu%u", ctx->cpu); if (ret) - break; + goto out; } + return 0; +out: + hctx_for_each_ctx(hctx, ctx, j) { + if (j < i) + kobject_del(&ctx->kobj); + } + kobject_del(&hctx->kobj); return ret; } -void blk_mq_unregister_dev(struct device *dev, struct request_queue *q) -{ - struct blk_mq_hw_ctx *hctx; - int i; - - lockdep_assert_held(&q->sysfs_dir_lock); - - queue_for_each_hw_ctx(q, hctx, i) - blk_mq_unregister_hctx(hctx); - - kobject_uevent(q->mq_kobj, KOBJ_REMOVE); - kobject_del(q->mq_kobj); - kobject_put(&dev->kobj); - - q->mq_sysfs_init_done = false; -} - void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx) { kobject_init(&hctx->kobj, &blk_mq_hw_ktype); @@ -311,15 +216,16 @@ void blk_mq_sysfs_init(struct request_queue *q) } } -int __blk_mq_register_dev(struct device *dev, struct request_queue *q) +int blk_mq_sysfs_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; - int ret, i; + unsigned long i, j; + int ret; - WARN_ON_ONCE(!q->kobj.parent); lockdep_assert_held(&q->sysfs_dir_lock); - ret = kobject_add(q->mq_kobj, kobject_get(&dev->kobj), "%s", "mq"); + ret = kobject_add(q->mq_kobj, &disk_to_dev(disk)->kobj, "mq"); if (ret < 0) goto out; @@ -337,19 +243,37 @@ out: return ret; unreg: - while (--i >= 0) - blk_mq_unregister_hctx(q->queue_hw_ctx[i]); + queue_for_each_hw_ctx(q, hctx, j) { + if (j < i) + blk_mq_unregister_hctx(hctx); + } kobject_uevent(q->mq_kobj, KOBJ_REMOVE); kobject_del(q->mq_kobj); - kobject_put(&dev->kobj); return ret; } -void blk_mq_sysfs_unregister(struct request_queue *q) +void blk_mq_sysfs_unregister(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; + + lockdep_assert_held(&q->sysfs_dir_lock); + + queue_for_each_hw_ctx(q, hctx, i) + blk_mq_unregister_hctx(hctx); + + kobject_uevent(q->mq_kobj, KOBJ_REMOVE); + kobject_del(q->mq_kobj); + + q->mq_sysfs_init_done = false; +} + +void blk_mq_sysfs_unregister_hctxs(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) @@ -362,10 +286,11 @@ unlock: mutex_unlock(&q->sysfs_dir_lock); } -int blk_mq_sysfs_register(struct request_queue *q) +int blk_mq_sysfs_register_hctxs(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i, ret = 0; + unsigned long i; + int ret = 0; mutex_lock(&q->sysfs_dir_lock); if (!q->mq_sysfs_init_done) diff --git a/block/blk-mq-tag.c b/block/blk-mq-tag.c index ae722f8b13fb..cc57e2dd9a0b 100644 --- a/block/blk-mq-tag.c +++ b/block/blk-mq-tag.c @@ -9,11 +9,25 @@ #include <linux/kernel.h> #include <linux/module.h> -#include <linux/blk-mq.h> #include <linux/delay.h> #include "blk.h" #include "blk-mq.h" -#include "blk-mq-tag.h" +#include "blk-mq-sched.h" + +/* + * Recalculate wakeup batch when tag is shared by hctx. + */ +static void blk_mq_update_wake_batch(struct blk_mq_tags *tags, + unsigned int users) +{ + if (!users) + return; + + sbitmap_queue_recalculate_wake_batch(&tags->bitmap_tags, + users); + sbitmap_queue_recalculate_wake_batch(&tags->breserved_tags, + users); +} /* * If a previously inactive queue goes active, bump the active user count. @@ -21,13 +35,32 @@ * to get tag when first time, the other shared-tag users could reserve * budget for it. */ -bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) { - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) && - !test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - atomic_inc(&hctx->tags->active_queues); + unsigned int users; + struct blk_mq_tags *tags = hctx->tags; - return true; + /* + * calling test_bit() prior to test_and_set_bit() is intentional, + * it avoids dirtying the cacheline if the queue is already active. + */ + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; + + if (test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags) || + test_and_set_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + return; + } else { + if (test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state) || + test_and_set_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + } + + spin_lock_irq(&tags->lock); + users = tags->active_queues + 1; + WRITE_ONCE(tags->active_queues, users); + blk_mq_update_wake_batch(tags, users); + spin_unlock_irq(&tags->lock); } /* @@ -47,58 +80,56 @@ void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool include_reserve) void __blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) { struct blk_mq_tags *tags = hctx->tags; + unsigned int users; - if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return; - - atomic_dec(&tags->active_queues); - - blk_mq_tag_wakeup_all(tags, false); -} + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; -/* - * For shared tag users, we track the number of currently active users - * and attempt to provide a fair share of the tag depth for each of them. - */ -static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, - struct sbitmap_queue *bt) -{ - unsigned int depth, users; - - if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_SHARED)) - return true; - if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) - return true; + if (!test_and_clear_bit(QUEUE_FLAG_HCTX_ACTIVE, + &q->queue_flags)) + return; + } else { + if (!test_and_clear_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return; + } - /* - * Don't try dividing an ant - */ - if (bt->sb.depth == 1) - return true; + spin_lock_irq(&tags->lock); + users = tags->active_queues - 1; + WRITE_ONCE(tags->active_queues, users); + blk_mq_update_wake_batch(tags, users); + spin_unlock_irq(&tags->lock); - users = atomic_read(&hctx->tags->active_queues); - if (!users) - return true; - - /* - * Allow at least some tags - */ - depth = max((bt->sb.depth + users - 1) / users, 4U); - return atomic_read(&hctx->nr_active) < depth; + blk_mq_tag_wakeup_all(tags, false); } static int __blk_mq_get_tag(struct blk_mq_alloc_data *data, struct sbitmap_queue *bt) { - if (!(data->flags & BLK_MQ_REQ_INTERNAL) && - !hctx_may_queue(data->hctx, bt)) + if (!data->q->elevator && !(data->flags & BLK_MQ_REQ_RESERVED) && + !hctx_may_queue(data->hctx, bt)) return BLK_MQ_NO_TAG; + if (data->shallow_depth) - return __sbitmap_queue_get_shallow(bt, data->shallow_depth); + return sbitmap_queue_get_shallow(bt, data->shallow_depth); else return __sbitmap_queue_get(bt); } +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset) +{ + struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct sbitmap_queue *bt = &tags->bitmap_tags; + unsigned long ret; + + if (data->shallow_depth ||data->flags & BLK_MQ_REQ_RESERVED || + data->hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + return 0; + ret = __sbitmap_queue_get_batch(bt, nr_tags, offset); + *offset += tags->nr_reserved_tags; + return ret; +} + unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) { struct blk_mq_tags *tags = blk_mq_tags_from_data(data); @@ -172,7 +203,7 @@ unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data) * other allocations on previous queue won't be starved. */ if (bt != bt_prev) - sbitmap_queue_wake_up(bt_prev); + sbitmap_queue_wake_up(bt_prev, 1); ws = bt_wait_ptr(bt, data->hctx); } while (1); @@ -191,33 +222,6 @@ found_tag: return tag + tag_offset; } -bool __blk_mq_get_driver_tag(struct request *rq) -{ - struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; - unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; - bool shared = blk_mq_tag_busy(rq->mq_hctx); - int tag; - - if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { - bt = &rq->mq_hctx->tags->breserved_tags; - tag_offset = 0; - } - - if (!hctx_may_queue(rq->mq_hctx, bt)) - return false; - tag = __sbitmap_queue_get(bt); - if (tag == BLK_MQ_NO_TAG) - return false; - - rq->tag = tag + tag_offset; - if (shared) { - rq->rq_flags |= RQF_MQ_INFLIGHT; - atomic_inc(&rq->mq_hctx->nr_active); - } - rq->mq_hctx->tags->rqs[rq->tag] = rq; - return true; -} - void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, unsigned int tag) { @@ -227,42 +231,73 @@ void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, BUG_ON(real_tag >= tags->nr_tags); sbitmap_queue_clear(&tags->bitmap_tags, real_tag, ctx->cpu); } else { - BUG_ON(tag >= tags->nr_reserved_tags); sbitmap_queue_clear(&tags->breserved_tags, tag, ctx->cpu); } } +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags) +{ + sbitmap_queue_clear_batch(&tags->bitmap_tags, tags->nr_reserved_tags, + tag_array, nr_tags); +} + struct bt_iter_data { struct blk_mq_hw_ctx *hctx; - busy_iter_fn *fn; + struct request_queue *q; + busy_tag_iter_fn *fn; void *data; bool reserved; }; +static struct request *blk_mq_find_and_get_req(struct blk_mq_tags *tags, + unsigned int bitnr) +{ + struct request *rq; + unsigned long flags; + + spin_lock_irqsave(&tags->lock, flags); + rq = tags->rqs[bitnr]; + if (!rq || rq->tag != bitnr || !req_ref_inc_not_zero(rq)) + rq = NULL; + spin_unlock_irqrestore(&tags->lock, flags); + return rq; +} + static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_iter_data *iter_data = data; struct blk_mq_hw_ctx *hctx = iter_data->hctx; - struct blk_mq_tags *tags = hctx->tags; - bool reserved = iter_data->reserved; + struct request_queue *q = iter_data->q; + struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_tags *tags; struct request *rq; + bool ret = true; - if (!reserved) - bitnr += tags->nr_reserved_tags; - rq = tags->rqs[bitnr]; + if (blk_mq_is_shared_tags(set->flags)) + tags = set->shared_tags; + else + tags = hctx->tags; + if (!iter_data->reserved) + bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (rq && rq->q == hctx->queue) - return iter_data->fn(hctx, rq, iter_data->data, reserved); - return true; + rq = blk_mq_find_and_get_req(tags, bitnr); + if (!rq) + return true; + + if (rq->q == q && (!hctx || rq->mq_hctx == hctx)) + ret = iter_data->fn(rq, iter_data->data); + blk_mq_put_rq_ref(rq); + return ret; } /** * bt_for_each - iterate over the requests associated with a hardware queue * @hctx: Hardware queue to examine. + * @q: Request queue to examine. * @bt: sbitmap to examine. This is either the breserved_tags member * or the bitmap_tags member of struct blk_mq_tags. * @fn: Pointer to the function that will be called for each request @@ -274,14 +309,16 @@ static bool bt_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) * @reserved: Indicates whether @bt is the breserved_tags member or the * bitmap_tags member of struct blk_mq_tags. */ -static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct sbitmap_queue *bt, - busy_iter_fn *fn, void *data, bool reserved) +static void bt_for_each(struct blk_mq_hw_ctx *hctx, struct request_queue *q, + struct sbitmap_queue *bt, busy_tag_iter_fn *fn, + void *data, bool reserved) { struct bt_iter_data iter_data = { .hctx = hctx, .fn = fn, .data = data, .reserved = reserved, + .q = q, }; sbitmap_for_each_set(&bt->sb, bt_iter, &iter_data); @@ -302,26 +339,30 @@ static bool bt_tags_iter(struct sbitmap *bitmap, unsigned int bitnr, void *data) { struct bt_tags_iter_data *iter_data = data; struct blk_mq_tags *tags = iter_data->tags; - bool reserved = iter_data->flags & BT_TAG_ITER_RESERVED; struct request *rq; + bool ret = true; + bool iter_static_rqs = !!(iter_data->flags & BT_TAG_ITER_STATIC_RQS); - if (!reserved) + if (!(iter_data->flags & BT_TAG_ITER_RESERVED)) bitnr += tags->nr_reserved_tags; /* * We can hit rq == NULL here, because the tagging functions * test and set the bit before assigning ->rqs[]. */ - if (iter_data->flags & BT_TAG_ITER_STATIC_RQS) + if (iter_static_rqs) rq = tags->static_rqs[bitnr]; else - rq = tags->rqs[bitnr]; + rq = blk_mq_find_and_get_req(tags, bitnr); if (!rq) return true; - if ((iter_data->flags & BT_TAG_ITER_STARTED) && - !blk_mq_request_started(rq)) - return true; - return iter_data->fn(rq, iter_data->data, reserved); + + if (!(iter_data->flags & BT_TAG_ITER_STARTED) || + blk_mq_request_started(rq)) + ret = iter_data->fn(rq, iter_data->data); + if (!iter_static_rqs) + blk_mq_put_rq_ref(rq); + return ret; } /** @@ -388,13 +429,19 @@ void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, * indicates whether or not @rq is a reserved request. Return * true to continue iterating tags, false to stop. * @priv: Will be passed as second argument to @fn. + * + * We grab one request reference before calling @fn and release it after + * @fn returns. */ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, busy_tag_iter_fn *fn, void *priv) { - int i; + unsigned int flags = tagset->flags; + int i, nr_tags; + + nr_tags = blk_mq_is_shared_tags(flags) ? 1 : tagset->nr_hw_queues; - for (i = 0; i < tagset->nr_hw_queues; i++) { + for (i = 0; i < nr_tags; i++) { if (tagset->tags && tagset->tags[i]) __blk_mq_all_tag_iter(tagset->tags[i], fn, priv, BT_TAG_ITER_STARTED); @@ -402,8 +449,7 @@ void blk_mq_tagset_busy_iter(struct blk_mq_tag_set *tagset, } EXPORT_SYMBOL(blk_mq_tagset_busy_iter); -static bool blk_mq_tagset_count_completed_rqs(struct request *rq, - void *data, bool reserved) +static bool blk_mq_tagset_count_completed_rqs(struct request *rq, void *data) { unsigned *count = data; @@ -413,8 +459,8 @@ static bool blk_mq_tagset_count_completed_rqs(struct request *rq, } /** - * blk_mq_tagset_wait_completed_request - wait until all completed req's - * complete funtion is run + * blk_mq_tagset_wait_completed_request - Wait until all scheduled request + * completions have finished. * @tagset: Tag set to drain completed request * * Note: This function has to be run after all IO queues are shutdown @@ -447,35 +493,45 @@ EXPORT_SYMBOL(blk_mq_tagset_wait_completed_request); * called for all requests on all queues that share that tag set and not only * for requests associated with @q. */ -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, void *priv) { - struct blk_mq_hw_ctx *hctx; - int i; - /* - * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and queue_hw_ctx + * __blk_mq_update_nr_hw_queues() updates nr_hw_queues and hctx_table * while the queue is frozen. So we can use q_usage_counter to avoid - * racing with it. __blk_mq_update_nr_hw_queues() uses - * synchronize_rcu() to ensure this function left the critical section - * below. + * racing with it. */ if (!percpu_ref_tryget(&q->q_usage_counter)) return; - queue_for_each_hw_ctx(q, hctx, i) { - struct blk_mq_tags *tags = hctx->tags; - - /* - * If no software queues are currently mapped to this - * hardware queue, there's nothing to check - */ - if (!blk_mq_hw_queue_mapped(hctx)) - continue; + if (blk_mq_is_shared_tags(q->tag_set->flags)) { + struct blk_mq_tags *tags = q->tag_set->shared_tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; if (tags->nr_reserved_tags) - bt_for_each(hctx, &tags->breserved_tags, fn, priv, true); - bt_for_each(hctx, &tags->bitmap_tags, fn, priv, false); + bt_for_each(NULL, q, bresv, fn, priv, true); + bt_for_each(NULL, q, btags, fn, priv, false); + } else { + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + queue_for_each_hw_ctx(q, hctx, i) { + struct blk_mq_tags *tags = hctx->tags; + struct sbitmap_queue *bresv = &tags->breserved_tags; + struct sbitmap_queue *btags = &tags->bitmap_tags; + + /* + * If no software queues are currently mapped to this + * hardware queue, there's nothing to check + */ + if (!blk_mq_hw_queue_mapped(hctx)) + continue; + + if (tags->nr_reserved_tags) + bt_for_each(hctx, q, bresv, fn, priv, true); + bt_for_each(hctx, q, btags, fn, priv, false); + } } blk_queue_exit(q); } @@ -487,24 +543,24 @@ static int bt_alloc(struct sbitmap_queue *bt, unsigned int depth, node); } -static struct blk_mq_tags *blk_mq_init_bitmap_tags(struct blk_mq_tags *tags, - int node, int alloc_policy) +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, + unsigned int queue_depth, unsigned int reserved, + int node, int alloc_policy) { - unsigned int depth = tags->nr_tags - tags->nr_reserved_tags; + unsigned int depth = queue_depth - reserved; bool round_robin = alloc_policy == BLK_TAG_ALLOC_RR; - if (bt_alloc(&tags->bitmap_tags, depth, round_robin, node)) - goto free_tags; - if (bt_alloc(&tags->breserved_tags, tags->nr_reserved_tags, round_robin, - node)) + if (bt_alloc(bitmap_tags, depth, round_robin, node)) + return -ENOMEM; + if (bt_alloc(breserved_tags, reserved, round_robin, node)) goto free_bitmap_tags; - return tags; + return 0; + free_bitmap_tags: - sbitmap_queue_free(&tags->bitmap_tags); -free_tags: - kfree(tags); - return NULL; + sbitmap_queue_free(bitmap_tags); + return -ENOMEM; } struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, @@ -524,8 +580,15 @@ struct blk_mq_tags *blk_mq_init_tags(unsigned int total_tags, tags->nr_tags = total_tags; tags->nr_reserved_tags = reserved_tags; + spin_lock_init(&tags->lock); - return blk_mq_init_bitmap_tags(tags, node, alloc_policy); + if (blk_mq_init_bitmaps(&tags->bitmap_tags, &tags->breserved_tags, + total_tags, reserved_tags, node, + alloc_policy) < 0) { + kfree(tags); + return NULL; + } + return tags; } void blk_mq_free_tags(struct blk_mq_tags *tags) @@ -551,7 +614,6 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, if (tdepth > tags->nr_tags) { struct blk_mq_tag_set *set = hctx->queue->tag_set; struct blk_mq_tags *new; - bool ret; if (!can_grow) return -EINVAL; @@ -560,21 +622,21 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, * We need some sort of upper limit, set it high enough that * no valid use cases should require more. */ - if (tdepth > 16 * BLKDEV_MAX_RQ) + if (tdepth > MAX_SCHED_RQ) return -EINVAL; - new = blk_mq_alloc_rq_map(set, hctx->queue_num, tdepth, - tags->nr_reserved_tags); + /* + * Only the sbitmap needs resizing since we allocated the max + * initially. + */ + if (blk_mq_is_shared_tags(set->flags)) + return 0; + + new = blk_mq_alloc_map_and_rqs(set, hctx->queue_num, tdepth); if (!new) return -ENOMEM; - ret = blk_mq_alloc_rqs(set, new, hctx->queue_num, tdepth); - if (ret) { - blk_mq_free_rq_map(new); - return -ENOMEM; - } - blk_mq_free_rqs(set, *tagsptr, hctx->queue_num); - blk_mq_free_rq_map(*tagsptr); + blk_mq_free_map_and_rqs(set, *tagsptr, hctx->queue_num); *tagsptr = new; } else { /* @@ -588,6 +650,19 @@ int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, return 0; } +void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, unsigned int size) +{ + struct blk_mq_tags *tags = set->shared_tags; + + sbitmap_queue_resize(&tags->bitmap_tags, size - set->reserved_tags); +} + +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q) +{ + sbitmap_queue_resize(&q->sched_shared_tags->bitmap_tags, + q->nr_requests - q->tag_set->reserved_tags); +} + /** * blk_mq_unique_tag() - return a tag that is unique queue-wide * @rq: request for which to compute a unique tag diff --git a/block/blk-mq-tag.h b/block/blk-mq-tag.h deleted file mode 100644 index 2e4ef51cdb32..000000000000 --- a/block/blk-mq-tag.h +++ /dev/null @@ -1,99 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef INT_BLK_MQ_TAG_H -#define INT_BLK_MQ_TAG_H - -#include "blk-mq.h" - -/* - * Tag address space map. - */ -struct blk_mq_tags { - unsigned int nr_tags; - unsigned int nr_reserved_tags; - - atomic_t active_queues; - - struct sbitmap_queue bitmap_tags; - struct sbitmap_queue breserved_tags; - - struct request **rqs; - struct request **static_rqs; - struct list_head page_list; -}; - - -extern struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, unsigned int reserved_tags, int node, int alloc_policy); -extern void blk_mq_free_tags(struct blk_mq_tags *tags); - -extern unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); -extern void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, - unsigned int tag); -extern int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, - struct blk_mq_tags **tags, - unsigned int depth, bool can_grow); -extern void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); -void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_iter_fn *fn, - void *priv); -void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, - void *priv); - -static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, - struct blk_mq_hw_ctx *hctx) -{ - if (!hctx) - return &bt->ws[0]; - return sbq_wait_ptr(bt, &hctx->wait_index); -} - -enum { - BLK_MQ_NO_TAG = -1U, - BLK_MQ_TAG_MIN = 1, - BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, -}; - -bool __blk_mq_get_driver_tag(struct request *rq); -static inline bool blk_mq_get_driver_tag(struct request *rq) -{ - if (rq->tag != BLK_MQ_NO_TAG) - return true; - return __blk_mq_get_driver_tag(rq); -} - -extern bool __blk_mq_tag_busy(struct blk_mq_hw_ctx *); -extern void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); - -static inline bool blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) -{ - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) - return false; - - return __blk_mq_tag_busy(hctx); -} - -static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) -{ - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) - return; - - __blk_mq_tag_idle(hctx); -} - -/* - * This helper should only be used for flush request to share tag - * with the request cloned from, and both the two requests can't be - * in flight at the same time. The caller has to make sure the tag - * can't be freed. - */ -static inline void blk_mq_tag_set_rq(struct blk_mq_hw_ctx *hctx, - unsigned int tag, struct request *rq) -{ - hctx->tags->rqs[tag] = rq; -} - -static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, - unsigned int tag) -{ - return tag < tags->nr_reserved_tags; -} - -#endif diff --git a/block/blk-mq-virtio.c b/block/blk-mq-virtio.c index 7b8a42c35102..68d0945c0b08 100644 --- a/block/blk-mq-virtio.c +++ b/block/blk-mq-virtio.c @@ -3,7 +3,6 @@ * Copyright (c) 2016 Christoph Hellwig. */ #include <linux/device.h> -#include <linux/blk-mq.h> #include <linux/blk-mq-virtio.h> #include <linux/virtio_config.h> #include <linux/module.h> @@ -21,7 +20,7 @@ * that maps a queue to the CPUs that have irq affinity for the corresponding * vector. */ -int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, +void blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, struct virtio_device *vdev, int first_vec) { const struct cpumask *mask; @@ -39,8 +38,9 @@ int blk_mq_virtio_map_queues(struct blk_mq_queue_map *qmap, qmap->mq_map[cpu] = qmap->queue_offset + queue; } - return 0; + return; + fallback: - return blk_mq_map_queues(qmap); + blk_mq_map_queues(qmap); } EXPORT_SYMBOL_GPL(blk_mq_virtio_map_queues); diff --git a/block/blk-mq.c b/block/blk-mq.c index 4e0d173beaa3..2dc01551e27c 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -10,14 +10,15 @@ #include <linux/backing-dev.h> #include <linux/bio.h> #include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/kmemleak.h> #include <linux/mm.h> #include <linux/init.h> #include <linux/slab.h> #include <linux/workqueue.h> #include <linux/smp.h> +#include <linux/interrupt.h> #include <linux/llist.h> -#include <linux/list_sort.h> #include <linux/cpu.h> #include <linux/cache.h> #include <linux/sched/sysctl.h> @@ -27,39 +28,29 @@ #include <linux/crash_dump.h> #include <linux/prefetch.h> #include <linux/blk-crypto.h> +#include <linux/part_stat.h> #include <trace/events/block.h> -#include <linux/blk-mq.h> #include <linux/t10-pi.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" #include "blk-pm.h" #include "blk-stat.h" #include "blk-mq-sched.h" #include "blk-rq-qos.h" -static void blk_mq_poll_stats_start(struct request_queue *q); -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); +static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); +static DEFINE_PER_CPU(call_single_data_t, blk_cpu_csd); -static int blk_mq_poll_stats_bkt(const struct request *rq) -{ - int ddir, sectors, bucket; - - ddir = rq_data_dir(rq); - sectors = blk_rq_stats_sectors(rq); - - bucket = ddir + 2 * ilog2(sectors); - - if (bucket < 0) - return -1; - else if (bucket >= BLK_MQ_POLL_STATS_BKTS) - return ddir + BLK_MQ_POLL_STATS_BKTS - 2; - - return bucket; -} +static void blk_mq_insert_request(struct request *rq, blk_insert_t flags); +static void blk_mq_request_bypass_insert(struct request *rq, + blk_insert_t flags); +static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, + struct list_head *list); +static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob, unsigned int flags); /* * Check if any of the ctx, dispatch list or elevator @@ -93,23 +84,24 @@ static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, } struct mq_inflight { - struct hd_struct *part; + struct block_device *part; unsigned int inflight[2]; }; -static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, - bool reserved) +static bool blk_mq_check_inflight(struct request *rq, void *priv) { struct mq_inflight *mi = priv; - if (rq->part == mi->part) + if (rq->part && blk_do_io_stat(rq) && + (!mi->part->bd_partno || rq->part == mi->part) && + blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) mi->inflight[rq_data_dir(rq)]++; return true; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part) { struct mq_inflight mi = { .part = part }; @@ -118,8 +110,8 @@ unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part) return mi.inflight[0] + mi.inflight[1]; } -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]) +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]) { struct mq_inflight mi = { .part = part }; @@ -184,9 +176,11 @@ void blk_mq_freeze_queue(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); -void blk_mq_unfreeze_queue(struct request_queue *q) +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) { mutex_lock(&q->mq_freeze_lock); + if (force_atomic) + q->q_usage_counter.data->force_atomic = true; q->mq_freeze_depth--; WARN_ON_ONCE(q->mq_freeze_depth < 0); if (!q->mq_freeze_depth) { @@ -195,6 +189,11 @@ void blk_mq_unfreeze_queue(struct request_queue *q) } mutex_unlock(&q->mq_freeze_lock); } + +void blk_mq_unfreeze_queue(struct request_queue *q) +{ + __blk_mq_unfreeze_queue(q, false); +} EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); /* @@ -203,11 +202,34 @@ EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); */ void blk_mq_quiesce_queue_nowait(struct request_queue *q) { - blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + + spin_lock_irqsave(&q->queue_lock, flags); + if (!q->quiesce_depth++) + blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); + spin_unlock_irqrestore(&q->queue_lock, flags); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); /** + * blk_mq_wait_quiesce_done() - wait until in-progress quiesce is done + * @set: tag_set to wait on + * + * Note: it is driver's responsibility for making sure that quiesce has + * been started on or more of the request_queues of the tag_set. This + * function only waits for the quiesce on those request_queues that had + * the quiesce flag set using blk_mq_quiesce_queue_nowait. + */ +void blk_mq_wait_quiesce_done(struct blk_mq_tag_set *set) +{ + if (set->flags & BLK_MQ_F_BLOCKING) + synchronize_srcu(set->srcu); + else + synchronize_rcu(); +} +EXPORT_SYMBOL_GPL(blk_mq_wait_quiesce_done); + +/** * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished * @q: request queue. * @@ -218,20 +240,10 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); */ void blk_mq_quiesce_queue(struct request_queue *q) { - struct blk_mq_hw_ctx *hctx; - unsigned int i; - bool rcu = false; - blk_mq_quiesce_queue_nowait(q); - - queue_for_each_hw_ctx(q, hctx, i) { - if (hctx->flags & BLK_MQ_F_BLOCKING) - synchronize_srcu(hctx->srcu); - else - rcu = true; - } - if (rcu) - synchronize_rcu(); + /* nothing to wait for non-mq queues */ + if (queue_is_mq(q)) + blk_mq_wait_quiesce_done(q->tag_set); } EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); @@ -244,114 +256,189 @@ EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); */ void blk_mq_unquiesce_queue(struct request_queue *q) { - blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + unsigned long flags; + bool run_queue = false; + + spin_lock_irqsave(&q->queue_lock, flags); + if (WARN_ON_ONCE(q->quiesce_depth <= 0)) { + ; + } else if (!--q->quiesce_depth) { + blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); + run_queue = true; + } + spin_unlock_irqrestore(&q->queue_lock, flags); /* dispatch requests which are inserted during quiescing */ - blk_mq_run_hw_queues(q, true); + if (run_queue) + blk_mq_run_hw_queues(q, true); } EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); +void blk_mq_quiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_quiesce_queue_nowait(q); + } + blk_mq_wait_quiesce_done(set); + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_quiesce_tagset); + +void blk_mq_unquiesce_tagset(struct blk_mq_tag_set *set) +{ + struct request_queue *q; + + mutex_lock(&set->tag_list_lock); + list_for_each_entry(q, &set->tag_list, tag_set_list) { + if (!blk_queue_skip_tagset_quiesce(q)) + blk_mq_unquiesce_queue(q); + } + mutex_unlock(&set->tag_list_lock); +} +EXPORT_SYMBOL_GPL(blk_mq_unquiesce_tagset); + void blk_mq_wake_waiters(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_wakeup_all(hctx->tags, true); } -/* - * Only need start/end time stamping if we have iostat or - * blk stats enabled, or using an IO scheduler. - */ -static inline bool blk_mq_need_time_stamp(struct request *rq) +void blk_rq_init(struct request_queue *q, struct request *rq) +{ + memset(rq, 0, sizeof(*rq)); + + INIT_LIST_HEAD(&rq->queuelist); + rq->q = q; + rq->__sector = (sector_t) -1; + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + rq->tag = BLK_MQ_NO_TAG; + rq->internal_tag = BLK_MQ_NO_TAG; + rq->start_time_ns = ktime_get_ns(); + rq->part = NULL; + blk_crypto_rq_set_defaults(rq); +} +EXPORT_SYMBOL(blk_rq_init); + +/* Set start and alloc time when the allocated request is actually used */ +static inline void blk_mq_rq_time_init(struct request *rq, u64 alloc_time_ns) { - return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; + if (blk_mq_need_time_stamp(rq)) + rq->start_time_ns = ktime_get_ns(); + else + rq->start_time_ns = 0; + +#ifdef CONFIG_BLK_RQ_ALLOC_TIME + if (blk_queue_rq_alloc_time(rq->q)) + rq->alloc_time_ns = alloc_time_ns ?: rq->start_time_ns; + else + rq->alloc_time_ns = 0; +#endif } static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, - unsigned int tag, u64 alloc_time_ns) + struct blk_mq_tags *tags, unsigned int tag) { - struct blk_mq_tags *tags = blk_mq_tags_from_data(data); + struct blk_mq_ctx *ctx = data->ctx; + struct blk_mq_hw_ctx *hctx = data->hctx; + struct request_queue *q = data->q; struct request *rq = tags->static_rqs[tag]; - req_flags_t rq_flags = 0; - if (data->flags & BLK_MQ_REQ_INTERNAL) { + rq->q = q; + rq->mq_ctx = ctx; + rq->mq_hctx = hctx; + rq->cmd_flags = data->cmd_flags; + + if (data->flags & BLK_MQ_REQ_PM) + data->rq_flags |= RQF_PM; + if (blk_queue_io_stat(q)) + data->rq_flags |= RQF_IO_STAT; + rq->rq_flags = data->rq_flags; + + if (data->rq_flags & RQF_SCHED_TAGS) { rq->tag = BLK_MQ_NO_TAG; rq->internal_tag = tag; } else { - if (data->hctx->flags & BLK_MQ_F_TAG_SHARED) { - rq_flags = RQF_MQ_INFLIGHT; - atomic_inc(&data->hctx->nr_active); - } rq->tag = tag; rq->internal_tag = BLK_MQ_NO_TAG; - data->hctx->tags->rqs[rq->tag] = rq; } + rq->timeout = 0; - /* csd/requeue_work/fifo_time is initialized before use */ - rq->q = data->q; - rq->mq_ctx = data->ctx; - rq->mq_hctx = data->hctx; - rq->rq_flags = rq_flags; - rq->cmd_flags = data->cmd_flags; - if (data->flags & BLK_MQ_REQ_PREEMPT) - rq->rq_flags |= RQF_PREEMPT; - if (blk_queue_io_stat(data->q)) - rq->rq_flags |= RQF_IO_STAT; - INIT_LIST_HEAD(&rq->queuelist); - INIT_HLIST_NODE(&rq->hash); - RB_CLEAR_NODE(&rq->rb_node); - rq->rq_disk = NULL; rq->part = NULL; -#ifdef CONFIG_BLK_RQ_ALLOC_TIME - rq->alloc_time_ns = alloc_time_ns; -#endif - if (blk_mq_need_time_stamp(rq)) - rq->start_time_ns = ktime_get_ns(); - else - rq->start_time_ns = 0; rq->io_start_time_ns = 0; rq->stats_sectors = 0; rq->nr_phys_segments = 0; #if defined(CONFIG_BLK_DEV_INTEGRITY) rq->nr_integrity_segments = 0; #endif - blk_crypto_rq_set_defaults(rq); - /* tag was already set */ - WRITE_ONCE(rq->deadline, 0); - - rq->timeout = 0; - rq->end_io = NULL; rq->end_io_data = NULL; - data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; - refcount_set(&rq->ref, 1); + blk_crypto_rq_set_defaults(rq); + INIT_LIST_HEAD(&rq->queuelist); + /* tag was already set */ + WRITE_ONCE(rq->deadline, 0); + req_ref_set(rq, 1); - if (!op_is_flush(data->cmd_flags)) { + if (rq->rq_flags & RQF_USE_SCHED) { struct elevator_queue *e = data->q->elevator; - rq->elv.icq = NULL; - if (e && e->type->ops.prepare_request) { - if (e->type->icq_cache) - blk_mq_sched_assign_ioc(rq); + INIT_HLIST_NODE(&rq->hash); + RB_CLEAR_NODE(&rq->rb_node); + if (e->type->ops.prepare_request) e->type->ops.prepare_request(rq); - rq->rq_flags |= RQF_ELVPRIV; - } } - data->hctx->queued++; return rq; } -static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) +static inline struct request * +__blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data) +{ + unsigned int tag, tag_offset; + struct blk_mq_tags *tags; + struct request *rq; + unsigned long tag_mask; + int i, nr = 0; + + tag_mask = blk_mq_get_tags(data, data->nr_tags, &tag_offset); + if (unlikely(!tag_mask)) + return NULL; + + tags = blk_mq_tags_from_data(data); + for (i = 0; tag_mask; i++) { + if (!(tag_mask & (1UL << i))) + continue; + tag = tag_offset + i; + prefetch(tags->static_rqs[tag]); + tag_mask &= ~(1UL << i); + rq = blk_mq_rq_ctx_init(data, tags, tag); + rq_list_add(data->cached_rq, rq); + nr++; + } + if (!(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_add_active_requests(data->hctx, nr); + /* caller already holds a reference, add for remainder */ + percpu_ref_get_many(&data->q->q_usage_counter, nr - 1); + data->nr_tags -= nr; + + return rq_list_pop(data->cached_rq); +} + +static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) { struct request_queue *q = data->q; - struct elevator_queue *e = q->elevator; u64 alloc_time_ns = 0; + struct request *rq; unsigned int tag; /* alloc_time includes depth and tag waits */ @@ -361,26 +448,50 @@ static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) if (data->cmd_flags & REQ_NOWAIT) data->flags |= BLK_MQ_REQ_NOWAIT; - if (e) { - data->flags |= BLK_MQ_REQ_INTERNAL; + if (q->elevator) { + /* + * All requests use scheduler tags when an I/O scheduler is + * enabled for the queue. + */ + data->rq_flags |= RQF_SCHED_TAGS; /* - * Flush requests are special and go directly to the - * dispatch list. Don't include reserved tags in the - * limiting, as it isn't useful. + * Flush/passthrough requests are special and go directly to the + * dispatch list. */ - if (!op_is_flush(data->cmd_flags) && - e->type->ops.limit_depth && - !(data->flags & BLK_MQ_REQ_RESERVED)) - e->type->ops.limit_depth(data->cmd_flags, data); + if ((data->cmd_flags & REQ_OP_MASK) != REQ_OP_FLUSH && + !blk_op_is_passthrough(data->cmd_flags)) { + struct elevator_mq_ops *ops = &q->elevator->type->ops; + + WARN_ON_ONCE(data->flags & BLK_MQ_REQ_RESERVED); + + data->rq_flags |= RQF_USE_SCHED; + if (ops->limit_depth) + ops->limit_depth(data->cmd_flags, data); + } } retry: data->ctx = blk_mq_get_ctx(q); data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); - if (!(data->flags & BLK_MQ_REQ_INTERNAL)) + if (!(data->rq_flags & RQF_SCHED_TAGS)) blk_mq_tag_busy(data->hctx); + if (data->flags & BLK_MQ_REQ_RESERVED) + data->rq_flags |= RQF_RESV; + + /* + * Try batched alloc if we want more than 1 tag. + */ + if (data->nr_tags > 1) { + rq = __blk_mq_alloc_requests_batch(data); + if (rq) { + blk_mq_rq_time_init(rq, alloc_time_ns); + return rq; + } + data->nr_tags = 1; + } + /* * Waiting allocations only fail because of an inactive hctx. In that * case just retry the hctx assignment and tag allocation as CPU hotplug @@ -390,36 +501,106 @@ retry: if (tag == BLK_MQ_NO_TAG) { if (data->flags & BLK_MQ_REQ_NOWAIT) return NULL; - /* - * Give up the CPU and sleep for a random short time to ensure - * that thread using a realtime scheduling class are migrated - * off the the CPU, and thus off the hctx that is going away. + * Give up the CPU and sleep for a random short time to + * ensure that thread using a realtime scheduling class + * are migrated off the CPU, and thus off the hctx that + * is going away. */ msleep(3); goto retry; } - return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); + + if (!(data->rq_flags & RQF_SCHED_TAGS)) + blk_mq_inc_active_requests(data->hctx); + rq = blk_mq_rq_ctx_init(data, blk_mq_tags_from_data(data), tag); + blk_mq_rq_time_init(rq, alloc_time_ns); + return rq; } -struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, - blk_mq_req_flags_t flags) +static struct request *blk_mq_rq_cache_fill(struct request_queue *q, + struct blk_plug *plug, + blk_opf_t opf, + blk_mq_req_flags_t flags) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, - .cmd_flags = op, + .cmd_flags = opf, + .nr_tags = plug->nr_ios, + .cached_rq = &plug->cached_rq, }; struct request *rq; - int ret; - ret = blk_queue_enter(q, flags); - if (ret) - return ERR_PTR(ret); + if (blk_queue_enter(q, flags)) + return NULL; - rq = __blk_mq_alloc_request(&data); - if (!rq) - goto out_queue_exit; + plug->nr_ios = 1; + + rq = __blk_mq_alloc_requests(&data); + if (unlikely(!rq)) + blk_queue_exit(q); + return rq; +} + +static struct request *blk_mq_alloc_cached_request(struct request_queue *q, + blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct blk_plug *plug = current->plug; + struct request *rq; + + if (!plug) + return NULL; + + if (rq_list_empty(plug->cached_rq)) { + if (plug->nr_ios == 1) + return NULL; + rq = blk_mq_rq_cache_fill(q, plug, opf, flags); + if (!rq) + return NULL; + } else { + rq = rq_list_peek(&plug->cached_rq); + if (!rq || rq->q != q) + return NULL; + + if (blk_mq_get_hctx_type(opf) != rq->mq_hctx->type) + return NULL; + if (op_is_flush(rq->cmd_flags) != op_is_flush(opf)) + return NULL; + + plug->cached_rq = rq_list_next(rq); + blk_mq_rq_time_init(rq, 0); + } + + rq->cmd_flags = opf; + INIT_LIST_HEAD(&rq->queuelist); + return rq; +} + +struct request *blk_mq_alloc_request(struct request_queue *q, blk_opf_t opf, + blk_mq_req_flags_t flags) +{ + struct request *rq; + + rq = blk_mq_alloc_cached_request(q, opf, flags); + if (!rq) { + struct blk_mq_alloc_data data = { + .q = q, + .flags = flags, + .cmd_flags = opf, + .nr_tags = 1, + }; + int ret; + + ret = blk_queue_enter(q, flags); + if (ret) + return ERR_PTR(ret); + + rq = __blk_mq_alloc_requests(&data); + if (!rq) + goto out_queue_exit; + } rq->__data_len = 0; rq->__sector = (sector_t) -1; rq->bio = rq->biotail = NULL; @@ -431,14 +612,16 @@ out_queue_exit: EXPORT_SYMBOL(blk_mq_alloc_request); struct request *blk_mq_alloc_request_hctx(struct request_queue *q, - unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) + blk_opf_t opf, blk_mq_req_flags_t flags, unsigned int hctx_idx) { struct blk_mq_alloc_data data = { .q = q, .flags = flags, - .cmd_flags = op, + .cmd_flags = opf, + .nr_tags = 1, }; u64 alloc_time_ns = 0; + struct request *rq; unsigned int cpu; unsigned int tag; int ret; @@ -453,7 +636,8 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * allocator for this for the rare use case of a command tied to * a specific queue. */ - if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED)))) + if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT)) || + WARN_ON_ONCE(!(flags & BLK_MQ_REQ_RESERVED))) return ERR_PTR(-EINVAL); if (hctx_idx >= q->nr_hw_queues) @@ -468,22 +652,34 @@ struct request *blk_mq_alloc_request_hctx(struct request_queue *q, * If not tell the caller that it should skip this queue. */ ret = -EXDEV; - data.hctx = q->queue_hw_ctx[hctx_idx]; + data.hctx = xa_load(&q->hctx_table, hctx_idx); if (!blk_mq_hw_queue_mapped(data.hctx)) goto out_queue_exit; cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); + if (cpu >= nr_cpu_ids) + goto out_queue_exit; data.ctx = __blk_mq_get_ctx(q, cpu); if (q->elevator) - data.flags |= BLK_MQ_REQ_INTERNAL; + data.rq_flags |= RQF_SCHED_TAGS; else blk_mq_tag_busy(data.hctx); + if (flags & BLK_MQ_REQ_RESERVED) + data.rq_flags |= RQF_RESV; + ret = -EWOULDBLOCK; tag = blk_mq_get_tag(&data); if (tag == BLK_MQ_NO_TAG) goto out_queue_exit; - return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); + if (!(data.rq_flags & RQF_SCHED_TAGS)) + blk_mq_inc_active_requests(data.hctx); + rq = blk_mq_rq_ctx_init(&data, blk_mq_tags_from_data(&data), tag); + blk_mq_rq_time_init(rq, alloc_time_ns); + rq->__data_len = 0; + rq->__sector = (sector_t) -1; + rq->bio = rq->biotail = NULL; + return rq; out_queue_exit: blk_queue_exit(q); @@ -491,6 +687,21 @@ out_queue_exit: } EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); +static void blk_mq_finish_request(struct request *rq) +{ + struct request_queue *q = rq->q; + + if (rq->rq_flags & RQF_USE_SCHED) { + q->elevator->type->ops.finish_request(rq); + /* + * For postflush request that may need to be + * completed twice, we should clear this flag + * to avoid double finish_request() on the rq. + */ + rq->rq_flags &= ~RQF_USE_SCHED; + } +} + static void __blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; @@ -501,8 +712,11 @@ static void __blk_mq_free_request(struct request *rq) blk_crypto_free_request(rq); blk_pm_mark_last_busy(rq); rq->mq_hctx = NULL; - if (rq->tag != BLK_MQ_NO_TAG) + + if (rq->tag != BLK_MQ_NO_TAG) { + blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, ctx, rq->tag); + } if (sched_tag != BLK_MQ_NO_TAG) blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); blk_mq_sched_restart(hctx); @@ -512,54 +726,329 @@ static void __blk_mq_free_request(struct request *rq) void blk_mq_free_request(struct request *rq) { struct request_queue *q = rq->q; - struct elevator_queue *e = q->elevator; - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - - if (rq->rq_flags & RQF_ELVPRIV) { - if (e && e->type->ops.finish_request) - e->type->ops.finish_request(rq); - if (rq->elv.icq) { - put_io_context(rq->elv.icq->ioc); - rq->elv.icq = NULL; - } - } - ctx->rq_completed[rq_is_sync(rq)]++; - if (rq->rq_flags & RQF_MQ_INFLIGHT) - atomic_dec(&hctx->nr_active); + blk_mq_finish_request(rq); if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) - laptop_io_completion(q->backing_dev_info); + laptop_io_completion(q->disk->bdi); rq_qos_done(q, rq); WRITE_ONCE(rq->state, MQ_RQ_IDLE); - if (refcount_dec_and_test(&rq->ref)) + if (req_ref_put_and_test(rq)) __blk_mq_free_request(rq); } EXPORT_SYMBOL_GPL(blk_mq_free_request); -inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +void blk_mq_free_plug_rqs(struct blk_plug *plug) { - u64 now = 0; + struct request *rq; - if (blk_mq_need_time_stamp(rq)) - now = ktime_get_ns(); + while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) + blk_mq_free_request(rq); +} - if (rq->rq_flags & RQF_STATS) { - blk_mq_poll_stats_start(rq->q); - blk_stat_add(rq, now); +void blk_dump_rq_flags(struct request *rq, char *msg) +{ + printk(KERN_INFO "%s: dev %s: flags=%llx\n", msg, + rq->q->disk ? rq->q->disk->disk_name : "?", + (__force unsigned long long) rq->cmd_flags); + + printk(KERN_INFO " sector %llu, nr/cnr %u/%u\n", + (unsigned long long)blk_rq_pos(rq), + blk_rq_sectors(rq), blk_rq_cur_sectors(rq)); + printk(KERN_INFO " bio %p, biotail %p, len %u\n", + rq->bio, rq->biotail, blk_rq_bytes(rq)); +} +EXPORT_SYMBOL(blk_dump_rq_flags); + +static void req_bio_endio(struct request *rq, struct bio *bio, + unsigned int nbytes, blk_status_t error) +{ + if (unlikely(error)) { + bio->bi_status = error; + } else if (req_op(rq) == REQ_OP_ZONE_APPEND) { + /* + * Partial zone append completions cannot be supported as the + * BIO fragments may end up not being written sequentially. + * For such case, force the completed nbytes to be equal to + * the BIO size so that bio_advance() sets the BIO remaining + * size to 0 and we end up calling bio_endio() before returning. + */ + if (bio->bi_iter.bi_size != nbytes) { + bio->bi_status = BLK_STS_IOERR; + nbytes = bio->bi_iter.bi_size; + } else { + bio->bi_iter.bi_sector = rq->__sector; + } + } + + bio_advance(bio, nbytes); + + if (unlikely(rq->rq_flags & RQF_QUIET)) + bio_set_flag(bio, BIO_QUIET); + /* don't actually finish bio if it's part of flush sequence */ + if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) + bio_endio(bio); +} + +static void blk_account_io_completion(struct request *req, unsigned int bytes) +{ + if (req->part && blk_do_io_stat(req)) { + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + part_stat_add(req->part, sectors[sgrp], bytes >> 9); + part_stat_unlock(); + } +} + +static void blk_print_req_error(struct request *req, blk_status_t status) +{ + printk_ratelimited(KERN_ERR + "%s error, dev %s, sector %llu op 0x%x:(%s) flags 0x%x " + "phys_seg %u prio class %u\n", + blk_status_to_str(status), + req->q->disk ? req->q->disk->disk_name : "?", + blk_rq_pos(req), (__force u32)req_op(req), + blk_op_str(req_op(req)), + (__force u32)(req->cmd_flags & ~REQ_OP_MASK), + req->nr_phys_segments, + IOPRIO_PRIO_CLASS(req->ioprio)); +} + +/* + * Fully end IO on a request. Does not support partial completions, or + * errors. + */ +static void blk_complete_request(struct request *req) +{ + const bool is_flush = (req->rq_flags & RQF_FLUSH_SEQ) != 0; + int total_bytes = blk_rq_bytes(req); + struct bio *bio = req->bio; + + trace_block_rq_complete(req, BLK_STS_OK, total_bytes); + + if (!bio) + return; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ) + req->q->integrity.profile->complete_fn(req, total_bytes); +#endif + + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + blk_crypto_rq_put_keyslot(req); + + blk_account_io_completion(req, total_bytes); + + do { + struct bio *next = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + + if (req_op(req) == REQ_OP_ZONE_APPEND) + bio->bi_iter.bi_sector = req->__sector; + + if (!is_flush) + bio_endio(bio); + bio = next; + } while (bio); + + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + if (!req->end_io) { + req->bio = NULL; + req->__data_len = 0; + } +} + +/** + * blk_update_request - Complete multiple bytes without completing the request + * @req: the request being processed + * @error: block status code + * @nr_bytes: number of bytes to complete for @req + * + * Description: + * Ends I/O on a number of bytes attached to @req, but doesn't complete + * the request structure even if @req doesn't have leftover. + * If @req has leftover, sets it up for the next range of segments. + * + * Passing the result of blk_rq_bytes() as @nr_bytes guarantees + * %false return from this function. + * + * Note: + * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function + * except in the consistency check at the end of this function. + * + * Return: + * %false - this request doesn't have any more data + * %true - this request has more data + **/ +bool blk_update_request(struct request *req, blk_status_t error, + unsigned int nr_bytes) +{ + int total_bytes; + + trace_block_rq_complete(req, error, nr_bytes); + + if (!req->bio) + return false; + +#ifdef CONFIG_BLK_DEV_INTEGRITY + if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && + error == BLK_STS_OK) + req->q->integrity.profile->complete_fn(req, nr_bytes); +#endif + + /* + * Upper layers may call blk_crypto_evict_key() anytime after the last + * bio_endio(). Therefore, the keyslot must be released before that. + */ + if (blk_crypto_rq_has_keyslot(req) && nr_bytes >= blk_rq_bytes(req)) + __blk_crypto_rq_put_keyslot(req); + + if (unlikely(error && !blk_rq_is_passthrough(req) && + !(req->rq_flags & RQF_QUIET)) && + !test_bit(GD_DEAD, &req->q->disk->state)) { + blk_print_req_error(req, error); + trace_block_rq_error(req, error, nr_bytes); + } + + blk_account_io_completion(req, nr_bytes); + + total_bytes = 0; + while (req->bio) { + struct bio *bio = req->bio; + unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); + + if (bio_bytes == bio->bi_iter.bi_size) + req->bio = bio->bi_next; + + /* Completion has already been traced */ + bio_clear_flag(bio, BIO_TRACE_COMPLETION); + req_bio_endio(req, bio, bio_bytes, error); + + total_bytes += bio_bytes; + nr_bytes -= bio_bytes; + + if (!nr_bytes) + break; + } + + /* + * completely done + */ + if (!req->bio) { + /* + * Reset counters so that the request stacking driver + * can find how many bytes remain in the request + * later. + */ + req->__data_len = 0; + return false; + } + + req->__data_len -= total_bytes; + + /* update sector only for requests with clear definition of sector */ + if (!blk_rq_is_passthrough(req)) + req->__sector += total_bytes >> 9; + + /* mixed attributes always follow the first bio */ + if (req->rq_flags & RQF_MIXED_MERGE) { + req->cmd_flags &= ~REQ_FAILFAST_MASK; + req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; + } + + if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { + /* + * If total number of sectors is less than the first segment + * size, something has gone terribly wrong. + */ + if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { + blk_dump_rq_flags(req, "request botched"); + req->__data_len = blk_rq_cur_bytes(req); + } + + /* recalculate the number of segments */ + req->nr_phys_segments = blk_recalc_rq_segments(req); + } + + return true; +} +EXPORT_SYMBOL_GPL(blk_update_request); + +static inline void blk_account_io_done(struct request *req, u64 now) +{ + trace_block_io_done(req); + + /* + * Account IO completion. flush_rq isn't accounted as a + * normal IO on queueing nor completion. Accounting the + * containing request is enough. + */ + if (blk_do_io_stat(req) && req->part && + !(req->rq_flags & RQF_FLUSH_SEQ)) { + const int sgrp = op_stat_group(req_op(req)); + + part_stat_lock(); + update_io_ticks(req->part, jiffies, true); + part_stat_inc(req->part, ios[sgrp]); + part_stat_add(req->part, nsecs[sgrp], now - req->start_time_ns); + part_stat_unlock(); + } +} + +static inline void blk_account_io_start(struct request *req) +{ + trace_block_io_start(req); + + if (blk_do_io_stat(req)) { + /* + * All non-passthrough requests are created from a bio with one + * exception: when a flush command that is part of a flush sequence + * generated by the state machine in blk-flush.c is cloned onto the + * lower device by dm-multipath we can get here without a bio. + */ + if (req->bio) + req->part = req->bio->bi_bdev; + else + req->part = req->q->disk->part0; + + part_stat_lock(); + update_io_ticks(req->part, jiffies, false); + part_stat_unlock(); } +} - if (rq->internal_tag != BLK_MQ_NO_TAG) - blk_mq_sched_completed_request(rq, now); +static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) +{ + if (rq->rq_flags & RQF_STATS) + blk_stat_add(rq, now); + blk_mq_sched_completed_request(rq, now); blk_account_io_done(rq, now); +} + +inline void __blk_mq_end_request(struct request *rq, blk_status_t error) +{ + if (blk_mq_need_time_stamp(rq)) + __blk_mq_end_request_acct(rq, ktime_get_ns()); + + blk_mq_finish_request(rq); if (rq->end_io) { rq_qos_done(rq->q, rq); - rq->end_io(rq, error); + if (rq->end_io(rq, error) == RQ_END_IO_FREE) + blk_mq_free_request(rq); } else { blk_mq_free_request(rq); } @@ -574,106 +1063,178 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); -static void __blk_mq_complete_request_remote(void *data) +#define TAG_COMP_BATCH 32 + +static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, + int *tag_array, int nr_tags) { - struct request *rq = data; - struct request_queue *q = rq->q; + struct request_queue *q = hctx->queue; + + blk_mq_sub_active_requests(hctx, nr_tags); - q->mq_ops->complete(rq); + blk_mq_put_tags(hctx->tags, tag_array, nr_tags); + percpu_ref_put_many(&q->q_usage_counter, nr_tags); } -/** - * blk_mq_force_complete_rq() - Force complete the request, bypassing any error - * injection that could drop the completion. - * @rq: Request to be force completed - * - * Drivers should use blk_mq_complete_request() to complete requests in their - * normal IO path. For timeout error recovery, drivers may call this forced - * completion routine after they've reclaimed timed out requests to bypass - * potentially subsequent fake timeouts. - */ -void blk_mq_force_complete_rq(struct request *rq) +void blk_mq_end_request_batch(struct io_comp_batch *iob) { - struct blk_mq_ctx *ctx = rq->mq_ctx; - struct request_queue *q = rq->q; - bool shared = false; - int cpu; + int tags[TAG_COMP_BATCH], nr_tags = 0; + struct blk_mq_hw_ctx *cur_hctx = NULL; + struct request *rq; + u64 now = 0; - WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); - /* - * Most of single queue controllers, there is only one irq vector - * for handling IO completion, and the only irq's affinity is set - * as all possible CPUs. On most of ARCHs, this affinity means the - * irq is handled on one specific CPU. - * - * So complete IO reqeust in softirq context in case of single queue - * for not degrading IO performance by irqsoff latency. - */ - if (q->nr_hw_queues == 1) { - __blk_complete_request(rq); - return; + if (iob->need_ts) + now = ktime_get_ns(); + + while ((rq = rq_list_pop(&iob->req_list)) != NULL) { + prefetch(rq->bio); + prefetch(rq->rq_next); + + blk_complete_request(rq); + if (iob->need_ts) + __blk_mq_end_request_acct(rq, now); + + blk_mq_finish_request(rq); + + rq_qos_done(rq->q, rq); + + /* + * If end_io handler returns NONE, then it still has + * ownership of the request. + */ + if (rq->end_io && rq->end_io(rq, 0) == RQ_END_IO_NONE) + continue; + + WRITE_ONCE(rq->state, MQ_RQ_IDLE); + if (!req_ref_put_and_test(rq)) + continue; + + blk_crypto_free_request(rq); + blk_pm_mark_last_busy(rq); + + if (nr_tags == TAG_COMP_BATCH || cur_hctx != rq->mq_hctx) { + if (cur_hctx) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); + nr_tags = 0; + cur_hctx = rq->mq_hctx; + } + tags[nr_tags++] = rq->tag; } + if (nr_tags) + blk_mq_flush_tag_batch(cur_hctx, tags, nr_tags); +} +EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); + +static void blk_complete_reqs(struct llist_head *list) +{ + struct llist_node *entry = llist_reverse_order(llist_del_all(list)); + struct request *rq, *next; + + llist_for_each_entry_safe(rq, next, entry, ipi_list) + rq->q->mq_ops->complete(rq); +} + +static __latent_entropy void blk_done_softirq(struct softirq_action *h) +{ + blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); +} + +static int blk_softirq_cpu_dead(unsigned int cpu) +{ + blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); + return 0; +} + +static void __blk_mq_complete_request_remote(void *data) +{ + __raise_softirq_irqoff(BLOCK_SOFTIRQ); +} + +static inline bool blk_mq_complete_need_ipi(struct request *rq) +{ + int cpu = raw_smp_processor_id(); + + if (!IS_ENABLED(CONFIG_SMP) || + !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) + return false; /* - * For a polled request, always complete locallly, it's pointless - * to redirect the completion. + * With force threaded interrupts enabled, raising softirq from an SMP + * function call will always result in waking the ksoftirqd thread. + * This is probably worse than completing the request on a different + * cache domain. */ - if ((rq->cmd_flags & REQ_HIPRI) || - !test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags)) { - q->mq_ops->complete(rq); - return; - } + if (force_irqthreads()) + return false; - cpu = get_cpu(); - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ctx->cpu); + /* same CPU or cache domain? Complete locally */ + if (cpu == rq->mq_ctx->cpu || + (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && + cpus_share_cache(cpu, rq->mq_ctx->cpu))) + return false; - if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { - rq->csd.func = __blk_mq_complete_request_remote; - rq->csd.info = rq; - rq->csd.flags = 0; - smp_call_function_single_async(ctx->cpu, &rq->csd); - } else { - q->mq_ops->complete(rq); - } - put_cpu(); + /* don't try to IPI to an offline CPU */ + return cpu_online(rq->mq_ctx->cpu); } -EXPORT_SYMBOL_GPL(blk_mq_force_complete_rq); -static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) - __releases(hctx->srcu) +static void blk_mq_complete_send_ipi(struct request *rq) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) - rcu_read_unlock(); - else - srcu_read_unlock(hctx->srcu, srcu_idx); + unsigned int cpu; + + cpu = rq->mq_ctx->cpu; + if (llist_add(&rq->ipi_list, &per_cpu(blk_cpu_done, cpu))) + smp_call_function_single_async(cpu, &per_cpu(blk_cpu_csd, cpu)); +} + +static void blk_mq_raise_softirq(struct request *rq) +{ + struct llist_head *list; + + preempt_disable(); + list = this_cpu_ptr(&blk_cpu_done); + if (llist_add(&rq->ipi_list, list)) + raise_softirq(BLOCK_SOFTIRQ); + preempt_enable(); } -static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) - __acquires(hctx->srcu) +bool blk_mq_complete_request_remote(struct request *rq) { - if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { - /* shut up gcc false positive */ - *srcu_idx = 0; - rcu_read_lock(); - } else - *srcu_idx = srcu_read_lock(hctx->srcu); + WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); + + /* + * For request which hctx has only one ctx mapping, + * or a polled request, always complete locally, + * it's pointless to redirect the completion. + */ + if ((rq->mq_hctx->nr_ctx == 1 && + rq->mq_ctx->cpu == raw_smp_processor_id()) || + rq->cmd_flags & REQ_POLLED) + return false; + + if (blk_mq_complete_need_ipi(rq)) { + blk_mq_complete_send_ipi(rq); + return true; + } + + if (rq->q->nr_hw_queues == 1) { + blk_mq_raise_softirq(rq); + return true; + } + return false; } +EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); /** * blk_mq_complete_request - end I/O on a request * @rq: the request being processed * * Description: - * Ends all I/O on a request. It does not handle partial completions. - * The actual completion happens out-of-order, through a IPI handler. + * Complete a request by scheduling the ->complete_rq operation. **/ -bool blk_mq_complete_request(struct request *rq) +void blk_mq_complete_request(struct request *rq) { - if (unlikely(blk_should_fake_timeout(rq->q))) - return false; - blk_mq_force_complete_rq(rq); - return true; + if (!blk_mq_complete_request_remote(rq)) + rq->q->mq_ops->complete(rq); } EXPORT_SYMBOL(blk_mq_complete_request); @@ -689,9 +1250,10 @@ void blk_mq_start_request(struct request *rq) { struct request_queue *q = rq->q; - trace_block_rq_issue(q, rq); + trace_block_rq_issue(rq); - if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { + if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags) && + !blk_rq_is_passthrough(rq)) { rq->io_start_time_ns = ktime_get_ns(); rq->stats_sectors = blk_rq_sectors(rq); rq->rq_flags |= RQF_STATS; @@ -702,21 +1264,179 @@ void blk_mq_start_request(struct request *rq) blk_add_timer(rq); WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); + rq->mq_hctx->tags->rqs[rq->tag] = rq; #ifdef CONFIG_BLK_DEV_INTEGRITY if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) q->integrity.profile->prepare_fn(rq); #endif + if (rq->bio && rq->bio->bi_opf & REQ_POLLED) + WRITE_ONCE(rq->bio->bi_cookie, rq->mq_hctx->queue_num); } EXPORT_SYMBOL(blk_mq_start_request); +/* + * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple + * queues. This is important for md arrays to benefit from merging + * requests. + */ +static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) +{ + if (plug->multiple_queues) + return BLK_MAX_REQUEST_COUNT * 2; + return BLK_MAX_REQUEST_COUNT; +} + +static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +{ + struct request *last = rq_list_peek(&plug->mq_list); + + if (!plug->rq_count) { + trace_block_plug(rq->q); + } else if (plug->rq_count >= blk_plug_max_rq_count(plug) || + (!blk_queue_nomerges(rq->q) && + blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { + blk_mq_flush_plug_list(plug, false); + last = NULL; + trace_block_plug(rq->q); + } + + if (!plug->multiple_queues && last && last->q != rq->q) + plug->multiple_queues = true; + /* + * Any request allocated from sched tags can't be issued to + * ->queue_rqs() directly + */ + if (!plug->has_elevator && (rq->rq_flags & RQF_SCHED_TAGS)) + plug->has_elevator = true; + rq->rq_next = NULL; + rq_list_add(&plug->mq_list, rq); + plug->rq_count++; +} + +/** + * blk_execute_rq_nowait - insert a request to I/O scheduler for execution + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution. Don't wait for completion. + * + * Note: + * This function will invoke @done directly if the queue is dead. + */ +void blk_execute_rq_nowait(struct request *rq, bool at_head) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + blk_account_io_start(rq); + + /* + * As plugging can be enabled for passthrough requests on a zoned + * device, directly accessing the plug instead of using blk_mq_plug() + * should not have any consequences. + */ + if (current->plug && !at_head) { + blk_add_rq_to_plug(current->plug, rq); + return; + } + + blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); +} +EXPORT_SYMBOL_GPL(blk_execute_rq_nowait); + +struct blk_rq_wait { + struct completion done; + blk_status_t ret; +}; + +static enum rq_end_io_ret blk_end_sync_rq(struct request *rq, blk_status_t ret) +{ + struct blk_rq_wait *wait = rq->end_io_data; + + wait->ret = ret; + complete(&wait->done); + return RQ_END_IO_NONE; +} + +bool blk_rq_is_poll(struct request *rq) +{ + if (!rq->mq_hctx) + return false; + if (rq->mq_hctx->type != HCTX_TYPE_POLL) + return false; + return true; +} +EXPORT_SYMBOL_GPL(blk_rq_is_poll); + +static void blk_rq_poll_completion(struct request *rq, struct completion *wait) +{ + do { + blk_hctx_poll(rq->q, rq->mq_hctx, NULL, 0); + cond_resched(); + } while (!completion_done(wait)); +} + +/** + * blk_execute_rq - insert a request into queue for execution + * @rq: request to insert + * @at_head: insert request at head or tail of queue + * + * Description: + * Insert a fully prepared request at the back of the I/O scheduler queue + * for execution and wait for completion. + * Return: The blk_status_t result provided to blk_mq_end_request(). + */ +blk_status_t blk_execute_rq(struct request *rq, bool at_head) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + struct blk_rq_wait wait = { + .done = COMPLETION_INITIALIZER_ONSTACK(wait.done), + }; + + WARN_ON(irqs_disabled()); + WARN_ON(!blk_rq_is_passthrough(rq)); + + rq->end_io_data = &wait; + rq->end_io = blk_end_sync_rq; + + blk_account_io_start(rq); + blk_mq_insert_request(rq, at_head ? BLK_MQ_INSERT_AT_HEAD : 0); + blk_mq_run_hw_queue(hctx, false); + + if (blk_rq_is_poll(rq)) { + blk_rq_poll_completion(rq, &wait.done); + } else { + /* + * Prevent hang_check timer from firing at us during very long + * I/O + */ + unsigned long hang_check = sysctl_hung_task_timeout_secs; + + if (hang_check) + while (!wait_for_completion_io_timeout(&wait.done, + hang_check * (HZ/2))) + ; + else + wait_for_completion_io(&wait.done); + } + + return wait.ret; +} +EXPORT_SYMBOL(blk_execute_rq); + static void __blk_mq_requeue_request(struct request *rq) { struct request_queue *q = rq->q; blk_mq_put_driver_tag(rq); - trace_block_rq_requeue(q, rq); + trace_block_rq_requeue(rq); rq_qos_requeue(q, rq); if (blk_mq_request_started(rq)) { @@ -727,13 +1447,20 @@ static void __blk_mq_requeue_request(struct request *rq) void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) { + struct request_queue *q = rq->q; + unsigned long flags; + __blk_mq_requeue_request(rq); /* this request will be re-inserted to io scheduler queue */ blk_mq_sched_requeue_request(rq); - BUG_ON(!list_empty(&rq->queuelist)); - blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); + spin_lock_irqsave(&q->requeue_lock, flags); + list_add_tail(&rq->queuelist, &q->requeue_list); + spin_unlock_irqrestore(&q->requeue_lock, flags); + + if (kick_requeue_list) + blk_mq_kick_requeue_list(q); } EXPORT_SYMBOL(blk_mq_requeue_request); @@ -742,63 +1469,40 @@ static void blk_mq_requeue_work(struct work_struct *work) struct request_queue *q = container_of(work, struct request_queue, requeue_work.work); LIST_HEAD(rq_list); - struct request *rq, *next; + LIST_HEAD(flush_list); + struct request *rq; spin_lock_irq(&q->requeue_lock); list_splice_init(&q->requeue_list, &rq_list); + list_splice_init(&q->flush_list, &flush_list); spin_unlock_irq(&q->requeue_lock); - list_for_each_entry_safe(rq, next, &rq_list, queuelist) { - if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) - continue; - - rq->rq_flags &= ~RQF_SOFTBARRIER; - list_del_init(&rq->queuelist); + while (!list_empty(&rq_list)) { + rq = list_entry(rq_list.next, struct request, queuelist); /* - * If RQF_DONTPREP, rq has contained some driver specific - * data, so insert it to hctx dispatch list to avoid any - * merge. + * If RQF_DONTPREP ist set, the request has been started by the + * driver already and might have driver-specific data allocated + * already. Insert it into the hctx dispatch list to avoid + * block layer merges for the request. */ - if (rq->rq_flags & RQF_DONTPREP) - blk_mq_request_bypass_insert(rq, false, false); - else - blk_mq_sched_insert_request(rq, true, false, false); + if (rq->rq_flags & RQF_DONTPREP) { + list_del_init(&rq->queuelist); + blk_mq_request_bypass_insert(rq, 0); + } else { + list_del_init(&rq->queuelist); + blk_mq_insert_request(rq, BLK_MQ_INSERT_AT_HEAD); + } } - while (!list_empty(&rq_list)) { - rq = list_entry(rq_list.next, struct request, queuelist); + while (!list_empty(&flush_list)) { + rq = list_entry(flush_list.next, struct request, queuelist); list_del_init(&rq->queuelist); - blk_mq_sched_insert_request(rq, false, false, false); + blk_mq_insert_request(rq, 0); } blk_mq_run_hw_queues(q, false); } -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list) -{ - struct request_queue *q = rq->q; - unsigned long flags; - - /* - * We abuse this flag that is otherwise used by the I/O scheduler to - * request head insertion from the workqueue. - */ - BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); - - spin_lock_irqsave(&q->requeue_lock, flags); - if (at_head) { - rq->rq_flags |= RQF_SOFTBARRIER; - list_add(&rq->queuelist, &q->requeue_list); - } else { - list_add_tail(&rq->queuelist, &q->requeue_list); - } - spin_unlock_irqrestore(&q->requeue_lock, flags); - - if (kick_requeue_list) - blk_mq_kick_requeue_list(q); -} - void blk_mq_kick_requeue_list(struct request_queue *q) { kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); @@ -813,25 +1517,26 @@ void blk_mq_delay_kick_requeue_list(struct request_queue *q, } EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); -struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) +static bool blk_is_flush_data_rq(struct request *rq) { - if (tag < tags->nr_tags) { - prefetch(tags->rqs[tag]); - return tags->rqs[tag]; - } - - return NULL; + return (rq->rq_flags & RQF_FLUSH_SEQ) && !is_flush_rq(rq); } -EXPORT_SYMBOL(blk_mq_tag_to_rq); -static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, - void *priv, bool reserved) +static bool blk_mq_rq_inflight(struct request *rq, void *priv) { /* - * If we find a request that isn't idle and the queue matches, - * we know the queue is busy. Return false to stop the iteration. + * If we find a request that isn't idle we know the queue is busy + * as it's checked in the iter. + * Return false to stop the iteration. + * + * In case of queue quiesce, if one flush data request is completed, + * don't count it as inflight given the flush sequence is suspended, + * and the original flush data request is invisible to driver, just + * like other pending requests because of quiesce */ - if (blk_mq_request_started(rq) && rq->q == hctx->queue) { + if (blk_mq_request_started(rq) && !(blk_queue_quiesced(rq->q) && + blk_is_flush_data_rq(rq) && + blk_mq_request_completed(rq))) { bool *busy = priv; *busy = true; @@ -850,13 +1555,13 @@ bool blk_mq_queue_inflight(struct request_queue *q) } EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); -static void blk_mq_rq_timed_out(struct request *req, bool reserved) +static void blk_mq_rq_timed_out(struct request *req) { req->rq_flags |= RQF_TIMED_OUT; if (req->q->mq_ops->timeout) { enum blk_eh_timer_return ret; - ret = req->q->mq_ops->timeout(req, reserved); + ret = req->q->mq_ops->timeout(req); if (ret == BLK_EH_DONE) return; WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); @@ -865,7 +1570,13 @@ static void blk_mq_rq_timed_out(struct request *req, bool reserved) blk_add_timer(req); } -static bool blk_mq_req_expired(struct request *rq, unsigned long *next) +struct blk_expired_data { + bool has_timedout_rq; + unsigned long next; + unsigned long timeout_start; +}; + +static bool blk_mq_req_expired(struct request *rq, struct blk_expired_data *expired) { unsigned long deadline; @@ -875,54 +1586,50 @@ static bool blk_mq_req_expired(struct request *rq, unsigned long *next) return false; deadline = READ_ONCE(rq->deadline); - if (time_after_eq(jiffies, deadline)) + if (time_after_eq(expired->timeout_start, deadline)) return true; - if (*next == 0) - *next = deadline; - else if (time_after(*next, deadline)) - *next = deadline; + if (expired->next == 0) + expired->next = deadline; + else if (time_after(expired->next, deadline)) + expired->next = deadline; return false; } -static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, - struct request *rq, void *priv, bool reserved) +void blk_mq_put_rq_ref(struct request *rq) { - unsigned long *next = priv; - - /* - * Just do a quick check if it is expired before locking the request in - * so we're not unnecessarilly synchronizing across CPUs. - */ - if (!blk_mq_req_expired(rq, next)) - return true; + if (is_flush_rq(rq)) { + if (rq->end_io(rq, 0) == RQ_END_IO_FREE) + blk_mq_free_request(rq); + } else if (req_ref_put_and_test(rq)) { + __blk_mq_free_request(rq); + } +} - /* - * We have reason to believe the request may be expired. Take a - * reference on the request to lock this request lifetime into its - * currently allocated context to prevent it from being reallocated in - * the event the completion by-passes this timeout handler. - * - * If the reference was already released, then the driver beat the - * timeout handler to posting a natural completion. - */ - if (!refcount_inc_not_zero(&rq->ref)) - return true; +static bool blk_mq_check_expired(struct request *rq, void *priv) +{ + struct blk_expired_data *expired = priv; /* - * The request is now locked and cannot be reallocated underneath the - * timeout handler's processing. Re-verify this exact request is truly - * expired; if it is not expired, then the request was completed and - * reallocated as a new request. + * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot + * be reallocated underneath the timeout handler's processing, then + * the expire check is reliable. If the request is not expired, then + * it was completed and reallocated as a new request after returning + * from blk_mq_check_expired(). */ - if (blk_mq_req_expired(rq, next)) - blk_mq_rq_timed_out(rq, reserved); + if (blk_mq_req_expired(rq, expired)) { + expired->has_timedout_rq = true; + return false; + } + return true; +} - if (is_flush_rq(rq, hctx)) - rq->end_io(rq, 0); - else if (refcount_dec_and_test(&rq->ref)) - __blk_mq_free_request(rq); +static bool blk_mq_handle_expired(struct request *rq, void *priv) +{ + struct blk_expired_data *expired = priv; + if (blk_mq_req_expired(rq, expired)) + blk_mq_rq_timed_out(rq); return true; } @@ -930,9 +1637,11 @@ static void blk_mq_timeout_work(struct work_struct *work) { struct request_queue *q = container_of(work, struct request_queue, timeout_work); - unsigned long next = 0; + struct blk_expired_data expired = { + .timeout_start = jiffies, + }; struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; /* A deadlock might occur if a request is stuck requiring a * timeout at the same time a queue freeze is waiting @@ -950,10 +1659,23 @@ static void blk_mq_timeout_work(struct work_struct *work) if (!percpu_ref_tryget(&q->q_usage_counter)) return; - blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); + /* check if there is any timed-out request */ + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &expired); + if (expired.has_timedout_rq) { + /* + * Before walking tags, we must ensure any submit started + * before the current time has finished. Since the submit + * uses srcu or rcu, wait for a synchronization point to + * ensure all running submits have finished + */ + blk_mq_wait_quiesce_done(q->tag_set); + + expired.next = 0; + blk_mq_queue_tag_busy_iter(q, blk_mq_handle_expired, &expired); + } - if (next != 0) { - mod_timer(&q->timeout, next); + if (expired.next != 0) { + mod_timer(&q->timeout, expired.next); } else { /* * Request timeouts are handled as a forward rolling timer. If @@ -1044,12 +1766,29 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, return data.rq; } -static inline unsigned int queued_to_index(unsigned int queued) +bool __blk_mq_alloc_driver_tag(struct request *rq) { - if (!queued) - return 0; + struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; + unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; + int tag; + + blk_mq_tag_busy(rq->mq_hctx); + + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { + bt = &rq->mq_hctx->tags->breserved_tags; + tag_offset = 0; + } else { + if (!hctx_may_queue(rq->mq_hctx, bt)) + return false; + } + + tag = __sbitmap_queue_get(bt); + if (tag == BLK_MQ_NO_TAG) + return false; - return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); + rq->tag = tag + tag_offset; + blk_mq_inc_active_requests(rq->mq_hctx); + return true; } static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, @@ -1082,12 +1821,13 @@ static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, struct request *rq) { - struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; + struct sbitmap_queue *sbq; struct wait_queue_head *wq; wait_queue_entry_t *wait; bool ret; - if (!(hctx->flags & BLK_MQ_F_TAG_SHARED)) { + if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && + !(blk_mq_is_shared_tags(hctx->flags))) { blk_mq_sched_mark_restart_hctx(hctx); /* @@ -1105,6 +1845,10 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, if (!list_empty_careful(&wait->entry)) return false; + if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) + sbq = &hctx->tags->breserved_tags; + else + sbq = &hctx->tags->bitmap_tags; wq = &bt_wait_ptr(sbq, hctx)->wait; spin_lock_irq(&wq->lock); @@ -1120,6 +1864,22 @@ static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, __add_wait_queue(wq, wait); /* + * Add one explicit barrier since blk_mq_get_driver_tag() may + * not imply barrier in case of failure. + * + * Order adding us to wait queue and allocating driver tag. + * + * The pair is the one implied in sbitmap_queue_wake_up() which + * orders clearing sbitmap tag bits and waitqueue_active() in + * __sbitmap_queue_wake_up(), since waitqueue_active() is lockless + * + * Otherwise, re-order of adding wait queue and getting driver tag + * may cause __sbitmap_queue_wake_up() to wake up nothing because + * the waitqueue_active() may not observe us in wait queue. + */ + smp_mb(); + + /* * It's possible that a tag was freed in the window between the * allocation failure and adding the hardware queue to the wait * queue. @@ -1156,9 +1916,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) { unsigned int ewma; - if (hctx->queue->elevator) - return; - ewma = hctx->dispatch_busy; if (!ewma && !busy) @@ -1177,16 +1934,6 @@ static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) static void blk_mq_handle_dev_resource(struct request *rq, struct list_head *list) { - struct request *next = - list_first_entry_or_null(list, struct request, queuelist); - - /* - * If an I/O scheduler has been configured and we got a driver tag for - * the next request already, free it. - */ - if (next) - blk_mq_put_driver_tag(next); - list_add(&rq->queuelist, list); __blk_mq_requeue_request(rq); } @@ -1204,105 +1951,155 @@ static void blk_mq_handle_zone_resource(struct request *rq, __blk_mq_requeue_request(rq); } +enum prep_dispatch { + PREP_DISPATCH_OK, + PREP_DISPATCH_NO_TAG, + PREP_DISPATCH_NO_BUDGET, +}; + +static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, + bool need_budget) +{ + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; + int budget_token = -1; + + if (need_budget) { + budget_token = blk_mq_get_dispatch_budget(rq->q); + if (budget_token < 0) { + blk_mq_put_driver_tag(rq); + return PREP_DISPATCH_NO_BUDGET; + } + blk_mq_set_rq_budget_token(rq, budget_token); + } + + if (!blk_mq_get_driver_tag(rq)) { + /* + * The initial allocation attempt failed, so we need to + * rerun the hardware queue when a tag is freed. The + * waitqueue takes care of that. If the queue is run + * before we add this entry back on the dispatch list, + * we'll re-run it below. + */ + if (!blk_mq_mark_tag_wait(hctx, rq)) { + /* + * All budgets not got from this function will be put + * together during handling partial dispatch + */ + if (need_budget) + blk_mq_put_dispatch_budget(rq->q, budget_token); + return PREP_DISPATCH_NO_TAG; + } + } + + return PREP_DISPATCH_OK; +} + +/* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ +static void blk_mq_release_budgets(struct request_queue *q, + struct list_head *list) +{ + struct request *rq; + + list_for_each_entry(rq, list, queuelist) { + int budget_token = blk_mq_get_rq_budget_token(rq); + + if (budget_token >= 0) + blk_mq_put_dispatch_budget(q, budget_token); + } +} + +/* + * blk_mq_commit_rqs will notify driver using bd->last that there is no + * more requests. (See comment in struct blk_mq_ops for commit_rqs for + * details) + * Attention, we should explicitly call this in unusual cases: + * 1) did not queue everything initially scheduled to queue + * 2) the last attempt to queue a request failed + */ +static void blk_mq_commit_rqs(struct blk_mq_hw_ctx *hctx, int queued, + bool from_schedule) +{ + if (hctx->queue->mq_ops->commit_rqs && queued) { + trace_block_unplug(hctx->queue, queued, !from_schedule); + hctx->queue->mq_ops->commit_rqs(hctx); + } +} + /* * Returns true if we did some work AND can potentially do more. */ -bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, - bool got_budget) +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, + unsigned int nr_budgets) { - struct blk_mq_hw_ctx *hctx; - struct request *rq, *nxt; - bool no_tag = false; - int errors, queued; + enum prep_dispatch prep; + struct request_queue *q = hctx->queue; + struct request *rq; + int queued; blk_status_t ret = BLK_STS_OK; - bool no_budget_avail = false; LIST_HEAD(zone_list); + bool needs_resource = false; if (list_empty(list)) return false; - WARN_ON(!list_is_singular(list) && got_budget); - /* * Now process all the entries, sending them to the driver. */ - errors = queued = 0; + queued = 0; do { struct blk_mq_queue_data bd; rq = list_first_entry(list, struct request, queuelist); - hctx = rq->mq_hctx; - if (!got_budget && !blk_mq_get_dispatch_budget(hctx)) { - blk_mq_put_driver_tag(rq); - no_budget_avail = true; + WARN_ON_ONCE(hctx != rq->mq_hctx); + prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); + if (prep != PREP_DISPATCH_OK) break; - } - - if (!blk_mq_get_driver_tag(rq)) { - /* - * The initial allocation attempt failed, so we need to - * rerun the hardware queue when a tag is freed. The - * waitqueue takes care of that. If the queue is run - * before we add this entry back on the dispatch list, - * we'll re-run it below. - */ - if (!blk_mq_mark_tag_wait(hctx, rq)) { - blk_mq_put_dispatch_budget(hctx); - /* - * For non-shared tags, the RESTART check - * will suffice. - */ - if (hctx->flags & BLK_MQ_F_TAG_SHARED) - no_tag = true; - break; - } - } list_del_init(&rq->queuelist); bd.rq = rq; + bd.last = list_empty(list); /* - * Flag last if we have no more requests, or if we have more - * but can't assign a driver tag to it. + * once the request is queued to lld, no need to cover the + * budget any more */ - if (list_empty(list)) - bd.last = true; - else { - nxt = list_first_entry(list, struct request, queuelist); - bd.last = !blk_mq_get_driver_tag(nxt); - } - + if (nr_budgets) + nr_budgets--; ret = q->mq_ops->queue_rq(hctx, &bd); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) { - blk_mq_handle_dev_resource(rq, list); + switch (ret) { + case BLK_STS_OK: + queued++; break; - } else if (ret == BLK_STS_ZONE_RESOURCE) { + case BLK_STS_RESOURCE: + needs_resource = true; + fallthrough; + case BLK_STS_DEV_RESOURCE: + blk_mq_handle_dev_resource(rq, list); + goto out; + case BLK_STS_ZONE_RESOURCE: /* * Move the request to zone_list and keep going through * the dispatch list to find more requests the drive can * accept. */ blk_mq_handle_zone_resource(rq, &zone_list); - if (list_empty(list)) - break; - continue; - } - - if (unlikely(ret != BLK_STS_OK)) { - errors++; - blk_mq_end_request(rq, BLK_STS_IOERR); - continue; + needs_resource = true; + break; + default: + blk_mq_end_request(rq, ret); } - - queued++; } while (!list_empty(list)); - +out: if (!list_empty(&zone_list)) list_splice_tail_init(&zone_list, list); - hctx->dispatched[queued_to_index(queued)]++; + /* If we didn't flush the entire list, we could have told the driver + * there was more coming, but that turned out to be a lie. + */ + if (!list_empty(list) || ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); /* * Any items that need requeuing? Stuff them into hctx->dispatch, @@ -1310,20 +2107,28 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, */ if (!list_empty(list)) { bool needs_restart; + /* For non-shared tags, the RESTART check will suffice */ + bool no_tag = prep == PREP_DISPATCH_NO_TAG && + ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) || + blk_mq_is_shared_tags(hctx->flags)); - /* - * If we didn't flush the entire list, we could have told - * the driver there was more coming, but that turned out to - * be a lie. - */ - if (q->mq_ops->commit_rqs && queued) - q->mq_ops->commit_rqs(hctx); + if (nr_budgets) + blk_mq_release_budgets(q, list); spin_lock(&hctx->lock); list_splice_tail_init(list, &hctx->dispatch); spin_unlock(&hctx->lock); /* + * Order adding requests to hctx->dispatch and checking + * SCHED_RESTART flag. The pair of this smp_mb() is the one + * in blk_mq_sched_restart(). Avoid restart code path to + * miss the new added requests to hctx->dispatch, meantime + * SCHED_RESTART is observed here. + */ + smp_mb(); + + /* * If SCHED_RESTART was set by the caller of this function and * it is no longer set that means that it was cleared by another * thread and hence that a queue rerun is needed. @@ -1346,77 +2151,24 @@ bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list, * If driver returns BLK_STS_RESOURCE and SCHED_RESTART * bit is set, run queue after a delay to avoid IO stalls * that could otherwise occur if the queue is idle. We'll do - * similar if we couldn't get budget and SCHED_RESTART is set. + * similar if we couldn't get budget or couldn't lock a zone + * and SCHED_RESTART is set. */ needs_restart = blk_mq_sched_needs_restart(hctx); + if (prep == PREP_DISPATCH_NO_BUDGET) + needs_resource = true; if (!needs_restart || (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) blk_mq_run_hw_queue(hctx, true); - else if (needs_restart && (ret == BLK_STS_RESOURCE || - no_budget_avail)) + else if (needs_resource) blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); blk_mq_update_dispatch_busy(hctx, true); return false; - } else - blk_mq_update_dispatch_busy(hctx, false); - - /* - * If the host/device is unable to accept more work, inform the - * caller of that. - */ - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - return false; - - return (queued + errors) != 0; -} - -/** - * __blk_mq_run_hw_queue - Run a hardware queue. - * @hctx: Pointer to the hardware queue to run. - * - * Send pending requests to the hardware. - */ -static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) -{ - int srcu_idx; - - /* - * We should be running this queue from one of the CPUs that - * are mapped to it. - * - * There are at least two related races now between setting - * hctx->next_cpu from blk_mq_hctx_next_cpu() and running - * __blk_mq_run_hw_queue(): - * - * - hctx->next_cpu is found offline in blk_mq_hctx_next_cpu(), - * but later it becomes online, then this warning is harmless - * at all - * - * - hctx->next_cpu is found online in blk_mq_hctx_next_cpu(), - * but later it becomes offline, then the warning can't be - * triggered, and we depend on blk-mq timeout handler to - * handle dispatched requests to this hctx - */ - if (!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && - cpu_online(hctx->next_cpu)) { - printk(KERN_WARNING "run queue from wrong CPU %d, hctx %s\n", - raw_smp_processor_id(), - cpumask_empty(hctx->cpumask) ? "inactive": "active"); - dump_stack(); } - /* - * We can't run the queue inline with ints disabled. Ensure that - * we catch bad users of this early. - */ - WARN_ON_ONCE(in_interrupt()); - - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); - - hctx_lock(hctx, &srcu_idx); - blk_mq_sched_dispatch_requests(hctx); - hctx_unlock(hctx, srcu_idx); + blk_mq_update_dispatch_busy(hctx, false); + return true; } static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) @@ -1475,46 +2227,19 @@ select_cpu: } /** - * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. + * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. * @hctx: Pointer to the hardware queue to run. - * @async: If we want to run the queue asynchronously. - * @msecs: Microseconds of delay to wait before running the queue. + * @msecs: Milliseconds of delay to wait before running the queue. * - * If !@async, try to run the queue now. Else, run the queue asynchronously and - * with a delay of @msecs. + * Run a hardware queue asynchronously with a delay of @msecs. */ -static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, - unsigned long msecs) +void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) { if (unlikely(blk_mq_hctx_stopped(hctx))) return; - - if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); - if (cpumask_test_cpu(cpu, hctx->cpumask)) { - __blk_mq_run_hw_queue(hctx); - put_cpu(); - return; - } - - put_cpu(); - } - kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, msecs_to_jiffies(msecs)); } - -/** - * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. - * @hctx: Pointer to the hardware queue to run. - * @msecs: Microseconds of delay to wait before running the queue. - * - * Run a hardware queue asynchronously with a delay of @msecs. - */ -void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) -{ - __blk_mq_delay_run_hw_queue(hctx, true, msecs); -} EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); /** @@ -1528,10 +2253,16 @@ EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); */ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) { - int srcu_idx; bool need_run; /* + * We can't run the queue inline with interrupts disabled. + */ + WARN_ON_ONCE(!async && in_interrupt()); + + might_sleep_if(!async && hctx->flags & BLK_MQ_F_BLOCKING); + + /* * When queue is quiesced, we may be switching io scheduler, or * updating nr_hw_queues, or other things, and we can't run queue * any more, even __blk_mq_hctx_has_pending() can't be called safely. @@ -1539,31 +2270,68 @@ void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) * And queue will be rerun in blk_mq_unquiesce_queue() if it is * quiesced. */ - hctx_lock(hctx, &srcu_idx); - need_run = !blk_queue_quiesced(hctx->queue) && - blk_mq_hctx_has_pending(hctx); - hctx_unlock(hctx, srcu_idx); + __blk_mq_run_dispatch_ops(hctx->queue, false, + need_run = !blk_queue_quiesced(hctx->queue) && + blk_mq_hctx_has_pending(hctx)); + + if (!need_run) + return; + + if (async || !cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)) { + blk_mq_delay_run_hw_queue(hctx, 0); + return; + } - if (need_run) - __blk_mq_delay_run_hw_queue(hctx, async, 0); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } EXPORT_SYMBOL(blk_mq_run_hw_queue); +/* + * Return prefered queue to dispatch from (if any) for non-mq aware IO + * scheduler. + */ +static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) +{ + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + /* + * If the IO scheduler does not respect hardware queues when + * dispatching, we just don't bother with multiple HW queues and + * dispatch from hctx for the current CPU since running multiple queues + * just causes lock contention inside the scheduler and pointless cache + * bouncing. + */ + struct blk_mq_hw_ctx *hctx = ctx->hctxs[HCTX_TYPE_DEFAULT]; + + if (!blk_mq_hctx_stopped(hctx)) + return hctx; + return NULL; +} + /** - * blk_mq_run_hw_queue - Run all hardware queues in a request queue. + * blk_mq_run_hw_queues - Run all hardware queues in a request queue. * @q: Pointer to the request queue to run. * @async: If we want to run the queue asynchronously. */ void blk_mq_run_hw_queues(struct request_queue *q, bool async) { - struct blk_mq_hw_ctx *hctx; - int i; + struct blk_mq_hw_ctx *hctx, *sq_hctx; + unsigned long i; + sq_hctx = NULL; + if (blk_queue_sq_sched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_run_hw_queue(hctx, async); + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_run_hw_queue(hctx, async); } } EXPORT_SYMBOL(blk_mq_run_hw_queues); @@ -1571,42 +2339,39 @@ EXPORT_SYMBOL(blk_mq_run_hw_queues); /** * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. * @q: Pointer to the request queue to run. - * @msecs: Microseconds of delay to wait before running the queues. + * @msecs: Milliseconds of delay to wait before running the queues. */ void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) { - struct blk_mq_hw_ctx *hctx; - int i; + struct blk_mq_hw_ctx *hctx, *sq_hctx; + unsigned long i; + sq_hctx = NULL; + if (blk_queue_sq_sched(q)) + sq_hctx = blk_mq_get_sq_hctx(q); queue_for_each_hw_ctx(q, hctx, i) { if (blk_mq_hctx_stopped(hctx)) continue; - - blk_mq_delay_run_hw_queue(hctx, msecs); + /* + * If there is already a run_work pending, leave the + * pending delay untouched. Otherwise, a hctx can stall + * if another hctx is re-delaying the other's work + * before the work executes. + */ + if (delayed_work_pending(&hctx->run_work)) + continue; + /* + * Dispatch from this hctx either if there's no hctx preferred + * by IO scheduler or if it has requests that bypass the + * scheduler. + */ + if (!sq_hctx || sq_hctx == hctx || + !list_empty_careful(&hctx->dispatch)) + blk_mq_delay_run_hw_queue(hctx, msecs); } } EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); -/** - * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped - * @q: request queue. - * - * The caller is responsible for serializing this function against - * blk_mq_{start,stop}_hw_queue(). - */ -bool blk_mq_queue_stopped(struct request_queue *q) -{ - struct blk_mq_hw_ctx *hctx; - int i; - - queue_for_each_hw_ctx(q, hctx, i) - if (blk_mq_hctx_stopped(hctx)) - return true; - - return false; -} -EXPORT_SYMBOL(blk_mq_queue_stopped); - /* * This function is often used for pausing .queue_rq() by driver when * there isn't enough resource or some conditions aren't satisfied, and @@ -1636,7 +2401,7 @@ EXPORT_SYMBOL(blk_mq_stop_hw_queue); void blk_mq_stop_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_stop_hw_queue(hctx); @@ -1647,14 +2412,14 @@ void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) { clear_bit(BLK_MQ_S_STOPPED, &hctx->state); - blk_mq_run_hw_queue(hctx, false); + blk_mq_run_hw_queue(hctx, hctx->flags & BLK_MQ_F_BLOCKING); } EXPORT_SYMBOL(blk_mq_start_hw_queue); void blk_mq_start_hw_queues(struct request_queue *q) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) blk_mq_start_hw_queue(hctx); @@ -1674,179 +2439,170 @@ EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) - blk_mq_start_stopped_hw_queue(hctx, async); + blk_mq_start_stopped_hw_queue(hctx, async || + (hctx->flags & BLK_MQ_F_BLOCKING)); } EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); static void blk_mq_run_work_fn(struct work_struct *work) { - struct blk_mq_hw_ctx *hctx; - - hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); - - /* - * If we are stopped, don't run the queue. - */ - if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) - return; - - __blk_mq_run_hw_queue(hctx); -} - -static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, - struct request *rq, - bool at_head) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - enum hctx_type type = hctx->type; - - lockdep_assert_held(&ctx->lock); - - trace_block_rq_insert(hctx->queue, rq); - - if (at_head) - list_add(&rq->queuelist, &ctx->rq_lists[type]); - else - list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); -} - -void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) -{ - struct blk_mq_ctx *ctx = rq->mq_ctx; - - lockdep_assert_held(&ctx->lock); + struct blk_mq_hw_ctx *hctx = + container_of(work, struct blk_mq_hw_ctx, run_work.work); - __blk_mq_insert_req_list(hctx, rq, at_head); - blk_mq_hctx_mark_pending(hctx, ctx); + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_sched_dispatch_requests(hctx)); } /** * blk_mq_request_bypass_insert - Insert a request at dispatch list. * @rq: Pointer to request to be inserted. - * @run_queue: If we should run the hardware queue after inserting the request. + * @flags: BLK_MQ_INSERT_* * * Should only be used carefully, when the caller knows we want to * bypass a potential IO scheduler on the target device. */ -void blk_mq_request_bypass_insert(struct request *rq, bool at_head, - bool run_queue) +static void blk_mq_request_bypass_insert(struct request *rq, blk_insert_t flags) { struct blk_mq_hw_ctx *hctx = rq->mq_hctx; spin_lock(&hctx->lock); - if (at_head) + if (flags & BLK_MQ_INSERT_AT_HEAD) list_add(&rq->queuelist, &hctx->dispatch); else list_add_tail(&rq->queuelist, &hctx->dispatch); spin_unlock(&hctx->lock); - - if (run_queue) - blk_mq_run_hw_queue(hctx, false); } -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct list_head *list) - +static void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, + struct blk_mq_ctx *ctx, struct list_head *list, + bool run_queue_async) { struct request *rq; enum hctx_type type = hctx->type; /* + * Try to issue requests directly if the hw queue isn't busy to save an + * extra enqueue & dequeue to the sw queue. + */ + if (!hctx->dispatch_busy && !run_queue_async) { + blk_mq_run_dispatch_ops(hctx->queue, + blk_mq_try_issue_list_directly(hctx, list)); + if (list_empty(list)) + goto out; + } + + /* * preemption doesn't flush plug list, so it's possible ctx->cpu is * offline now */ list_for_each_entry(rq, list, queuelist) { BUG_ON(rq->mq_ctx != ctx); - trace_block_rq_insert(hctx->queue, rq); + trace_block_rq_insert(rq); + if (rq->cmd_flags & REQ_NOWAIT) + run_queue_async = true; } spin_lock(&ctx->lock); list_splice_tail_init(list, &ctx->rq_lists[type]); blk_mq_hctx_mark_pending(hctx, ctx); spin_unlock(&ctx->lock); +out: + blk_mq_run_hw_queue(hctx, run_queue_async); } -static int plug_rq_cmp(void *priv, struct list_head *a, struct list_head *b) -{ - struct request *rqa = container_of(a, struct request, queuelist); - struct request *rqb = container_of(b, struct request, queuelist); - - if (rqa->mq_ctx != rqb->mq_ctx) - return rqa->mq_ctx > rqb->mq_ctx; - if (rqa->mq_hctx != rqb->mq_hctx) - return rqa->mq_hctx > rqb->mq_hctx; - - return blk_rq_pos(rqa) > blk_rq_pos(rqb); -} - -void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +static void blk_mq_insert_request(struct request *rq, blk_insert_t flags) { - LIST_HEAD(list); - - if (list_empty(&plug->mq_list)) - return; - list_splice_init(&plug->mq_list, &list); + struct request_queue *q = rq->q; + struct blk_mq_ctx *ctx = rq->mq_ctx; + struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - if (plug->rq_count > 2 && plug->multiple_queues) - list_sort(NULL, &list, plug_rq_cmp); + if (blk_rq_is_passthrough(rq)) { + /* + * Passthrough request have to be added to hctx->dispatch + * directly. The device may be in a situation where it can't + * handle FS request, and always returns BLK_STS_RESOURCE for + * them, which gets them added to hctx->dispatch. + * + * If a passthrough request is required to unblock the queues, + * and it is added to the scheduler queue, there is no chance to + * dispatch it given we prioritize requests in hctx->dispatch. + */ + blk_mq_request_bypass_insert(rq, flags); + } else if (req_op(rq) == REQ_OP_FLUSH) { + /* + * Firstly normal IO request is inserted to scheduler queue or + * sw queue, meantime we add flush request to dispatch queue( + * hctx->dispatch) directly and there is at most one in-flight + * flush request for each hw queue, so it doesn't matter to add + * flush request to tail or front of the dispatch queue. + * + * Secondly in case of NCQ, flush request belongs to non-NCQ + * command, and queueing it will fail when there is any + * in-flight normal IO request(NCQ command). When adding flush + * rq to the front of hctx->dispatch, it is easier to introduce + * extra time to flush rq's latency because of S_SCHED_RESTART + * compared with adding to the tail of dispatch queue, then + * chance of flush merge is increased, and less flush requests + * will be issued to controller. It is observed that ~10% time + * is saved in blktests block/004 on disk attached to AHCI/NCQ + * drive when adding flush rq to the front of hctx->dispatch. + * + * Simply queue flush rq to the front of hctx->dispatch so that + * intensive flush workloads can benefit in case of NCQ HW. + */ + blk_mq_request_bypass_insert(rq, BLK_MQ_INSERT_AT_HEAD); + } else if (q->elevator) { + LIST_HEAD(list); - plug->rq_count = 0; + WARN_ON_ONCE(rq->tag != BLK_MQ_NO_TAG); - do { - struct list_head rq_list; - struct request *rq, *head_rq = list_entry_rq(list.next); - struct list_head *pos = &head_rq->queuelist; /* skip first */ - struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; - struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; - unsigned int depth = 1; - - list_for_each_continue(pos, &list) { - rq = list_entry_rq(pos); - BUG_ON(!rq->q); - if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) - break; - depth++; - } + list_add(&rq->queuelist, &list); + q->elevator->type->ops.insert_requests(hctx, &list, flags); + } else { + trace_block_rq_insert(rq); - list_cut_before(&rq_list, &list, pos); - trace_block_unplug(head_rq->q, depth, !from_schedule); - blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, - from_schedule); - } while(!list_empty(&list)); + spin_lock(&ctx->lock); + if (flags & BLK_MQ_INSERT_AT_HEAD) + list_add(&rq->queuelist, &ctx->rq_lists[hctx->type]); + else + list_add_tail(&rq->queuelist, + &ctx->rq_lists[hctx->type]); + blk_mq_hctx_mark_pending(hctx, ctx); + spin_unlock(&ctx->lock); + } } static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, unsigned int nr_segs) { + int err; + if (bio->bi_opf & REQ_RAHEAD) rq->cmd_flags |= REQ_FAILFAST_MASK; rq->__sector = bio->bi_iter.bi_sector; - rq->write_hint = bio->bi_write_hint; blk_rq_bio_prep(rq, bio, nr_segs); - blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); + + /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ + err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); + WARN_ON_ONCE(err); blk_account_io_start(rq); } static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, bool last) + struct request *rq, bool last) { struct request_queue *q = rq->q; struct blk_mq_queue_data bd = { .rq = rq, .last = last, }; - blk_qc_t new_cookie; blk_status_t ret; - new_cookie = request_to_qc_t(hctx, rq); - /* * For OK queue, we are done. For error, caller may kill it. * Any other error (busy), just add it to our list as we @@ -1856,7 +2612,6 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, switch (ret) { case BLK_STS_OK: blk_mq_update_dispatch_busy(hctx, false); - *cookie = new_cookie; break; case BLK_STS_RESOURCE: case BLK_STS_DEV_RESOURCE: @@ -1865,59 +2620,31 @@ static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, break; default: blk_mq_update_dispatch_busy(hctx, false); - *cookie = BLK_QC_T_NONE; break; } return ret; } -static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, - blk_qc_t *cookie, - bool bypass_insert, bool last) +static bool blk_mq_get_budget_and_tag(struct request *rq) { - struct request_queue *q = rq->q; - bool run_queue = true; - - /* - * RCU or SRCU read lock is needed before checking quiesced flag. - * - * When queue is stopped or quiesced, ignore 'bypass_insert' from - * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, - * and avoid driver to try to dispatch again. - */ - if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { - run_queue = false; - bypass_insert = false; - goto insert; - } - - if (q->elevator && !bypass_insert) - goto insert; - - if (!blk_mq_get_dispatch_budget(hctx)) - goto insert; + int budget_token; + budget_token = blk_mq_get_dispatch_budget(rq->q); + if (budget_token < 0) + return false; + blk_mq_set_rq_budget_token(rq, budget_token); if (!blk_mq_get_driver_tag(rq)) { - blk_mq_put_dispatch_budget(hctx); - goto insert; + blk_mq_put_dispatch_budget(rq->q, budget_token); + return false; } - - return __blk_mq_issue_directly(hctx, rq, cookie, last); -insert: - if (bypass_insert) - return BLK_STS_RESOURCE; - - blk_mq_request_bypass_insert(rq, false, run_queue); - return BLK_STS_OK; + return true; } /** * blk_mq_try_issue_directly - Try to send a request directly to device driver. * @hctx: Pointer of the associated hardware queue. * @rq: Pointer to request to be sent. - * @cookie: Request queue cookie. * * If the device has enough resources to accept a new request now, send the * request directly to device driver. Else, insert at hctx->dispatch queue, so @@ -1925,88 +2652,299 @@ insert: * queue have higher priority. */ static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, - struct request *rq, blk_qc_t *cookie) + struct request *rq) { blk_status_t ret; - int srcu_idx; - might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { + blk_mq_insert_request(rq, 0); + return; + } - hctx_lock(hctx, &srcu_idx); + if ((rq->rq_flags & RQF_USE_SCHED) || !blk_mq_get_budget_and_tag(rq)) { + blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, rq->cmd_flags & REQ_NOWAIT); + return; + } - ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); - if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) - blk_mq_request_bypass_insert(rq, false, true); - else if (ret != BLK_STS_OK) + ret = __blk_mq_issue_directly(hctx, rq, true); + switch (ret) { + case BLK_STS_OK: + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); + break; + default: blk_mq_end_request(rq, ret); - - hctx_unlock(hctx, srcu_idx); + break; + } } -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) +static blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) { - blk_status_t ret; - int srcu_idx; - blk_qc_t unused_cookie; struct blk_mq_hw_ctx *hctx = rq->mq_hctx; - hctx_lock(hctx, &srcu_idx); - ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); - hctx_unlock(hctx, srcu_idx); + if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(rq->q)) { + blk_mq_insert_request(rq, 0); + return BLK_STS_OK; + } - return ret; + if (!blk_mq_get_budget_and_tag(rq)) + return BLK_STS_RESOURCE; + return __blk_mq_issue_directly(hctx, rq, last); } -void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, +static void blk_mq_plug_issue_direct(struct blk_plug *plug) +{ + struct blk_mq_hw_ctx *hctx = NULL; + struct request *rq; + int queued = 0; + blk_status_t ret = BLK_STS_OK; + + while ((rq = rq_list_pop(&plug->mq_list))) { + bool last = rq_list_empty(plug->mq_list); + + if (hctx != rq->mq_hctx) { + if (hctx) { + blk_mq_commit_rqs(hctx, queued, false); + queued = 0; + } + hctx = rq->mq_hctx; + } + + ret = blk_mq_request_issue_directly(rq, last); + switch (ret) { + case BLK_STS_OK: + queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, 0); + blk_mq_run_hw_queue(hctx, false); + goto out; + default: + blk_mq_end_request(rq, ret); + break; + } + } + +out: + if (ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); +} + +static void __blk_mq_flush_plug_list(struct request_queue *q, + struct blk_plug *plug) +{ + if (blk_queue_quiesced(q)) + return; + q->mq_ops->queue_rqs(&plug->mq_list); +} + +static void blk_mq_dispatch_plug_list(struct blk_plug *plug, bool from_sched) +{ + struct blk_mq_hw_ctx *this_hctx = NULL; + struct blk_mq_ctx *this_ctx = NULL; + struct request *requeue_list = NULL; + struct request **requeue_lastp = &requeue_list; + unsigned int depth = 0; + bool is_passthrough = false; + LIST_HEAD(list); + + do { + struct request *rq = rq_list_pop(&plug->mq_list); + + if (!this_hctx) { + this_hctx = rq->mq_hctx; + this_ctx = rq->mq_ctx; + is_passthrough = blk_rq_is_passthrough(rq); + } else if (this_hctx != rq->mq_hctx || this_ctx != rq->mq_ctx || + is_passthrough != blk_rq_is_passthrough(rq)) { + rq_list_add_tail(&requeue_lastp, rq); + continue; + } + list_add(&rq->queuelist, &list); + depth++; + } while (!rq_list_empty(plug->mq_list)); + + plug->mq_list = requeue_list; + trace_block_unplug(this_hctx->queue, depth, !from_sched); + + percpu_ref_get(&this_hctx->queue->q_usage_counter); + /* passthrough requests should never be issued to the I/O scheduler */ + if (is_passthrough) { + spin_lock(&this_hctx->lock); + list_splice_tail_init(&list, &this_hctx->dispatch); + spin_unlock(&this_hctx->lock); + blk_mq_run_hw_queue(this_hctx, from_sched); + } else if (this_hctx->queue->elevator) { + this_hctx->queue->elevator->type->ops.insert_requests(this_hctx, + &list, 0); + blk_mq_run_hw_queue(this_hctx, from_sched); + } else { + blk_mq_insert_requests(this_hctx, this_ctx, &list, from_sched); + } + percpu_ref_put(&this_hctx->queue->q_usage_counter); +} + +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) +{ + struct request *rq; + + /* + * We may have been called recursively midway through handling + * plug->mq_list via a schedule() in the driver's queue_rq() callback. + * To avoid mq_list changing under our feet, clear rq_count early and + * bail out specifically if rq_count is 0 rather than checking + * whether the mq_list is empty. + */ + if (plug->rq_count == 0) + return; + plug->rq_count = 0; + + if (!plug->multiple_queues && !plug->has_elevator && !from_schedule) { + struct request_queue *q; + + rq = rq_list_peek(&plug->mq_list); + q = rq->q; + + /* + * Peek first request and see if we have a ->queue_rqs() hook. + * If we do, we can dispatch the whole plug list in one go. We + * already know at this point that all requests belong to the + * same queue, caller must ensure that's the case. + */ + if (q->mq_ops->queue_rqs) { + blk_mq_run_dispatch_ops(q, + __blk_mq_flush_plug_list(q, plug)); + if (rq_list_empty(plug->mq_list)) + return; + } + + blk_mq_run_dispatch_ops(q, + blk_mq_plug_issue_direct(plug)); + if (rq_list_empty(plug->mq_list)) + return; + } + + do { + blk_mq_dispatch_plug_list(plug, from_schedule); + } while (!rq_list_empty(plug->mq_list)); +} + +static void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, struct list_head *list) { int queued = 0; + blk_status_t ret = BLK_STS_OK; while (!list_empty(list)) { - blk_status_t ret; struct request *rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); ret = blk_mq_request_issue_directly(rq, list_empty(list)); - if (ret != BLK_STS_OK) { - if (ret == BLK_STS_RESOURCE || - ret == BLK_STS_DEV_RESOURCE) { - blk_mq_request_bypass_insert(rq, false, - list_empty(list)); - break; - } - blk_mq_end_request(rq, ret); - } else + switch (ret) { + case BLK_STS_OK: queued++; + break; + case BLK_STS_RESOURCE: + case BLK_STS_DEV_RESOURCE: + blk_mq_request_bypass_insert(rq, 0); + if (list_empty(list)) + blk_mq_run_hw_queue(hctx, false); + goto out; + default: + blk_mq_end_request(rq, ret); + break; + } } - /* - * If we didn't flush the entire list, we could have told - * the driver there was more coming, but that turned out to - * be a lie. - */ - if (!list_empty(list) && hctx->queue->mq_ops->commit_rqs && queued) - hctx->queue->mq_ops->commit_rqs(hctx); +out: + if (ret != BLK_STS_OK) + blk_mq_commit_rqs(hctx, queued, false); } -static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) +static bool blk_mq_attempt_bio_merge(struct request_queue *q, + struct bio *bio, unsigned int nr_segs) { - list_add_tail(&rq->queuelist, &plug->mq_list); - plug->rq_count++; - if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { - struct request *tmp; + if (!blk_queue_nomerges(q) && bio_mergeable(bio)) { + if (blk_attempt_plug_merge(q, bio, nr_segs)) + return true; + if (blk_mq_sched_bio_merge(q, bio, nr_segs)) + return true; + } + return false; +} + +static struct request *blk_mq_get_new_requests(struct request_queue *q, + struct blk_plug *plug, + struct bio *bio, + unsigned int nsegs) +{ + struct blk_mq_alloc_data data = { + .q = q, + .nr_tags = 1, + .cmd_flags = bio->bi_opf, + }; + struct request *rq; + + if (blk_mq_attempt_bio_merge(q, bio, nsegs)) + return NULL; + + rq_qos_throttle(q, bio); - tmp = list_first_entry(&plug->mq_list, struct request, - queuelist); - if (tmp->q != rq->q) - plug->multiple_queues = true; + if (plug) { + data.nr_tags = plug->nr_ios; + plug->nr_ios = 1; + data.cached_rq = &plug->cached_rq; } + + rq = __blk_mq_alloc_requests(&data); + if (rq) + return rq; + rq_qos_cleanup(q, bio); + if (bio->bi_opf & REQ_NOWAIT) + bio_wouldblock_error(bio); + return NULL; +} + +/* + * Check if we can use the passed on request for submitting the passed in bio, + * and remove it from the request list if it can be used. + */ +static bool blk_mq_use_cached_rq(struct request *rq, struct blk_plug *plug, + struct bio *bio) +{ + enum hctx_type type = blk_mq_get_hctx_type(bio->bi_opf); + enum hctx_type hctx_type = rq->mq_hctx->type; + + WARN_ON_ONCE(rq_list_peek(&plug->cached_rq) != rq); + + if (type != hctx_type && + !(type == HCTX_TYPE_READ && hctx_type == HCTX_TYPE_DEFAULT)) + return false; + if (op_is_flush(rq->cmd_flags) != op_is_flush(bio->bi_opf)) + return false; + + /* + * If any qos ->throttle() end up blocking, we will have flushed the + * plug and hence killed the cached_rq list as well. Pop this entry + * before we throttle. + */ + plug->cached_rq = rq_list_next(rq); + rq_qos_throttle(rq->q, bio); + + blk_mq_rq_time_init(rq, 0); + rq->cmd_flags = bio->bi_opf; + INIT_LIST_HEAD(&rq->queuelist); + return true; } /** - * blk_mq_make_request - Create and send a request to block device. - * @q: Request queue pointer. + * blk_mq_submit_bio - Create and send a request to block device. * @bio: Bio pointer. * * Builds up a request structure from @q and @bio and send to the device. The @@ -2017,143 +2955,323 @@ static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) * * It will not queue the request if there is an error with the bio, or at the * request creation. - * - * Returns: Request queue cookie. */ -blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) +void blk_mq_submit_bio(struct bio *bio) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct blk_plug *plug = blk_mq_plug(bio); const int is_sync = op_is_sync(bio->bi_opf); - const int is_flush_fua = op_is_flush(bio->bi_opf); - struct blk_mq_alloc_data data = { - .q = q, - }; - struct request *rq; - struct blk_plug *plug; - struct request *same_queue_rq = NULL; - unsigned int nr_segs; - blk_qc_t cookie; + struct blk_mq_hw_ctx *hctx; + struct request *rq = NULL; + unsigned int nr_segs = 1; blk_status_t ret; - blk_queue_bounce(q, &bio); - __blk_queue_split(q, &bio, &nr_segs); - - if (!bio_integrity_prep(bio)) - goto queue_exit; + bio = blk_queue_bounce(bio, q); - if (!is_flush_fua && !blk_queue_nomerges(q) && - blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) - goto queue_exit; - - if (blk_mq_sched_bio_merge(q, bio, nr_segs)) - goto queue_exit; - - rq_qos_throttle(q, bio); + if (plug) { + rq = rq_list_peek(&plug->cached_rq); + if (rq && rq->q != q) + rq = NULL; + } + if (rq) { + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + return; + } + if (!bio_integrity_prep(bio)) + return; + if (blk_mq_attempt_bio_merge(q, bio, nr_segs)) + return; + if (blk_mq_use_cached_rq(rq, plug, bio)) + goto done; + percpu_ref_get(&q->q_usage_counter); + } else { + if (unlikely(bio_queue_enter(bio))) + return; + if (unlikely(bio_may_exceed_limits(bio, &q->limits))) { + bio = __bio_split_to_limits(bio, &q->limits, &nr_segs); + if (!bio) + goto fail; + } + if (!bio_integrity_prep(bio)) + goto fail; + } - data.cmd_flags = bio->bi_opf; - rq = __blk_mq_alloc_request(&data); + rq = blk_mq_get_new_requests(q, plug, bio, nr_segs); if (unlikely(!rq)) { - rq_qos_cleanup(q, bio); - if (bio->bi_opf & REQ_NOWAIT) - bio_wouldblock_error(bio); - goto queue_exit; +fail: + blk_queue_exit(q); + return; } - trace_block_getrq(q, bio, bio->bi_opf); +done: + trace_block_getrq(bio); rq_qos_track(q, rq, bio); - cookie = request_to_qc_t(data.hctx, rq); - blk_mq_bio_to_request(rq, bio, nr_segs); - ret = blk_crypto_init_request(rq); + ret = blk_crypto_rq_get_keyslot(rq); if (ret != BLK_STS_OK) { bio->bi_status = ret; bio_endio(bio); blk_mq_free_request(rq); - return BLK_QC_T_NONE; + return; } - plug = blk_mq_plug(q, bio); - if (unlikely(is_flush_fua)) { - /* Bypass scheduler for flush requests */ - blk_insert_flush(rq); - blk_mq_run_hw_queue(data.hctx, true); - } else if (plug && (q->nr_hw_queues == 1 || q->mq_ops->commit_rqs || - !blk_queue_nonrot(q))) { - /* - * Use plugging if we have a ->commit_rqs() hook as well, as - * we know the driver uses bd->last in a smart fashion. - * - * Use normal plugging if this disk is slow HDD, as sequential - * IO may benefit a lot from plug merging. - */ - unsigned int request_count = plug->rq_count; - struct request *last = NULL; + if (op_is_flush(bio->bi_opf) && blk_insert_flush(rq)) + return; - if (!request_count) - trace_block_plug(q); - else - last = list_entry_rq(plug->mq_list.prev); + if (plug) { + blk_add_rq_to_plug(plug, rq); + return; + } - if (request_count >= BLK_MAX_REQUEST_COUNT || (last && - blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { - blk_flush_plug_list(plug, false); - trace_block_plug(q); - } + hctx = rq->mq_hctx; + if ((rq->rq_flags & RQF_USE_SCHED) || + (hctx->dispatch_busy && (q->nr_hw_queues == 1 || !is_sync))) { + blk_mq_insert_request(rq, 0); + blk_mq_run_hw_queue(hctx, true); + } else { + blk_mq_run_dispatch_ops(q, blk_mq_try_issue_directly(hctx, rq)); + } +} - blk_add_rq_to_plug(plug, rq); - } else if (q->elevator) { - /* Insert the request at the IO scheduler queue */ - blk_mq_sched_insert_request(rq, false, true, true); - } else if (plug && !blk_queue_nomerges(q)) { +#ifdef CONFIG_BLK_MQ_STACKING +/** + * blk_insert_cloned_request - Helper for stacking drivers to submit a request + * @rq: the request being queued + */ +blk_status_t blk_insert_cloned_request(struct request *rq) +{ + struct request_queue *q = rq->q; + unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_segments = blk_rq_get_max_segments(rq); + blk_status_t ret; + + if (blk_rq_sectors(rq) > max_sectors) { /* - * We do limited plugging. If the bio can be merged, do that. - * Otherwise the existing request in the plug list will be - * issued. So the plug list will have one request at most - * The plug list might get flushed before this. If that happens, - * the plug list is empty, and same_queue_rq is invalid. + * SCSI device does not have a good way to return if + * Write Same/Zero is actually supported. If a device rejects + * a non-read/write command (discard, write same,etc.) the + * low-level device driver will set the relevant queue limit to + * 0 to prevent blk-lib from issuing more of the offending + * operations. Commands queued prior to the queue limit being + * reset need to be completed with BLK_STS_NOTSUPP to avoid I/O + * errors being propagated to upper layers. */ - if (list_empty(&plug->mq_list)) - same_queue_rq = NULL; - if (same_queue_rq) { - list_del_init(&same_queue_rq->queuelist); - plug->rq_count--; + if (max_sectors == 0) + return BLK_STS_NOTSUPP; + + printk(KERN_ERR "%s: over max size limit. (%u > %u)\n", + __func__, blk_rq_sectors(rq), max_sectors); + return BLK_STS_IOERR; + } + + /* + * The queue settings related to segment counting may differ from the + * original queue. + */ + rq->nr_phys_segments = blk_recalc_rq_segments(rq); + if (rq->nr_phys_segments > max_segments) { + printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", + __func__, rq->nr_phys_segments, max_segments); + return BLK_STS_IOERR; + } + + if (q->disk && should_fail_request(q->disk->part0, blk_rq_bytes(rq))) + return BLK_STS_IOERR; + + ret = blk_crypto_rq_get_keyslot(rq); + if (ret != BLK_STS_OK) + return ret; + + blk_account_io_start(rq); + + /* + * Since we have a scheduler attached on the top device, + * bypass a potential scheduler on the bottom device for + * insert. + */ + blk_mq_run_dispatch_ops(q, + ret = blk_mq_request_issue_directly(rq, true)); + if (ret) + blk_account_io_done(rq, ktime_get_ns()); + return ret; +} +EXPORT_SYMBOL_GPL(blk_insert_cloned_request); + +/** + * blk_rq_unprep_clone - Helper function to free all bios in a cloned request + * @rq: the clone request to be cleaned up + * + * Description: + * Free all bios in @rq for a cloned request. + */ +void blk_rq_unprep_clone(struct request *rq) +{ + struct bio *bio; + + while ((bio = rq->bio) != NULL) { + rq->bio = bio->bi_next; + + bio_put(bio); + } +} +EXPORT_SYMBOL_GPL(blk_rq_unprep_clone); + +/** + * blk_rq_prep_clone - Helper function to setup clone request + * @rq: the request to be setup + * @rq_src: original request to be cloned + * @bs: bio_set that bios for clone are allocated from + * @gfp_mask: memory allocation mask for bio + * @bio_ctr: setup function to be called for each clone bio. + * Returns %0 for success, non %0 for failure. + * @data: private data to be passed to @bio_ctr + * + * Description: + * Clones bios in @rq_src to @rq, and copies attributes of @rq_src to @rq. + * Also, pages which the original bios are pointing to are not copied + * and the cloned bios just point same pages. + * So cloned bios must be completed before original bios, which means + * the caller must complete @rq before @rq_src. + */ +int blk_rq_prep_clone(struct request *rq, struct request *rq_src, + struct bio_set *bs, gfp_t gfp_mask, + int (*bio_ctr)(struct bio *, struct bio *, void *), + void *data) +{ + struct bio *bio, *bio_src; + + if (!bs) + bs = &fs_bio_set; + + __rq_for_each_bio(bio_src, rq_src) { + bio = bio_alloc_clone(rq->q->disk->part0, bio_src, gfp_mask, + bs); + if (!bio) + goto free_and_out; + + if (bio_ctr && bio_ctr(bio, bio_src, data)) + goto free_and_out; + + if (rq->bio) { + rq->biotail->bi_next = bio; + rq->biotail = bio; + } else { + rq->bio = rq->biotail = bio; } - blk_add_rq_to_plug(plug, rq); - trace_block_plug(q); + bio = NULL; + } + + /* Copy attributes of the original request to the clone request. */ + rq->__sector = blk_rq_pos(rq_src); + rq->__data_len = blk_rq_bytes(rq_src); + if (rq_src->rq_flags & RQF_SPECIAL_PAYLOAD) { + rq->rq_flags |= RQF_SPECIAL_PAYLOAD; + rq->special_vec = rq_src->special_vec; + } + rq->nr_phys_segments = rq_src->nr_phys_segments; + rq->ioprio = rq_src->ioprio; + + if (rq->bio && blk_crypto_rq_bio_prep(rq, rq->bio, gfp_mask) < 0) + goto free_and_out; + + return 0; + +free_and_out: + if (bio) + bio_put(bio); + blk_rq_unprep_clone(rq); - if (same_queue_rq) { - data.hctx = same_queue_rq->mq_hctx; - trace_block_unplug(q, 1, true); - blk_mq_try_issue_directly(data.hctx, same_queue_rq, - &cookie); + return -ENOMEM; +} +EXPORT_SYMBOL_GPL(blk_rq_prep_clone); +#endif /* CONFIG_BLK_MQ_STACKING */ + +/* + * Steal bios from a request and add them to a bio list. + * The request must not have been partially completed before. + */ +void blk_steal_bios(struct bio_list *list, struct request *rq) +{ + if (rq->bio) { + if (list->tail) + list->tail->bi_next = rq->bio; + else + list->head = rq->bio; + list->tail = rq->biotail; + + rq->bio = NULL; + rq->biotail = NULL; + } + + rq->__data_len = 0; +} +EXPORT_SYMBOL_GPL(blk_steal_bios); + +static size_t order_to_size(unsigned int order) +{ + return (size_t)PAGE_SIZE << order; +} + +/* called before freeing request pool in @tags */ +static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, + struct blk_mq_tags *tags) +{ + struct page *page; + unsigned long flags; + + /* + * There is no need to clear mapping if driver tags is not initialized + * or the mapping belongs to the driver tags. + */ + if (!drv_tags || drv_tags == tags) + return; + + list_for_each_entry(page, &tags->page_list, lru) { + unsigned long start = (unsigned long)page_address(page); + unsigned long end = start + order_to_size(page->private); + int i; + + for (i = 0; i < drv_tags->nr_tags; i++) { + struct request *rq = drv_tags->rqs[i]; + unsigned long rq_addr = (unsigned long)rq; + + if (rq_addr >= start && rq_addr < end) { + WARN_ON_ONCE(req_ref_read(rq) != 0); + cmpxchg(&drv_tags->rqs[i], rq, NULL); + } } - } else if ((q->nr_hw_queues > 1 && is_sync) || - !data.hctx->dispatch_busy) { - /* - * There is no scheduler and we can try to send directly - * to the hardware. - */ - blk_mq_try_issue_directly(data.hctx, rq, &cookie); - } else { - /* Default case. */ - blk_mq_sched_insert_request(rq, false, true, true); } - return cookie; -queue_exit: - blk_queue_exit(q); - return BLK_QC_T_NONE; + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&drv_tags->lock, flags); + spin_unlock_irqrestore(&drv_tags->lock, flags); } -EXPORT_SYMBOL_GPL(blk_mq_make_request); /* only for request based dm */ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx) { + struct blk_mq_tags *drv_tags; struct page *page; - if (tags->rqs && set->ops->exit_request) { + if (list_empty(&tags->page_list)) + return; + + if (blk_mq_is_shared_tags(set->flags)) + drv_tags = set->shared_tags; + else + drv_tags = set->tags[hctx_idx]; + + if (tags->static_rqs && set->ops->exit_request) { int i; for (i = 0; i < tags->nr_tags; i++) { @@ -2166,6 +3284,8 @@ void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, } } + blk_mq_clear_rq_mapping(drv_tags, tags); + while (!list_empty(&tags->page_list)) { page = list_first_entry(&tags->page_list, struct page, lru); list_del_init(&page->lru); @@ -2188,15 +3308,41 @@ void blk_mq_free_rq_map(struct blk_mq_tags *tags) blk_mq_free_tags(tags); } -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags) +static enum hctx_type hctx_idx_to_type(struct blk_mq_tag_set *set, + unsigned int hctx_idx) { + int i; + + for (i = 0; i < set->nr_maps; i++) { + unsigned int start = set->map[i].queue_offset; + unsigned int end = start + set->map[i].nr_queues; + + if (hctx_idx >= start && hctx_idx < end) + break; + } + + if (i >= set->nr_maps) + i = HCTX_TYPE_DEFAULT; + + return i; +} + +static int blk_mq_get_hctx_node(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + enum hctx_type type = hctx_idx_to_type(set, hctx_idx); + + return blk_mq_hw_queue_to_node(&set->map[type], hctx_idx); +} + +static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int nr_tags, + unsigned int reserved_tags) +{ + int node = blk_mq_get_hctx_node(set, hctx_idx); struct blk_mq_tags *tags; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2208,26 +3354,22 @@ struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->rqs) { - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->rqs) + goto err_free_tags; tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, node); - if (!tags->static_rqs) { - kfree(tags->rqs); - blk_mq_free_tags(tags); - return NULL; - } + if (!tags->static_rqs) + goto err_free_rqs; return tags; -} -static size_t order_to_size(unsigned int order) -{ - return (size_t)PAGE_SIZE << order; +err_free_rqs: + kfree(tags->rqs); +err_free_tags: + blk_mq_free_tags(tags); + return NULL; } static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, @@ -2245,14 +3387,14 @@ static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, return 0; } -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth) +static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx, unsigned int depth) { unsigned int i, j, entries_per_page, max_order = 4; + int node = blk_mq_get_hctx_node(set, hctx_idx); size_t rq_size, left; - int node; - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); if (node == NUMA_NO_NODE) node = set->numa_node; @@ -2327,7 +3469,7 @@ struct rq_iter_data { bool has_rq; }; -static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) +static bool blk_mq_has_request(struct request *rq, void *data) { struct rq_iter_data *iter_data = data; @@ -2352,7 +3494,7 @@ static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, struct blk_mq_hw_ctx *hctx) { - if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) + if (cpumask_first_and(hctx->cpumask, cpu_online_mask) != cpu) return false; if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) return false; @@ -2448,22 +3590,58 @@ static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) &hctx->cpuhp_dead); } +/* + * Before freeing hw queue, clearing the flush request reference in + * tags->rqs[] for avoiding potential UAF. + */ +static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, + unsigned int queue_depth, struct request *flush_rq) +{ + int i; + unsigned long flags; + + /* The hw queue may not be mapped yet */ + if (!tags) + return; + + WARN_ON_ONCE(req_ref_read(flush_rq) != 0); + + for (i = 0; i < queue_depth; i++) + cmpxchg(&tags->rqs[i], flush_rq, NULL); + + /* + * Wait until all pending iteration is done. + * + * Request reference is cleared and it is guaranteed to be observed + * after the ->lock is released. + */ + spin_lock_irqsave(&tags->lock, flags); + spin_unlock_irqrestore(&tags->lock, flags); +} + /* hctx->ctxs will be freed in queue's release handler */ static void blk_mq_exit_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { + struct request *flush_rq = hctx->fq->flush_rq; + if (blk_mq_hw_queue_mapped(hctx)) blk_mq_tag_idle(hctx); + if (blk_queue_init_done(q)) + blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], + set->queue_depth, flush_rq); if (set->ops->exit_request) - set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); + set->ops->exit_request(set, flush_rq, hctx_idx); if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); blk_mq_remove_cpuhp(hctx); + xa_erase(&q->hctx_table, hctx_idx); + spin_lock(&q->unused_hctx_lock); list_add(&hctx->hctx_list, &q->unused_hctx_list); spin_unlock(&q->unused_hctx_lock); @@ -2473,30 +3651,15 @@ static void blk_mq_exit_hw_queues(struct request_queue *q, struct blk_mq_tag_set *set, int nr_queue) { struct blk_mq_hw_ctx *hctx; - unsigned int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { if (i == nr_queue) break; - blk_mq_debugfs_unregister_hctx(hctx); blk_mq_exit_hctx(q, set, hctx, i); } } -static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) -{ - int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); - - BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), - __alignof__(struct blk_mq_hw_ctx)) != - sizeof(struct blk_mq_hw_ctx)); - - if (tag_set->flags & BLK_MQ_F_BLOCKING) - hw_ctx_size += sizeof(struct srcu_struct); - - return hw_ctx_size; -} - static int blk_mq_init_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) @@ -2517,8 +3680,15 @@ static int blk_mq_init_hctx(struct request_queue *q, if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, hctx->numa_node)) goto exit_hctx; + + if (xa_insert(&q->hctx_table, hctx_idx, hctx, GFP_KERNEL)) + goto exit_flush_rq; + return 0; + exit_flush_rq: + if (set->ops->exit_request) + set->ops->exit_request(set, hctx->fq->flush_rq, hctx_idx); exit_hctx: if (set->ops->exit_hctx) set->ops->exit_hctx(hctx, hctx_idx); @@ -2534,7 +3704,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, struct blk_mq_hw_ctx *hctx; gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; - hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); + hctx = kzalloc_node(sizeof(struct blk_mq_hw_ctx), gfp, node); if (!hctx) goto fail_alloc_hctx; @@ -2550,7 +3720,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, spin_lock_init(&hctx->lock); INIT_LIST_HEAD(&hctx->dispatch); hctx->queue = q; - hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; + hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; INIT_LIST_HEAD(&hctx->hctx_list); @@ -2564,7 +3734,7 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, goto free_cpumask; if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), - gfp, node)) + gfp, node, false, false)) goto free_ctxs; hctx->nr_ctx = 0; @@ -2576,8 +3746,6 @@ blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, if (!hctx->fq) goto free_bitmap; - if (hctx->flags & BLK_MQ_F_BLOCKING) - init_srcu_struct(hctx->srcu); blk_mq_hctx_kobj_init(hctx); return hctx; @@ -2619,44 +3787,69 @@ static void blk_mq_init_cpu_queues(struct request_queue *q, for (j = 0; j < set->nr_maps; j++) { hctx = blk_mq_map_queue_type(q, j, i); if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) - hctx->numa_node = local_memory_node(cpu_to_node(i)); + hctx->numa_node = cpu_to_node(i); } } } -static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, - int hctx_idx) +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, + unsigned int depth) { - int ret = 0; + struct blk_mq_tags *tags; + int ret; - set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, - set->queue_depth, set->reserved_tags); - if (!set->tags[hctx_idx]) - return false; + tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); + if (!tags) + return NULL; + + ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); + if (ret) { + blk_mq_free_rq_map(tags); + return NULL; + } + + return tags; +} + +static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + int hctx_idx) +{ + if (blk_mq_is_shared_tags(set->flags)) { + set->tags[hctx_idx] = set->shared_tags; - ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, - set->queue_depth); - if (!ret) return true; + } - blk_mq_free_rq_map(set->tags[hctx_idx]); - set->tags[hctx_idx] = NULL; - return false; + set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, + set->queue_depth); + + return set->tags[hctx_idx]; } -static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, - unsigned int hctx_idx) +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx) { - if (set->tags && set->tags[hctx_idx]) { - blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); - blk_mq_free_rq_map(set->tags[hctx_idx]); - set->tags[hctx_idx] = NULL; + if (tags) { + blk_mq_free_rqs(set, tags, hctx_idx); + blk_mq_free_rq_map(tags); } } +static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx) +{ + if (!blk_mq_is_shared_tags(set->flags)) + blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); + + set->tags[hctx_idx] = NULL; +} + static void blk_mq_map_swqueue(struct request_queue *q) { - unsigned int i, j, hctx_idx; + unsigned int j, hctx_idx; + unsigned long i; struct blk_mq_hw_ctx *hctx; struct blk_mq_ctx *ctx; struct blk_mq_tag_set *set = q->tag_set; @@ -2684,7 +3877,7 @@ static void blk_mq_map_swqueue(struct request_queue *q) hctx_idx = set->map[j].mq_map[i]; /* unmapped hw queue can be remapped after CPU topo changed */ if (!set->tags[hctx_idx] && - !__blk_mq_alloc_map_and_request(set, hctx_idx)) { + !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { /* * If tags initialization fail for some hctx, * that hctx won't be brought online. In this @@ -2731,8 +3924,8 @@ static void blk_mq_map_swqueue(struct request_queue *q) * fallback in case of a new remap fails * allocation */ - if (i && set->tags[i]) - blk_mq_free_map_and_requests(set, i); + if (i) + __blk_mq_free_map_and_rqs(set, i); hctx->tags = NULL; continue; @@ -2763,18 +3956,20 @@ static void blk_mq_map_swqueue(struct request_queue *q) static void queue_set_hctx_shared(struct request_queue *q, bool shared) { struct blk_mq_hw_ctx *hctx; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) { - if (shared) - hctx->flags |= BLK_MQ_F_TAG_SHARED; - else - hctx->flags &= ~BLK_MQ_F_TAG_SHARED; + if (shared) { + hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; + } else { + blk_mq_tag_idle(hctx); + hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; + } } } -static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, - bool shared) +static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, + bool shared) { struct request_queue *q; @@ -2792,12 +3987,12 @@ static void blk_mq_del_queue_tag_set(struct request_queue *q) struct blk_mq_tag_set *set = q->tag_set; mutex_lock(&set->tag_list_lock); - list_del_rcu(&q->tag_set_list); + list_del(&q->tag_set_list); if (list_is_singular(&set->tag_list)) { /* just transitioned to unshared */ - set->flags &= ~BLK_MQ_F_TAG_SHARED; + set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ - blk_mq_update_tag_set_depth(set, false); + blk_mq_update_tag_set_shared(set, false); } mutex_unlock(&set->tag_list_lock); INIT_LIST_HEAD(&q->tag_set_list); @@ -2812,14 +4007,14 @@ static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, * Check to see if we're transitioning to shared (from 1 to 2 queues). */ if (!list_empty(&set->tag_list) && - !(set->flags & BLK_MQ_F_TAG_SHARED)) { - set->flags |= BLK_MQ_F_TAG_SHARED; + !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { + set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; /* update existing queue */ - blk_mq_update_tag_set_depth(set, true); + blk_mq_update_tag_set_shared(set, true); } - if (set->flags & BLK_MQ_F_TAG_SHARED) + if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) queue_set_hctx_shared(q, true); - list_add_tail_rcu(&q->tag_set_list, &set->tag_list); + list_add_tail(&q->tag_set_list, &set->tag_list); mutex_unlock(&set->tag_list_lock); } @@ -2861,7 +4056,7 @@ static int blk_mq_alloc_ctxs(struct request_queue *q) void blk_mq_release(struct request_queue *q) { struct blk_mq_hw_ctx *hctx, *next; - int i; + unsigned long i; queue_for_each_hw_ctx(q, hctx, i) WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); @@ -2872,7 +4067,7 @@ void blk_mq_release(struct request_queue *q) kobject_put(&hctx->kobj); } - kfree(q->queue_hw_ctx); + xa_destroy(&q->hctx_table); /* * release .mq_kobj and sw queue's kobject now because @@ -2881,27 +4076,23 @@ void blk_mq_release(struct request_queue *q) blk_mq_sysfs_deinit(q); } -struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, +static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, void *queuedata) { - struct request_queue *uninit_q, *q; + struct request_queue *q; + int ret; - uninit_q = __blk_alloc_queue(set->numa_node); - if (!uninit_q) + q = blk_alloc_queue(set->numa_node); + if (!q) return ERR_PTR(-ENOMEM); - uninit_q->queuedata = queuedata; - - /* - * Initialize the queue without an elevator. device_add_disk() will do - * the initialization. - */ - q = blk_mq_init_allocated_queue(set, uninit_q, false); - if (IS_ERR(q)) - blk_cleanup_queue(uninit_q); - + q->queuedata = queuedata; + ret = blk_mq_init_allocated_queue(set, q); + if (ret) { + blk_put_queue(q); + return ERR_PTR(ret); + } return q; } -EXPORT_SYMBOL_GPL(blk_mq_init_queue_data); struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) { @@ -2909,39 +4100,67 @@ struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) } EXPORT_SYMBOL(blk_mq_init_queue); -/* - * Helper for setting up a queue with mq ops, given queue depth, and - * the passed in mq ops flags. +/** + * blk_mq_destroy_queue - shutdown a request queue + * @q: request queue to shutdown + * + * This shuts down a request queue allocated by blk_mq_init_queue(). All future + * requests will be failed with -ENODEV. The caller is responsible for dropping + * the reference from blk_mq_init_queue() by calling blk_put_queue(). + * + * Context: can sleep */ -struct request_queue *blk_mq_init_sq_queue(struct blk_mq_tag_set *set, - const struct blk_mq_ops *ops, - unsigned int queue_depth, - unsigned int set_flags) +void blk_mq_destroy_queue(struct request_queue *q) { - struct request_queue *q; - int ret; + WARN_ON_ONCE(!queue_is_mq(q)); + WARN_ON_ONCE(blk_queue_registered(q)); - memset(set, 0, sizeof(*set)); - set->ops = ops; - set->nr_hw_queues = 1; - set->nr_maps = 1; - set->queue_depth = queue_depth; - set->numa_node = NUMA_NO_NODE; - set->flags = set_flags; + might_sleep(); - ret = blk_mq_alloc_tag_set(set); - if (ret) - return ERR_PTR(ret); + blk_queue_flag_set(QUEUE_FLAG_DYING, q); + blk_queue_start_drain(q); + blk_mq_freeze_queue_wait(q); + + blk_sync_queue(q); + blk_mq_cancel_work_sync(q); + blk_mq_exit_queue(q); +} +EXPORT_SYMBOL(blk_mq_destroy_queue); + +struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, + struct lock_class_key *lkclass) +{ + struct request_queue *q; + struct gendisk *disk; + + q = blk_mq_init_queue_data(set, queuedata); + if (IS_ERR(q)) + return ERR_CAST(q); - q = blk_mq_init_queue(set); - if (IS_ERR(q)) { - blk_mq_free_tag_set(set); - return q; + disk = __alloc_disk_node(q, set->numa_node, lkclass); + if (!disk) { + blk_mq_destroy_queue(q); + blk_put_queue(q); + return ERR_PTR(-ENOMEM); } + set_bit(GD_OWNS_QUEUE, &disk->state); + return disk; +} +EXPORT_SYMBOL(__blk_mq_alloc_disk); - return q; +struct gendisk *blk_mq_alloc_disk_for_queue(struct request_queue *q, + struct lock_class_key *lkclass) +{ + struct gendisk *disk; + + if (!blk_get_queue(q)) + return NULL; + disk = __alloc_disk_node(q, NUMA_NO_NODE, lkclass); + if (!disk) + blk_put_queue(q); + return disk; } -EXPORT_SYMBOL(blk_mq_init_sq_queue); +EXPORT_SYMBOL(blk_mq_alloc_disk_for_queue); static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( struct blk_mq_tag_set *set, struct request_queue *q, @@ -2980,52 +4199,28 @@ static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, struct request_queue *q) { - int i, j, end; - struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; - - if (q->nr_hw_queues < set->nr_hw_queues) { - struct blk_mq_hw_ctx **new_hctxs; - - new_hctxs = kcalloc_node(set->nr_hw_queues, - sizeof(*new_hctxs), GFP_KERNEL, - set->numa_node); - if (!new_hctxs) - return; - if (hctxs) - memcpy(new_hctxs, hctxs, q->nr_hw_queues * - sizeof(*hctxs)); - q->queue_hw_ctx = new_hctxs; - kfree(hctxs); - hctxs = new_hctxs; - } + struct blk_mq_hw_ctx *hctx; + unsigned long i, j; /* protect against switching io scheduler */ mutex_lock(&q->sysfs_lock); for (i = 0; i < set->nr_hw_queues; i++) { - int node; - struct blk_mq_hw_ctx *hctx; + int old_node; + int node = blk_mq_get_hctx_node(set, i); + struct blk_mq_hw_ctx *old_hctx = xa_load(&q->hctx_table, i); - node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); - /* - * If the hw queue has been mapped to another numa node, - * we need to realloc the hctx. If allocation fails, fallback - * to use the previous one. - */ - if (hctxs[i] && (hctxs[i]->numa_node == node)) - continue; + if (old_hctx) { + old_node = old_hctx->numa_node; + blk_mq_exit_hctx(q, set, old_hctx, i); + } - hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); - if (hctx) { - if (hctxs[i]) - blk_mq_exit_hctx(q, set, hctxs[i], i); - hctxs[i] = hctx; - } else { - if (hctxs[i]) - pr_warn("Allocate new hctx on node %d fails,\ - fallback to previous one on node %d\n", - node, hctxs[i]->numa_node); - else + if (!blk_mq_alloc_and_init_hctx(set, q, i, node)) { + if (!old_hctx) break; + pr_warn("Allocate new hctx on node %d fails, fallback to previous one on node %d\n", + node, old_node); + hctx = blk_mq_alloc_and_init_hctx(set, q, i, old_node); + WARN_ON_ONCE(!hctx); } } /* @@ -3034,41 +4229,35 @@ static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, */ if (i != set->nr_hw_queues) { j = q->nr_hw_queues; - end = i; } else { j = i; - end = q->nr_hw_queues; q->nr_hw_queues = set->nr_hw_queues; } - for (; j < end; j++) { - struct blk_mq_hw_ctx *hctx = hctxs[j]; - - if (hctx) { - if (hctx->tags) - blk_mq_free_map_and_requests(set, j); - blk_mq_exit_hctx(q, set, hctx, j); - hctxs[j] = NULL; - } - } + xa_for_each_start(&q->hctx_table, j, hctx, j) + blk_mq_exit_hctx(q, set, hctx, j); mutex_unlock(&q->sysfs_lock); } -struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, - struct request_queue *q, - bool elevator_init) +static void blk_mq_update_poll_flag(struct request_queue *q) +{ + struct blk_mq_tag_set *set = q->tag_set; + + if (set->nr_maps > HCTX_TYPE_POLL && + set->map[HCTX_TYPE_POLL].nr_queues) + blk_queue_flag_set(QUEUE_FLAG_POLL, q); + else + blk_queue_flag_clear(QUEUE_FLAG_POLL, q); +} + +int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, + struct request_queue *q) { /* mark the queue as mq asap */ q->mq_ops = set->ops; - q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, - blk_mq_poll_stats_bkt, - BLK_MQ_POLL_STATS_BKTS, q); - if (!q->poll_cb) - goto err_exit; - if (blk_mq_alloc_ctxs(q)) - goto err_poll; + goto err_exit; /* init q->mq_kobj and sw queues' kobjects */ blk_mq_sysfs_init(q); @@ -3076,6 +4265,8 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, INIT_LIST_HEAD(&q->unused_hctx_list); spin_lock_init(&q->unused_hctx_lock); + xa_init(&q->hctx_table); + blk_mq_realloc_hw_ctxs(set, q); if (!q->nr_hw_queues) goto err_hctxs; @@ -3086,67 +4277,67 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, q->tag_set = set; q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; - if (set->nr_maps > HCTX_TYPE_POLL && - set->map[HCTX_TYPE_POLL].nr_queues) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - - q->sg_reserved_size = INT_MAX; + blk_mq_update_poll_flag(q); INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); + INIT_LIST_HEAD(&q->flush_list); INIT_LIST_HEAD(&q->requeue_list); spin_lock_init(&q->requeue_lock); q->nr_requests = set->queue_depth; - /* - * Default to classic polling - */ - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - blk_mq_init_cpu_queues(q, set->nr_hw_queues); blk_mq_add_queue_tag_set(set, q); blk_mq_map_swqueue(q); - - if (elevator_init) - elevator_init_mq(q); - - return q; + return 0; err_hctxs: - kfree(q->queue_hw_ctx); - q->nr_hw_queues = 0; - blk_mq_sysfs_deinit(q); -err_poll: - blk_stat_free_callback(q->poll_cb); - q->poll_cb = NULL; + blk_mq_release(q); err_exit: q->mq_ops = NULL; - return ERR_PTR(-ENOMEM); + return -ENOMEM; } EXPORT_SYMBOL(blk_mq_init_allocated_queue); /* tags can _not_ be used after returning from blk_mq_exit_queue */ void blk_mq_exit_queue(struct request_queue *q) { - struct blk_mq_tag_set *set = q->tag_set; + struct blk_mq_tag_set *set = q->tag_set; - blk_mq_del_queue_tag_set(q); + /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); + /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ + blk_mq_del_queue_tag_set(q); } static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) { int i; - for (i = 0; i < set->nr_hw_queues; i++) - if (!__blk_mq_alloc_map_and_request(set, i)) + if (blk_mq_is_shared_tags(set->flags)) { + set->shared_tags = blk_mq_alloc_map_and_rqs(set, + BLK_MQ_NO_HCTX_IDX, + set->queue_depth); + if (!set->shared_tags) + return -ENOMEM; + } + + for (i = 0; i < set->nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_rqs(set, i)) goto out_unwind; + cond_resched(); + } return 0; out_unwind: while (--i >= 0) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); + + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } return -ENOMEM; } @@ -3156,7 +4347,7 @@ out_unwind: * may reduce the depth asked for, if memory is tight. set->queue_depth * will be updated to reflect the allocated depth. */ -static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) +static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) { unsigned int depth; int err; @@ -3186,7 +4377,7 @@ static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) return 0; } -static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) +static void blk_mq_update_queue_map(struct blk_mq_tag_set *set) { /* * blk_mq_map_queues() and multiple .map_queues() implementations @@ -3216,20 +4407,21 @@ static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) for (i = 0; i < set->nr_maps; i++) blk_mq_clear_mq_map(&set->map[i]); - return set->ops->map_queues(set); + set->ops->map_queues(set); } else { BUG_ON(set->nr_maps > 1); - return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); + blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); } } static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, - int cur_nr_hw_queues, int new_nr_hw_queues) + int new_nr_hw_queues) { struct blk_mq_tags **new_tags; + int i; - if (cur_nr_hw_queues >= new_nr_hw_queues) - return 0; + if (set->nr_hw_queues >= new_nr_hw_queues) + goto done; new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), GFP_KERNEL, set->numa_node); @@ -3237,12 +4429,22 @@ static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, return -ENOMEM; if (set->tags) - memcpy(new_tags, set->tags, cur_nr_hw_queues * + memcpy(new_tags, set->tags, set->nr_hw_queues * sizeof(*set->tags)); kfree(set->tags); set->tags = new_tags; - set->nr_hw_queues = new_nr_hw_queues; + for (i = set->nr_hw_queues; i < new_nr_hw_queues; i++) { + if (!__blk_mq_alloc_map_and_rqs(set, i)) { + while (--i >= set->nr_hw_queues) + __blk_mq_free_map_and_rqs(set, i); + return -ENOMEM; + } + cond_resched(); + } + +done: + set->nr_hw_queues = new_nr_hw_queues; return 0; } @@ -3299,10 +4501,22 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) set->nr_hw_queues = nr_cpu_ids; - if (blk_mq_realloc_tag_set_tags(set, 0, set->nr_hw_queues) < 0) - return -ENOMEM; + if (set->flags & BLK_MQ_F_BLOCKING) { + set->srcu = kmalloc(sizeof(*set->srcu), GFP_KERNEL); + if (!set->srcu) + return -ENOMEM; + ret = init_srcu_struct(set->srcu); + if (ret) + goto out_free_srcu; + } ret = -ENOMEM; + set->tags = kcalloc_node(set->nr_hw_queues, + sizeof(struct blk_mq_tags *), GFP_KERNEL, + set->numa_node); + if (!set->tags) + goto out_cleanup_srcu; + for (i = 0; i < set->nr_maps; i++) { set->map[i].mq_map = kcalloc_node(nr_cpu_ids, sizeof(set->map[i].mq_map[0]), @@ -3312,11 +4526,9 @@ int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; } - ret = blk_mq_update_queue_map(set); - if (ret) - goto out_free_mq_map; + blk_mq_update_queue_map(set); - ret = blk_mq_alloc_map_and_requests(set); + ret = blk_mq_alloc_set_map_and_rqs(set); if (ret) goto out_free_mq_map; @@ -3332,16 +4544,43 @@ out_free_mq_map: } kfree(set->tags); set->tags = NULL; +out_cleanup_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + cleanup_srcu_struct(set->srcu); +out_free_srcu: + if (set->flags & BLK_MQ_F_BLOCKING) + kfree(set->srcu); return ret; } EXPORT_SYMBOL(blk_mq_alloc_tag_set); +/* allocate and initialize a tagset for a simple single-queue device */ +int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, + const struct blk_mq_ops *ops, unsigned int queue_depth, + unsigned int set_flags) +{ + memset(set, 0, sizeof(*set)); + set->ops = ops; + set->nr_hw_queues = 1; + set->nr_maps = 1; + set->queue_depth = queue_depth; + set->numa_node = NUMA_NO_NODE; + set->flags = set_flags; + return blk_mq_alloc_tag_set(set); +} +EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); + void blk_mq_free_tag_set(struct blk_mq_tag_set *set) { int i, j; for (i = 0; i < set->nr_hw_queues; i++) - blk_mq_free_map_and_requests(set, i); + __blk_mq_free_map_and_rqs(set, i); + + if (blk_mq_is_shared_tags(set->flags)) { + blk_mq_free_map_and_rqs(set, set->shared_tags, + BLK_MQ_NO_HCTX_IDX); + } for (j = 0; j < set->nr_maps; j++) { kfree(set->map[j].mq_map); @@ -3350,6 +4589,10 @@ void blk_mq_free_tag_set(struct blk_mq_tag_set *set) kfree(set->tags); set->tags = NULL; + if (set->flags & BLK_MQ_F_BLOCKING) { + cleanup_srcu_struct(set->srcu); + kfree(set->srcu); + } } EXPORT_SYMBOL(blk_mq_free_tag_set); @@ -3357,7 +4600,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) { struct blk_mq_tag_set *set = q->tag_set; struct blk_mq_hw_ctx *hctx; - int i, ret; + int ret; + unsigned long i; if (!set) return -EINVAL; @@ -3376,21 +4620,27 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) * If we're using an MQ scheduler, just update the scheduler * queue depth. This is similar to what the old code would do. */ - if (!hctx->sched_tags) { - ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, - false); - } else { + if (hctx->sched_tags) { ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, - nr, true); + nr, true); + } else { + ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, + false); } if (ret) break; if (q->elevator && q->elevator->type->ops.depth_updated) q->elevator->type->ops.depth_updated(hctx); } - - if (!ret) + if (!ret) { q->nr_requests = nr; + if (blk_mq_is_shared_tags(set->flags)) { + if (q->elevator) + blk_mq_tag_update_sched_shared_tags(q); + else + blk_mq_tag_resize_shared_tags(set, nr); + } + } blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); @@ -3418,53 +4668,61 @@ static bool blk_mq_elv_switch_none(struct list_head *head, { struct blk_mq_qe_pair *qe; - if (!q->elevator) - return true; - qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); if (!qe) return false; + /* q->elevator needs protection from ->sysfs_lock */ + mutex_lock(&q->sysfs_lock); + + /* the check has to be done with holding sysfs_lock */ + if (!q->elevator) { + kfree(qe); + goto unlock; + } + INIT_LIST_HEAD(&qe->node); qe->q = q; qe->type = q->elevator->type; + /* keep a reference to the elevator module as we'll switch back */ + __elevator_get(qe->type); list_add(&qe->node, head); - - mutex_lock(&q->sysfs_lock); - /* - * After elevator_switch_mq, the previous elevator_queue will be - * released by elevator_release. The reference of the io scheduler - * module get by elevator_get will also be put. So we need to get - * a reference of the io scheduler module here to prevent it to be - * removed. - */ - __module_get(qe->type->elevator_owner); - elevator_switch_mq(q, NULL); + elevator_disable(q); +unlock: mutex_unlock(&q->sysfs_lock); return true; } -static void blk_mq_elv_switch_back(struct list_head *head, - struct request_queue *q) +static struct blk_mq_qe_pair *blk_lookup_qe_pair(struct list_head *head, + struct request_queue *q) { struct blk_mq_qe_pair *qe; - struct elevator_type *t = NULL; list_for_each_entry(qe, head, node) - if (qe->q == q) { - t = qe->type; - break; - } + if (qe->q == q) + return qe; - if (!t) - return; + return NULL; +} + +static void blk_mq_elv_switch_back(struct list_head *head, + struct request_queue *q) +{ + struct blk_mq_qe_pair *qe; + struct elevator_type *t; + qe = blk_lookup_qe_pair(head, q); + if (!qe) + return; + t = qe->type; list_del(&qe->node); kfree(qe); mutex_lock(&q->sysfs_lock); - elevator_switch_mq(q, t); + elevator_switch(q, t); + /* drop the reference acquired in blk_mq_elv_switch_none */ + elevator_put(t); mutex_unlock(&q->sysfs_lock); } @@ -3473,7 +4731,8 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, { struct request_queue *q; LIST_HEAD(head); - int prev_nr_hw_queues; + int prev_nr_hw_queues = set->nr_hw_queues; + int i; lockdep_assert_held(&set->tag_list_lock); @@ -3497,24 +4756,26 @@ static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_debugfs_unregister_hctxs(q); - blk_mq_sysfs_unregister(q); + blk_mq_sysfs_unregister_hctxs(q); } - prev_nr_hw_queues = set->nr_hw_queues; - if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < - 0) + if (blk_mq_realloc_tag_set_tags(set, nr_hw_queues) < 0) goto reregister; - set->nr_hw_queues = nr_hw_queues; fallback: blk_mq_update_queue_map(set); list_for_each_entry(q, &set->tag_list, tag_set_list) { blk_mq_realloc_hw_ctxs(set, q); + blk_mq_update_poll_flag(q); if (q->nr_hw_queues != set->nr_hw_queues) { + int i = prev_nr_hw_queues; + pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", nr_hw_queues, prev_nr_hw_queues); + for (; i < set->nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); + set->nr_hw_queues = prev_nr_hw_queues; - blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); goto fallback; } blk_mq_map_swqueue(q); @@ -3522,7 +4783,7 @@ fallback: reregister: list_for_each_entry(q, &set->tag_list, tag_set_list) { - blk_mq_sysfs_register(q); + blk_mq_sysfs_register_hctxs(q); blk_mq_debugfs_register_hctxs(q); } @@ -3532,6 +4793,10 @@ switch_back: list_for_each_entry(q, &set->tag_list, tag_set_list) blk_mq_unfreeze_queue(q); + + /* Free the excess tags when nr_hw_queues shrink. */ + for (i = set->nr_hw_queues; i < prev_nr_hw_queues; i++) + __blk_mq_free_map_and_rqs(set, i); } void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) @@ -3542,215 +4807,58 @@ void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) } EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); -/* Enable polling stats and return whether they were already enabled. */ -static bool blk_poll_stats_enable(struct request_queue *q) -{ - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) - return true; - blk_stat_add_callback(q, q->poll_cb); - return false; -} - -static void blk_mq_poll_stats_start(struct request_queue *q) -{ - /* - * We don't arm the callback if polling stats are not enabled or the - * callback is already active. - */ - if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || - blk_stat_is_active(q->poll_cb)) - return; - - blk_stat_activate_msecs(q->poll_cb, 100); -} - -static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) -{ - struct request_queue *q = cb->data; - int bucket; - - for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { - if (cb->stat[bucket].nr_samples) - q->poll_stat[bucket] = cb->stat[bucket]; - } -} - -static unsigned long blk_mq_poll_nsecs(struct request_queue *q, - struct request *rq) -{ - unsigned long ret = 0; - int bucket; - - /* - * If stats collection isn't on, don't sleep but turn it on for - * future users - */ - if (!blk_poll_stats_enable(q)) - return 0; - - /* - * As an optimistic guess, use half of the mean service time - * for this type of request. We can (and should) make this smarter. - * For instance, if the completion latencies are tight, we can - * get closer than just half the mean. This is especially - * important on devices where the completion latencies are longer - * than ~10 usec. We do use the stats for the relevant IO size - * if available which does lead to better estimates. - */ - bucket = blk_mq_poll_stats_bkt(rq); - if (bucket < 0) - return ret; - - if (q->poll_stat[bucket].nr_samples) - ret = (q->poll_stat[bucket].mean + 1) / 2; - - return ret; -} - -static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, - struct request *rq) +static int blk_hctx_poll(struct request_queue *q, struct blk_mq_hw_ctx *hctx, + struct io_comp_batch *iob, unsigned int flags) { - struct hrtimer_sleeper hs; - enum hrtimer_mode mode; - unsigned int nsecs; - ktime_t kt; - - if (rq->rq_flags & RQF_MQ_POLL_SLEPT) - return false; - - /* - * If we get here, hybrid polling is enabled. Hence poll_nsec can be: - * - * 0: use half of prev avg - * >0: use this specific value - */ - if (q->poll_nsec > 0) - nsecs = q->poll_nsec; - else - nsecs = blk_mq_poll_nsecs(q, rq); - - if (!nsecs) - return false; - - rq->rq_flags |= RQF_MQ_POLL_SLEPT; + long state = get_current_state(); + int ret; - /* - * This will be replaced with the stats tracking code, using - * 'avg_completion_time / 2' as the pre-sleep target. - */ - kt = nsecs; + do { + ret = q->mq_ops->poll(hctx, iob); + if (ret > 0) { + __set_current_state(TASK_RUNNING); + return ret; + } - mode = HRTIMER_MODE_REL; - hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); - hrtimer_set_expires(&hs.timer, kt); + if (signal_pending_state(state, current)) + __set_current_state(TASK_RUNNING); + if (task_is_running(current)) + return 1; - do { - if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) + if (ret < 0 || (flags & BLK_POLL_ONESHOT)) break; - set_current_state(TASK_UNINTERRUPTIBLE); - hrtimer_sleeper_start_expires(&hs, mode); - if (hs.task) - io_schedule(); - hrtimer_cancel(&hs.timer); - mode = HRTIMER_MODE_ABS; - } while (hs.task && !signal_pending(current)); + cpu_relax(); + } while (!need_resched()); __set_current_state(TASK_RUNNING); - destroy_hrtimer_on_stack(&hs.timer); - return true; + return 0; } -static bool blk_mq_poll_hybrid(struct request_queue *q, - struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, + struct io_comp_batch *iob, unsigned int flags) { - struct request *rq; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - return false; + struct blk_mq_hw_ctx *hctx = xa_load(&q->hctx_table, cookie); - if (!blk_qc_t_is_internal(cookie)) - rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); - else { - rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); - /* - * With scheduling, if the request has completed, we'll - * get a NULL return here, as we clear the sched tag when - * that happens. The request still remains valid, like always, - * so we should be safe with just the NULL check. - */ - if (!rq) - return false; - } - - return blk_mq_poll_hybrid_sleep(q, rq); + return blk_hctx_poll(q, hctx, iob, flags); } -/** - * blk_poll - poll for IO completions - * @q: the queue - * @cookie: cookie passed back at IO submission time - * @spin: whether to spin for completions - * - * Description: - * Poll for completions on the passed in queue. Returns number of - * completed entries found. If @spin is true, then blk_poll will continue - * looping until at least one completion is found, unless the task is - * otherwise marked running (or we need to reschedule). - */ -int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) +int blk_rq_poll(struct request *rq, struct io_comp_batch *iob, + unsigned int poll_flags) { - struct blk_mq_hw_ctx *hctx; - long state; + struct request_queue *q = rq->q; + int ret; - if (!blk_qc_t_valid(cookie) || - !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) + if (!blk_rq_is_poll(rq)) + return 0; + if (!percpu_ref_tryget(&q->q_usage_counter)) return 0; - if (current->plug) - blk_flush_plug_list(current->plug, false); - - hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; - - /* - * If we sleep, have the caller restart the poll loop to reset - * the state. Like for the other success return cases, the - * caller is responsible for checking if the IO completed. If - * the IO isn't complete, we'll get called again and will go - * straight to the busy poll loop. - */ - if (blk_mq_poll_hybrid(q, hctx, cookie)) - return 1; - - hctx->poll_considered++; - - state = current->state; - do { - int ret; - - hctx->poll_invoked++; - - ret = q->mq_ops->poll(hctx); - if (ret > 0) { - hctx->poll_success++; - __set_current_state(TASK_RUNNING); - return ret; - } - - if (signal_pending_state(state, current)) - __set_current_state(TASK_RUNNING); - - if (current->state == TASK_RUNNING) - return 1; - if (ret < 0 || !spin) - break; - cpu_relax(); - } while (!need_resched()); + ret = blk_hctx_poll(q, rq->mq_hctx, iob, poll_flags); + blk_queue_exit(q); - __set_current_state(TASK_RUNNING); - return 0; + return ret; } -EXPORT_SYMBOL_GPL(blk_poll); +EXPORT_SYMBOL_GPL(blk_rq_poll); unsigned int blk_mq_rq_cpu(struct request *rq) { @@ -3758,8 +4866,31 @@ unsigned int blk_mq_rq_cpu(struct request *rq) } EXPORT_SYMBOL(blk_mq_rq_cpu); +void blk_mq_cancel_work_sync(struct request_queue *q) +{ + struct blk_mq_hw_ctx *hctx; + unsigned long i; + + cancel_delayed_work_sync(&q->requeue_work); + + queue_for_each_hw_ctx(q, hctx, i) + cancel_delayed_work_sync(&hctx->run_work); +} + static int __init blk_mq_init(void) { + int i; + + for_each_possible_cpu(i) + init_llist_head(&per_cpu(blk_cpu_done, i)); + for_each_possible_cpu(i) + INIT_CSD(&per_cpu(blk_cpu_csd, i), + __blk_mq_complete_request_remote, NULL); + open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); + + cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, + "block/softirq:dead", NULL, + blk_softirq_cpu_dead); cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, blk_mq_hctx_notify_dead); cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", diff --git a/block/blk-mq.h b/block/blk-mq.h index b3ce0f3a2ad2..f75a9ecfebde 100644 --- a/block/blk-mq.h +++ b/block/blk-mq.h @@ -2,8 +2,8 @@ #ifndef INT_BLK_MQ_H #define INT_BLK_MQ_H +#include <linux/blk-mq.h> #include "blk-stat.h" -#include "blk-mq-tag.h" struct blk_mq_tag_set; @@ -25,27 +25,32 @@ struct blk_mq_ctx { unsigned short index_hw[HCTX_MAX_TYPES]; struct blk_mq_hw_ctx *hctxs[HCTX_MAX_TYPES]; - /* incremented at dispatch time */ - unsigned long rq_dispatched[2]; - unsigned long rq_merged; - - /* incremented at completion time */ - unsigned long ____cacheline_aligned_in_smp rq_completed[2]; - struct request_queue *queue; struct blk_mq_ctxs *ctxs; struct kobject kobj; } ____cacheline_aligned_in_smp; +enum { + BLK_MQ_NO_TAG = -1U, + BLK_MQ_TAG_MIN = 1, + BLK_MQ_TAG_MAX = BLK_MQ_NO_TAG - 1, +}; + +typedef unsigned int __bitwise blk_insert_t; +#define BLK_MQ_INSERT_AT_HEAD ((__force blk_insert_t)0x01) + +void blk_mq_submit_bio(struct bio *bio); +int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, + unsigned int flags); void blk_mq_exit_queue(struct request_queue *q); int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr); void blk_mq_wake_waiters(struct request_queue *q); -bool blk_mq_dispatch_rq_list(struct request_queue *, struct list_head *, bool); -void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, - bool kick_requeue_list); +bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *, + unsigned int); void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list); struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *start); +void blk_mq_put_rq_ref(struct request *rq); /* * Internal helpers for allocating/freeing the request map @@ -53,27 +58,11 @@ struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, unsigned int hctx_idx); void blk_mq_free_rq_map(struct blk_mq_tags *tags); -struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, - unsigned int hctx_idx, - unsigned int nr_tags, - unsigned int reserved_tags); -int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, - unsigned int hctx_idx, unsigned int depth); - -/* - * Internal helpers for request insertion into sw queues - */ -void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head); -void blk_mq_request_bypass_insert(struct request *rq, bool at_head, - bool run_queue); -void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, - struct list_head *list); - -/* Used by blk_insert_cloned_request() to issue request directly */ -blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last); -void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, - struct list_head *list); +struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, + unsigned int hctx_idx, unsigned int depth); +void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, + struct blk_mq_tags *tags, + unsigned int hctx_idx); /* * CPU -> queue mappings @@ -90,30 +79,34 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue_type(struct request_queue * enum hctx_type type, unsigned int cpu) { - return q->queue_hw_ctx[q->tag_set->map[type].mq_map[cpu]]; + return xa_load(&q->hctx_table, q->tag_set->map[type].mq_map[cpu]); } -/* - * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue - * @q: request queue - * @flags: request command flags - * @cpu: cpu ctx - */ -static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, - unsigned int flags, - struct blk_mq_ctx *ctx) +static inline enum hctx_type blk_mq_get_hctx_type(blk_opf_t opf) { enum hctx_type type = HCTX_TYPE_DEFAULT; /* - * The caller ensure that if REQ_HIPRI, poll must be enabled. + * The caller ensure that if REQ_POLLED, poll must be enabled. */ - if (flags & REQ_HIPRI) + if (opf & REQ_POLLED) type = HCTX_TYPE_POLL; - else if ((flags & REQ_OP_MASK) == REQ_OP_READ) + else if ((opf & REQ_OP_MASK) == REQ_OP_READ) type = HCTX_TYPE_READ; - - return ctx->hctxs[type]; + return type; +} + +/* + * blk_mq_map_queue() - map (cmd_flags,type) to hardware queue + * @q: request queue + * @opf: operation type (REQ_OP_*) and flags (e.g. REQ_POLLED). + * @ctx: software queue cpu ctx + */ +static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, + blk_opf_t opf, + struct blk_mq_ctx *ctx) +{ + return ctx->hctxs[blk_mq_get_hctx_type(opf)]; } /* @@ -121,10 +114,15 @@ static inline struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, */ extern void blk_mq_sysfs_init(struct request_queue *q); extern void blk_mq_sysfs_deinit(struct request_queue *q); -extern int __blk_mq_register_dev(struct device *dev, struct request_queue *q); -extern int blk_mq_sysfs_register(struct request_queue *q); -extern void blk_mq_sysfs_unregister(struct request_queue *q); +int blk_mq_sysfs_register(struct gendisk *disk); +void blk_mq_sysfs_unregister(struct gendisk *disk); +int blk_mq_sysfs_register_hctxs(struct request_queue *q); +void blk_mq_sysfs_unregister_hctxs(struct request_queue *q); extern void blk_mq_hctx_kobj_init(struct blk_mq_hw_ctx *hctx); +void blk_mq_free_plug_rqs(struct blk_plug *plug); +void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule); + +void blk_mq_cancel_work_sync(struct request_queue *q); void blk_mq_release(struct request_queue *q); @@ -150,18 +148,81 @@ struct blk_mq_alloc_data { struct request_queue *q; blk_mq_req_flags_t flags; unsigned int shallow_depth; - unsigned int cmd_flags; + blk_opf_t cmd_flags; + req_flags_t rq_flags; + + /* allocate multiple requests/tags in one go */ + unsigned int nr_tags; + struct request **cached_rq; /* input & output parameter */ struct blk_mq_ctx *ctx; struct blk_mq_hw_ctx *hctx; }; +struct blk_mq_tags *blk_mq_init_tags(unsigned int nr_tags, + unsigned int reserved_tags, int node, int alloc_policy); +void blk_mq_free_tags(struct blk_mq_tags *tags); +int blk_mq_init_bitmaps(struct sbitmap_queue *bitmap_tags, + struct sbitmap_queue *breserved_tags, unsigned int queue_depth, + unsigned int reserved, int node, int alloc_policy); + +unsigned int blk_mq_get_tag(struct blk_mq_alloc_data *data); +unsigned long blk_mq_get_tags(struct blk_mq_alloc_data *data, int nr_tags, + unsigned int *offset); +void blk_mq_put_tag(struct blk_mq_tags *tags, struct blk_mq_ctx *ctx, + unsigned int tag); +void blk_mq_put_tags(struct blk_mq_tags *tags, int *tag_array, int nr_tags); +int blk_mq_tag_update_depth(struct blk_mq_hw_ctx *hctx, + struct blk_mq_tags **tags, unsigned int depth, bool can_grow); +void blk_mq_tag_resize_shared_tags(struct blk_mq_tag_set *set, + unsigned int size); +void blk_mq_tag_update_sched_shared_tags(struct request_queue *q); + +void blk_mq_tag_wakeup_all(struct blk_mq_tags *tags, bool); +void blk_mq_queue_tag_busy_iter(struct request_queue *q, busy_tag_iter_fn *fn, + void *priv); +void blk_mq_all_tag_iter(struct blk_mq_tags *tags, busy_tag_iter_fn *fn, + void *priv); + +static inline struct sbq_wait_state *bt_wait_ptr(struct sbitmap_queue *bt, + struct blk_mq_hw_ctx *hctx) +{ + if (!hctx) + return &bt->ws[0]; + return sbq_wait_ptr(bt, &hctx->wait_index); +} + +void __blk_mq_tag_busy(struct blk_mq_hw_ctx *); +void __blk_mq_tag_idle(struct blk_mq_hw_ctx *); + +static inline void blk_mq_tag_busy(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_tag_busy(hctx); +} + +static inline void blk_mq_tag_idle(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_tag_idle(hctx); +} + +static inline bool blk_mq_tag_is_reserved(struct blk_mq_tags *tags, + unsigned int tag) +{ + return tag < tags->nr_reserved_tags; +} + +static inline bool blk_mq_is_shared_tags(unsigned int flags) +{ + return flags & BLK_MQ_F_TAG_HCTX_SHARED; +} + static inline struct blk_mq_tags *blk_mq_tags_from_data(struct blk_mq_alloc_data *data) { - if (data->flags & BLK_MQ_REQ_INTERNAL) + if (data->rq_flags & RQF_SCHED_TAGS) return data->hctx->sched_tags; - return data->hctx->tags; } @@ -175,37 +236,107 @@ static inline bool blk_mq_hw_queue_mapped(struct blk_mq_hw_ctx *hctx) return hctx->nr_ctx && hctx->tags; } -unsigned int blk_mq_in_flight(struct request_queue *q, struct hd_struct *part); -void blk_mq_in_flight_rw(struct request_queue *q, struct hd_struct *part, - unsigned int inflight[2]); +unsigned int blk_mq_in_flight(struct request_queue *q, + struct block_device *part); +void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, + unsigned int inflight[2]); -static inline void blk_mq_put_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline void blk_mq_put_dispatch_budget(struct request_queue *q, + int budget_token) { - struct request_queue *q = hctx->queue; - if (q->mq_ops->put_budget) - q->mq_ops->put_budget(hctx); + q->mq_ops->put_budget(q, budget_token); } -static inline bool blk_mq_get_dispatch_budget(struct blk_mq_hw_ctx *hctx) +static inline int blk_mq_get_dispatch_budget(struct request_queue *q) { - struct request_queue *q = hctx->queue; - if (q->mq_ops->get_budget) - return q->mq_ops->get_budget(hctx); - return true; + return q->mq_ops->get_budget(q); + return 0; +} + +static inline void blk_mq_set_rq_budget_token(struct request *rq, int token) +{ + if (token < 0) + return; + + if (rq->q->mq_ops->set_rq_budget_token) + rq->q->mq_ops->set_rq_budget_token(rq, token); +} + +static inline int blk_mq_get_rq_budget_token(struct request *rq) +{ + if (rq->q->mq_ops->get_rq_budget_token) + return rq->q->mq_ops->get_rq_budget_token(rq); + return -1; +} + +static inline void __blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (blk_mq_is_shared_tags(hctx->flags)) + atomic_add(val, &hctx->queue->nr_active_requests_shared_tags); + else + atomic_add(val, &hctx->nr_active); } +static inline void __blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + __blk_mq_add_active_requests(hctx, 1); +} + +static inline void __blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (blk_mq_is_shared_tags(hctx->flags)) + atomic_sub(val, &hctx->queue->nr_active_requests_shared_tags); + else + atomic_sub(val, &hctx->nr_active); +} + +static inline void __blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + __blk_mq_sub_active_requests(hctx, 1); +} + +static inline void blk_mq_add_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_add_active_requests(hctx, val); +} + +static inline void blk_mq_inc_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_inc_active_requests(hctx); +} + +static inline void blk_mq_sub_active_requests(struct blk_mq_hw_ctx *hctx, + int val) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_sub_active_requests(hctx, val); +} + +static inline void blk_mq_dec_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) + __blk_mq_dec_active_requests(hctx); +} + +static inline int __blk_mq_active_requests(struct blk_mq_hw_ctx *hctx) +{ + if (blk_mq_is_shared_tags(hctx->flags)) + return atomic_read(&hctx->queue->nr_active_requests_shared_tags); + return atomic_read(&hctx->nr_active); +} static inline void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, struct request *rq) { + blk_mq_dec_active_requests(hctx); blk_mq_put_tag(hctx->tags, rq->mq_ctx, rq->tag); rq->tag = BLK_MQ_NO_TAG; - - if (rq->rq_flags & RQF_MQ_INFLIGHT) { - rq->rq_flags &= ~RQF_MQ_INFLIGHT; - atomic_dec(&hctx->nr_active); - } } static inline void blk_mq_put_driver_tag(struct request *rq) @@ -216,6 +347,16 @@ static inline void blk_mq_put_driver_tag(struct request *rq) __blk_mq_put_driver_tag(rq->mq_hctx, rq); } +bool __blk_mq_alloc_driver_tag(struct request *rq); + +static inline bool blk_mq_get_driver_tag(struct request *rq) +{ + if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_alloc_driver_tag(rq)) + return false; + + return true; +} + static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) { int cpu; @@ -226,7 +367,6 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) /* * blk_mq_plug() - Get caller context plug - * @q: request queue * @bio : the bio being submitted by the caller context * * Plugging, by design, may delay the insertion of BIOs into the elevator in @@ -237,23 +377,94 @@ static inline void blk_mq_clear_mq_map(struct blk_mq_queue_map *qmap) * order. While this is not a problem with regular block devices, this ordering * change can cause write BIO failures with zoned block devices as these * require sequential write patterns to zones. Prevent this from happening by - * ignoring the plug state of a BIO issuing context if the target request queue - * is for a zoned block device and the BIO to plug is a write operation. + * ignoring the plug state of a BIO issuing context if it is for a zoned block + * device and the BIO to plug is a write operation. * * Return current->plug if the bio can be plugged and NULL otherwise */ -static inline struct blk_plug *blk_mq_plug(struct request_queue *q, - struct bio *bio) +static inline struct blk_plug *blk_mq_plug( struct bio *bio) { + /* Zoned block device write operation case: do not plug the BIO */ + if (IS_ENABLED(CONFIG_BLK_DEV_ZONED) && + bdev_op_is_zoned_write(bio->bi_bdev, bio_op(bio))) + return NULL; + /* * For regular block devices or read operations, use the context plug * which may be NULL if blk_start_plug() was not executed. */ - if (!blk_queue_is_zoned(q) || !op_is_write(bio_op(bio))) - return current->plug; + return current->plug; +} - /* Zoned block device write operation case: do not plug the BIO */ - return NULL; +/* Free all requests on the list */ +static inline void blk_mq_free_requests(struct list_head *list) +{ + while (!list_empty(list)) { + struct request *rq = list_entry_rq(list->next); + + list_del_init(&rq->queuelist); + blk_mq_free_request(rq); + } +} + +/* + * For shared tag users, we track the number of currently active users + * and attempt to provide a fair share of the tag depth for each of them. + */ +static inline bool hctx_may_queue(struct blk_mq_hw_ctx *hctx, + struct sbitmap_queue *bt) +{ + unsigned int depth, users; + + if (!hctx || !(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) + return true; + + /* + * Don't try dividing an ant + */ + if (bt->sb.depth == 1) + return true; + + if (blk_mq_is_shared_tags(hctx->flags)) { + struct request_queue *q = hctx->queue; + + if (!test_bit(QUEUE_FLAG_HCTX_ACTIVE, &q->queue_flags)) + return true; + } else { + if (!test_bit(BLK_MQ_S_TAG_ACTIVE, &hctx->state)) + return true; + } + + users = READ_ONCE(hctx->tags->active_queues); + if (!users) + return true; + + /* + * Allow at least some tags + */ + depth = max((bt->sb.depth + users - 1) / users, 4U); + return __blk_mq_active_requests(hctx) < depth; } +/* run the code block in @dispatch_ops with rcu/srcu read lock held */ +#define __blk_mq_run_dispatch_ops(q, check_sleep, dispatch_ops) \ +do { \ + if ((q)->tag_set->flags & BLK_MQ_F_BLOCKING) { \ + struct blk_mq_tag_set *__tag_set = (q)->tag_set; \ + int srcu_idx; \ + \ + might_sleep_if(check_sleep); \ + srcu_idx = srcu_read_lock(__tag_set->srcu); \ + (dispatch_ops); \ + srcu_read_unlock(__tag_set->srcu, srcu_idx); \ + } else { \ + rcu_read_lock(); \ + (dispatch_ops); \ + rcu_read_unlock(); \ + } \ +} while (0) + +#define blk_mq_run_dispatch_ops(q, dispatch_ops) \ + __blk_mq_run_dispatch_ops(q, true, dispatch_ops) \ + #endif diff --git a/block/blk-pm.c b/block/blk-pm.c index 1adc1cd748b4..42e842074715 100644 --- a/block/blk-pm.c +++ b/block/blk-pm.c @@ -1,11 +1,9 @@ // SPDX-License-Identifier: GPL-2.0 -#include <linux/blk-mq.h> #include <linux/blk-pm.h> #include <linux/blkdev.h> #include <linux/pm_runtime.h> #include "blk-mq.h" -#include "blk-mq-tag.h" /** * blk_pm_runtime_init - Block layer runtime PM initialization routine @@ -67,6 +65,10 @@ int blk_pre_runtime_suspend(struct request_queue *q) WARN_ON_ONCE(q->rpm_status != RPM_ACTIVE); + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_SUSPENDING; + spin_unlock_irq(&q->queue_lock); + /* * Increase the pm_only counter before checking whether any * non-PM blk_queue_enter() calls are in progress to avoid that any @@ -89,15 +91,14 @@ int blk_pre_runtime_suspend(struct request_queue *q) /* Switch q_usage_counter back to per-cpu mode. */ blk_mq_unfreeze_queue(q); - spin_lock_irq(&q->queue_lock); - if (ret < 0) + if (ret < 0) { + spin_lock_irq(&q->queue_lock); + q->rpm_status = RPM_ACTIVE; pm_runtime_mark_last_busy(q->dev); - else - q->rpm_status = RPM_SUSPENDING; - spin_unlock_irq(&q->queue_lock); + spin_unlock_irq(&q->queue_lock); - if (ret) blk_clear_pm_only(q); + } return ret; } @@ -160,59 +161,31 @@ EXPORT_SYMBOL(blk_pre_runtime_resume); /** * blk_post_runtime_resume - Post runtime resume processing * @q: the queue of the device - * @err: return value of the device's runtime_resume function * * Description: - * Update the queue's runtime status according to the return value of the - * device's runtime_resume function. If it is successfully resumed, process - * the requests that are queued into the device's queue when it is resuming - * and then mark last busy and initiate autosuspend for it. + * Restart the queue of a runtime suspended device. It does this regardless + * of whether the device's runtime-resume succeeded; even if it failed the + * driver or error handler will need to communicate with the device. * * This function should be called near the end of the device's - * runtime_resume callback. + * runtime_resume callback to correct queue runtime PM status and re-enable + * peeking requests from the queue. */ -void blk_post_runtime_resume(struct request_queue *q, int err) +void blk_post_runtime_resume(struct request_queue *q) { + int old_status; + if (!q->dev) return; spin_lock_irq(&q->queue_lock); - if (!err) { - q->rpm_status = RPM_ACTIVE; - pm_runtime_mark_last_busy(q->dev); - pm_request_autosuspend(q->dev); - } else { - q->rpm_status = RPM_SUSPENDED; - } + old_status = q->rpm_status; + q->rpm_status = RPM_ACTIVE; + pm_runtime_mark_last_busy(q->dev); + pm_request_autosuspend(q->dev); spin_unlock_irq(&q->queue_lock); - if (!err) + if (old_status != RPM_ACTIVE) blk_clear_pm_only(q); } EXPORT_SYMBOL(blk_post_runtime_resume); - -/** - * blk_set_runtime_active - Force runtime status of the queue to be active - * @q: the queue of the device - * - * If the device is left runtime suspended during system suspend the resume - * hook typically resumes the device and corrects runtime status - * accordingly. However, that does not affect the queue runtime PM status - * which is still "suspended". This prevents processing requests from the - * queue. - * - * This function can be used in driver's resume hook to correct queue - * runtime PM status and re-enable peeking requests from the queue. It - * should be called before first request is added to the queue. - */ -void blk_set_runtime_active(struct request_queue *q) -{ - if (q->dev) { - spin_lock_irq(&q->queue_lock); - q->rpm_status = RPM_ACTIVE; - pm_runtime_mark_last_busy(q->dev); - pm_request_autosuspend(q->dev); - spin_unlock_irq(&q->queue_lock); - } -} -EXPORT_SYMBOL(blk_set_runtime_active); diff --git a/block/blk-pm.h b/block/blk-pm.h index ea5507d23e75..8a5a0d4b357f 100644 --- a/block/blk-pm.h +++ b/block/blk-pm.h @@ -6,11 +6,14 @@ #include <linux/pm_runtime.h> #ifdef CONFIG_PM -static inline void blk_pm_request_resume(struct request_queue *q) +static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { - if (q->dev && (q->rpm_status == RPM_SUSPENDED || - q->rpm_status == RPM_SUSPENDING)) - pm_request_resume(q->dev); + if (!q->dev || !blk_queue_pm_only(q)) + return 1; /* Nothing to do */ + if (pm && q->rpm_status != RPM_SUSPENDED) + return 1; /* Request allowed */ + pm_request_resume(q->dev); + return 0; } static inline void blk_pm_mark_last_busy(struct request *rq) @@ -18,52 +21,15 @@ static inline void blk_pm_mark_last_busy(struct request *rq) if (rq->q->dev && !(rq->rq_flags & RQF_PM)) pm_runtime_mark_last_busy(rq->q->dev); } - -static inline void blk_pm_requeue_request(struct request *rq) -{ - lockdep_assert_held(&rq->q->queue_lock); - - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - rq->q->nr_pending--; -} - -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ - lockdep_assert_held(&q->queue_lock); - - if (q->dev && !(rq->rq_flags & RQF_PM)) - q->nr_pending++; -} - -static inline void blk_pm_put_request(struct request *rq) -{ - lockdep_assert_held(&rq->q->queue_lock); - - if (rq->q->dev && !(rq->rq_flags & RQF_PM)) - --rq->q->nr_pending; -} #else -static inline void blk_pm_request_resume(struct request_queue *q) +static inline int blk_pm_resume_queue(const bool pm, struct request_queue *q) { + return 1; } static inline void blk_pm_mark_last_busy(struct request *rq) { } - -static inline void blk_pm_requeue_request(struct request *rq) -{ -} - -static inline void blk_pm_add_request(struct request_queue *q, - struct request *rq) -{ -} - -static inline void blk_pm_put_request(struct request *rq) -{ -} #endif #endif /* _BLOCK_BLK_PM_H_ */ diff --git a/block/blk-rq-qos.c b/block/blk-rq-qos.c index 656460636ad3..dd7310c94713 100644 --- a/block/blk-rq-qos.c +++ b/block/blk-rq-qos.c @@ -10,16 +10,10 @@ static bool atomic_inc_below(atomic_t *v, unsigned int below) { unsigned int cur = atomic_read(v); - for (;;) { - unsigned int old; - + do { if (cur >= below) return false; - old = atomic_cmpxchg(v, cur, cur + 1); - if (old == cur) - break; - cur = old; - } + } while (!atomic_try_cmpxchg(v, &cur, cur + 1)); return true; } @@ -266,8 +260,8 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, if (!has_sleeper && acquire_inflight_cb(rqw, private_data)) return; - prepare_to_wait_exclusive(&rqw->wait, &data.wq, TASK_UNINTERRUPTIBLE); - has_sleeper = !wq_has_single_sleeper(&rqw->wait); + has_sleeper = !prepare_to_wait_exclusive(&rqw->wait, &data.wq, + TASK_UNINTERRUPTIBLE); do { /* The memory barrier in set_task_state saves us here. */ if (data.got_token) @@ -276,7 +270,7 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, finish_wait(&rqw->wait, &data.wq); /* - * We raced with wbt_wake_function() getting a token, + * We raced with rq_qos_wake_function() getting a token, * which means we now have two. Put our local token * and wake anyone else potentially waiting for one. */ @@ -294,11 +288,68 @@ void rq_qos_wait(struct rq_wait *rqw, void *private_data, void rq_qos_exit(struct request_queue *q) { - blk_mq_debugfs_unregister_queue_rqos(q); - + mutex_lock(&q->rq_qos_mutex); while (q->rq_qos) { struct rq_qos *rqos = q->rq_qos; q->rq_qos = rqos->next; rqos->ops->exit(rqos); } + mutex_unlock(&q->rq_qos_mutex); +} + +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops) +{ + struct request_queue *q = disk->queue; + + lockdep_assert_held(&q->rq_qos_mutex); + + rqos->disk = disk; + rqos->id = id; + rqos->ops = ops; + + /* + * No IO can be in-flight when adding rqos, so freeze queue, which + * is fine since we only support rq_qos for blk-mq queue. + */ + blk_mq_freeze_queue(q); + + if (rq_qos_id(q, rqos->id)) + goto ebusy; + rqos->next = q->rq_qos; + q->rq_qos = rqos; + + blk_mq_unfreeze_queue(q); + + if (rqos->ops->debugfs_attrs) { + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_register_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); + } + + return 0; +ebusy: + blk_mq_unfreeze_queue(q); + return -EBUSY; +} + +void rq_qos_del(struct rq_qos *rqos) +{ + struct request_queue *q = rqos->disk->queue; + struct rq_qos **cur; + + lockdep_assert_held(&q->rq_qos_mutex); + + blk_mq_freeze_queue(q); + for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { + if (*cur == rqos) { + *cur = rqos->next; + break; + } + } + blk_mq_unfreeze_queue(q); + + mutex_lock(&q->debugfs_mutex); + blk_mq_debugfs_unregister_rqos(rqos); + mutex_unlock(&q->debugfs_mutex); } diff --git a/block/blk-rq-qos.h b/block/blk-rq-qos.h index 2bc43e94f4c4..37245c97ee61 100644 --- a/block/blk-rq-qos.h +++ b/block/blk-rq-qos.h @@ -7,6 +7,7 @@ #include <linux/blk_types.h> #include <linux/atomic.h> #include <linux/wait.h> +#include <linux/blk-mq.h> #include "blk-mq-debugfs.h" @@ -24,8 +25,8 @@ struct rq_wait { }; struct rq_qos { - struct rq_qos_ops *ops; - struct request_queue *q; + const struct rq_qos_ops *ops; + struct gendisk *disk; enum rq_qos_id id; struct rq_qos *next; #ifdef CONFIG_BLK_DEBUG_FS @@ -73,52 +74,20 @@ static inline struct rq_qos *wbt_rq_qos(struct request_queue *q) return rq_qos_id(q, RQ_QOS_WBT); } -static inline struct rq_qos *blkcg_rq_qos(struct request_queue *q) +static inline struct rq_qos *iolat_rq_qos(struct request_queue *q) { return rq_qos_id(q, RQ_QOS_LATENCY); } -static inline const char *rq_qos_id_to_name(enum rq_qos_id id) -{ - switch (id) { - case RQ_QOS_WBT: - return "wbt"; - case RQ_QOS_LATENCY: - return "latency"; - case RQ_QOS_COST: - return "cost"; - } - return "unknown"; -} - static inline void rq_wait_init(struct rq_wait *rq_wait) { atomic_set(&rq_wait->inflight, 0); init_waitqueue_head(&rq_wait->wait); } -static inline void rq_qos_add(struct request_queue *q, struct rq_qos *rqos) -{ - rqos->next = q->rq_qos; - q->rq_qos = rqos; - - if (rqos->ops->debugfs_attrs) - blk_mq_debugfs_register_rqos(rqos); -} - -static inline void rq_qos_del(struct request_queue *q, struct rq_qos *rqos) -{ - struct rq_qos **cur; - - for (cur = &q->rq_qos; *cur; cur = &(*cur)->next) { - if (*cur == rqos) { - *cur = rqos->next; - break; - } - } - - blk_mq_debugfs_unregister_rqos(rqos); -} +int rq_qos_add(struct rq_qos *rqos, struct gendisk *disk, enum rq_qos_id id, + const struct rq_qos_ops *ops); +void rq_qos_del(struct rq_qos *rqos); typedef bool (acquire_inflight_cb_t)(struct rq_wait *rqw, void *private_data); typedef void (cleanup_cb_t)(struct rq_wait *rqw, void *private_data); @@ -149,7 +118,7 @@ static inline void rq_qos_cleanup(struct request_queue *q, struct bio *bio) static inline void rq_qos_done(struct request_queue *q, struct request *rq) { - if (q->rq_qos) + if (q->rq_qos && !blk_rq_is_passthrough(rq)) __rq_qos_done(q->rq_qos, rq); } @@ -165,21 +134,22 @@ static inline void rq_qos_requeue(struct request_queue *q, struct request *rq) __rq_qos_requeue(q->rq_qos, rq); } -static inline void rq_qos_done_bio(struct request_queue *q, struct bio *bio) +static inline void rq_qos_done_bio(struct bio *bio) { - if (q->rq_qos) - __rq_qos_done_bio(q->rq_qos, bio); + if (bio->bi_bdev && (bio_flagged(bio, BIO_QOS_THROTTLED) || + bio_flagged(bio, BIO_QOS_MERGED))) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + if (q->rq_qos) + __rq_qos_done_bio(q->rq_qos, bio); + } } static inline void rq_qos_throttle(struct request_queue *q, struct bio *bio) { - /* - * BIO_TRACKED lets controllers know that a bio went through the - * normal rq_qos path. - */ - bio_set_flag(bio, BIO_TRACKED); - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_THROTTLED); __rq_qos_throttle(q->rq_qos, bio); + } } static inline void rq_qos_track(struct request_queue *q, struct request *rq, @@ -192,8 +162,10 @@ static inline void rq_qos_track(struct request_queue *q, struct request *rq, static inline void rq_qos_merge(struct request_queue *q, struct request *rq, struct bio *bio) { - if (q->rq_qos) + if (q->rq_qos) { + bio_set_flag(bio, BIO_QOS_MERGED); __rq_qos_merge(q->rq_qos, rq, bio); + } } static inline void rq_qos_queue_depth_changed(struct request_queue *q) diff --git a/block/blk-settings.c b/block/blk-settings.c index 9a2c23cd9700..06ea91e51b8b 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -7,7 +7,8 @@ #include <linux/init.h> #include <linux/bio.h> #include <linux/blkdev.h> -#include <linux/memblock.h> /* for max_pfn/max_low_pfn */ +#include <linux/pagemap.h> +#include <linux/backing-dev-defs.h> #include <linux/gcd.h> #include <linux/lcm.h> #include <linux/jiffies.h> @@ -15,13 +16,9 @@ #include <linux/dma-mapping.h> #include "blk.h" +#include "blk-rq-qos.h" #include "blk-wbt.h" -unsigned long blk_max_low_pfn; -EXPORT_SYMBOL(blk_max_low_pfn); - -unsigned long blk_max_pfn; - void blk_queue_rq_timeout(struct request_queue *q, unsigned int timeout) { q->rq_timeout = timeout; @@ -44,24 +41,25 @@ void blk_set_default_limits(struct queue_limits *lim) lim->virt_boundary_mask = 0; lim->max_segment_size = BLK_MAX_SEGMENT_SIZE; lim->max_sectors = lim->max_hw_sectors = BLK_SAFE_MAX_SECTORS; - lim->max_dev_sectors = 0; + lim->max_user_sectors = lim->max_dev_sectors = 0; lim->chunk_sectors = 0; - lim->max_write_same_sectors = 0; lim->max_write_zeroes_sectors = 0; lim->max_zone_append_sectors = 0; lim->max_discard_sectors = 0; lim->max_hw_discard_sectors = 0; - lim->discard_granularity = 0; + lim->max_secure_erase_sectors = 0; + lim->discard_granularity = 512; lim->discard_alignment = 0; lim->discard_misaligned = 0; lim->logical_block_size = lim->physical_block_size = lim->io_min = 512; - lim->bounce_pfn = (unsigned long)(BLK_BOUNCE_ANY >> PAGE_SHIFT); + lim->bounce = BLK_BOUNCE_NONE; lim->alignment_offset = 0; lim->io_opt = 0; lim->misaligned = 0; - lim->zoned = BLK_ZONED_NONE; + lim->zoned = false; + lim->zone_write_granularity = 0; + lim->dma_alignment = 511; } -EXPORT_SYMBOL(blk_set_default_limits); /** * blk_set_stacking_limits - set default limits for stacking devices @@ -82,7 +80,6 @@ void blk_set_stacking_limits(struct queue_limits *lim) lim->max_segment_size = UINT_MAX; lim->max_sectors = UINT_MAX; lim->max_dev_sectors = UINT_MAX; - lim->max_write_same_sectors = UINT_MAX; lim->max_write_zeroes_sectors = UINT_MAX; lim->max_zone_append_sectors = UINT_MAX; } @@ -91,39 +88,16 @@ EXPORT_SYMBOL(blk_set_stacking_limits); /** * blk_queue_bounce_limit - set bounce buffer limit for queue * @q: the request queue for the device - * @max_addr: the maximum address the device can handle + * @bounce: bounce limit to enforce * * Description: - * Different hardware can have different requirements as to what pages - * it can do I/O directly to. A low level driver can call - * blk_queue_bounce_limit to have lower memory pages allocated as bounce - * buffers for doing I/O to pages residing above @max_addr. + * Force bouncing for ISA DMA ranges or highmem. + * + * DEPRECATED, don't use in new code. **/ -void blk_queue_bounce_limit(struct request_queue *q, u64 max_addr) +void blk_queue_bounce_limit(struct request_queue *q, enum blk_bounce bounce) { - unsigned long b_pfn = max_addr >> PAGE_SHIFT; - int dma = 0; - - q->bounce_gfp = GFP_NOIO; -#if BITS_PER_LONG == 64 - /* - * Assume anything <= 4GB can be handled by IOMMU. Actually - * some IOMMUs can handle everything, but I don't know of a - * way to test this here. - */ - if (b_pfn < (min_t(u64, 0xffffffffUL, BLK_BOUNCE_HIGH) >> PAGE_SHIFT)) - dma = 1; - q->limits.bounce_pfn = max(max_low_pfn, b_pfn); -#else - if (b_pfn < blk_max_low_pfn) - dma = 1; - q->limits.bounce_pfn = b_pfn; -#endif - if (dma) { - init_emergency_isa_pool(); - q->bounce_gfp = GFP_NOIO | GFP_DMA; - q->limits.bounce_pfn = b_pfn; - } + q->limits.bounce = bounce; } EXPORT_SYMBOL(blk_queue_bounce_limit); @@ -153,15 +127,27 @@ void blk_queue_max_hw_sectors(struct request_queue *q, unsigned int max_hw_secto if ((max_hw_sectors << 9) < PAGE_SIZE) { max_hw_sectors = 1 << (PAGE_SHIFT - 9); - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_hw_sectors); + pr_info("%s: set to minimum %u\n", __func__, max_hw_sectors); } + max_hw_sectors = round_down(max_hw_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_hw_sectors = max_hw_sectors; + max_sectors = min_not_zero(max_hw_sectors, limits->max_dev_sectors); - max_sectors = min_t(unsigned int, max_sectors, BLK_DEF_MAX_SECTORS); + + if (limits->max_user_sectors) + max_sectors = min(max_sectors, limits->max_user_sectors); + else + max_sectors = min(max_sectors, BLK_DEF_MAX_SECTORS_CAP); + + max_sectors = round_down(max_sectors, + limits->logical_block_size >> SECTOR_SHIFT); limits->max_sectors = max_sectors; - q->backing_dev_info->io_pages = max_sectors >> (PAGE_SHIFT - 9); + + if (!q->disk) + return; + q->disk->bdi->io_pages = max_sectors >> (PAGE_SHIFT - 9); } EXPORT_SYMBOL(blk_queue_max_hw_sectors); @@ -172,15 +158,13 @@ EXPORT_SYMBOL(blk_queue_max_hw_sectors); * * Description: * If a driver doesn't want IOs to cross a given chunk size, it can set - * this limit and prevent merging across chunks. Note that the chunk size - * must currently be a power-of-2 in sectors. Also note that the block - * layer must accept a page worth of data at any offset. So if the - * crossing of chunks is a hard limitation in the driver, it must still be - * prepared to split single page bios. + * this limit and prevent merging across chunks. Note that the block layer + * must accept a page worth of data at any offset. So if the crossing of + * chunks is a hard limitation in the driver, it must still be prepared + * to split single page bios. **/ void blk_queue_chunk_sectors(struct request_queue *q, unsigned int chunk_sectors) { - BUG_ON(!is_power_of_2(chunk_sectors)); q->limits.chunk_sectors = chunk_sectors; } EXPORT_SYMBOL(blk_queue_chunk_sectors); @@ -199,16 +183,16 @@ void blk_queue_max_discard_sectors(struct request_queue *q, EXPORT_SYMBOL(blk_queue_max_discard_sectors); /** - * blk_queue_max_write_same_sectors - set max sectors for a single write same + * blk_queue_max_secure_erase_sectors - set max sectors for a secure erase * @q: the request queue for the device - * @max_write_same_sectors: maximum number of sectors to write per command + * @max_sectors: maximum number of sectors to secure_erase **/ -void blk_queue_max_write_same_sectors(struct request_queue *q, - unsigned int max_write_same_sectors) +void blk_queue_max_secure_erase_sectors(struct request_queue *q, + unsigned int max_sectors) { - q->limits.max_write_same_sectors = max_write_same_sectors; + q->limits.max_secure_erase_sectors = max_sectors; } -EXPORT_SYMBOL(blk_queue_max_write_same_sectors); +EXPORT_SYMBOL(blk_queue_max_secure_erase_sectors); /** * blk_queue_max_write_zeroes_sectors - set max sectors for a single @@ -263,8 +247,7 @@ void blk_queue_max_segments(struct request_queue *q, unsigned short max_segments { if (!max_segments) { max_segments = 1; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_segments); + pr_info("%s: set to minimum %u\n", __func__, max_segments); } q->limits.max_segments = max_segments; @@ -300,8 +283,7 @@ void blk_queue_max_segment_size(struct request_queue *q, unsigned int max_size) { if (max_size < PAGE_SIZE) { max_size = PAGE_SIZE; - printk(KERN_INFO "%s: set to minimum %d\n", - __func__, max_size); + pr_info("%s: set to minimum %u\n", __func__, max_size); } /* see blk_queue_virt_boundary() for the explanation */ @@ -323,13 +305,23 @@ EXPORT_SYMBOL(blk_queue_max_segment_size); **/ void blk_queue_logical_block_size(struct request_queue *q, unsigned int size) { - q->limits.logical_block_size = size; + struct queue_limits *limits = &q->limits; - if (q->limits.physical_block_size < size) - q->limits.physical_block_size = size; + limits->logical_block_size = size; - if (q->limits.io_min < q->limits.physical_block_size) - q->limits.io_min = q->limits.physical_block_size; + if (limits->discard_granularity < limits->logical_block_size) + limits->discard_granularity = limits->logical_block_size; + + if (limits->physical_block_size < size) + limits->physical_block_size = size; + + if (limits->io_min < limits->physical_block_size) + limits->io_min = limits->physical_block_size; + + limits->max_hw_sectors = + round_down(limits->max_hw_sectors, size >> SECTOR_SHIFT); + limits->max_sectors = + round_down(limits->max_sectors, size >> SECTOR_SHIFT); } EXPORT_SYMBOL(blk_queue_logical_block_size); @@ -350,12 +342,37 @@ void blk_queue_physical_block_size(struct request_queue *q, unsigned int size) if (q->limits.physical_block_size < q->limits.logical_block_size) q->limits.physical_block_size = q->limits.logical_block_size; + if (q->limits.discard_granularity < q->limits.physical_block_size) + q->limits.discard_granularity = q->limits.physical_block_size; + if (q->limits.io_min < q->limits.physical_block_size) q->limits.io_min = q->limits.physical_block_size; } EXPORT_SYMBOL(blk_queue_physical_block_size); /** + * blk_queue_zone_write_granularity - set zone write granularity for the queue + * @q: the request queue for the zoned device + * @size: the zone write granularity size, in bytes + * + * Description: + * This should be set to the lowest possible size allowing to write in + * sequential zones of a zoned block device. + */ +void blk_queue_zone_write_granularity(struct request_queue *q, + unsigned int size) +{ + if (WARN_ON_ONCE(!blk_queue_is_zoned(q))) + return; + + q->limits.zone_write_granularity = size; + + if (q->limits.zone_write_granularity < q->limits.logical_block_size) + q->limits.zone_write_granularity = q->limits.logical_block_size; +} +EXPORT_SYMBOL_GPL(blk_queue_zone_write_granularity); + +/** * blk_queue_alignment_offset - set physical block alignment offset * @q: the request queue for the device * @offset: alignment offset in bytes @@ -374,6 +391,20 @@ void blk_queue_alignment_offset(struct request_queue *q, unsigned int offset) } EXPORT_SYMBOL(blk_queue_alignment_offset); +void disk_update_readahead(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + + /* + * For read-ahead of large files to be effective, we need to read ahead + * at least twice the optimal I/O size. + */ + disk->bdi->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); + disk->bdi->io_pages = queue_max_sectors(q) >> (PAGE_SHIFT - 9); +} +EXPORT_SYMBOL_GPL(disk_update_readahead); + /** * blk_limits_io_min - set minimum request size for a device * @limits: the queue limits @@ -452,19 +483,54 @@ EXPORT_SYMBOL(blk_limits_io_opt); void blk_queue_io_opt(struct request_queue *q, unsigned int opt) { blk_limits_io_opt(&q->limits, opt); + if (!q->disk) + return; + q->disk->bdi->ra_pages = + max(queue_io_opt(q) * 2 / PAGE_SIZE, VM_READAHEAD_PAGES); } EXPORT_SYMBOL(blk_queue_io_opt); -/** - * blk_queue_stack_limits - inherit underlying queue limits for stacked drivers - * @t: the stacking driver (top) - * @b: the underlying device (bottom) - **/ -void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b) +static int queue_limit_alignment_offset(const struct queue_limits *lim, + sector_t sector) +{ + unsigned int granularity = max(lim->physical_block_size, lim->io_min); + unsigned int alignment = sector_div(sector, granularity >> SECTOR_SHIFT) + << SECTOR_SHIFT; + + return (granularity + lim->alignment_offset - alignment) % granularity; +} + +static unsigned int queue_limit_discard_alignment( + const struct queue_limits *lim, sector_t sector) +{ + unsigned int alignment, granularity, offset; + + if (!lim->max_discard_sectors) + return 0; + + /* Why are these in bytes, not sectors? */ + alignment = lim->discard_alignment >> SECTOR_SHIFT; + granularity = lim->discard_granularity >> SECTOR_SHIFT; + if (!granularity) + return 0; + + /* Offset of the partition start in 'granularity' sectors */ + offset = sector_div(sector, granularity); + + /* And why do we do this modulus *again* in blkdev_issue_discard()? */ + offset = (granularity + alignment - offset) % granularity; + + /* Turn it back into bytes, gaah */ + return offset << SECTOR_SHIFT; +} + +static unsigned int blk_round_down_sectors(unsigned int sectors, unsigned int lbs) { - blk_stack_limits(&t->limits, &b->limits, 0); + sectors = round_down(sectors, lbs >> SECTOR_SHIFT); + if (sectors < PAGE_SIZE >> SECTOR_SHIFT) + sectors = PAGE_SIZE >> SECTOR_SHIFT; + return sectors; } -EXPORT_SYMBOL(blk_queue_stack_limits); /** * blk_stack_limits - adjust queue_limits for stacked devices @@ -495,13 +561,11 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); t->max_dev_sectors = min_not_zero(t->max_dev_sectors, b->max_dev_sectors); - t->max_write_same_sectors = min(t->max_write_same_sectors, - b->max_write_same_sectors); t->max_write_zeroes_sectors = min(t->max_write_zeroes_sectors, b->max_write_zeroes_sectors); t->max_zone_append_sectors = min(t->max_zone_append_sectors, b->max_zone_append_sectors); - t->bounce_pfn = min_not_zero(t->bounce_pfn, b->bounce_pfn); + t->bounce = max(t->bounce, b->bounce); t->seg_boundary_mask = min_not_zero(t->seg_boundary_mask, b->seg_boundary_mask); @@ -545,6 +609,11 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->io_min = max(t->io_min, b->io_min); t->io_opt = lcm_not_zero(t->io_opt, b->io_opt); + t->dma_alignment = max(t->dma_alignment, b->dma_alignment); + + /* Set non-power-of-2 compatible chunk_sectors boundary */ + if (b->chunk_sectors) + t->chunk_sectors = gcd(t->chunk_sectors, b->chunk_sectors); /* Physical block size a multiple of the logical block size? */ if (t->physical_block_size & (t->logical_block_size - 1)) { @@ -567,6 +636,13 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } + /* chunk_sectors a multiple of the physical block size? */ + if ((t->chunk_sectors << 9) & (t->physical_block_size - 1)) { + t->chunk_sectors = 0; + t->misaligned = 1; + ret = -1; + } + t->raid_partial_stripes_expensive = max(t->raid_partial_stripes_expensive, b->raid_partial_stripes_expensive); @@ -581,6 +657,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, ret = -1; } + t->max_sectors = blk_round_down_sectors(t->max_sectors, t->logical_block_size); + t->max_hw_sectors = blk_round_down_sectors(t->max_hw_sectors, t->logical_block_size); + t->max_dev_sectors = blk_round_down_sectors(t->max_dev_sectors, t->logical_block_size); + /* Discard alignment and granularity */ if (b->discard_granularity) { alignment = queue_limit_discard_alignment(b, start); @@ -604,38 +684,16 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->discard_alignment = lcm_not_zero(t->discard_alignment, alignment) % t->discard_granularity; } - - if (b->chunk_sectors) - t->chunk_sectors = min_not_zero(t->chunk_sectors, - b->chunk_sectors); - + t->max_secure_erase_sectors = min_not_zero(t->max_secure_erase_sectors, + b->max_secure_erase_sectors); + t->zone_write_granularity = max(t->zone_write_granularity, + b->zone_write_granularity); + t->zoned = max(t->zoned, b->zoned); return ret; } EXPORT_SYMBOL(blk_stack_limits); /** - * bdev_stack_limits - adjust queue limits for stacked drivers - * @t: the stacking driver limits (top device) - * @bdev: the component block_device (bottom) - * @start: first data sector within component device - * - * Description: - * Merges queue limits for a top device and a block_device. Returns - * 0 if alignment didn't change. Returns -1 if adding the bottom - * device caused misalignment. - */ -int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, - sector_t start) -{ - struct request_queue *bq = bdev_get_queue(bdev); - - start += get_start_sect(bdev); - - return blk_stack_limits(t, &bq->limits, start); -} -EXPORT_SYMBOL(bdev_stack_limits); - -/** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) * @bdev: the underlying block device (bottom) @@ -650,18 +708,12 @@ void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, { struct request_queue *t = disk->queue; - if (bdev_stack_limits(&t->limits, bdev, offset >> 9) < 0) { - char top[BDEVNAME_SIZE], bottom[BDEVNAME_SIZE]; - - disk_name(disk, 0, top); - bdevname(bdev, bottom); - - printk(KERN_NOTICE "%s: Warning: Device %s is misaligned\n", - top, bottom); - } + if (blk_stack_limits(&t->limits, &bdev_get_queue(bdev)->limits, + get_start_sect(bdev) + (offset >> 9)) < 0) + pr_notice("%s: Warning: Device %pg is misaligned\n", + disk->disk_name, bdev); - t->backing_dev_info->io_pages = - t->limits.max_sectors >> (PAGE_SHIFT - 9); + disk_update_readahead(disk); } EXPORT_SYMBOL(disk_stack_limits); @@ -691,8 +743,7 @@ void blk_queue_segment_boundary(struct request_queue *q, unsigned long mask) { if (mask < PAGE_SIZE - 1) { mask = PAGE_SIZE - 1; - printk(KERN_INFO "%s: set to minimum %lx\n", - __func__, mask); + pr_info("%s: set to minimum %lx\n", __func__, mask); } q->limits.seg_boundary_mask = mask; @@ -731,7 +782,7 @@ EXPORT_SYMBOL(blk_queue_virt_boundary); **/ void blk_queue_dma_alignment(struct request_queue *q, int mask) { - q->dma_alignment = mask; + q->limits.dma_alignment = mask; } EXPORT_SYMBOL(blk_queue_dma_alignment); @@ -753,8 +804,8 @@ void blk_queue_update_dma_alignment(struct request_queue *q, int mask) { BUG_ON(mask > PAGE_SIZE); - if (mask > q->dma_alignment) - q->dma_alignment = mask; + if (mask > q->limits.dma_alignment) + q->limits.dma_alignment = mask; } EXPORT_SYMBOL(blk_queue_update_dma_alignment); @@ -781,16 +832,17 @@ EXPORT_SYMBOL(blk_set_queue_depth); */ void blk_queue_write_cache(struct request_queue *q, bool wc, bool fua) { - if (wc) + if (wc) { + blk_queue_flag_set(QUEUE_FLAG_HW_WC, q); blk_queue_flag_set(QUEUE_FLAG_WC, q); - else + } else { + blk_queue_flag_clear(QUEUE_FLAG_HW_WC, q); blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } if (fua) blk_queue_flag_set(QUEUE_FLAG_FUA, q); else blk_queue_flag_clear(QUEUE_FLAG_FUA, q); - - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); } EXPORT_SYMBOL_GPL(blk_queue_write_cache); @@ -832,10 +884,45 @@ bool blk_queue_can_use_dma_map_merging(struct request_queue *q, } EXPORT_SYMBOL_GPL(blk_queue_can_use_dma_map_merging); -static int __init blk_settings_init(void) +/** + * disk_set_zoned - inidicate a zoned device + * @disk: gendisk to configure + */ +void disk_set_zoned(struct gendisk *disk) { - blk_max_low_pfn = max_low_pfn - 1; - blk_max_pfn = max_pfn - 1; - return 0; + struct request_queue *q = disk->queue; + + WARN_ON_ONCE(!IS_ENABLED(CONFIG_BLK_DEV_ZONED)); + + /* + * Set the zone write granularity to the device logical block + * size by default. The driver can change this value if needed. + */ + q->limits.zoned = true; + blk_queue_zone_write_granularity(q, queue_logical_block_size(q)); +} +EXPORT_SYMBOL_GPL(disk_set_zoned); + +int bdev_alignment_offset(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (q->limits.misaligned) + return -1; + if (bdev_is_partition(bdev)) + return queue_limit_alignment_offset(&q->limits, + bdev->bd_start_sect); + return q->limits.alignment_offset; +} +EXPORT_SYMBOL_GPL(bdev_alignment_offset); + +unsigned int bdev_discard_alignment(struct block_device *bdev) +{ + struct request_queue *q = bdev_get_queue(bdev); + + if (bdev_is_partition(bdev)) + return queue_limit_discard_alignment(&q->limits, + bdev->bd_start_sect); + return q->limits.discard_alignment; } -subsys_initcall(blk_settings_init); +EXPORT_SYMBOL_GPL(bdev_discard_alignment); diff --git a/block/blk-softirq.c b/block/blk-softirq.c deleted file mode 100644 index 6e7ec87d49fa..000000000000 --- a/block/blk-softirq.c +++ /dev/null @@ -1,156 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Functions related to softirq rq completions - */ -#include <linux/kernel.h> -#include <linux/module.h> -#include <linux/init.h> -#include <linux/bio.h> -#include <linux/blkdev.h> -#include <linux/interrupt.h> -#include <linux/cpu.h> -#include <linux/sched.h> -#include <linux/sched/topology.h> - -#include "blk.h" - -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); - -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static __latent_entropy void blk_done_softirq(struct softirq_action *h) -{ - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = this_cpu_ptr(&blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; - - rq = list_entry(local_list.next, struct request, ipi_list); - list_del_init(&rq->ipi_list); - rq->q->mq_ops->complete(rq); - } -} - -#ifdef CONFIG_SMP -static void trigger_softirq(void *data) -{ - struct request *rq = data; - struct list_head *list; - - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&rq->ipi_list, list); - - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); -} - -/* - * Setup and invoke a run of 'trigger_softirq' on the given cpu. - */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - if (cpu_online(cpu)) { - call_single_data_t *data = &rq->csd; - - data->func = trigger_softirq; - data->info = rq; - data->flags = 0; - - smp_call_function_single_async(cpu, data); - return 0; - } - - return 1; -} -#else /* CONFIG_SMP */ -static int raise_blk_irq(int cpu, struct request *rq) -{ - return 1; -} -#endif - -static int blk_softirq_cpu_dead(unsigned int cpu) -{ - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - - return 0; -} - -void __blk_complete_request(struct request *req) -{ - struct request_queue *q = req->q; - int cpu, ccpu = req->mq_ctx->cpu; - unsigned long flags; - bool shared = false; - - BUG_ON(!q->mq_ops->complete); - - local_irq_save(flags); - cpu = smp_processor_id(); - - /* - * Select completion CPU - */ - if (test_bit(QUEUE_FLAG_SAME_COMP, &q->queue_flags) && ccpu != -1) { - if (!test_bit(QUEUE_FLAG_SAME_FORCE, &q->queue_flags)) - shared = cpus_share_cache(cpu, ccpu); - } else - ccpu = cpu; - - /* - * If current CPU and requested CPU share a cache, run the softirq on - * the current CPU. One might concern this is just like - * QUEUE_FLAG_SAME_FORCE, but actually not. blk_complete_request() is - * running in interrupt handler, and currently I/O controller doesn't - * support multiple interrupts, so current CPU is unique actually. This - * avoids IPI sending from current CPU to the first CPU of a group. - */ - if (ccpu == cpu || shared) { - struct list_head *list; -do_local: - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&req->ipi_list, list); - - /* - * if the list only contains our just added request, - * signal a raise of the softirq. If there are already - * entries there, someone already raised the irq but it - * hasn't run yet. - */ - if (list->next == &req->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - } else if (raise_blk_irq(ccpu, req)) - goto do_local; - - local_irq_restore(flags); -} - -static __init int blk_softirq_init(void) -{ - int i; - - for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); - - open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); - cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, - "block/softirq:dead", NULL, - blk_softirq_cpu_dead); - return 0; -} -subsys_initcall(blk_softirq_init); diff --git a/block/blk-stat.c b/block/blk-stat.c index 7da302ff88d0..7ff76ae6c76a 100644 --- a/block/blk-stat.c +++ b/block/blk-stat.c @@ -6,7 +6,6 @@ */ #include <linux/kernel.h> #include <linux/rculist.h> -#include <linux/blk-mq.h> #include "blk-stat.h" #include "blk-mq.h" @@ -15,7 +14,7 @@ struct blk_queue_stats { struct list_head callbacks; spinlock_t lock; - bool enable_accounting; + int accounting; }; void blk_rq_stat_init(struct blk_rq_stat *stat) @@ -58,7 +57,8 @@ void blk_stat_add(struct request *rq, u64 now) value = (now >= rq->io_start_time_ns) ? now - rq->io_start_time_ns : 0; - blk_throtl_stat_add(rq, value); + if (req_op(rq) == REQ_OP_READ || req_op(rq) == REQ_OP_WRITE) + blk_throtl_stat_add(rq, value); rcu_read_lock(); cpu = get_cpu(); @@ -137,6 +137,7 @@ void blk_stat_add_callback(struct request_queue *q, struct blk_stat_callback *cb) { unsigned int bucket; + unsigned long flags; int cpu; for_each_possible_cpu(cpu) { @@ -147,20 +148,22 @@ void blk_stat_add_callback(struct request_queue *q, blk_rq_stat_init(&cpu_stat[bucket]); } - spin_lock(&q->stats->lock); + spin_lock_irqsave(&q->stats->lock, flags); list_add_tail_rcu(&cb->list, &q->stats->callbacks); blk_queue_flag_set(QUEUE_FLAG_STATS, q); - spin_unlock(&q->stats->lock); + spin_unlock_irqrestore(&q->stats->lock, flags); } void blk_stat_remove_callback(struct request_queue *q, struct blk_stat_callback *cb) { - spin_lock(&q->stats->lock); + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); list_del_rcu(&cb->list); - if (list_empty(&q->stats->callbacks) && !q->stats->enable_accounting) + if (list_empty(&q->stats->callbacks) && !q->stats->accounting) blk_queue_flag_clear(QUEUE_FLAG_STATS, q); - spin_unlock(&q->stats->lock); + spin_unlock_irqrestore(&q->stats->lock, flags); del_timer_sync(&cb->timer); } @@ -181,12 +184,25 @@ void blk_stat_free_callback(struct blk_stat_callback *cb) call_rcu(&cb->rcu, blk_stat_free_callback_rcu); } +void blk_stat_disable_accounting(struct request_queue *q) +{ + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + if (!--q->stats->accounting && list_empty(&q->stats->callbacks)) + blk_queue_flag_clear(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); +} +EXPORT_SYMBOL_GPL(blk_stat_disable_accounting); + void blk_stat_enable_accounting(struct request_queue *q) { - spin_lock(&q->stats->lock); - q->stats->enable_accounting = true; - blk_queue_flag_set(QUEUE_FLAG_STATS, q); - spin_unlock(&q->stats->lock); + unsigned long flags; + + spin_lock_irqsave(&q->stats->lock, flags); + if (!q->stats->accounting++ && list_empty(&q->stats->callbacks)) + blk_queue_flag_set(QUEUE_FLAG_STATS, q); + spin_unlock_irqrestore(&q->stats->lock, flags); } EXPORT_SYMBOL_GPL(blk_stat_enable_accounting); @@ -200,7 +216,7 @@ struct blk_queue_stats *blk_alloc_queue_stats(void) INIT_LIST_HEAD(&stats->callbacks); spin_lock_init(&stats->lock); - stats->enable_accounting = false; + stats->accounting = 0; return stats; } diff --git a/block/blk-stat.h b/block/blk-stat.h index 17b47a86eefb..17e1eb4ec7e2 100644 --- a/block/blk-stat.h +++ b/block/blk-stat.h @@ -64,11 +64,13 @@ struct blk_stat_callback { struct blk_queue_stats *blk_alloc_queue_stats(void); void blk_free_queue_stats(struct blk_queue_stats *); +bool blk_stats_alloc_enable(struct request_queue *q); void blk_stat_add(struct request *rq, u64 now); /* record time/size info in request but not add a callback */ void blk_stat_enable_accounting(struct request_queue *q); +void blk_stat_disable_accounting(struct request_queue *q); /** * blk_stat_alloc_callback() - Allocate a block statistics callback. diff --git a/block/blk-sysfs.c b/block/blk-sysfs.c index 02643e149d5e..6b2429cad81a 100644 --- a/block/blk-sysfs.c +++ b/block/blk-sysfs.c @@ -9,13 +9,16 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/blktrace_api.h> -#include <linux/blk-mq.h> -#include <linux/blk-cgroup.h> +#include <linux/debugfs.h> #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" +#include "blk-mq-sched.h" +#include "blk-rq-qos.h" #include "blk-wbt.h" +#include "blk-cgroup.h" +#include "blk-throttle.h" struct queue_sysfs_entry { struct attribute attr; @@ -44,22 +47,9 @@ queue_var_store(unsigned long *var, const char *page, size_t count) return count; } -static ssize_t queue_var_store64(s64 *var, const char *page) -{ - int err; - s64 v; - - err = kstrtos64(page, 10, &v); - if (err < 0) - return err; - - *var = v; - return 0; -} - static ssize_t queue_requests_show(struct request_queue *q, char *page) { - return queue_var_show(q->nr_requests, (page)); + return queue_var_show(q->nr_requests, page); } static ssize_t @@ -87,23 +77,26 @@ queue_requests_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_ra_show(struct request_queue *q, char *page) { - unsigned long ra_kb = q->backing_dev_info->ra_pages << - (PAGE_SHIFT - 10); + unsigned long ra_kb; - return queue_var_show(ra_kb, (page)); + if (!q->disk) + return -EINVAL; + ra_kb = q->disk->bdi->ra_pages << (PAGE_SHIFT - 10); + return queue_var_show(ra_kb, page); } static ssize_t queue_ra_store(struct request_queue *q, const char *page, size_t count) { unsigned long ra_kb; - ssize_t ret = queue_var_store(&ra_kb, page, count); + ssize_t ret; + if (!q->disk) + return -EINVAL; + ret = queue_var_store(&ra_kb, page, count); if (ret < 0) return ret; - - q->backing_dev_info->ra_pages = ra_kb >> (PAGE_SHIFT - 10); - + q->disk->bdi->ra_pages = ra_kb >> (PAGE_SHIFT - 10); return ret; } @@ -111,28 +104,28 @@ static ssize_t queue_max_sectors_show(struct request_queue *q, char *page) { int max_sectors_kb = queue_max_sectors(q) >> 1; - return queue_var_show(max_sectors_kb, (page)); + return queue_var_show(max_sectors_kb, page); } static ssize_t queue_max_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segments(q), (page)); + return queue_var_show(queue_max_segments(q), page); } static ssize_t queue_max_discard_segments_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_discard_segments(q), (page)); + return queue_var_show(queue_max_discard_segments(q), page); } static ssize_t queue_max_integrity_segments_show(struct request_queue *q, char *page) { - return queue_var_show(q->limits.max_integrity_segments, (page)); + return queue_var_show(q->limits.max_integrity_segments, page); } static ssize_t queue_max_segment_size_show(struct request_queue *q, char *page) { - return queue_var_show(queue_max_segment_size(q), (page)); + return queue_var_show(queue_max_segment_size(q), page); } static ssize_t queue_logical_block_size_show(struct request_queue *q, char *page) @@ -208,8 +201,7 @@ static ssize_t queue_discard_zeroes_data_show(struct request_queue *q, char *pag static ssize_t queue_write_same_max_show(struct request_queue *q, char *page) { - return sprintf(page, "%llu\n", - (unsigned long long)q->limits.max_write_same_sectors << 9); + return queue_var_show(0, page); } static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) @@ -218,6 +210,12 @@ static ssize_t queue_write_zeroes_max_show(struct request_queue *q, char *page) (unsigned long long)q->limits.max_write_zeroes_sectors << 9); } +static ssize_t queue_zone_write_granularity_show(struct request_queue *q, + char *page) +{ + return queue_var_show(queue_zone_write_granularity(q), page); +} + static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) { unsigned long long max_sectors = q->limits.max_zone_append_sectors; @@ -228,23 +226,33 @@ static ssize_t queue_zone_append_max_show(struct request_queue *q, char *page) static ssize_t queue_max_sectors_store(struct request_queue *q, const char *page, size_t count) { - unsigned long max_sectors_kb, + unsigned long var; + unsigned int max_sectors_kb, max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1, page_kb = 1 << (PAGE_SHIFT - 10); - ssize_t ret = queue_var_store(&max_sectors_kb, page, count); + ssize_t ret = queue_var_store(&var, page, count); if (ret < 0) return ret; - max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, (unsigned long) + max_sectors_kb = (unsigned int)var; + max_hw_sectors_kb = min_not_zero(max_hw_sectors_kb, q->limits.max_dev_sectors >> 1); - - if (max_sectors_kb > max_hw_sectors_kb || max_sectors_kb < page_kb) - return -EINVAL; + if (max_sectors_kb == 0) { + q->limits.max_user_sectors = 0; + max_sectors_kb = min(max_hw_sectors_kb, + BLK_DEF_MAX_SECTORS_CAP >> 1); + } else { + if (max_sectors_kb > max_hw_sectors_kb || + max_sectors_kb < page_kb) + return -EINVAL; + q->limits.max_user_sectors = max_sectors_kb << 1; + } spin_lock_irq(&q->queue_lock); q->limits.max_sectors = max_sectors_kb << 1; - q->backing_dev_info->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); + if (q->disk) + q->disk->bdi->io_pages = max_sectors_kb >> (PAGE_SHIFT - 10); spin_unlock_irq(&q->queue_lock); return ret; @@ -254,19 +262,29 @@ static ssize_t queue_max_hw_sectors_show(struct request_queue *q, char *page) { int max_hw_sectors_kb = queue_max_hw_sectors(q) >> 1; - return queue_var_show(max_hw_sectors_kb, (page)); + return queue_var_show(max_hw_sectors_kb, page); +} + +static ssize_t queue_virt_boundary_mask_show(struct request_queue *q, char *page) +{ + return queue_var_show(q->limits.virt_boundary_mask, page); +} + +static ssize_t queue_dma_alignment_show(struct request_queue *q, char *page) +{ + return queue_var_show(queue_dma_alignment(q), page); } #define QUEUE_SYSFS_BIT_FNS(name, flag, neg) \ static ssize_t \ -queue_show_##name(struct request_queue *q, char *page) \ +queue_##name##_show(struct request_queue *q, char *page) \ { \ int bit; \ bit = test_bit(QUEUE_FLAG_##flag, &q->queue_flags); \ return queue_var_show(neg ? !bit : bit, page); \ } \ static ssize_t \ -queue_store_##name(struct request_queue *q, const char *page, size_t count) \ +queue_##name##_store(struct request_queue *q, const char *page, size_t count) \ { \ unsigned long val; \ ssize_t ret; \ @@ -286,23 +304,29 @@ queue_store_##name(struct request_queue *q, const char *page, size_t count) \ QUEUE_SYSFS_BIT_FNS(nonrot, NONROT, 1); QUEUE_SYSFS_BIT_FNS(random, ADD_RANDOM, 0); QUEUE_SYSFS_BIT_FNS(iostats, IO_STAT, 0); +QUEUE_SYSFS_BIT_FNS(stable_writes, STABLE_WRITES, 0); #undef QUEUE_SYSFS_BIT_FNS static ssize_t queue_zoned_show(struct request_queue *q, char *page) { - switch (blk_queue_zoned_model(q)) { - case BLK_ZONED_HA: - return sprintf(page, "host-aware\n"); - case BLK_ZONED_HM: + if (blk_queue_is_zoned(q)) return sprintf(page, "host-managed\n"); - default: - return sprintf(page, "none\n"); - } + return sprintf(page, "none\n"); } static ssize_t queue_nr_zones_show(struct request_queue *q, char *page) { - return queue_var_show(blk_queue_nr_zones(q), page); + return queue_var_show(disk_nr_zones(q->disk), page); +} + +static ssize_t queue_max_open_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(bdev_max_open_zones(q->disk->part0), page); +} + +static ssize_t queue_max_active_zones_show(struct request_queue *q, char *page) +{ + return queue_var_show(bdev_max_active_zones(q->disk->part0), page); } static ssize_t queue_nomerges_show(struct request_queue *q, char *page) @@ -365,35 +389,12 @@ queue_rq_affinity_store(struct request_queue *q, const char *page, size_t count) static ssize_t queue_poll_delay_show(struct request_queue *q, char *page) { - int val; - - if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) - val = BLK_MQ_POLL_CLASSIC; - else - val = q->poll_nsec / 1000; - - return sprintf(page, "%d\n", val); + return sprintf(page, "%d\n", -1); } static ssize_t queue_poll_delay_store(struct request_queue *q, const char *page, size_t count) { - int err, val; - - if (!q->mq_ops || !q->mq_ops->poll) - return -EINVAL; - - err = kstrtoint(page, 10, &val); - if (err < 0) - return err; - - if (val == BLK_MQ_POLL_CLASSIC) - q->poll_nsec = BLK_MQ_POLL_CLASSIC; - else if (val >= 0) - q->poll_nsec = val * 1000; - else - return -EINVAL; - return count; } @@ -405,23 +406,11 @@ static ssize_t queue_poll_show(struct request_queue *q, char *page) static ssize_t queue_poll_store(struct request_queue *q, const char *page, size_t count) { - unsigned long poll_on; - ssize_t ret; - - if (!q->tag_set || q->tag_set->nr_maps <= HCTX_TYPE_POLL || - !q->tag_set->map[HCTX_TYPE_POLL].nr_queues) + if (!test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return -EINVAL; - - ret = queue_var_store(&poll_on, page, count); - if (ret < 0) - return ret; - - if (poll_on) - blk_queue_flag_set(QUEUE_FLAG_POLL, q); - else - blk_queue_flag_clear(QUEUE_FLAG_POLL, q); - - return ret; + pr_info_ratelimited("writes to the poll attribute are ignored.\n"); + pr_info_ratelimited("please use driver specific parameters instead.\n"); + return count; } static ssize_t queue_io_timeout_show(struct request_queue *q, char *page) @@ -444,11 +433,133 @@ static ssize_t queue_io_timeout_store(struct request_queue *q, const char *page, return count; } +static ssize_t queue_wc_show(struct request_queue *q, char *page) +{ + if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) + return sprintf(page, "write back\n"); + + return sprintf(page, "write through\n"); +} + +static ssize_t queue_wc_store(struct request_queue *q, const char *page, + size_t count) +{ + if (!strncmp(page, "write back", 10)) { + if (!test_bit(QUEUE_FLAG_HW_WC, &q->queue_flags)) + return -EINVAL; + blk_queue_flag_set(QUEUE_FLAG_WC, q); + } else if (!strncmp(page, "write through", 13) || + !strncmp(page, "none", 4)) { + blk_queue_flag_clear(QUEUE_FLAG_WC, q); + } else { + return -EINVAL; + } + + return count; +} + +static ssize_t queue_fua_show(struct request_queue *q, char *page) +{ + return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); +} + +static ssize_t queue_dax_show(struct request_queue *q, char *page) +{ + return queue_var_show(blk_queue_dax(q), page); +} + +#define QUEUE_RO_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0444 }, \ + .show = _prefix##_show, \ +}; + +#define QUEUE_RW_ENTRY(_prefix, _name) \ +static struct queue_sysfs_entry _prefix##_entry = { \ + .attr = { .name = _name, .mode = 0644 }, \ + .show = _prefix##_show, \ + .store = _prefix##_store, \ +}; + +QUEUE_RW_ENTRY(queue_requests, "nr_requests"); +QUEUE_RW_ENTRY(queue_ra, "read_ahead_kb"); +QUEUE_RW_ENTRY(queue_max_sectors, "max_sectors_kb"); +QUEUE_RO_ENTRY(queue_max_hw_sectors, "max_hw_sectors_kb"); +QUEUE_RO_ENTRY(queue_max_segments, "max_segments"); +QUEUE_RO_ENTRY(queue_max_integrity_segments, "max_integrity_segments"); +QUEUE_RO_ENTRY(queue_max_segment_size, "max_segment_size"); +QUEUE_RW_ENTRY(elv_iosched, "scheduler"); + +QUEUE_RO_ENTRY(queue_logical_block_size, "logical_block_size"); +QUEUE_RO_ENTRY(queue_physical_block_size, "physical_block_size"); +QUEUE_RO_ENTRY(queue_chunk_sectors, "chunk_sectors"); +QUEUE_RO_ENTRY(queue_io_min, "minimum_io_size"); +QUEUE_RO_ENTRY(queue_io_opt, "optimal_io_size"); + +QUEUE_RO_ENTRY(queue_max_discard_segments, "max_discard_segments"); +QUEUE_RO_ENTRY(queue_discard_granularity, "discard_granularity"); +QUEUE_RO_ENTRY(queue_discard_max_hw, "discard_max_hw_bytes"); +QUEUE_RW_ENTRY(queue_discard_max, "discard_max_bytes"); +QUEUE_RO_ENTRY(queue_discard_zeroes_data, "discard_zeroes_data"); + +QUEUE_RO_ENTRY(queue_write_same_max, "write_same_max_bytes"); +QUEUE_RO_ENTRY(queue_write_zeroes_max, "write_zeroes_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_append_max, "zone_append_max_bytes"); +QUEUE_RO_ENTRY(queue_zone_write_granularity, "zone_write_granularity"); + +QUEUE_RO_ENTRY(queue_zoned, "zoned"); +QUEUE_RO_ENTRY(queue_nr_zones, "nr_zones"); +QUEUE_RO_ENTRY(queue_max_open_zones, "max_open_zones"); +QUEUE_RO_ENTRY(queue_max_active_zones, "max_active_zones"); + +QUEUE_RW_ENTRY(queue_nomerges, "nomerges"); +QUEUE_RW_ENTRY(queue_rq_affinity, "rq_affinity"); +QUEUE_RW_ENTRY(queue_poll, "io_poll"); +QUEUE_RW_ENTRY(queue_poll_delay, "io_poll_delay"); +QUEUE_RW_ENTRY(queue_wc, "write_cache"); +QUEUE_RO_ENTRY(queue_fua, "fua"); +QUEUE_RO_ENTRY(queue_dax, "dax"); +QUEUE_RW_ENTRY(queue_io_timeout, "io_timeout"); +QUEUE_RO_ENTRY(queue_virt_boundary_mask, "virt_boundary_mask"); +QUEUE_RO_ENTRY(queue_dma_alignment, "dma_alignment"); + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW +QUEUE_RW_ENTRY(blk_throtl_sample_time, "throttle_sample_time"); +#endif + +/* legacy alias for logical_block_size: */ +static struct queue_sysfs_entry queue_hw_sector_size_entry = { + .attr = {.name = "hw_sector_size", .mode = 0444 }, + .show = queue_logical_block_size_show, +}; + +QUEUE_RW_ENTRY(queue_nonrot, "rotational"); +QUEUE_RW_ENTRY(queue_iostats, "iostats"); +QUEUE_RW_ENTRY(queue_random, "add_random"); +QUEUE_RW_ENTRY(queue_stable_writes, "stable_writes"); + +#ifdef CONFIG_BLK_WBT +static ssize_t queue_var_store64(s64 *var, const char *page) +{ + int err; + s64 v; + + err = kstrtos64(page, 10, &v); + if (err < 0) + return err; + + *var = v; + return 0; +} + static ssize_t queue_wb_lat_show(struct request_queue *q, char *page) { if (!wbt_rq_qos(q)) return -EINVAL; + if (wbt_disabled(q)) + return sprintf(page, "0\n"); + return sprintf(page, "%llu\n", div_u64(wbt_get_min_lat(q), 1000)); } @@ -467,7 +578,7 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, rqos = wbt_rq_qos(q); if (!rqos) { - ret = wbt_init(q); + ret = wbt_init(q->disk); if (ret) return ret; } @@ -496,251 +607,11 @@ static ssize_t queue_wb_lat_store(struct request_queue *q, const char *page, return count; } -static ssize_t queue_wc_show(struct request_queue *q, char *page) -{ - if (test_bit(QUEUE_FLAG_WC, &q->queue_flags)) - return sprintf(page, "write back\n"); - - return sprintf(page, "write through\n"); -} - -static ssize_t queue_wc_store(struct request_queue *q, const char *page, - size_t count) -{ - int set = -1; - - if (!strncmp(page, "write back", 10)) - set = 1; - else if (!strncmp(page, "write through", 13) || - !strncmp(page, "none", 4)) - set = 0; - - if (set == -1) - return -EINVAL; - - if (set) - blk_queue_flag_set(QUEUE_FLAG_WC, q); - else - blk_queue_flag_clear(QUEUE_FLAG_WC, q); - - return count; -} - -static ssize_t queue_fua_show(struct request_queue *q, char *page) -{ - return sprintf(page, "%u\n", test_bit(QUEUE_FLAG_FUA, &q->queue_flags)); -} - -static ssize_t queue_dax_show(struct request_queue *q, char *page) -{ - return queue_var_show(blk_queue_dax(q), page); -} - -static struct queue_sysfs_entry queue_requests_entry = { - .attr = {.name = "nr_requests", .mode = 0644 }, - .show = queue_requests_show, - .store = queue_requests_store, -}; - -static struct queue_sysfs_entry queue_ra_entry = { - .attr = {.name = "read_ahead_kb", .mode = 0644 }, - .show = queue_ra_show, - .store = queue_ra_store, -}; - -static struct queue_sysfs_entry queue_max_sectors_entry = { - .attr = {.name = "max_sectors_kb", .mode = 0644 }, - .show = queue_max_sectors_show, - .store = queue_max_sectors_store, -}; - -static struct queue_sysfs_entry queue_max_hw_sectors_entry = { - .attr = {.name = "max_hw_sectors_kb", .mode = 0444 }, - .show = queue_max_hw_sectors_show, -}; - -static struct queue_sysfs_entry queue_max_segments_entry = { - .attr = {.name = "max_segments", .mode = 0444 }, - .show = queue_max_segments_show, -}; - -static struct queue_sysfs_entry queue_max_discard_segments_entry = { - .attr = {.name = "max_discard_segments", .mode = 0444 }, - .show = queue_max_discard_segments_show, -}; - -static struct queue_sysfs_entry queue_max_integrity_segments_entry = { - .attr = {.name = "max_integrity_segments", .mode = 0444 }, - .show = queue_max_integrity_segments_show, -}; - -static struct queue_sysfs_entry queue_max_segment_size_entry = { - .attr = {.name = "max_segment_size", .mode = 0444 }, - .show = queue_max_segment_size_show, -}; - -static struct queue_sysfs_entry queue_iosched_entry = { - .attr = {.name = "scheduler", .mode = 0644 }, - .show = elv_iosched_show, - .store = elv_iosched_store, -}; - -static struct queue_sysfs_entry queue_hw_sector_size_entry = { - .attr = {.name = "hw_sector_size", .mode = 0444 }, - .show = queue_logical_block_size_show, -}; - -static struct queue_sysfs_entry queue_logical_block_size_entry = { - .attr = {.name = "logical_block_size", .mode = 0444 }, - .show = queue_logical_block_size_show, -}; - -static struct queue_sysfs_entry queue_physical_block_size_entry = { - .attr = {.name = "physical_block_size", .mode = 0444 }, - .show = queue_physical_block_size_show, -}; - -static struct queue_sysfs_entry queue_chunk_sectors_entry = { - .attr = {.name = "chunk_sectors", .mode = 0444 }, - .show = queue_chunk_sectors_show, -}; - -static struct queue_sysfs_entry queue_io_min_entry = { - .attr = {.name = "minimum_io_size", .mode = 0444 }, - .show = queue_io_min_show, -}; - -static struct queue_sysfs_entry queue_io_opt_entry = { - .attr = {.name = "optimal_io_size", .mode = 0444 }, - .show = queue_io_opt_show, -}; - -static struct queue_sysfs_entry queue_discard_granularity_entry = { - .attr = {.name = "discard_granularity", .mode = 0444 }, - .show = queue_discard_granularity_show, -}; - -static struct queue_sysfs_entry queue_discard_max_hw_entry = { - .attr = {.name = "discard_max_hw_bytes", .mode = 0444 }, - .show = queue_discard_max_hw_show, -}; - -static struct queue_sysfs_entry queue_discard_max_entry = { - .attr = {.name = "discard_max_bytes", .mode = 0644 }, - .show = queue_discard_max_show, - .store = queue_discard_max_store, -}; - -static struct queue_sysfs_entry queue_discard_zeroes_data_entry = { - .attr = {.name = "discard_zeroes_data", .mode = 0444 }, - .show = queue_discard_zeroes_data_show, -}; - -static struct queue_sysfs_entry queue_write_same_max_entry = { - .attr = {.name = "write_same_max_bytes", .mode = 0444 }, - .show = queue_write_same_max_show, -}; - -static struct queue_sysfs_entry queue_write_zeroes_max_entry = { - .attr = {.name = "write_zeroes_max_bytes", .mode = 0444 }, - .show = queue_write_zeroes_max_show, -}; - -static struct queue_sysfs_entry queue_zone_append_max_entry = { - .attr = {.name = "zone_append_max_bytes", .mode = 0444 }, - .show = queue_zone_append_max_show, -}; - -static struct queue_sysfs_entry queue_nonrot_entry = { - .attr = {.name = "rotational", .mode = 0644 }, - .show = queue_show_nonrot, - .store = queue_store_nonrot, -}; - -static struct queue_sysfs_entry queue_zoned_entry = { - .attr = {.name = "zoned", .mode = 0444 }, - .show = queue_zoned_show, -}; - -static struct queue_sysfs_entry queue_nr_zones_entry = { - .attr = {.name = "nr_zones", .mode = 0444 }, - .show = queue_nr_zones_show, -}; - -static struct queue_sysfs_entry queue_nomerges_entry = { - .attr = {.name = "nomerges", .mode = 0644 }, - .show = queue_nomerges_show, - .store = queue_nomerges_store, -}; - -static struct queue_sysfs_entry queue_rq_affinity_entry = { - .attr = {.name = "rq_affinity", .mode = 0644 }, - .show = queue_rq_affinity_show, - .store = queue_rq_affinity_store, -}; - -static struct queue_sysfs_entry queue_iostats_entry = { - .attr = {.name = "iostats", .mode = 0644 }, - .show = queue_show_iostats, - .store = queue_store_iostats, -}; - -static struct queue_sysfs_entry queue_random_entry = { - .attr = {.name = "add_random", .mode = 0644 }, - .show = queue_show_random, - .store = queue_store_random, -}; - -static struct queue_sysfs_entry queue_poll_entry = { - .attr = {.name = "io_poll", .mode = 0644 }, - .show = queue_poll_show, - .store = queue_poll_store, -}; - -static struct queue_sysfs_entry queue_poll_delay_entry = { - .attr = {.name = "io_poll_delay", .mode = 0644 }, - .show = queue_poll_delay_show, - .store = queue_poll_delay_store, -}; - -static struct queue_sysfs_entry queue_wc_entry = { - .attr = {.name = "write_cache", .mode = 0644 }, - .show = queue_wc_show, - .store = queue_wc_store, -}; - -static struct queue_sysfs_entry queue_fua_entry = { - .attr = {.name = "fua", .mode = 0444 }, - .show = queue_fua_show, -}; - -static struct queue_sysfs_entry queue_dax_entry = { - .attr = {.name = "dax", .mode = 0444 }, - .show = queue_dax_show, -}; - -static struct queue_sysfs_entry queue_io_timeout_entry = { - .attr = {.name = "io_timeout", .mode = 0644 }, - .show = queue_io_timeout_show, - .store = queue_io_timeout_store, -}; - -static struct queue_sysfs_entry queue_wb_lat_entry = { - .attr = {.name = "wbt_lat_usec", .mode = 0644 }, - .show = queue_wb_lat_show, - .store = queue_wb_lat_store, -}; - -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW -static struct queue_sysfs_entry throtl_sample_time_entry = { - .attr = {.name = "throttle_sample_time", .mode = 0644 }, - .show = blk_throtl_sample_time_show, - .store = blk_throtl_sample_time_store, -}; +QUEUE_RW_ENTRY(queue_wb_lat, "wbt_lat_usec"); #endif +/* Common attributes for bio-based and request-based queues. */ static struct attribute *queue_attrs[] = { - &queue_requests_entry.attr, &queue_ra_entry.attr, &queue_max_hw_sectors_entry.attr, &queue_max_sectors_entry.attr, @@ -748,7 +619,6 @@ static struct attribute *queue_attrs[] = { &queue_max_discard_segments_entry.attr, &queue_max_integrity_segments_entry.attr, &queue_max_segment_size_entry.attr, - &queue_iosched_entry.attr, &queue_hw_sector_size_entry.attr, &queue_logical_block_size_entry.attr, &queue_physical_block_size_entry.attr, @@ -762,22 +632,37 @@ static struct attribute *queue_attrs[] = { &queue_write_same_max_entry.attr, &queue_write_zeroes_max_entry.attr, &queue_zone_append_max_entry.attr, + &queue_zone_write_granularity_entry.attr, &queue_nonrot_entry.attr, &queue_zoned_entry.attr, &queue_nr_zones_entry.attr, + &queue_max_open_zones_entry.attr, + &queue_max_active_zones_entry.attr, &queue_nomerges_entry.attr, - &queue_rq_affinity_entry.attr, &queue_iostats_entry.attr, + &queue_stable_writes_entry.attr, &queue_random_entry.attr, &queue_poll_entry.attr, &queue_wc_entry.attr, &queue_fua_entry.attr, &queue_dax_entry.attr, - &queue_wb_lat_entry.attr, &queue_poll_delay_entry.attr, - &queue_io_timeout_entry.attr, #ifdef CONFIG_BLK_DEV_THROTTLING_LOW - &throtl_sample_time_entry.attr, + &blk_throtl_sample_time_entry.attr, +#endif + &queue_virt_boundary_mask_entry.attr, + &queue_dma_alignment_entry.attr, + NULL, +}; + +/* Request-based queue attributes that are not relevant for bio-based queues. */ +static struct attribute *blk_mq_queue_attrs[] = { + &queue_requests_entry.attr, + &elv_iosched_entry.attr, + &queue_rq_affinity_entry.attr, + &queue_io_timeout_entry.attr, +#ifdef CONFIG_BLK_WBT + &queue_wb_lat_entry.attr, #endif NULL, }; @@ -785,12 +670,28 @@ static struct attribute *queue_attrs[] = { static umode_t queue_attr_visible(struct kobject *kobj, struct attribute *attr, int n) { - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; + + if ((attr == &queue_max_open_zones_entry.attr || + attr == &queue_max_active_zones_entry.attr) && + !blk_queue_is_zoned(q)) + return 0; + + return attr->mode; +} + +static umode_t blk_mq_queue_attr_visible(struct kobject *kobj, + struct attribute *attr, int n) +{ + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; - if (attr == &queue_io_timeout_entry.attr && - (!q->mq_ops || !q->mq_ops->timeout)) - return 0; + if (!queue_is_mq(q)) + return 0; + + if (attr == &queue_io_timeout_entry.attr && !q->mq_ops->timeout) + return 0; return attr->mode; } @@ -800,6 +701,10 @@ static struct attribute_group queue_attr_group = { .is_visible = queue_attr_visible, }; +static struct attribute_group blk_mq_queue_attr_group = { + .attrs = blk_mq_queue_attrs, + .is_visible = blk_mq_queue_attr_visible, +}; #define to_queue(atr) container_of((atr), struct queue_sysfs_entry, attr) @@ -807,8 +712,8 @@ static ssize_t queue_attr_show(struct kobject *kobj, struct attribute *attr, char *page) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->show) @@ -824,135 +729,106 @@ queue_attr_store(struct kobject *kobj, struct attribute *attr, const char *page, size_t length) { struct queue_sysfs_entry *entry = to_queue(attr); - struct request_queue *q; + struct gendisk *disk = container_of(kobj, struct gendisk, queue_kobj); + struct request_queue *q = disk->queue; ssize_t res; if (!entry->store) return -EIO; - q = container_of(kobj, struct request_queue, kobj); mutex_lock(&q->sysfs_lock); res = entry->store(q, page, length); mutex_unlock(&q->sysfs_lock); return res; } -static void blk_free_queue_rcu(struct rcu_head *rcu_head) -{ - struct request_queue *q = container_of(rcu_head, struct request_queue, - rcu_head); - kmem_cache_free(blk_requestq_cachep, q); -} - -/* Unconfigure the I/O scheduler and dissociate from the cgroup controller. */ -static void blk_exit_queue(struct request_queue *q) -{ - /* - * Since the I/O scheduler exit code may access cgroup information, - * perform I/O scheduler exit before disassociating from the block - * cgroup controller. - */ - if (q->elevator) { - ioc_clear_queue(q); - __elevator_exit(q, q->elevator); - q->elevator = NULL; - } +static const struct sysfs_ops queue_sysfs_ops = { + .show = queue_attr_show, + .store = queue_attr_store, +}; - /* - * Remove all references to @q from the block cgroup controller before - * restoring @q->queue_lock to avoid that restoring this pointer causes - * e.g. blkcg_print_blkgs() to crash. - */ - blkcg_exit_queue(q); +static const struct attribute_group *blk_queue_attr_groups[] = { + &queue_attr_group, + &blk_mq_queue_attr_group, + NULL +}; - /* - * Since the cgroup code may dereference the @q->backing_dev_info - * pointer, only decrease its reference count after having removed the - * association with the block cgroup controller. - */ - bdi_put(q->backing_dev_info); +static void blk_queue_release(struct kobject *kobj) +{ + /* nothing to do here, all data is associated with the parent gendisk */ } +static const struct kobj_type blk_queue_ktype = { + .default_groups = blk_queue_attr_groups, + .sysfs_ops = &queue_sysfs_ops, + .release = blk_queue_release, +}; -/** - * __blk_release_queue - release a request queue - * @work: pointer to the release_work member of the request queue to be released - * - * Description: - * This function is called when a block device is being unregistered. The - * process of releasing a request queue starts with blk_cleanup_queue, which - * set the appropriate flags and then calls blk_put_queue, that decrements - * the reference counter of the request queue. Once the reference counter - * of the request queue reaches zero, blk_release_queue is called to release - * all allocated resources of the request queue. - */ -static void __blk_release_queue(struct work_struct *work) +static void blk_debugfs_remove(struct gendisk *disk) { - struct request_queue *q = container_of(work, typeof(*q), release_work); - - if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) - blk_stat_remove_callback(q, q->poll_cb); - blk_stat_free_callback(q->poll_cb); - - blk_free_queue_stats(q->stats); - - if (queue_is_mq(q)) - cancel_delayed_work_sync(&q->requeue_work); - - blk_exit_queue(q); - - blk_queue_free_zone_bitmaps(q); - - if (queue_is_mq(q)) - blk_mq_release(q); + struct request_queue *q = disk->queue; + mutex_lock(&q->debugfs_mutex); blk_trace_shutdown(q); - - if (queue_is_mq(q)) - blk_mq_debugfs_unregister(q); - - bioset_exit(&q->bio_split); - - ida_simple_remove(&blk_queue_ida, q->id); - call_rcu(&q->rcu_head, blk_free_queue_rcu); -} - -static void blk_release_queue(struct kobject *kobj) -{ - struct request_queue *q = - container_of(kobj, struct request_queue, kobj); - - INIT_WORK(&q->release_work, __blk_release_queue); - schedule_work(&q->release_work); + debugfs_remove_recursive(q->debugfs_dir); + q->debugfs_dir = NULL; + q->sched_debugfs_dir = NULL; + q->rqos_debugfs_dir = NULL; + mutex_unlock(&q->debugfs_mutex); } -static const struct sysfs_ops queue_sysfs_ops = { - .show = queue_attr_show, - .store = queue_attr_store, -}; - -struct kobj_type blk_queue_ktype = { - .sysfs_ops = &queue_sysfs_ops, - .release = blk_release_queue, -}; - /** * blk_register_queue - register a block layer queue with sysfs * @disk: Disk of which the request queue should be registered with sysfs. */ int blk_register_queue(struct gendisk *disk) { - int ret; - struct device *dev = disk_to_dev(disk); struct request_queue *q = disk->queue; - bool has_elevator = false; + int ret; - if (WARN_ON(!q)) - return -ENXIO; + mutex_lock(&q->sysfs_dir_lock); + kobject_init(&disk->queue_kobj, &blk_queue_ktype); + ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue"); + if (ret < 0) + goto out_put_queue_kobj; - WARN_ONCE(blk_queue_registered(q), - "%s is registering an already registered queue\n", - kobject_name(&dev->kobj)); + if (queue_is_mq(q)) { + ret = blk_mq_sysfs_register(disk); + if (ret) + goto out_put_queue_kobj; + } + mutex_lock(&q->sysfs_lock); + + mutex_lock(&q->debugfs_mutex); + q->debugfs_dir = debugfs_create_dir(disk->disk_name, blk_debugfs_root); + if (queue_is_mq(q)) + blk_mq_debugfs_register(q); + mutex_unlock(&q->debugfs_mutex); + + ret = disk_register_independent_access_ranges(disk); + if (ret) + goto out_debugfs_remove; + + if (q->elevator) { + ret = elv_register_queue(q, false); + if (ret) + goto out_unregister_ia_ranges; + } + + ret = blk_crypto_sysfs_register(disk); + if (ret) + goto out_elv_unregister; + + blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); + wbt_enable_default(disk); + blk_throtl_register(disk); + + /* Now everything is ready and send out KOBJ_ADD uevent */ + kobject_uevent(&disk->queue_kobj, KOBJ_ADD); + if (q->elevator) + kobject_uevent(&q->elevator->kobj, KOBJ_ADD); + mutex_unlock(&q->sysfs_lock); + mutex_unlock(&q->sysfs_dir_lock); /* * SCSI probing may synchronously create and destroy a lot of @@ -968,61 +844,20 @@ int blk_register_queue(struct gendisk *disk) percpu_ref_switch_to_percpu(&q->q_usage_counter); } - ret = blk_trace_init_sysfs(dev); - if (ret) - return ret; - - mutex_lock(&q->sysfs_dir_lock); - - ret = kobject_add(&q->kobj, kobject_get(&dev->kobj), "%s", "queue"); - if (ret < 0) { - blk_trace_remove_sysfs(dev); - goto unlock; - } - - ret = sysfs_create_group(&q->kobj, &queue_attr_group); - if (ret) { - blk_trace_remove_sysfs(dev); - kobject_del(&q->kobj); - kobject_put(&dev->kobj); - goto unlock; - } - - if (queue_is_mq(q)) { - __blk_mq_register_dev(dev, q); - blk_mq_debugfs_register(q); - } - - mutex_lock(&q->sysfs_lock); - if (q->elevator) { - ret = elv_register_queue(q, false); - if (ret) { - mutex_unlock(&q->sysfs_lock); - mutex_unlock(&q->sysfs_dir_lock); - kobject_del(&q->kobj); - blk_trace_remove_sysfs(dev); - kobject_put(&dev->kobj); - return ret; - } - has_elevator = true; - } - - blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q); - wbt_enable_default(q); - blk_throtl_register_queue(q); + return ret; - /* Now everything is ready and send out KOBJ_ADD uevent */ - kobject_uevent(&q->kobj, KOBJ_ADD); - if (has_elevator) - kobject_uevent(&q->elevator->kobj, KOBJ_ADD); +out_elv_unregister: + elv_unregister_queue(q); +out_unregister_ia_ranges: + disk_unregister_independent_access_ranges(disk); +out_debugfs_remove: + blk_debugfs_remove(disk); mutex_unlock(&q->sysfs_lock); - - ret = 0; -unlock: +out_put_queue_kobj: + kobject_put(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); return ret; } -EXPORT_SYMBOL_GPL(blk_register_queue); /** * blk_unregister_queue - counterpart of blk_register_queue() @@ -1057,17 +892,18 @@ void blk_unregister_queue(struct gendisk *disk) * structures that can be modified through sysfs. */ if (queue_is_mq(q)) - blk_mq_unregister_dev(disk_to_dev(disk), q); - - kobject_uevent(&q->kobj, KOBJ_REMOVE); - kobject_del(&q->kobj); - blk_trace_remove_sysfs(disk_to_dev(disk)); + blk_mq_sysfs_unregister(disk); + blk_crypto_sysfs_unregister(disk); mutex_lock(&q->sysfs_lock); - if (q->elevator) - elv_unregister_queue(q); + elv_unregister_queue(q); + disk_unregister_independent_access_ranges(disk); mutex_unlock(&q->sysfs_lock); + + /* Now that we've deleted all child objects, we can delete the queue. */ + kobject_uevent(&disk->queue_kobj, KOBJ_REMOVE); + kobject_del(&disk->queue_kobj); mutex_unlock(&q->sysfs_dir_lock); - kobject_put(&disk_to_dev(disk)->kobj); + blk_debugfs_remove(disk); } diff --git a/block/blk-throttle.c b/block/blk-throttle.c index 209fdd8939fb..16f5766620a4 100644 --- a/block/blk-throttle.c +++ b/block/blk-throttle.c @@ -10,15 +10,16 @@ #include <linux/blkdev.h> #include <linux/bio.h> #include <linux/blktrace_api.h> -#include <linux/blk-cgroup.h> #include "blk.h" #include "blk-cgroup-rwstat.h" +#include "blk-stat.h" +#include "blk-throttle.h" /* Max dispatch from a group in 1 round */ -static int throtl_grp_quantum = 8; +#define THROTL_GRP_QUANTUM 8 /* Total max dispatch from all groups in one round */ -static int throtl_quantum = 32; +#define THROTL_QUANTUM 32 /* Throttling is performed over a slice and after that slice is renewed */ #define DFL_THROTL_SLICE_HD (HZ / 10) @@ -37,151 +38,11 @@ static int throtl_quantum = 32; */ #define LATENCY_FILTERED_HD (1000L) /* 1ms */ -static struct blkcg_policy blkcg_policy_throtl; - /* A workqueue to queue throttle related work */ static struct workqueue_struct *kthrotld_workqueue; -/* - * To implement hierarchical throttling, throtl_grps form a tree and bios - * are dispatched upwards level by level until they reach the top and get - * issued. When dispatching bios from the children and local group at each - * level, if the bios are dispatched into a single bio_list, there's a risk - * of a local or child group which can queue many bios at once filling up - * the list starving others. - * - * To avoid such starvation, dispatched bios are queued separately - * according to where they came from. When they are again dispatched to - * the parent, they're popped in round-robin order so that no single source - * hogs the dispatch window. - * - * throtl_qnode is used to keep the queued bios separated by their sources. - * Bios are queued to throtl_qnode which in turn is queued to - * throtl_service_queue and then dispatched in round-robin order. - * - * It's also used to track the reference counts on blkg's. A qnode always - * belongs to a throtl_grp and gets queued on itself or the parent, so - * incrementing the reference of the associated throtl_grp when a qnode is - * queued and decrementing when dequeued is enough to keep the whole blkg - * tree pinned while bios are in flight. - */ -struct throtl_qnode { - struct list_head node; /* service_queue->queued[] */ - struct bio_list bios; /* queued bios */ - struct throtl_grp *tg; /* tg this qnode belongs to */ -}; - -struct throtl_service_queue { - struct throtl_service_queue *parent_sq; /* the parent service_queue */ - - /* - * Bios queued directly to this service_queue or dispatched from - * children throtl_grp's. - */ - struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ - unsigned int nr_queued[2]; /* number of queued bios */ - - /* - * RB tree of active children throtl_grp's, which are sorted by - * their ->disptime. - */ - struct rb_root_cached pending_tree; /* RB tree of active tgs */ - unsigned int nr_pending; /* # queued in the tree */ - unsigned long first_pending_disptime; /* disptime of the first tg */ - struct timer_list pending_timer; /* fires on first_pending_disptime */ -}; - -enum tg_state_flags { - THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ - THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ -}; - #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) -enum { - LIMIT_LOW, - LIMIT_MAX, - LIMIT_CNT, -}; - -struct throtl_grp { - /* must be the first member */ - struct blkg_policy_data pd; - - /* active throtl group service_queue member */ - struct rb_node rb_node; - - /* throtl_data this group belongs to */ - struct throtl_data *td; - - /* this group's service queue */ - struct throtl_service_queue service_queue; - - /* - * qnode_on_self is used when bios are directly queued to this - * throtl_grp so that local bios compete fairly with bios - * dispatched from children. qnode_on_parent is used when bios are - * dispatched from this throtl_grp into its parent and will compete - * with the sibling qnode_on_parents and the parent's - * qnode_on_self. - */ - struct throtl_qnode qnode_on_self[2]; - struct throtl_qnode qnode_on_parent[2]; - - /* - * Dispatch time in jiffies. This is the estimated time when group - * will unthrottle and is ready to dispatch more bio. It is used as - * key to sort active groups in service tree. - */ - unsigned long disptime; - - unsigned int flags; - - /* are there any throtl rules between this group and td? */ - bool has_rules[2]; - - /* internally used bytes per second rate limits */ - uint64_t bps[2][LIMIT_CNT]; - /* user configured bps limits */ - uint64_t bps_conf[2][LIMIT_CNT]; - - /* internally used IOPS limits */ - unsigned int iops[2][LIMIT_CNT]; - /* user configured IOPS limits */ - unsigned int iops_conf[2][LIMIT_CNT]; - - /* Number of bytes disptached in current slice */ - uint64_t bytes_disp[2]; - /* Number of bio's dispatched in current slice */ - unsigned int io_disp[2]; - - unsigned long last_low_overflow_time[2]; - - uint64_t last_bytes_disp[2]; - unsigned int last_io_disp[2]; - - unsigned long last_check_time; - - unsigned long latency_target; /* us */ - unsigned long latency_target_conf; /* us */ - /* When did we start a new slice */ - unsigned long slice_start[2]; - unsigned long slice_end[2]; - - unsigned long last_finish_time; /* ns / 1024 */ - unsigned long checked_last_finish_time; /* ns / 1024 */ - unsigned long avg_idletime; /* ns / 1024 */ - unsigned long idletime_threshold; /* us */ - unsigned long idletime_threshold_conf; /* us */ - - unsigned int bio_cnt; /* total bios */ - unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ - unsigned long bio_cnt_reset_time; - - struct blkg_rwstat stat_bytes; - struct blkg_rwstat stat_ios; -}; - /* We measure latency for request size from <= 4k to >= 1M */ #define LATENCY_BUCKET_SIZE 9 @@ -228,16 +89,6 @@ struct throtl_data static void throtl_pending_timer_fn(struct timer_list *t); -static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) -{ - return pd ? container_of(pd, struct throtl_grp, pd) : NULL; -} - -static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) -{ - return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); -} - static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) { return pd_to_blkg(&tg->pd); @@ -278,7 +129,7 @@ static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) /* * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to * make the IO dispatch more smooth. - * Scale up: linearly scale up according to lapsed time since upgrade. For + * Scale up: linearly scale up according to elapsed time since upgrade. For * every throtl_slice, the limit scales up 1/2 .low limit till the * limit hits .max limit * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit @@ -376,7 +227,7 @@ static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) break; \ if ((__tg)) { \ blk_add_cgroup_trace_msg(__td->queue, \ - tg_to_blkg(__tg)->blkcg, "throtl " fmt, ##args);\ + &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ } else { \ blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ } \ @@ -423,12 +274,13 @@ static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, */ static struct bio *throtl_peek_queued(struct list_head *queued) { - struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); + struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; + qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_peek(&qn->bios); WARN_ON_ONCE(!bio); return bio; @@ -451,12 +303,13 @@ static struct bio *throtl_peek_queued(struct list_head *queued) static struct bio *throtl_pop_queued(struct list_head *queued, struct throtl_grp **tg_to_put) { - struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); + struct throtl_qnode *qn; struct bio *bio; if (list_empty(queued)) return NULL; + qn = list_first_entry(queued, struct throtl_qnode, node); bio = bio_list_pop(&qn->bios); WARN_ON_ONCE(!bio); @@ -476,20 +329,19 @@ static struct bio *throtl_pop_queued(struct list_head *queued, /* init a service_queue, assumes the caller zeroed it */ static void throtl_service_queue_init(struct throtl_service_queue *sq) { - INIT_LIST_HEAD(&sq->queued[0]); - INIT_LIST_HEAD(&sq->queued[1]); + INIT_LIST_HEAD(&sq->queued[READ]); + INIT_LIST_HEAD(&sq->queued[WRITE]); sq->pending_tree = RB_ROOT_CACHED; timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); } -static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, - struct request_queue *q, - struct blkcg *blkcg) +static struct blkg_policy_data *throtl_pd_alloc(struct gendisk *disk, + struct blkcg *blkcg, gfp_t gfp) { struct throtl_grp *tg; int rw; - tg = kzalloc_node(sizeof(*tg), gfp, q->node); + tg = kzalloc_node(sizeof(*tg), gfp, disk->node_id); if (!tg) return NULL; @@ -542,8 +394,9 @@ static void throtl_pd_init(struct blkg_policy_data *pd) * If on the default hierarchy, we switch to properly hierarchical * behavior where limits on a given throtl_grp are applied to the * whole subtree rather than just the group itself. e.g. If 16M - * read_bps limit is set on the root group, the whole system can't - * exceed 16M for the device. + * read_bps limit is set on a parent group, summary bps of + * parent group and its subtree groups can't exceed 16M for the + * device. * * If not on the default hierarchy, the broken flat hierarchy * behavior is retained where all throtl_grps are treated as if @@ -568,11 +421,16 @@ static void tg_update_has_rules(struct throtl_grp *tg) struct throtl_data *td = tg->td; int rw; - for (rw = READ; rw <= WRITE; rw++) - tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || + for (rw = READ; rw <= WRITE; rw++) { + tg->has_rules_iops[rw] = + (parent_tg && parent_tg->has_rules_iops[rw]) || + (td->limit_valid[td->limit_index] && + tg_iops_limit(tg, rw) != UINT_MAX); + tg->has_rules_bps[rw] = + (parent_tg && parent_tg->has_rules_bps[rw]) || (td->limit_valid[td->limit_index] && - (tg_bps_limit(tg, rw) != U64_MAX || - tg_iops_limit(tg, rw) != UINT_MAX)); + (tg_bps_limit(tg, rw) != U64_MAX)); + } } static void throtl_pd_online(struct blkg_policy_data *pd) @@ -585,6 +443,7 @@ static void throtl_pd_online(struct blkg_policy_data *pd) tg_update_has_rules(tg); } +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void blk_throtl_update_limit_valid(struct throtl_data *td) { struct cgroup_subsys_state *pos_css; @@ -605,6 +464,11 @@ static void blk_throtl_update_limit_valid(struct throtl_data *td) td->limit_valid[LIMIT_LOW] = low_valid; } +#else +static inline void blk_throtl_update_limit_valid(struct throtl_data *td) +{ +} +#endif static void throtl_upgrade_state(struct throtl_data *td); static void throtl_pd_offline(struct blkg_policy_data *pd) @@ -636,9 +500,6 @@ static struct throtl_grp * throtl_rb_first(struct throtl_service_queue *parent_sq) { struct rb_node *n; - /* Service tree is empty */ - if (!parent_sq->nr_pending) - return NULL; n = rb_first_cached(&parent_sq->pending_tree); WARN_ON_ONCE(!n); @@ -652,7 +513,6 @@ static void throtl_rb_erase(struct rb_node *n, { rb_erase_cached(n, &parent_sq->pending_tree); RB_CLEAR_NODE(n); - --parent_sq->nr_pending; } static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) @@ -692,29 +552,25 @@ static void tg_service_queue_add(struct throtl_grp *tg) leftmost); } -static void __throtl_enqueue_tg(struct throtl_grp *tg) -{ - tg_service_queue_add(tg); - tg->flags |= THROTL_TG_PENDING; - tg->service_queue.parent_sq->nr_pending++; -} - static void throtl_enqueue_tg(struct throtl_grp *tg) { - if (!(tg->flags & THROTL_TG_PENDING)) - __throtl_enqueue_tg(tg); -} - -static void __throtl_dequeue_tg(struct throtl_grp *tg) -{ - throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); - tg->flags &= ~THROTL_TG_PENDING; + if (!(tg->flags & THROTL_TG_PENDING)) { + tg_service_queue_add(tg); + tg->flags |= THROTL_TG_PENDING; + tg->service_queue.parent_sq->nr_pending++; + } } static void throtl_dequeue_tg(struct throtl_grp *tg) { - if (tg->flags & THROTL_TG_PENDING) - __throtl_dequeue_tg(tg); + if (tg->flags & THROTL_TG_PENDING) { + struct throtl_service_queue *parent_sq = + tg->service_queue.parent_sq; + + throtl_rb_erase(&tg->rb_node, parent_sq); + --parent_sq->nr_pending; + tg->flags &= ~THROTL_TG_PENDING; + } } /* Call with queue lock held */ @@ -779,6 +635,8 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; /* * Previous slice has expired. We must have trimmed it after last @@ -786,7 +644,7 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, * that bandwidth. Do try to make use of that bandwidth while giving * credit. */ - if (time_after_eq(start, tg->slice_start[rw])) + if (time_after(start, tg->slice_start[rw])) tg->slice_start[rw] = start; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; @@ -796,12 +654,18 @@ static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, tg->slice_end[rw], jiffies); } -static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) +static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, + bool clear_carryover) { tg->bytes_disp[rw] = 0; tg->io_disp[rw] = 0; tg->slice_start[rw] = jiffies; tg->slice_end[rw] = jiffies + tg->td->throtl_slice; + if (clear_carryover) { + tg->carryover_bytes[rw] = 0; + tg->carryover_ios[rw] = 0; + } + throtl_log(&tg->service_queue, "[%c] new slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -817,7 +681,7 @@ static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, unsigned long jiffy_end) { - tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); + throtl_set_slice_end(tg, rw, jiffy_end); throtl_log(&tg->service_queue, "[%c] extend slice start=%lu end=%lu jiffies=%lu", rw == READ ? 'R' : 'W', tg->slice_start[rw], @@ -833,11 +697,47 @@ static bool throtl_slice_used(struct throtl_grp *tg, bool rw) return true; } +static unsigned int calculate_io_allowed(u32 iops_limit, + unsigned long jiffy_elapsed) +{ + unsigned int io_allowed; + u64 tmp; + + /* + * jiffy_elapsed should not be a big value as minimum iops can be + * 1 then at max jiffy elapsed should be equivalent of 1 second as we + * will allow dispatch after 1 second and after that slice should + * have been trimmed. + */ + + tmp = (u64)iops_limit * jiffy_elapsed; + do_div(tmp, HZ); + + if (tmp > UINT_MAX) + io_allowed = UINT_MAX; + else + io_allowed = tmp; + + return io_allowed; +} + +static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) +{ + /* + * Can result be wider than 64 bits? + * We check against 62, not 64, due to ilog2 truncation. + */ + if (ilog2(bps_limit) + ilog2(jiffy_elapsed) - ilog2(HZ) > 62) + return U64_MAX; + return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); +} + /* Trim the used slices and adjust slice start accordingly */ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) { - unsigned long nr_slices, time_elapsed, io_trim; - u64 bytes_trim, tmp; + unsigned long time_elapsed; + long long bytes_trim; + int io_trim; BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); @@ -852,97 +752,121 @@ static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) /* * A bio has been dispatched. Also adjust slice_end. It might happen * that initially cgroup limit was very low resulting in high - * slice_end, but later limit was bumped up and bio was dispached + * slice_end, but later limit was bumped up and bio was dispatched * sooner, then we need to reduce slice_end. A high bogus slice_end * is bad because it does not allow new slice to start. */ throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); - time_elapsed = jiffies - tg->slice_start[rw]; - - nr_slices = time_elapsed / tg->td->throtl_slice; - - if (!nr_slices) + time_elapsed = rounddown(jiffies - tg->slice_start[rw], + tg->td->throtl_slice); + if (!time_elapsed) return; - tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; - do_div(tmp, HZ); - bytes_trim = tmp; - io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / - HZ; - - if (!bytes_trim && !io_trim) + bytes_trim = calculate_bytes_allowed(tg_bps_limit(tg, rw), + time_elapsed) + + tg->carryover_bytes[rw]; + io_trim = calculate_io_allowed(tg_iops_limit(tg, rw), time_elapsed) + + tg->carryover_ios[rw]; + if (bytes_trim <= 0 && io_trim <= 0) return; - if (tg->bytes_disp[rw] >= bytes_trim) + tg->carryover_bytes[rw] = 0; + if ((long long)tg->bytes_disp[rw] >= bytes_trim) tg->bytes_disp[rw] -= bytes_trim; else tg->bytes_disp[rw] = 0; - if (tg->io_disp[rw] >= io_trim) + tg->carryover_ios[rw] = 0; + if ((int)tg->io_disp[rw] >= io_trim) tg->io_disp[rw] -= io_trim; else tg->io_disp[rw] = 0; - tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; + tg->slice_start[rw] += time_elapsed; throtl_log(&tg->service_queue, - "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", - rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, - tg->slice_start[rw], tg->slice_end[rw], jiffies); + "[%c] trim slice nr=%lu bytes=%lld io=%d start=%lu end=%lu jiffies=%lu", + rw == READ ? 'R' : 'W', time_elapsed / tg->td->throtl_slice, + bytes_trim, io_trim, tg->slice_start[rw], tg->slice_end[rw], + jiffies); } -static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) +static void __tg_update_carryover(struct throtl_grp *tg, bool rw) { - bool rw = bio_data_dir(bio); - unsigned int io_allowed; - unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; - u64 tmp; - - jiffy_elapsed = jiffies - tg->slice_start[rw]; - - /* Round up to the next throttle slice, wait time must be nonzero */ - jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); /* - * jiffy_elapsed_rnd should not be a big value as minimum iops can be - * 1 then at max jiffy elapsed should be equivalent of 1 second as we - * will allow dispatch after 1 second and after that slice should - * have been trimmed. + * If config is updated while bios are still throttled, calculate and + * accumulate how many bytes/ios are waited across changes. And + * carryover_bytes/ios will be used to calculate new wait time under new + * configuration. */ + if (bps_limit != U64_MAX) + tg->carryover_bytes[rw] += + calculate_bytes_allowed(bps_limit, jiffy_elapsed) - + tg->bytes_disp[rw]; + if (iops_limit != UINT_MAX) + tg->carryover_ios[rw] += + calculate_io_allowed(iops_limit, jiffy_elapsed) - + tg->io_disp[rw]; +} - tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; - do_div(tmp, HZ); +static void tg_update_carryover(struct throtl_grp *tg) +{ + if (tg->service_queue.nr_queued[READ]) + __tg_update_carryover(tg, READ); + if (tg->service_queue.nr_queued[WRITE]) + __tg_update_carryover(tg, WRITE); - if (tmp > UINT_MAX) - io_allowed = UINT_MAX; - else - io_allowed = tmp; + /* see comments in struct throtl_grp for meaning of these fields. */ + throtl_log(&tg->service_queue, "%s: %lld %lld %d %d\n", __func__, + tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], + tg->carryover_ios[READ], tg->carryover_ios[WRITE]); +} - if (tg->io_disp[rw] + 1 <= io_allowed) { - if (wait) - *wait = 0; - return true; +static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, + u32 iops_limit) +{ + bool rw = bio_data_dir(bio); + int io_allowed; + unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; + + if (iops_limit == UINT_MAX) { + return 0; } + jiffy_elapsed = jiffies - tg->slice_start[rw]; + + /* Round up to the next throttle slice, wait time must be nonzero */ + jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); + io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + + tg->carryover_ios[rw]; + if (io_allowed > 0 && tg->io_disp[rw] + 1 <= io_allowed) + return 0; + /* Calc approx time to dispatch */ jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; - - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; } -static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, - unsigned long *wait) +static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, + u64 bps_limit) { bool rw = bio_data_dir(bio); - u64 bytes_allowed, extra_bytes, tmp; + long long bytes_allowed; + u64 extra_bytes; unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; unsigned int bio_size = throtl_bio_data_size(bio); + /* no need to throttle if this bio's bytes have been accounted */ + if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { + return 0; + } + jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; /* Slice has just started. Consider one slice interval */ @@ -950,20 +874,14 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, jiffy_elapsed_rnd = tg->td->throtl_slice; jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); - - tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; - do_div(tmp, HZ); - bytes_allowed = tmp; - - if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { - if (wait) - *wait = 0; - return true; - } + bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + + tg->carryover_bytes[rw]; + if (bytes_allowed > 0 && tg->bytes_disp[rw] + bio_size <= bytes_allowed) + return 0; /* Calc approx time to dispatch */ extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; - jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); + jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); if (!jiffy_wait) jiffy_wait = 1; @@ -973,9 +891,7 @@ static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, * up we did. Add that time also. */ jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); - if (wait) - *wait = jiffy_wait; - return false; + return jiffy_wait; } /* @@ -987,6 +903,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, { bool rw = bio_data_dir(bio); unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; + u64 bps_limit = tg_bps_limit(tg, rw); + u32 iops_limit = tg_iops_limit(tg, rw); /* * Currently whole state machine of group depends on first bio @@ -998,8 +916,8 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, bio != throtl_peek_queued(&tg->service_queue.queued[rw])); /* If tg->bps = -1, then BW is unlimited */ - if (tg_bps_limit(tg, rw) == U64_MAX && - tg_iops_limit(tg, rw) == UINT_MAX) { + if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || + tg->flags & THROTL_TG_CANCELING) { if (wait) *wait = 0; return true; @@ -1013,7 +931,7 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, * slice and it should be extended instead. */ if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) - throtl_start_new_slice(tg, rw); + throtl_start_new_slice(tg, rw, true); else { if (time_before(tg->slice_end[rw], jiffies + tg->td->throtl_slice)) @@ -1021,8 +939,9 @@ static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, jiffies + tg->td->throtl_slice); } - if (tg_with_in_bps_limit(tg, bio, &bps_wait) && - tg_with_in_iops_limit(tg, bio, &iops_wait)) { + bps_wait = tg_within_bps_limit(tg, bio, bps_limit); + iops_wait = tg_within_iops_limit(tg, bio, iops_limit); + if (bps_wait + iops_wait == 0) { if (wait) *wait = 0; return true; @@ -1045,19 +964,13 @@ static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) unsigned int bio_size = throtl_bio_data_size(bio); /* Charge the bio to the group */ - tg->bytes_disp[rw] += bio_size; + if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { + tg->bytes_disp[rw] += bio_size; + tg->last_bytes_disp[rw] += bio_size; + } + tg->io_disp[rw]++; - tg->last_bytes_disp[rw] += bio_size; tg->last_io_disp[rw]++; - - /* - * BIO_THROTTLED is used to prevent the same bio to be throttled - * more than once as a throttled bio will go through blk-throtl the - * second time when it eventually gets issued. Set it when a bio - * is being charged to a tg. - */ - if (!bio_flagged(bio, BIO_THROTTLED)) - bio_set_flag(bio, BIO_THROTTLED); } /** @@ -1082,7 +995,7 @@ static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, * If @tg doesn't currently have any bios queued in the same * direction, queueing @bio can change when @tg should be * dispatched. Mark that @tg was empty. This is automatically - * cleaered on the next tg_update_disptime(). + * cleared on the next tg_update_disptime(). */ if (!sq->nr_queued[rw]) tg->flags |= THROTL_TG_WAS_EMPTY; @@ -1111,9 +1024,9 @@ static void tg_update_disptime(struct throtl_grp *tg) disptime = jiffies + min_wait; /* Update dispatch time */ - throtl_dequeue_tg(tg); + throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); tg->disptime = disptime; - throtl_enqueue_tg(tg); + tg_service_queue_add(tg); /* see throtl_add_bio_tg() */ tg->flags &= ~THROTL_TG_WAS_EMPTY; @@ -1159,6 +1072,7 @@ static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); start_parent_slice_with_credit(tg, parent_tg, rw); } else { + bio_set_flag(bio, BIO_BPS_THROTTLED); throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], &parent_sq->queued[rw]); BUG_ON(tg->td->nr_queued[rw] <= 0); @@ -1175,8 +1089,8 @@ static int throtl_dispatch_tg(struct throtl_grp *tg) { struct throtl_service_queue *sq = &tg->service_queue; unsigned int nr_reads = 0, nr_writes = 0; - unsigned int max_nr_reads = throtl_grp_quantum*3/4; - unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; + unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; + unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; struct bio *bio; /* Try to dispatch 75% READS and 25% WRITES */ @@ -1209,24 +1123,28 @@ static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) unsigned int nr_disp = 0; while (1) { - struct throtl_grp *tg = throtl_rb_first(parent_sq); + struct throtl_grp *tg; struct throtl_service_queue *sq; + if (!parent_sq->nr_pending) + break; + + tg = throtl_rb_first(parent_sq); if (!tg) break; if (time_before(jiffies, tg->disptime)) break; - throtl_dequeue_tg(tg); - nr_disp += throtl_dispatch_tg(tg); sq = &tg->service_queue; - if (sq->nr_queued[0] || sq->nr_queued[1]) + if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) tg_update_disptime(tg); + else + throtl_dequeue_tg(tg); - if (nr_disp >= throtl_quantum) + if (nr_disp >= THROTL_QUANTUM) break; } @@ -1255,12 +1173,22 @@ static void throtl_pending_timer_fn(struct timer_list *t) struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); struct throtl_grp *tg = sq_to_tg(sq); struct throtl_data *td = sq_to_td(sq); - struct request_queue *q = td->queue; struct throtl_service_queue *parent_sq; + struct request_queue *q; bool dispatched; int ret; + /* throtl_data may be gone, so figure out request queue by blkg */ + if (tg) + q = tg->pd.blkg->q; + else + q = td->queue; + spin_lock_irq(&q->queue_lock); + + if (!q->root_blkg) + goto out_unlock; + if (throtl_can_upgrade(td, NULL)) throtl_upgrade_state(td); @@ -1303,7 +1231,7 @@ again: } } } else { - /* reached the top-level, queue issueing */ + /* reached the top-level, queue issuing */ queue_work(kthrotld_workqueue, &td->dispatch_work); } out_unlock: @@ -1314,8 +1242,8 @@ out_unlock: * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work * @work: work item being executed * - * This function is queued for execution when bio's reach the bio_lists[] - * of throtl_data->service_queue. Those bio's are ready and issued by this + * This function is queued for execution when bios reach the bio_lists[] + * of throtl_data->service_queue. Those bios are ready and issued by this * function. */ static void blk_throtl_dispatch_work_fn(struct work_struct *work) @@ -1339,8 +1267,8 @@ static void blk_throtl_dispatch_work_fn(struct work_struct *work) if (!bio_list_empty(&bio_list_on_stack)) { blk_start_plug(&plug); - while((bio = bio_list_pop(&bio_list_on_stack))) - generic_make_request(bio); + while ((bio = bio_list_pop(&bio_list_on_stack))) + submit_bio_noacct_nocheck(bio); blk_finish_plug(&plug); } } @@ -1392,6 +1320,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); + rcu_read_lock(); /* * Update has_rules[] flags for the updated tg's subtree. A tg is * considered to have rules if either the tg itself or any of its @@ -1419,6 +1348,7 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) this_tg->latency_target = max(this_tg->latency_target, parent_tg->latency_target); } + rcu_read_unlock(); /* * We're already holding queue_lock and know @tg is valid. Let's @@ -1428,8 +1358,8 @@ static void tg_conf_updated(struct throtl_grp *tg, bool global) * that a group's limit are dropped suddenly and we don't want to * account recently dispatched IO with new low rate. */ - throtl_start_new_slice(tg, 0); - throtl_start_new_slice(tg, 1); + throtl_start_new_slice(tg, READ, false); + throtl_start_new_slice(tg, WRITE, false); if (tg->flags & THROTL_TG_PENDING) { tg_update_disptime(tg); @@ -1446,9 +1376,11 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, int ret; u64 v; - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) - return ret; + goto out_finish; ret = -EINVAL; if (sscanf(ctx.body, "%llu", &v) != 1) @@ -1457,6 +1389,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, v = U64_MAX; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); if (is_u64) *(u64 *)((void *)tg + of_cft(of)->private) = v; @@ -1466,7 +1399,7 @@ static ssize_t tg_set_conf(struct kernfs_open_file *of, tg_conf_updated(tg, false); ret = 0; out_finish: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -1638,11 +1571,14 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, int ret; int index = of_cft(of)->private; - ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); + blkg_conf_init(&ctx, buf); + + ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, &ctx); if (ret) - return ret; + goto out_finish; tg = blkg_to_tg(ctx.blkg); + tg_update_carryover(tg); v[0] = tg->bps_conf[READ][index]; v[1] = tg->bps_conf[WRITE][index]; @@ -1674,13 +1610,13 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, goto out_finish; ret = -EINVAL; - if (!strcmp(tok, "rbps")) + if (!strcmp(tok, "rbps") && val > 1) v[0] = val; - else if (!strcmp(tok, "wbps")) + else if (!strcmp(tok, "wbps") && val > 1) v[1] = val; - else if (!strcmp(tok, "riops")) + else if (!strcmp(tok, "riops") && val > 1) v[2] = min_t(u64, val, UINT_MAX); - else if (!strcmp(tok, "wiops")) + else if (!strcmp(tok, "wiops") && val > 1) v[3] = min_t(u64, val, UINT_MAX); else if (off == LIMIT_LOW && !strcmp(tok, "idle")) idle_time = val; @@ -1738,7 +1674,7 @@ static ssize_t tg_set_limit(struct kernfs_open_file *of, tg->td->limit_valid[LIMIT_LOW]); ret = 0; out_finish: - blkg_conf_finish(&ctx); + blkg_conf_exit(&ctx); return ret ?: nbytes; } @@ -1769,7 +1705,7 @@ static void throtl_shutdown_wq(struct request_queue *q) cancel_work_sync(&td->dispatch_work); } -static struct blkcg_policy blkcg_policy_throtl = { +struct blkcg_policy blkcg_policy_throtl = { .dfl_cftypes = throtl_files, .legacy_cftypes = throtl_legacy_files, @@ -1780,6 +1716,52 @@ static struct blkcg_policy blkcg_policy_throtl = { .pd_free_fn = throtl_pd_free, }; +void blk_throtl_cancel_bios(struct gendisk *disk) +{ + struct request_queue *q = disk->queue; + struct cgroup_subsys_state *pos_css; + struct blkcg_gq *blkg; + + spin_lock_irq(&q->queue_lock); + /* + * queue_lock is held, rcu lock is not needed here technically. + * However, rcu lock is still held to emphasize that following + * path need RCU protection and to prevent warning from lockdep. + */ + rcu_read_lock(); + blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { + struct throtl_grp *tg = blkg_to_tg(blkg); + struct throtl_service_queue *sq = &tg->service_queue; + + /* + * Set the flag to make sure throtl_pending_timer_fn() won't + * stop until all throttled bios are dispatched. + */ + tg->flags |= THROTL_TG_CANCELING; + + /* + * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup + * will be inserted to service queue without THROTL_TG_PENDING + * set in tg_update_disptime below. Then IO dispatched from + * child in tg_dispatch_one_bio will trigger double insertion + * and corrupt the tree. + */ + if (!(tg->flags & THROTL_TG_PENDING)) + continue; + + /* + * Update disptime after setting the above flag to make sure + * throtl_select_dispatch() won't exit without dispatching. + */ + tg_update_disptime(tg); + + throtl_schedule_pending_timer(sq, jiffies + 1); + } + rcu_read_unlock(); + spin_unlock_irq(&q->queue_lock); +} + +#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) { unsigned long rtime = jiffies, wtime = jiffies; @@ -1791,7 +1773,6 @@ static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) return min(rtime, wtime); } -/* tg should not be an intermediate node */ static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) { struct throtl_service_queue *parent_sq; @@ -1845,24 +1826,29 @@ static bool throtl_tg_is_idle(struct throtl_grp *tg) return ret; } -static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) { struct throtl_service_queue *sq = &tg->service_queue; - bool read_limit, write_limit; + bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; /* - * if cgroup reaches low limit (if low limit is 0, the cgroup always - * reaches), it's ok to upgrade to next limit + * if low limit is zero, low limit is always reached. + * if low limit is non-zero, we can check if there is any request + * is queued to determine if low limit is reached as we throttle + * request according to limit. */ - read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; - write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; - if (!read_limit && !write_limit) - return true; - if (read_limit && sq->nr_queued[READ] && - (!write_limit || sq->nr_queued[WRITE])) - return true; - if (write_limit && sq->nr_queued[WRITE] && - (!read_limit || sq->nr_queued[READ])) + return !limit || sq->nr_queued[rw]; +} + +static bool throtl_tg_can_upgrade(struct throtl_grp *tg) +{ + /* + * cgroup reaches low limit when low limit of READ and WRITE are + * both reached, it's ok to upgrade to next limit if cgroup reaches + * low limit + */ + if (throtl_low_limit_reached(tg, READ) && + throtl_low_limit_reached(tg, WRITE)) return true; if (time_after_eq(jiffies, @@ -1957,7 +1943,7 @@ static void throtl_upgrade_state(struct throtl_data *td) queue_work(kthrotld_workqueue, &td->dispatch_work); } -static void throtl_downgrade_state(struct throtl_data *td, int new) +static void throtl_downgrade_state(struct throtl_data *td) { td->scale /= 2; @@ -1967,7 +1953,7 @@ static void throtl_downgrade_state(struct throtl_data *td, int new) return; } - td->limit_index = new; + td->limit_index = LIMIT_LOW; td->low_downgrade_time = jiffies; } @@ -1980,8 +1966,7 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) * If cgroup is below low limit, consider downgrade and throttle other * cgroups */ - if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && - time_after_eq(now, tg_last_low_overflow_time(tg) + + if (time_after_eq(now, tg_last_low_overflow_time(tg) + td->throtl_slice) && (!throtl_tg_is_idle(tg) || !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) @@ -1991,6 +1976,11 @@ static bool throtl_tg_can_downgrade(struct throtl_grp *tg) static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) { + struct throtl_data *td = tg->td; + + if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) + return false; + while (true) { if (!throtl_tg_can_downgrade(tg)) return false; @@ -2054,7 +2044,7 @@ static void throtl_downgrade_check(struct throtl_grp *tg) * cgroups */ if (throtl_hierarchy_can_downgrade(tg)) - throtl_downgrade_state(tg->td, LIMIT_LOW); + throtl_downgrade_state(tg->td); tg->last_bytes_disp[READ] = 0; tg->last_bytes_disp[WRITE] = 0; @@ -2064,10 +2054,14 @@ static void throtl_downgrade_check(struct throtl_grp *tg) static void blk_throtl_update_idletime(struct throtl_grp *tg) { - unsigned long now = ktime_get_ns() >> 10; + unsigned long now; unsigned long last_finish_time = tg->last_finish_time; - if (now <= last_finish_time || last_finish_time == 0 || + if (last_finish_time == 0) + return; + + now = ktime_get_ns() >> 10; + if (now <= last_finish_time || last_finish_time == tg->checked_last_finish_time) return; @@ -2075,7 +2069,6 @@ static void blk_throtl_update_idletime(struct throtl_grp *tg) tg->checked_last_finish_time = last_finish_time; } -#ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_update_latency_buckets(struct throtl_data *td) { struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; @@ -2083,7 +2076,7 @@ static void throtl_update_latency_buckets(struct throtl_data *td) unsigned long last_latency[2] = { 0 }; unsigned long latency[2]; - if (!blk_queue_nonrot(td->queue)) + if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) return; if (time_before(jiffies, td->last_calculate_time + HZ)) return; @@ -2156,32 +2149,42 @@ static void throtl_update_latency_buckets(struct throtl_data *td) static inline void throtl_update_latency_buckets(struct throtl_data *td) { } + +static void blk_throtl_update_idletime(struct throtl_grp *tg) +{ +} + +static void throtl_downgrade_check(struct throtl_grp *tg) +{ +} + +static void throtl_upgrade_check(struct throtl_grp *tg) +{ +} + +static bool throtl_can_upgrade(struct throtl_data *td, + struct throtl_grp *this_tg) +{ + return false; +} + +static void throtl_upgrade_state(struct throtl_data *td) +{ +} #endif -bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, - struct bio *bio) +bool __blk_throtl_bio(struct bio *bio) { + struct request_queue *q = bdev_get_queue(bio->bi_bdev); + struct blkcg_gq *blkg = bio->bi_blkg; struct throtl_qnode *qn = NULL; - struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); + struct throtl_grp *tg = blkg_to_tg(blkg); struct throtl_service_queue *sq; bool rw = bio_data_dir(bio); bool throttled = false; struct throtl_data *td = tg->td; - WARN_ON_ONCE(!rcu_read_lock_held()); - - /* see throtl_charge_bio() */ - if (bio_flagged(bio, BIO_THROTTLED)) - goto out; - - if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { - blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, - bio->bi_iter.bi_size); - blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); - } - - if (!tg->has_rules[rw]) - goto out; + rcu_read_lock(); spin_lock_irq(&q->queue_lock); @@ -2229,14 +2232,16 @@ again: /* * @bio passed through this layer without being throttled. - * Climb up the ladder. If we''re already at the top, it + * Climb up the ladder. If we're already at the top, it * can be executed directly. */ qn = &tg->qnode_on_parent[rw]; sq = sq->parent_sq; tg = sq_to_tg(sq); - if (!tg) + if (!tg) { + bio_set_flag(bio, BIO_BPS_THROTTLED); goto out_unlock; + } } /* out-of-limit, queue to @tg */ @@ -2265,21 +2270,21 @@ again: } out_unlock: - spin_unlock_irq(&q->queue_lock); -out: - bio_set_flag(bio, BIO_THROTTLED); - #ifdef CONFIG_BLK_DEV_THROTTLING_LOW if (throttled || !td->track_bio_latency) bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; #endif + spin_unlock_irq(&q->queue_lock); + + rcu_read_unlock(); return throttled; } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW static void throtl_track_latency(struct throtl_data *td, sector_t size, - int op, unsigned long time) + enum req_op op, unsigned long time) { + const bool rw = op_is_write(op); struct latency_bucket *latency; int index; @@ -2290,10 +2295,10 @@ static void throtl_track_latency(struct throtl_data *td, sector_t size, index = request_bucket_index(size); - latency = get_cpu_ptr(td->latency_buckets[op]); + latency = get_cpu_ptr(td->latency_buckets[rw]); latency[index].total_latency += time; latency[index].samples++; - put_cpu_ptr(td->latency_buckets[op]); + put_cpu_ptr(td->latency_buckets[rw]); } void blk_throtl_stat_add(struct request *rq, u64 time_ns) @@ -2319,6 +2324,8 @@ void blk_throtl_bio_endio(struct bio *bio) if (!blkg) return; tg = blkg_to_tg(blkg); + if (!tg->td->limit_valid[LIMIT_LOW]) + return; finish_time_ns = ktime_get_ns(); tg->last_finish_time = finish_time_ns >> 10; @@ -2358,8 +2365,9 @@ void blk_throtl_bio_endio(struct bio *bio) } #endif -int blk_throtl_init(struct request_queue *q) +int blk_throtl_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int ret; @@ -2392,7 +2400,7 @@ int blk_throtl_init(struct request_queue *q) td->low_downgrade_time = jiffies; /* activate policy */ - ret = blkcg_activate_policy(q, &blkcg_policy_throtl); + ret = blkcg_activate_policy(disk, &blkcg_policy_throtl); if (ret) { free_percpu(td->latency_buckets[READ]); free_percpu(td->latency_buckets[WRITE]); @@ -2401,18 +2409,22 @@ int blk_throtl_init(struct request_queue *q) return ret; } -void blk_throtl_exit(struct request_queue *q) +void blk_throtl_exit(struct gendisk *disk) { + struct request_queue *q = disk->queue; + BUG_ON(!q->td); + del_timer_sync(&q->td->service_queue.pending_timer); throtl_shutdown_wq(q); - blkcg_deactivate_policy(q, &blkcg_policy_throtl); + blkcg_deactivate_policy(disk, &blkcg_policy_throtl); free_percpu(q->td->latency_buckets[READ]); free_percpu(q->td->latency_buckets[WRITE]); kfree(q->td); } -void blk_throtl_register_queue(struct request_queue *q) +void blk_throtl_register(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct throtl_data *td; int i; @@ -2433,11 +2445,12 @@ void blk_throtl_register_queue(struct request_queue *q) #ifndef CONFIG_BLK_DEV_THROTTLING_LOW /* if no low limit, use previous default */ td->throtl_slice = DFL_THROTL_SLICE_HD; -#endif +#else td->track_bio_latency = !queue_is_mq(q); if (!td->track_bio_latency) blk_stat_enable_accounting(q); +#endif } #ifdef CONFIG_BLK_DEV_THROTTLING_LOW diff --git a/block/blk-throttle.h b/block/blk-throttle.h new file mode 100644 index 000000000000..bffbc9cfc8ab --- /dev/null +++ b/block/blk-throttle.h @@ -0,0 +1,217 @@ +#ifndef BLK_THROTTLE_H +#define BLK_THROTTLE_H + +#include "blk-cgroup-rwstat.h" + +/* + * To implement hierarchical throttling, throtl_grps form a tree and bios + * are dispatched upwards level by level until they reach the top and get + * issued. When dispatching bios from the children and local group at each + * level, if the bios are dispatched into a single bio_list, there's a risk + * of a local or child group which can queue many bios at once filling up + * the list starving others. + * + * To avoid such starvation, dispatched bios are queued separately + * according to where they came from. When they are again dispatched to + * the parent, they're popped in round-robin order so that no single source + * hogs the dispatch window. + * + * throtl_qnode is used to keep the queued bios separated by their sources. + * Bios are queued to throtl_qnode which in turn is queued to + * throtl_service_queue and then dispatched in round-robin order. + * + * It's also used to track the reference counts on blkg's. A qnode always + * belongs to a throtl_grp and gets queued on itself or the parent, so + * incrementing the reference of the associated throtl_grp when a qnode is + * queued and decrementing when dequeued is enough to keep the whole blkg + * tree pinned while bios are in flight. + */ +struct throtl_qnode { + struct list_head node; /* service_queue->queued[] */ + struct bio_list bios; /* queued bios */ + struct throtl_grp *tg; /* tg this qnode belongs to */ +}; + +struct throtl_service_queue { + struct throtl_service_queue *parent_sq; /* the parent service_queue */ + + /* + * Bios queued directly to this service_queue or dispatched from + * children throtl_grp's. + */ + struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ + unsigned int nr_queued[2]; /* number of queued bios */ + + /* + * RB tree of active children throtl_grp's, which are sorted by + * their ->disptime. + */ + struct rb_root_cached pending_tree; /* RB tree of active tgs */ + unsigned int nr_pending; /* # queued in the tree */ + unsigned long first_pending_disptime; /* disptime of the first tg */ + struct timer_list pending_timer; /* fires on first_pending_disptime */ +}; + +enum tg_state_flags { + THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ + THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ + THROTL_TG_CANCELING = 1 << 2, /* starts to cancel bio */ +}; + +enum { + LIMIT_LOW, + LIMIT_MAX, + LIMIT_CNT, +}; + +struct throtl_grp { + /* must be the first member */ + struct blkg_policy_data pd; + + /* active throtl group service_queue member */ + struct rb_node rb_node; + + /* throtl_data this group belongs to */ + struct throtl_data *td; + + /* this group's service queue */ + struct throtl_service_queue service_queue; + + /* + * qnode_on_self is used when bios are directly queued to this + * throtl_grp so that local bios compete fairly with bios + * dispatched from children. qnode_on_parent is used when bios are + * dispatched from this throtl_grp into its parent and will compete + * with the sibling qnode_on_parents and the parent's + * qnode_on_self. + */ + struct throtl_qnode qnode_on_self[2]; + struct throtl_qnode qnode_on_parent[2]; + + /* + * Dispatch time in jiffies. This is the estimated time when group + * will unthrottle and is ready to dispatch more bio. It is used as + * key to sort active groups in service tree. + */ + unsigned long disptime; + + unsigned int flags; + + /* are there any throtl rules between this group and td? */ + bool has_rules_bps[2]; + bool has_rules_iops[2]; + + /* internally used bytes per second rate limits */ + uint64_t bps[2][LIMIT_CNT]; + /* user configured bps limits */ + uint64_t bps_conf[2][LIMIT_CNT]; + + /* internally used IOPS limits */ + unsigned int iops[2][LIMIT_CNT]; + /* user configured IOPS limits */ + unsigned int iops_conf[2][LIMIT_CNT]; + + /* Number of bytes dispatched in current slice */ + uint64_t bytes_disp[2]; + /* Number of bio's dispatched in current slice */ + unsigned int io_disp[2]; + + unsigned long last_low_overflow_time[2]; + + uint64_t last_bytes_disp[2]; + unsigned int last_io_disp[2]; + + /* + * The following two fields are updated when new configuration is + * submitted while some bios are still throttled, they record how many + * bytes/ios are waited already in previous configuration, and they will + * be used to calculate wait time under new configuration. + */ + long long carryover_bytes[2]; + int carryover_ios[2]; + + unsigned long last_check_time; + + unsigned long latency_target; /* us */ + unsigned long latency_target_conf; /* us */ + /* When did we start a new slice */ + unsigned long slice_start[2]; + unsigned long slice_end[2]; + + unsigned long last_finish_time; /* ns / 1024 */ + unsigned long checked_last_finish_time; /* ns / 1024 */ + unsigned long avg_idletime; /* ns / 1024 */ + unsigned long idletime_threshold; /* us */ + unsigned long idletime_threshold_conf; /* us */ + + unsigned int bio_cnt; /* total bios */ + unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ + unsigned long bio_cnt_reset_time; + + struct blkg_rwstat stat_bytes; + struct blkg_rwstat stat_ios; +}; + +extern struct blkcg_policy blkcg_policy_throtl; + +static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) +{ + return pd ? container_of(pd, struct throtl_grp, pd) : NULL; +} + +static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) +{ + return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); +} + +/* + * Internal throttling interface + */ +#ifndef CONFIG_BLK_DEV_THROTTLING +static inline int blk_throtl_init(struct gendisk *disk) { return 0; } +static inline void blk_throtl_exit(struct gendisk *disk) { } +static inline void blk_throtl_register(struct gendisk *disk) { } +static inline bool blk_throtl_bio(struct bio *bio) { return false; } +static inline void blk_throtl_cancel_bios(struct gendisk *disk) { } +#else /* CONFIG_BLK_DEV_THROTTLING */ +int blk_throtl_init(struct gendisk *disk); +void blk_throtl_exit(struct gendisk *disk); +void blk_throtl_register(struct gendisk *disk); +bool __blk_throtl_bio(struct bio *bio); +void blk_throtl_cancel_bios(struct gendisk *disk); + +static inline bool blk_should_throtl(struct bio *bio) +{ + struct throtl_grp *tg = blkg_to_tg(bio->bi_blkg); + int rw = bio_data_dir(bio); + + if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { + if (!bio_flagged(bio, BIO_CGROUP_ACCT)) { + bio_set_flag(bio, BIO_CGROUP_ACCT); + blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, + bio->bi_iter.bi_size); + } + blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); + } + + /* iops limit is always counted */ + if (tg->has_rules_iops[rw]) + return true; + + if (tg->has_rules_bps[rw] && !bio_flagged(bio, BIO_BPS_THROTTLED)) + return true; + + return false; +} + +static inline bool blk_throtl_bio(struct bio *bio) +{ + + if (!blk_should_throtl(bio)) + return false; + + return __blk_throtl_bio(bio); +} +#endif /* CONFIG_BLK_DEV_THROTTLING */ + +#endif diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 8aa68fae96ad..1b8de0417fc1 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -20,13 +20,11 @@ static int __init setup_fail_io_timeout(char *str) } __setup("fail_io_timeout=", setup_fail_io_timeout); -int blk_should_fake_timeout(struct request_queue *q) +bool __blk_should_fake_timeout(struct request_queue *q) { - if (!test_bit(QUEUE_FLAG_FAIL_IO, &q->queue_flags)) - return 0; - return should_fail(&fail_io_timeout, 1); } +EXPORT_SYMBOL_GPL(__blk_should_fake_timeout); static int __init fail_io_timeout_debugfs(void) { @@ -70,7 +68,7 @@ ssize_t part_timeout_store(struct device *dev, struct device_attribute *attr, #endif /* CONFIG_FAIL_IO_TIMEOUT */ /** - * blk_abort_request -- Request request recovery for the specified command + * blk_abort_request - Request recovery for the specified command * @req: pointer to the request of interest * * This function requests that the block layer start recovery for the @@ -90,11 +88,29 @@ void blk_abort_request(struct request *req) } EXPORT_SYMBOL_GPL(blk_abort_request); +static unsigned long blk_timeout_mask __read_mostly; + +static int __init blk_timeout_init(void) +{ + blk_timeout_mask = roundup_pow_of_two(HZ) - 1; + return 0; +} + +late_initcall(blk_timeout_init); + +/* + * Just a rough estimate, we don't care about specific values for timeouts. + */ +static inline unsigned long blk_round_jiffies(unsigned long j) +{ + return (j + blk_timeout_mask) + 1; +} + unsigned long blk_rq_timeout(unsigned long timeout) { unsigned long maxt; - maxt = round_jiffies_up(jiffies + BLK_MAX_TIMEOUT); + maxt = blk_round_jiffies(jiffies + BLK_MAX_TIMEOUT); if (time_after(timeout, maxt)) timeout = maxt; @@ -131,7 +147,7 @@ void blk_add_timer(struct request *req) * than an existing one, modify the timer. Round up to next nearest * second. */ - expiry = blk_rq_timeout(round_jiffies_up(expiry)); + expiry = blk_rq_timeout(blk_round_jiffies(expiry)); if (!timer_pending(&q->timeout) || time_before(expiry, q->timeout.expires)) { diff --git a/block/blk-wbt.c b/block/blk-wbt.c index 0fa615eefd52..0c0e270a8265 100644 --- a/block/blk-wbt.c +++ b/block/blk-wbt.c @@ -25,12 +25,78 @@ #include <linux/backing-dev.h> #include <linux/swap.h> +#include "blk-stat.h" #include "blk-wbt.h" #include "blk-rq-qos.h" +#include "elevator.h" #define CREATE_TRACE_POINTS #include <trace/events/wbt.h> +enum wbt_flags { + WBT_TRACKED = 1, /* write, tracked for throttling */ + WBT_READ = 2, /* read */ + WBT_KSWAPD = 4, /* write, from kswapd */ + WBT_DISCARD = 8, /* discard */ + + WBT_NR_BITS = 4, /* number of bits */ +}; + +enum { + WBT_RWQ_BG = 0, + WBT_RWQ_KSWAPD, + WBT_RWQ_DISCARD, + WBT_NUM_RWQ, +}; + +/* + * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other + * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered + * to WBT_STATE_OFF/ON_MANUAL. + */ +enum { + WBT_STATE_ON_DEFAULT = 1, /* on by default */ + WBT_STATE_ON_MANUAL = 2, /* on manually by sysfs */ + WBT_STATE_OFF_DEFAULT = 3, /* off by default */ + WBT_STATE_OFF_MANUAL = 4, /* off manually by sysfs */ +}; + +struct rq_wb { + /* + * Settings that govern how we throttle + */ + unsigned int wb_background; /* background writeback */ + unsigned int wb_normal; /* normal writeback */ + + short enable_state; /* WBT_STATE_* */ + + /* + * Number of consecutive periods where we don't have enough + * information to make a firm scale up/down decision. + */ + unsigned int unknown_cnt; + + u64 win_nsec; /* default window size */ + u64 cur_win_nsec; /* current window size */ + + struct blk_stat_callback *cb; + + u64 sync_issue; + void *sync_cookie; + + unsigned long last_issue; /* last non-throttled issue */ + unsigned long last_comp; /* last non-throttled comp */ + unsigned long min_lat_nsec; + struct rq_qos rqos; + struct rq_wait rq_wait[WBT_NUM_RWQ]; + struct rq_depth rq_depth; +}; + +static inline struct rq_wb *RQWB(struct rq_qos *rqos) +{ + return container_of(rqos, struct rq_wb, rqos); +} + static inline void wbt_clear_state(struct request *rq) { rq->wbt_flags = 0; @@ -77,7 +143,8 @@ enum { static inline bool rwb_enabled(struct rq_wb *rwb) { - return rwb && rwb->wb_normal != 0; + return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT && + rwb->enable_state != WBT_STATE_OFF_MANUAL; } static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) @@ -96,9 +163,9 @@ static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) */ static bool wb_recent_wait(struct rq_wb *rwb) { - struct bdi_writeback *wb = &rwb->rqos.q->backing_dev_info->wb; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; - return time_before(jiffies, wb->dirty_sleep + HZ); + return time_before(jiffies, bdi->last_bdp_sleep + HZ); } static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, @@ -132,22 +199,14 @@ static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw, inflight = atomic_dec_return(&rqw->inflight); /* - * wbt got disabled with IO in flight. Wake up any potential - * waiters, we don't have to do more than that. - */ - if (unlikely(!rwb_enabled(rwb))) { - rwb_wake_all(rwb); - return; - } - - /* * For discards, our limit is always the background. For writes, if * the device does write back caching, drop further down before we * wake people up. */ if (wb_acct & WBT_DISCARD) limit = rwb->wb_background; - else if (rwb->wc && !wb_recent_wait(rwb)) + else if (test_bit(QUEUE_FLAG_WC, &rwb->rqos.disk->queue->queue_flags) && + !wb_recent_wait(rwb)) limit = 0; else limit = rwb->wb_normal; @@ -224,6 +283,16 @@ static u64 rwb_sync_issue_lat(struct rq_wb *rwb) return now - issue; } +static inline unsigned int wbt_inflight(struct rq_wb *rwb) +{ + unsigned int i, ret = 0; + + for (i = 0; i < WBT_NUM_RWQ; i++) + ret += atomic_read(&rwb->rq_wait[i].inflight); + + return ret; +} + enum { LAT_OK = 1, LAT_UNKNOWN, @@ -233,7 +302,7 @@ enum { static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; u64 thislat; @@ -286,7 +355,7 @@ static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) static void rwb_trace_step(struct rq_wb *rwb, const char *msg) { - struct backing_dev_info *bdi = rwb->rqos.q->backing_dev_info; + struct backing_dev_info *bdi = rwb->rqos.disk->bdi; struct rq_depth *rqd = &rwb->rq_depth; trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec, @@ -356,10 +425,12 @@ static void wb_timer_fn(struct blk_stat_callback *cb) unsigned int inflight = wbt_inflight(rwb); int status; + if (!rwb->rqos.disk) + return; + status = latency_exceeded(rwb, cb->stat); - trace_wbt_timer(rwb->rqos.q->backing_dev_info, status, rqd->scale_step, - inflight); + trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight); /* * If we exceeded the latency target, step down. If we did not, @@ -418,6 +489,13 @@ static void wbt_update_limits(struct rq_wb *rwb) rwb_wake_all(rwb); } +bool wbt_disabled(struct request_queue *q) +{ + struct rq_qos *rqos = wbt_rq_qos(q); + + return !rqos || !rwb_enabled(RQWB(rqos)); +} + u64 wbt_get_min_lat(struct request_queue *q) { struct rq_qos *rqos = wbt_rq_qos(q); @@ -431,8 +509,13 @@ void wbt_set_min_lat(struct request_queue *q, u64 val) struct rq_qos *rqos = wbt_rq_qos(q); if (!rqos) return; + RQWB(rqos)->min_lat_nsec = val; - RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + if (val) + RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL; + else + RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL; + wbt_update_limits(RQWB(rqos)); } @@ -447,18 +530,11 @@ static bool close_io(struct rq_wb *rwb) #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) -static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) +static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf) { unsigned int limit; - /* - * If we got disabled, just return UINT_MAX. This ensures that - * we'll properly inc a new IO, and dec+wakeup at the end. - */ - if (!rwb_enabled(rwb)) - return UINT_MAX; - - if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD) + if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD) return rwb->wb_background; /* @@ -469,9 +545,9 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) * the idle limit, or go to normal if we haven't had competing * IO for a bit. */ - if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) + if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) limit = rwb->rq_depth.max_depth; - else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { + else if ((opf & REQ_BACKGROUND) || close_io(rwb)) { /* * If less than 100ms since we completed unrelated IO, * limit us to half the depth for background writeback. @@ -486,13 +562,13 @@ static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) struct wbt_wait_data { struct rq_wb *rwb; enum wbt_flags wb_acct; - unsigned long rw; + blk_opf_t opf; }; static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data) { struct wbt_wait_data *data = private_data; - return rq_wait_inc_below(rqw, get_limit(data->rwb, data->rw)); + return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf)); } static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) @@ -506,19 +582,19 @@ static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data) * the timer to kick off queuing again. */ static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, - unsigned long rw) + blk_opf_t opf) { struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); struct wbt_wait_data data = { .rwb = rwb, .wb_acct = wb_acct, - .rw = rw, + .opf = opf, }; rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb); } -static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) +static inline bool wbt_should_throttle(struct bio *bio) { switch (bio_op(bio)) { case REQ_OP_WRITE: @@ -528,7 +604,7 @@ static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) return false; - /* fallthrough */ + fallthrough; case REQ_OP_DISCARD: return true; default: @@ -545,7 +621,7 @@ static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio) if (bio_op(bio) == REQ_OP_READ) { flags = WBT_READ; - } else if (wbt_should_throttle(rwb, bio)) { + } else if (wbt_should_throttle(bio)) { if (current_is_kswapd()) flags |= WBT_KSWAPD; if (bio_op(bio) == REQ_OP_DISCARD) @@ -563,7 +639,6 @@ static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio) } /* - * Returns true if the IO request should be accounted, false if not. * May sleep, if we have exceeded the writeback limits. Caller can pass * in an irq held spinlock, if it holds one when calling this function. * If we do sleep, we'll release and re-grab it. @@ -623,29 +698,33 @@ static void wbt_requeue(struct rq_qos *rqos, struct request *rq) } } -void wbt_set_write_cache(struct request_queue *q, bool write_cache_on) -{ - struct rq_qos *rqos = wbt_rq_qos(q); - if (rqos) - RQWB(rqos)->wc = write_cache_on; -} - /* * Enable wbt if defaults are configured that way */ -void wbt_enable_default(struct request_queue *q) +void wbt_enable_default(struct gendisk *disk) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct request_queue *q = disk->queue; + struct rq_qos *rqos; + bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ); + + if (q->elevator && + test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags)) + enable = false; + /* Throttling already enabled? */ - if (rqos) + rqos = wbt_rq_qos(q); + if (rqos) { + if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT) + RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT; return; + } /* Queue not registered? Maybe shutting down... */ if (!blk_queue_registered(q)) return; - if (queue_is_mq(q) && IS_ENABLED(CONFIG_BLK_WBT_MQ)) - wbt_init(q); + if (queue_is_mq(q) && enable) + wbt_init(disk); } EXPORT_SYMBOL_GPL(wbt_enable_default); @@ -663,7 +742,7 @@ u64 wbt_default_latency_nsec(struct request_queue *q) static int wbt_data_dir(const struct request *rq) { - const int op = req_op(rq); + const enum req_op op = req_op(rq); if (op == REQ_OP_READ) return READ; @@ -676,16 +755,15 @@ static int wbt_data_dir(const struct request *rq) static void wbt_queue_depth_changed(struct rq_qos *rqos) { - RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->q); + RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue); wbt_update_limits(RQWB(rqos)); } static void wbt_exit(struct rq_qos *rqos) { struct rq_wb *rwb = RQWB(rqos); - struct request_queue *q = rqos->q; - blk_stat_remove_callback(q, rwb->cb); + blk_stat_remove_callback(rqos->disk->queue, rwb->cb); blk_stat_free_callback(rwb->cb); kfree(rwb); } @@ -693,16 +771,16 @@ static void wbt_exit(struct rq_qos *rqos) /* * Disable wbt, if enabled by default. */ -void wbt_disable_default(struct request_queue *q) +void wbt_disable_default(struct gendisk *disk) { - struct rq_qos *rqos = wbt_rq_qos(q); + struct rq_qos *rqos = wbt_rq_qos(disk->queue); struct rq_wb *rwb; if (!rqos) return; rwb = RQWB(rqos); if (rwb->enable_state == WBT_STATE_ON_DEFAULT) { blk_stat_deactivate(rwb->cb); - rwb->wb_normal = 0; + rwb->enable_state = WBT_STATE_OFF_DEFAULT; } } EXPORT_SYMBOL_GPL(wbt_disable_default); @@ -795,7 +873,7 @@ static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = { }; #endif -static struct rq_qos_ops wbt_rqos_ops = { +static const struct rq_qos_ops wbt_rqos_ops = { .throttle = wbt_wait, .issue = wbt_issue, .track = wbt_track, @@ -809,10 +887,12 @@ static struct rq_qos_ops wbt_rqos_ops = { #endif }; -int wbt_init(struct request_queue *q) +int wbt_init(struct gendisk *disk) { + struct request_queue *q = disk->queue; struct rq_wb *rwb; int i; + int ret; rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); if (!rwb) @@ -827,26 +907,30 @@ int wbt_init(struct request_queue *q) for (i = 0; i < WBT_NUM_RWQ; i++) rq_wait_init(&rwb->rq_wait[i]); - rwb->rqos.id = RQ_QOS_WBT; - rwb->rqos.ops = &wbt_rqos_ops; - rwb->rqos.q = q; rwb->last_comp = rwb->last_issue = jiffies; rwb->win_nsec = RWB_WINDOW_NSEC; rwb->enable_state = WBT_STATE_ON_DEFAULT; - rwb->wc = 1; rwb->rq_depth.default_depth = RWB_DEF_DEPTH; + rwb->min_lat_nsec = wbt_default_latency_nsec(q); + rwb->rq_depth.queue_depth = blk_queue_depth(q); wbt_update_limits(rwb); /* * Assign rwb and add the stats callback. */ - rq_qos_add(q, &rwb->rqos); + mutex_lock(&q->rq_qos_mutex); + ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops); + mutex_unlock(&q->rq_qos_mutex); + if (ret) + goto err_free; + blk_stat_add_callback(q, rwb->cb); - rwb->min_lat_nsec = wbt_default_latency_nsec(q); + return 0; - wbt_queue_depth_changed(&rwb->rqos); - wbt_set_write_cache(q, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); +err_free: + blk_stat_free_callback(rwb->cb); + kfree(rwb); + return ret; - return 0; } diff --git a/block/blk-wbt.h b/block/blk-wbt.h index 16bdc85b8df9..e5fc653b9b76 100644 --- a/block/blk-wbt.h +++ b/block/blk-wbt.h @@ -2,130 +2,25 @@ #ifndef WB_THROTTLE_H #define WB_THROTTLE_H -#include <linux/kernel.h> -#include <linux/atomic.h> -#include <linux/wait.h> -#include <linux/timer.h> -#include <linux/ktime.h> - -#include "blk-stat.h" -#include "blk-rq-qos.h" - -enum wbt_flags { - WBT_TRACKED = 1, /* write, tracked for throttling */ - WBT_READ = 2, /* read */ - WBT_KSWAPD = 4, /* write, from kswapd */ - WBT_DISCARD = 8, /* discard */ - - WBT_NR_BITS = 4, /* number of bits */ -}; - -enum { - WBT_RWQ_BG = 0, - WBT_RWQ_KSWAPD, - WBT_RWQ_DISCARD, - WBT_NUM_RWQ, -}; - -/* - * Enable states. Either off, or on by default (done at init time), - * or on through manual setup in sysfs. - */ -enum { - WBT_STATE_ON_DEFAULT = 1, - WBT_STATE_ON_MANUAL = 2, -}; - -struct rq_wb { - /* - * Settings that govern how we throttle - */ - unsigned int wb_background; /* background writeback */ - unsigned int wb_normal; /* normal writeback */ - - short enable_state; /* WBT_STATE_* */ - - /* - * Number of consecutive periods where we don't have enough - * information to make a firm scale up/down decision. - */ - unsigned int unknown_cnt; - - u64 win_nsec; /* default window size */ - u64 cur_win_nsec; /* current window size */ - - struct blk_stat_callback *cb; - - u64 sync_issue; - void *sync_cookie; - - unsigned int wc; - - unsigned long last_issue; /* last non-throttled issue */ - unsigned long last_comp; /* last non-throttled comp */ - unsigned long min_lat_nsec; - struct rq_qos rqos; - struct rq_wait rq_wait[WBT_NUM_RWQ]; - struct rq_depth rq_depth; -}; - -static inline struct rq_wb *RQWB(struct rq_qos *rqos) -{ - return container_of(rqos, struct rq_wb, rqos); -} - -static inline unsigned int wbt_inflight(struct rq_wb *rwb) -{ - unsigned int i, ret = 0; - - for (i = 0; i < WBT_NUM_RWQ; i++) - ret += atomic_read(&rwb->rq_wait[i].inflight); - - return ret; -} - - #ifdef CONFIG_BLK_WBT -int wbt_init(struct request_queue *); -void wbt_disable_default(struct request_queue *); -void wbt_enable_default(struct request_queue *); +int wbt_init(struct gendisk *disk); +void wbt_disable_default(struct gendisk *disk); +void wbt_enable_default(struct gendisk *disk); u64 wbt_get_min_lat(struct request_queue *q); void wbt_set_min_lat(struct request_queue *q, u64 val); - -void wbt_set_write_cache(struct request_queue *, bool); +bool wbt_disabled(struct request_queue *); u64 wbt_default_latency_nsec(struct request_queue *); #else -static inline void wbt_track(struct request *rq, enum wbt_flags flags) -{ -} -static inline int wbt_init(struct request_queue *q) -{ - return -EINVAL; -} -static inline void wbt_disable_default(struct request_queue *q) -{ -} -static inline void wbt_enable_default(struct request_queue *q) -{ -} -static inline void wbt_set_write_cache(struct request_queue *q, bool wc) -{ -} -static inline u64 wbt_get_min_lat(struct request_queue *q) -{ - return 0; -} -static inline void wbt_set_min_lat(struct request_queue *q, u64 val) +static inline void wbt_disable_default(struct gendisk *disk) { } -static inline u64 wbt_default_latency_nsec(struct request_queue *q) +static inline void wbt_enable_default(struct gendisk *disk) { - return 0; } #endif /* CONFIG_BLK_WBT */ diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 23831fa8701d..d343e5756a9c 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -52,33 +52,15 @@ const char *blk_zone_cond_str(enum blk_zone_cond zone_cond) } EXPORT_SYMBOL_GPL(blk_zone_cond_str); -static inline sector_t blk_zone_start(struct request_queue *q, - sector_t sector) -{ - sector_t zone_mask = blk_queue_zone_sectors(q) - 1; - - return sector & ~zone_mask; -} - /* * Return true if a request is a write requests that needs zone write locking. */ bool blk_req_needs_zone_write_lock(struct request *rq) { - if (!rq->q->seq_zones_wlock) + if (!rq->q->disk->seq_zones_wlock) return false; - if (blk_rq_is_passthrough(rq)) - return false; - - switch (req_op(rq)) { - case REQ_OP_WRITE_ZEROES: - case REQ_OP_WRITE_SAME: - case REQ_OP_WRITE: - return blk_rq_zone_is_seq(rq); - default: - return false; - } + return blk_rq_is_seq_zoned_write(rq); } EXPORT_SYMBOL_GPL(blk_req_needs_zone_write_lock); @@ -86,7 +68,7 @@ bool blk_req_zone_write_trylock(struct request *rq) { unsigned int zno = blk_rq_zone_no(rq); - if (test_and_set_bit(zno, rq->q->seq_zones_wlock)) + if (test_and_set_bit(zno, rq->q->disk->seq_zones_wlock)) return false; WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); @@ -99,7 +81,7 @@ EXPORT_SYMBOL_GPL(blk_req_zone_write_trylock); void __blk_req_zone_write_lock(struct request *rq) { if (WARN_ON_ONCE(test_and_set_bit(blk_rq_zone_no(rq), - rq->q->seq_zones_wlock))) + rq->q->disk->seq_zones_wlock))) return; WARN_ON_ONCE(rq->rq_flags & RQF_ZONE_WRITE_LOCKED); @@ -110,28 +92,29 @@ EXPORT_SYMBOL_GPL(__blk_req_zone_write_lock); void __blk_req_zone_write_unlock(struct request *rq) { rq->rq_flags &= ~RQF_ZONE_WRITE_LOCKED; - if (rq->q->seq_zones_wlock) + if (rq->q->disk->seq_zones_wlock) WARN_ON_ONCE(!test_and_clear_bit(blk_rq_zone_no(rq), - rq->q->seq_zones_wlock)); + rq->q->disk->seq_zones_wlock)); } EXPORT_SYMBOL_GPL(__blk_req_zone_write_unlock); /** - * blkdev_nr_zones - Get number of zones - * @disk: Target gendisk + * bdev_nr_zones - Get number of zones + * @bdev: Target device * * Return the total number of zones of a zoned block device. For a block * device without zone capabilities, the number of zones is always 0. */ -unsigned int blkdev_nr_zones(struct gendisk *disk) +unsigned int bdev_nr_zones(struct block_device *bdev) { - sector_t zone_sectors = blk_queue_zone_sectors(disk->queue); + sector_t zone_sectors = bdev_zone_sectors(bdev); - if (!blk_queue_is_zoned(disk->queue)) + if (!bdev_is_zoned(bdev)) return 0; - return (get_capacity(disk) + zone_sectors - 1) >> ilog2(zone_sectors); + return (bdev_nr_sectors(bdev) + zone_sectors - 1) >> + ilog2(zone_sectors); } -EXPORT_SYMBOL_GPL(blkdev_nr_zones); +EXPORT_SYMBOL_GPL(bdev_nr_zones); /** * blkdev_report_zones - Get zones information @@ -158,8 +141,7 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, struct gendisk *disk = bdev->bd_disk; sector_t capacity = get_capacity(disk); - if (!blk_queue_is_zoned(bdev_get_queue(bdev)) || - WARN_ON_ONCE(!disk->fops->report_zones)) + if (!bdev_is_zoned(bdev) || WARN_ON_ONCE(!disk->fops->report_zones)) return -EOPNOTSUPP; if (!nr_zones || sector >= capacity) @@ -169,18 +151,84 @@ int blkdev_report_zones(struct block_device *bdev, sector_t sector, } EXPORT_SYMBOL_GPL(blkdev_report_zones); -static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, - sector_t sector, - sector_t nr_sectors) +static inline unsigned long *blk_alloc_zone_bitmap(int node, + unsigned int nr_zones) { - if (!blk_queue_zone_resetall(bdev_get_queue(bdev))) - return false; + return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), + GFP_NOIO, node); +} +static int blk_zone_need_reset_cb(struct blk_zone *zone, unsigned int idx, + void *data) +{ /* - * REQ_OP_ZONE_RESET_ALL can be executed only if the number of sectors - * of the applicable zone range is the entire disk. + * For an all-zones reset, ignore conventional, empty, read-only + * and offline zones. */ - return !sector && nr_sectors == get_capacity(bdev->bd_disk); + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_EMPTY: + case BLK_ZONE_COND_READONLY: + case BLK_ZONE_COND_OFFLINE: + return 0; + default: + set_bit(idx, (unsigned long *)data); + return 0; + } +} + +static int blkdev_zone_reset_all_emulated(struct block_device *bdev, + gfp_t gfp_mask) +{ + struct gendisk *disk = bdev->bd_disk; + sector_t capacity = bdev_nr_sectors(bdev); + sector_t zone_sectors = bdev_zone_sectors(bdev); + unsigned long *need_reset; + struct bio *bio = NULL; + sector_t sector = 0; + int ret; + + need_reset = blk_alloc_zone_bitmap(disk->queue->node, disk->nr_zones); + if (!need_reset) + return -ENOMEM; + + ret = disk->fops->report_zones(disk, 0, disk->nr_zones, + blk_zone_need_reset_cb, need_reset); + if (ret < 0) + goto out_free_need_reset; + + ret = 0; + while (sector < capacity) { + if (!test_bit(disk_zone_no(disk, sector), need_reset)) { + sector += zone_sectors; + continue; + } + + bio = blk_next_bio(bio, bdev, 0, REQ_OP_ZONE_RESET | REQ_SYNC, + gfp_mask); + bio->bi_iter.bi_sector = sector; + sector += zone_sectors; + + /* This may take a while, so be nice to others */ + cond_resched(); + } + + if (bio) { + ret = submit_bio_wait(bio); + bio_put(bio); + } + +out_free_need_reset: + kfree(need_reset); + return ret; +} + +static int blkdev_zone_reset_all(struct block_device *bdev, gfp_t gfp_mask) +{ + struct bio bio; + + bio_init(&bio, bdev, NULL, 0, REQ_OP_ZONE_RESET_ALL | REQ_SYNC); + return submit_bio_wait(&bio); } /** @@ -199,18 +247,17 @@ static inline bool blkdev_allow_reset_all_zones(struct block_device *bdev, * The operation to execute on each zone can be a zone reset, open, close * or finish request. */ -int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, - sector_t sector, sector_t nr_sectors, - gfp_t gfp_mask) +int blkdev_zone_mgmt(struct block_device *bdev, enum req_op op, + sector_t sector, sector_t nr_sectors, gfp_t gfp_mask) { struct request_queue *q = bdev_get_queue(bdev); - sector_t zone_sectors = blk_queue_zone_sectors(q); - sector_t capacity = get_capacity(bdev->bd_disk); + sector_t zone_sectors = bdev_zone_sectors(bdev); + sector_t capacity = bdev_nr_sectors(bdev); sector_t end_sector = sector + nr_sectors; struct bio *bio = NULL; - int ret; + int ret = 0; - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bdev)) return -EOPNOTSUPP; if (bdev_read_only(bdev)) @@ -224,27 +271,26 @@ int blkdev_zone_mgmt(struct block_device *bdev, enum req_opf op, return -EINVAL; /* Check alignment (handle eventual smaller last zone) */ - if (sector & (zone_sectors - 1)) + if (!bdev_is_zone_start(bdev, sector)) return -EINVAL; - if ((nr_sectors & (zone_sectors - 1)) && end_sector != capacity) + if (!bdev_is_zone_start(bdev, nr_sectors) && end_sector != capacity) return -EINVAL; - while (sector < end_sector) { - bio = blk_next_bio(bio, 0, gfp_mask); - bio_set_dev(bio, bdev); - - /* - * Special case for the zone reset operation that reset all - * zones, this is useful for applications like mkfs. - */ - if (op == REQ_OP_ZONE_RESET && - blkdev_allow_reset_all_zones(bdev, sector, nr_sectors)) { - bio->bi_opf = REQ_OP_ZONE_RESET_ALL; - break; - } + /* + * In the case of a zone reset operation over all zones, + * REQ_OP_ZONE_RESET_ALL can be used with devices supporting this + * command. For other devices, we emulate this command behavior by + * identifying the zones needing a reset. + */ + if (op == REQ_OP_ZONE_RESET && sector == 0 && nr_sectors == capacity) { + if (!blk_queue_zone_resetall(q)) + return blkdev_zone_reset_all_emulated(bdev, gfp_mask); + return blkdev_zone_reset_all(bdev, gfp_mask); + } - bio->bi_opf = op | REQ_SYNC; + while (sector < end_sector) { + bio = blk_next_bio(bio, bdev, 0, op | REQ_SYNC, gfp_mask); bio->bi_iter.bi_sector = sector; sector += zone_sectors; @@ -277,28 +323,20 @@ static int blkdev_copy_zone_to_user(struct blk_zone *zone, unsigned int idx, * BLKREPORTZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) +int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, + unsigned long arg) { void __user *argp = (void __user *)arg; struct zone_report_args args; - struct request_queue *q; struct blk_zone_report rep; int ret; if (!argp) return -EINVAL; - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bdev)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&rep, argp, sizeof(struct blk_zone_report))) return -EFAULT; @@ -312,37 +350,47 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, return ret; rep.nr_zones = ret; + rep.flags = BLK_ZONE_REP_CAPACITY; if (copy_to_user(argp, &rep, sizeof(struct blk_zone_report))) return -EFAULT; return 0; } +static int blkdev_truncate_zone_range(struct block_device *bdev, + blk_mode_t mode, const struct blk_zone_range *zrange) +{ + loff_t start, end; + + if (zrange->sector + zrange->nr_sectors <= zrange->sector || + zrange->sector + zrange->nr_sectors > get_capacity(bdev->bd_disk)) + /* Out of range */ + return -EINVAL; + + start = zrange->sector << SECTOR_SHIFT; + end = ((zrange->sector + zrange->nr_sectors) << SECTOR_SHIFT) - 1; + + return truncate_bdev_range(bdev, mode, start, end); +} + /* * BLKRESETZONE, BLKOPENZONE, BLKCLOSEZONE and BLKFINISHZONE ioctl processing. * Called from blkdev_ioctl. */ -int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; - struct request_queue *q; struct blk_zone_range zrange; - enum req_opf op; + enum req_op op; + int ret; if (!argp) return -EINVAL; - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - - if (!blk_queue_is_zoned(q)) + if (!bdev_is_zoned(bdev)) return -ENOTTY; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(&zrange, argp, sizeof(struct blk_zone_range))) @@ -351,6 +399,12 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, switch (cmd) { case BLKRESETZONE: op = REQ_OP_ZONE_RESET; + + /* Invalidate the page cache, including dirty pages. */ + filemap_invalidate_lock(bdev->bd_inode->i_mapping); + ret = blkdev_truncate_zone_range(bdev, mode, &zrange); + if (ret) + goto fail; break; case BLKOPENZONE: op = REQ_OP_ZONE_OPEN; @@ -365,23 +419,22 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, return -ENOTTY; } - return blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, - GFP_KERNEL); -} + ret = blkdev_zone_mgmt(bdev, op, zrange.sector, zrange.nr_sectors, + GFP_KERNEL); -static inline unsigned long *blk_alloc_zone_bitmap(int node, - unsigned int nr_zones) -{ - return kcalloc_node(BITS_TO_LONGS(nr_zones), sizeof(unsigned long), - GFP_NOIO, node); +fail: + if (cmd == BLKRESETZONE) + filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + + return ret; } -void blk_queue_free_zone_bitmaps(struct request_queue *q) +void disk_free_zone_bitmaps(struct gendisk *disk) { - kfree(q->conv_zones_bitmap); - q->conv_zones_bitmap = NULL; - kfree(q->seq_zones_wlock); - q->seq_zones_wlock = NULL; + kfree(disk->conv_zones_bitmap); + disk->conv_zones_bitmap = NULL; + kfree(disk->seq_zones_wlock); + disk->seq_zones_wlock = NULL; } struct blk_revalidate_zone_args { @@ -389,7 +442,6 @@ struct blk_revalidate_zone_args { unsigned long *conv_zones_bitmap; unsigned long *seq_zones_wlock; unsigned int nr_zones; - sector_t zone_sectors; sector_t sector; }; @@ -403,38 +455,34 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, struct gendisk *disk = args->disk; struct request_queue *q = disk->queue; sector_t capacity = get_capacity(disk); + sector_t zone_sectors = q->limits.chunk_sectors; + + /* Check for bad zones and holes in the zone report */ + if (zone->start != args->sector) { + pr_warn("%s: Zone gap at sectors %llu..%llu\n", + disk->disk_name, args->sector, zone->start); + return -ENODEV; + } + + if (zone->start >= capacity || !zone->len) { + pr_warn("%s: Invalid zone start %llu, length %llu\n", + disk->disk_name, zone->start, zone->len); + return -ENODEV; + } /* * All zones must have the same size, with the exception on an eventual * smaller last zone. */ - if (zone->start == 0) { - if (zone->len == 0 || !is_power_of_2(zone->len)) { - pr_warn("%s: Invalid zoned device with non power of two zone size (%llu)\n", - disk->disk_name, zone->len); - return -ENODEV; - } - - args->zone_sectors = zone->len; - args->nr_zones = (capacity + zone->len - 1) >> ilog2(zone->len); - } else if (zone->start + args->zone_sectors < capacity) { - if (zone->len != args->zone_sectors) { + if (zone->start + zone->len < capacity) { + if (zone->len != zone_sectors) { pr_warn("%s: Invalid zoned device with non constant zone size\n", disk->disk_name); return -ENODEV; } - } else { - if (zone->len > args->zone_sectors) { - pr_warn("%s: Invalid zoned device with larger last zone size\n", - disk->disk_name); - return -ENODEV; - } - } - - /* Check for holes in the zone report */ - if (zone->start != args->sector) { - pr_warn("%s: Zone gap at sectors %llu..%llu\n", - disk->disk_name, args->sector, zone->start); + } else if (zone->len > zone_sectors) { + pr_warn("%s: Invalid zoned device with larger last zone size\n", + disk->disk_name); return -ENODEV; } @@ -450,7 +498,6 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, set_bit(idx, args->conv_zones_bitmap); break; case BLK_ZONE_TYPE_SEQWRITE_REQ: - case BLK_ZONE_TYPE_SEQWRITE_PREF: if (!args->seq_zones_wlock) { args->seq_zones_wlock = blk_alloc_zone_bitmap(q->node, args->nr_zones); @@ -458,6 +505,7 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, return -ENOMEM; } break; + case BLK_ZONE_TYPE_SEQWRITE_PREF: default: pr_warn("%s: Invalid zone type 0x%x at sectors %llu\n", disk->disk_name, (int)zone->type, zone->start); @@ -473,11 +521,13 @@ static int blk_revalidate_zone_cb(struct blk_zone *zone, unsigned int idx, * @disk: Target disk * @update_driver_data: Callback to update driver data on the frozen disk * - * Helper function for low-level device drivers to (re) allocate and initialize - * a disk request queue zone bitmaps. This functions should normally be called - * within the disk ->revalidate method for blk-mq based drivers. For BIO based - * drivers only q->nr_zones needs to be updated so that the sysfs exposed value - * is correct. + * Helper function for low-level device drivers to check and (re) allocate and + * initialize a disk request queue zone bitmaps. This functions should normally + * be called within the disk ->revalidate method for blk-mq based drivers. + * Before calling this function, the device driver must already have set the + * device zone size (chunk_sector limit) and the max zone append limit. + * For BIO based drivers, this function cannot be used. BIO based device drivers + * only need to set disk->nr_zones so that the sysfs exposed value is correct. * If the @update_driver_data callback function is not NULL, the callback is * executed with the device request queue frozen after all zones have been * checked. @@ -486,9 +536,9 @@ int blk_revalidate_disk_zones(struct gendisk *disk, void (*update_driver_data)(struct gendisk *disk)) { struct request_queue *q = disk->queue; - struct blk_revalidate_zone_args args = { - .disk = disk, - }; + sector_t zone_sectors = q->limits.chunk_sectors; + sector_t capacity = get_capacity(disk); + struct blk_revalidate_zone_args args = { }; unsigned int noio_flag; int ret; @@ -497,32 +547,66 @@ int blk_revalidate_disk_zones(struct gendisk *disk, if (WARN_ON_ONCE(!queue_is_mq(q))) return -EIO; + if (!capacity) + return -ENODEV; + + /* + * Checks that the device driver indicated a valid zone size and that + * the max zone append limit is set. + */ + if (!zone_sectors || !is_power_of_2(zone_sectors)) { + pr_warn("%s: Invalid non power of two zone size (%llu)\n", + disk->disk_name, zone_sectors); + return -ENODEV; + } + + if (!q->limits.max_zone_append_sectors) { + pr_warn("%s: Invalid 0 maximum zone append limit\n", + disk->disk_name); + return -ENODEV; + } + /* * Ensure that all memory allocations in this context are done as if * GFP_NOIO was specified. */ + args.disk = disk; + args.nr_zones = (capacity + zone_sectors - 1) >> ilog2(zone_sectors); noio_flag = memalloc_noio_save(); ret = disk->fops->report_zones(disk, 0, UINT_MAX, blk_revalidate_zone_cb, &args); + if (!ret) { + pr_warn("%s: No zones reported\n", disk->disk_name); + ret = -ENODEV; + } memalloc_noio_restore(noio_flag); /* + * If zones where reported, make sure that the entire disk capacity + * has been checked. + */ + if (ret > 0 && args.sector != capacity) { + pr_warn("%s: Missing zones from sector %llu\n", + disk->disk_name, args.sector); + ret = -ENODEV; + } + + /* * Install the new bitmaps and update nr_zones only once the queue is * stopped and all I/Os are completed (i.e. a scheduler is not * referencing the bitmaps). */ blk_mq_freeze_queue(q); - if (ret >= 0) { - blk_queue_chunk_sectors(q, args.zone_sectors); - q->nr_zones = args.nr_zones; - swap(q->seq_zones_wlock, args.seq_zones_wlock); - swap(q->conv_zones_bitmap, args.conv_zones_bitmap); + if (ret > 0) { + disk->nr_zones = args.nr_zones; + swap(disk->seq_zones_wlock, args.seq_zones_wlock); + swap(disk->conv_zones_bitmap, args.conv_zones_bitmap); if (update_driver_data) update_driver_data(disk); ret = 0; } else { pr_warn("%s: failed to revalidate zones\n", disk->disk_name); - blk_queue_free_zone_bitmaps(q); + disk_free_zone_bitmaps(disk); } blk_mq_unfreeze_queue(q); diff --git a/block/blk.h b/block/blk.h index b5d1f0fc6547..1ef920f72e0f 100644 --- a/block/blk.h +++ b/block/blk.h @@ -2,66 +2,82 @@ #ifndef BLK_INTERNAL_H #define BLK_INTERNAL_H -#include <linux/idr.h> -#include <linux/blk-mq.h> -#include <linux/part_stat.h> #include <linux/blk-crypto.h> +#include <linux/memblock.h> /* for max_pfn/max_low_pfn */ #include <xen/xen.h> #include "blk-crypto-internal.h" -#include "blk-mq.h" -#include "blk-mq-sched.h" + +struct elevator_type; /* Max future timer expiry for timeouts */ #define BLK_MAX_TIMEOUT (5 * HZ) -#ifdef CONFIG_DEBUG_FS extern struct dentry *blk_debugfs_root; -#endif struct blk_flush_queue { + spinlock_t mq_flush_lock; unsigned int flush_pending_idx:1; unsigned int flush_running_idx:1; blk_status_t rq_status; unsigned long flush_pending_since; struct list_head flush_queue[2]; - struct list_head flush_data_in_flight; + unsigned long flush_data_in_flight; struct request *flush_rq; +}; + +bool is_flush_rq(struct request *req); + +struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, + gfp_t flags); +void blk_free_flush_queue(struct blk_flush_queue *q); + +void blk_freeze_queue(struct request_queue *q); +void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic); +void blk_queue_start_drain(struct request_queue *q); +int __bio_queue_enter(struct request_queue *q, struct bio *bio); +void submit_bio_noacct_nocheck(struct bio *bio); + +static inline bool blk_try_enter_queue(struct request_queue *q, bool pm) +{ + rcu_read_lock(); + if (!percpu_ref_tryget_live_rcu(&q->q_usage_counter)) + goto fail; /* - * flush_rq shares tag with this rq, both can't be active - * at the same time + * The code that increments the pm_only counter must ensure that the + * counter is globally visible before the queue is unfrozen. */ - struct request *orig_rq; - struct lock_class_key key; - spinlock_t mq_flush_lock; -}; + if (blk_queue_pm_only(q) && + (!pm || queue_rpm_status(q) == RPM_SUSPENDED)) + goto fail_put; -extern struct kmem_cache *blk_requestq_cachep; -extern struct kobj_type blk_queue_ktype; -extern struct ida blk_queue_ida; + rcu_read_unlock(); + return true; -static inline struct blk_flush_queue * -blk_get_flush_queue(struct request_queue *q, struct blk_mq_ctx *ctx) -{ - return blk_mq_map_queue(q, REQ_OP_FLUSH, ctx)->fq; +fail_put: + blk_queue_exit(q); +fail: + rcu_read_unlock(); + return false; } -static inline void __blk_get_queue(struct request_queue *q) +static inline int bio_queue_enter(struct bio *bio) { - kobject_get(&q->kobj); -} + struct request_queue *q = bdev_get_queue(bio->bi_bdev); -static inline bool -is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) -{ - return hctx->fq->flush_rq == req; + if (blk_try_enter_queue(q, false)) + return 0; + return __bio_queue_enter(q, bio); } -struct blk_flush_queue *blk_alloc_flush_queue(int node, int cmd_size, - gfp_t flags); -void blk_free_flush_queue(struct blk_flush_queue *q); +#define BIO_INLINE_VECS 4 +struct bio_vec *bvec_alloc(mempool_t *pool, unsigned short *nr_vecs, + gfp_t gfp_mask); +void bvec_free(mempool_t *pool, struct bio_vec *bv, unsigned short nr_vecs); -void blk_freeze_queue(struct request_queue *q); +bool bvec_try_merge_hw_page(struct request_queue *q, struct bio_vec *bv, + struct page *page, unsigned len, unsigned offset, + bool *same_page); static inline bool biovec_phys_mergeable(struct request_queue *q, struct bio_vec *vec1, struct bio_vec *vec2) @@ -70,6 +86,13 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, phys_addr_t addr1 = page_to_phys(vec1->bv_page) + vec1->bv_offset; phys_addr_t addr2 = page_to_phys(vec2->bv_page) + vec2->bv_offset; + /* + * Merging adjacent physical pages may not work correctly under KMSAN + * if their metadata pages aren't adjacent. Just disable merging. + */ + if (IS_ENABLED(CONFIG_KMSAN)) + return false; + if (addr1 + vec1->bv_len != addr2) return false; if (xen_domain() && !xen_biovec_phys_mergeable(vec1, vec2->bv_page)) @@ -79,35 +102,81 @@ static inline bool biovec_phys_mergeable(struct request_queue *q, return true; } -static inline bool __bvec_gap_to_prev(struct request_queue *q, +static inline bool __bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { - return (offset & queue_virt_boundary(q)) || - ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q)); + return (offset & lim->virt_boundary_mask) || + ((bprv->bv_offset + bprv->bv_len) & lim->virt_boundary_mask); } /* * Check if adding a bio_vec after bprv with offset would create a gap in * the SG list. Most drivers don't care about this, but some do. */ -static inline bool bvec_gap_to_prev(struct request_queue *q, +static inline bool bvec_gap_to_prev(const struct queue_limits *lim, struct bio_vec *bprv, unsigned int offset) { - if (!queue_virt_boundary(q)) + if (!lim->virt_boundary_mask) + return false; + return __bvec_gap_to_prev(lim, bprv, offset); +} + +static inline bool rq_mergeable(struct request *rq) +{ + if (blk_rq_is_passthrough(rq)) + return false; + + if (req_op(rq) == REQ_OP_FLUSH) + return false; + + if (req_op(rq) == REQ_OP_WRITE_ZEROES) + return false; + + if (req_op(rq) == REQ_OP_ZONE_APPEND) + return false; + + if (rq->cmd_flags & REQ_NOMERGE_FLAGS) return false; - return __bvec_gap_to_prev(q, bprv, offset); + if (rq->rq_flags & RQF_NOMERGE_FLAGS) + return false; + + return true; +} + +/* + * There are two different ways to handle DISCARD merges: + * 1) If max_discard_segments > 1, the driver treats every bio as a range and + * send the bios to controller together. The ranges don't need to be + * contiguous. + * 2) Otherwise, the request will be normal read/write requests. The ranges + * need to be contiguous. + */ +static inline bool blk_discard_mergable(struct request *req) +{ + if (req_op(req) == REQ_OP_DISCARD && + queue_max_discard_segments(req->q) > 1) + return true; + return false; } -static inline void blk_rq_bio_prep(struct request *rq, struct bio *bio, - unsigned int nr_segs) +static inline unsigned int blk_rq_get_max_segments(struct request *rq) { - rq->nr_phys_segments = nr_segs; - rq->__data_len = bio->bi_iter.bi_size; - rq->bio = rq->biotail = bio; - rq->ioprio = bio_prio(bio); + if (req_op(rq) == REQ_OP_DISCARD) + return queue_max_discard_segments(rq->q); + return queue_max_segments(rq->q); +} + +static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, + enum req_op op) +{ + if (unlikely(op == REQ_OP_DISCARD || op == REQ_OP_SECURE_ERASE)) + return min(q->limits.max_discard_sectors, + UINT_MAX >> SECTOR_SHIFT); + + if (unlikely(op == REQ_OP_WRITE_ZEROES)) + return q->limits.max_write_zeroes_sectors; - if (bio->bi_disk) - rq->rq_disk = bio->bi_disk; + return q->limits.max_sectors; } #ifdef CONFIG_BLK_DEV_INTEGRITY @@ -121,13 +190,19 @@ static inline bool bio_integrity_endio(struct bio *bio) return true; } +bool blk_integrity_merge_rq(struct request_queue *, struct request *, + struct request *); +bool blk_integrity_merge_bio(struct request_queue *, struct request *, + struct bio *); + static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { struct bio_integrity_payload *bip = bio_integrity(req->bio); struct bio_integrity_payload *bip_next = bio_integrity(next); - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + return bvec_gap_to_prev(&req->q->limits, + &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } @@ -137,13 +212,23 @@ static inline bool integrity_req_gap_front_merge(struct request *req, struct bio_integrity_payload *bip = bio_integrity(bio); struct bio_integrity_payload *bip_next = bio_integrity(req->bio); - return bvec_gap_to_prev(req->q, &bip->bip_vec[bip->bip_vcnt - 1], + return bvec_gap_to_prev(&req->q->limits, + &bip->bip_vec[bip->bip_vcnt - 1], bip_next->bip_vec[0].bv_offset); } -void blk_integrity_add(struct gendisk *); -void blk_integrity_del(struct gendisk *); +extern const struct attribute_group blk_integrity_attr_group; #else /* CONFIG_BLK_DEV_INTEGRITY */ +static inline bool blk_integrity_merge_rq(struct request_queue *rq, + struct request *r1, struct request *r2) +{ + return true; +} +static inline bool blk_integrity_merge_bio(struct request_queue *rq, + struct request *r, struct bio *b) +{ + return true; +} static inline bool integrity_req_gap_back_merge(struct request *req, struct bio *next) { @@ -165,54 +250,35 @@ static inline bool bio_integrity_endio(struct bio *bio) static inline void bio_integrity_free(struct bio *bio) { } -static inline void blk_integrity_add(struct gendisk *disk) -{ -} -static inline void blk_integrity_del(struct gendisk *disk) -{ -} #endif /* CONFIG_BLK_DEV_INTEGRITY */ unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); -bool bio_attempt_front_merge(struct request *req, struct bio *bio, - unsigned int nr_segs); -bool bio_attempt_back_merge(struct request *req, struct bio *bio, - unsigned int nr_segs); -bool bio_attempt_discard_merge(struct request_queue *q, struct request *req, - struct bio *bio); bool blk_attempt_plug_merge(struct request_queue *q, struct bio *bio, - unsigned int nr_segs, struct request **same_queue_rq); + unsigned int nr_segs); +bool blk_bio_list_merge(struct request_queue *q, struct list_head *list, + struct bio *bio, unsigned int nr_segs); -void blk_account_io_start(struct request *req); -void blk_account_io_done(struct request *req, u64 now); +/* + * Plug flush limits + */ +#define BLK_MAX_REQUEST_COUNT 32 +#define BLK_PLUG_FLUSH_SIZE (128 * 1024) /* * Internal elevator interface */ #define ELV_ON_HASH(rq) ((rq)->rq_flags & RQF_HASHED) -void blk_insert_flush(struct request *rq); +bool blk_insert_flush(struct request *rq); -void elevator_init_mq(struct request_queue *q); -int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e); -void __elevator_exit(struct request_queue *, struct elevator_queue *); +int elevator_switch(struct request_queue *q, struct elevator_type *new_e); +void elevator_disable(struct request_queue *q); +void elevator_exit(struct request_queue *q); int elv_register_queue(struct request_queue *q, bool uevent); void elv_unregister_queue(struct request_queue *q); -static inline void elevator_exit(struct request_queue *q, - struct elevator_queue *e) -{ - lockdep_assert_held(&q->sysfs_lock); - - blk_mq_sched_free_requests(q); - __elevator_exit(q, e); -} - -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno); - ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, @@ -223,34 +289,47 @@ ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf); ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count); - -#ifdef CONFIG_FAIL_IO_TIMEOUT -int blk_should_fake_timeout(struct request_queue *); ssize_t part_timeout_show(struct device *, struct device_attribute *, char *); ssize_t part_timeout_store(struct device *, struct device_attribute *, const char *, size_t); -#else -static inline int blk_should_fake_timeout(struct request_queue *q) + +static inline bool bio_may_exceed_limits(struct bio *bio, + const struct queue_limits *lim) { - return 0; + switch (bio_op(bio)) { + case REQ_OP_DISCARD: + case REQ_OP_SECURE_ERASE: + case REQ_OP_WRITE_ZEROES: + return true; /* non-trivial splitting decisions */ + default: + break; + } + + /* + * All drivers must accept single-segments bios that are <= PAGE_SIZE. + * This is a quick and dirty check that relies on the fact that + * bi_io_vec[0] is always valid if a bio has data. The check might + * lead to occasional false negatives when bios are cloned, but compared + * to the performance impact of cloned bios themselves the loop below + * doesn't matter anyway. + */ + return lim->chunk_sectors || bio->bi_vcnt != 1 || + bio->bi_io_vec->bv_len + bio->bi_io_vec->bv_offset > PAGE_SIZE; } -#endif -void __blk_queue_split(struct request_queue *q, struct bio **bio, - unsigned int *nr_segs); +struct bio *__bio_split_to_limits(struct bio *bio, + const struct queue_limits *lim, + unsigned int *nr_segs); int ll_back_merge_fn(struct request *req, struct bio *bio, unsigned int nr_segs); -int ll_front_merge_fn(struct request *req, struct bio *bio, - unsigned int nr_segs); -struct request *attempt_back_merge(struct request_queue *q, struct request *rq); -struct request *attempt_front_merge(struct request_queue *q, struct request *rq); -int blk_attempt_req_merge(struct request_queue *q, struct request *rq, +bool blk_attempt_req_merge(struct request_queue *q, struct request *rq, struct request *next); unsigned int blk_recalc_rq_segments(struct request *rq); void blk_rq_set_mixed_merge(struct request *rq); bool blk_rq_merge_ok(struct request *rq, struct bio *bio); enum elv_merge blk_try_merge(struct request *rq, struct bio *bio); +void blk_set_default_limits(struct queue_limits *lim); int blk_dev_init(void); /* @@ -261,9 +340,11 @@ int blk_dev_init(void); */ static inline bool blk_do_io_stat(struct request *rq) { - return rq->rq_disk && (rq->rq_flags & RQF_IO_STAT); + return (rq->rq_flags & RQF_IO_STAT) && !blk_rq_is_passthrough(rq); } +void update_io_ticks(struct block_device *part, unsigned long now, bool end); + static inline void req_set_nomerge(struct request_queue *q, struct request *req) { req->cmd_flags |= REQ_NOMERGE; @@ -272,38 +353,18 @@ static inline void req_set_nomerge(struct request_queue *q, struct request *req) } /* - * The max size one bio can handle is UINT_MAX becasue bvec_iter.bi_size - * is defined as 'unsigned int', meantime it has to aligned to with logical - * block size which is the minimum accepted unit by hardware. - */ -static inline unsigned int bio_allowed_max_sectors(struct request_queue *q) -{ - return round_down(UINT_MAX, queue_logical_block_size(q)) >> 9; -} - -/* * Internal io_context interface */ -void get_io_context(struct io_context *ioc); -struct io_cq *ioc_lookup_icq(struct io_context *ioc, struct request_queue *q); -struct io_cq *ioc_create_icq(struct io_context *ioc, struct request_queue *q, - gfp_t gfp_mask); +struct io_cq *ioc_find_get_icq(struct request_queue *q); +struct io_cq *ioc_lookup_icq(struct request_queue *q); +#ifdef CONFIG_BLK_ICQ void ioc_clear_queue(struct request_queue *q); +#else +static inline void ioc_clear_queue(struct request_queue *q) +{ +} +#endif /* CONFIG_BLK_ICQ */ -int create_task_io_context(struct task_struct *task, gfp_t gfp_mask, int node); - -/* - * Internal throttling interface - */ -#ifdef CONFIG_BLK_DEV_THROTTLING -extern int blk_throtl_init(struct request_queue *q); -extern void blk_throtl_exit(struct request_queue *q); -extern void blk_throtl_register_queue(struct request_queue *q); -#else /* CONFIG_BLK_DEV_THROTTLING */ -static inline int blk_throtl_init(struct request_queue *q) { return 0; } -static inline void blk_throtl_exit(struct request_queue *q) { } -static inline void blk_throtl_register_queue(struct request_queue *q) { } -#endif /* CONFIG_BLK_DEV_THROTTLING */ #ifdef CONFIG_BLK_DEV_THROTTLING_LOW extern ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page); extern ssize_t blk_throtl_sample_time_store(struct request_queue *q, @@ -315,129 +376,144 @@ static inline void blk_throtl_bio_endio(struct bio *bio) { } static inline void blk_throtl_stat_add(struct request *rq, u64 time) { } #endif -#ifdef CONFIG_BOUNCE -extern int init_emergency_isa_pool(void); -extern void blk_queue_bounce(struct request_queue *q, struct bio **bio); -#else -static inline int init_emergency_isa_pool(void) +struct bio *__blk_queue_bounce(struct bio *bio, struct request_queue *q); + +static inline bool blk_queue_may_bounce(struct request_queue *q) { - return 0; + return IS_ENABLED(CONFIG_BOUNCE) && + q->limits.bounce == BLK_BOUNCE_HIGH && + max_low_pfn >= max_pfn; } -static inline void blk_queue_bounce(struct request_queue *q, struct bio **bio) + +static inline struct bio *blk_queue_bounce(struct bio *bio, + struct request_queue *q) { + if (unlikely(blk_queue_may_bounce(q) && bio_has_data(bio))) + return __blk_queue_bounce(bio, q); + return bio; } -#endif /* CONFIG_BOUNCE */ - -#ifdef CONFIG_BLK_CGROUP_IOLATENCY -extern int blk_iolatency_init(struct request_queue *q); -#else -static inline int blk_iolatency_init(struct request_queue *q) { return 0; } -#endif - -struct bio *blk_next_bio(struct bio *bio, unsigned int nr_pages, gfp_t gfp); #ifdef CONFIG_BLK_DEV_ZONED -void blk_queue_free_zone_bitmaps(struct request_queue *q); -#else -static inline void blk_queue_free_zone_bitmaps(struct request_queue *q) {} -#endif +void disk_free_zone_bitmaps(struct gendisk *disk); +int blkdev_report_zones_ioctl(struct block_device *bdev, unsigned int cmd, + unsigned long arg); +int blkdev_zone_mgmt_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long arg); +#else /* CONFIG_BLK_DEV_ZONED */ +static inline void disk_free_zone_bitmaps(struct gendisk *disk) {} +static inline int blkdev_report_zones_ioctl(struct block_device *bdev, + unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} +static inline int blkdev_zone_mgmt_ioctl(struct block_device *bdev, + blk_mode_t mode, unsigned int cmd, unsigned long arg) +{ + return -ENOTTY; +} +#endif /* CONFIG_BLK_DEV_ZONED */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector); +struct block_device *bdev_alloc(struct gendisk *disk, u8 partno); +void bdev_add(struct block_device *bdev, dev_t dev); -int blk_alloc_devt(struct hd_struct *part, dev_t *devt); -void blk_free_devt(dev_t devt); -void blk_invalidate_devt(dev_t devt); -char *disk_name(struct gendisk *hd, int partno, char *buf); +int blk_alloc_ext_minor(void); +void blk_free_ext_minor(unsigned int minor); #define ADDPART_FLAG_NONE 0 #define ADDPART_FLAG_RAID 1 #define ADDPART_FLAG_WHOLEDISK 2 -void delete_partition(struct gendisk *disk, struct hd_struct *part); -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); -int bdev_del_partition(struct block_device *bdev, int partno); -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length); -int disk_expand_part_tbl(struct gendisk *disk, int target); -int hd_ref_init(struct hd_struct *part); +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); +int bdev_del_partition(struct gendisk *disk, int partno); +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length); +void drop_partition(struct block_device *part); -/* no need to get/put refcount of part0 */ -static inline int hd_struct_try_get(struct hd_struct *part) -{ - if (part->partno) - return percpu_ref_tryget_live(&part->ref); - return 1; -} +void bdev_set_nr_sectors(struct block_device *bdev, sector_t sectors); + +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass); + +int bio_add_hw_page(struct request_queue *q, struct bio *bio, + struct page *page, unsigned int len, unsigned int offset, + unsigned int max_sectors, bool *same_page); -static inline void hd_struct_put(struct hd_struct *part) +/* + * Clean up a page appropriately, where the page may be pinned, may have a + * ref taken on it or neither. + */ +static inline void bio_release_page(struct bio *bio, struct page *page) { - if (part->partno) - percpu_ref_put(&part->ref); + if (bio_flagged(bio, BIO_PAGE_PINNED)) + unpin_user_page(page); } -static inline void hd_free_part(struct hd_struct *part) +struct request_queue *blk_alloc_queue(int node_id); + +int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode); + +int disk_alloc_events(struct gendisk *disk); +void disk_add_events(struct gendisk *disk); +void disk_del_events(struct gendisk *disk); +void disk_release_events(struct gendisk *disk); +void disk_block_events(struct gendisk *disk); +void disk_unblock_events(struct gendisk *disk); +void disk_flush_events(struct gendisk *disk, unsigned int mask); +extern struct device_attribute dev_attr_events; +extern struct device_attribute dev_attr_events_async; +extern struct device_attribute dev_attr_events_poll_msecs; + +extern struct attribute_group blk_trace_attr_group; + +blk_mode_t file_to_blk_mode(struct file *file); +int truncate_bdev_range(struct block_device *bdev, blk_mode_t mode, + loff_t lstart, loff_t lend); +long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); +long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg); + +extern const struct address_space_operations def_blk_aops; + +int disk_register_independent_access_ranges(struct gendisk *disk); +void disk_unregister_independent_access_ranges(struct gendisk *disk); + +#ifdef CONFIG_FAIL_MAKE_REQUEST +bool should_fail_request(struct block_device *part, unsigned int bytes); +#else /* CONFIG_FAIL_MAKE_REQUEST */ +static inline bool should_fail_request(struct block_device *part, + unsigned int bytes) { - free_percpu(part->dkstats); - kfree(part->info); - percpu_ref_exit(&part->ref); + return false; } +#endif /* CONFIG_FAIL_MAKE_REQUEST */ /* - * Any access of part->nr_sects which is not protected by partition - * bd_mutex or gendisk bdev bd_mutex, should be done using this - * accessor function. + * Optimized request reference counting. Ideally we'd make timeouts be more + * clever, as that's the only reason we need references at all... But until + * this happens, this is faster than using refcount_t. Also see: * - * Code written along the lines of i_size_read() and i_size_write(). - * CONFIG_PREEMPTION case optimizes the case of UP kernel with preemption - * on. + * abc54d634334 ("io_uring: switch to atomic_t for io_kiocb reference count") */ -static inline sector_t part_nr_sects_read(struct hd_struct *part) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - sector_t nr_sects; - unsigned seq; - do { - seq = read_seqcount_begin(&part->nr_sects_seq); - nr_sects = part->nr_sects; - } while (read_seqcount_retry(&part->nr_sects_seq, seq)); - return nr_sects; -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - sector_t nr_sects; - - preempt_disable(); - nr_sects = part->nr_sects; - preempt_enable(); - return nr_sects; -#else - return part->nr_sects; -#endif +#define req_ref_zero_or_close_to_overflow(req) \ + ((unsigned int) atomic_read(&(req->ref)) + 127u <= 127u) + +static inline bool req_ref_inc_not_zero(struct request *req) +{ + return atomic_inc_not_zero(&req->ref); } -/* - * Should be called with mutex lock held (typically bd_mutex) of partition - * to provide mutual exlusion among writers otherwise seqcount might be - * left in wrong state leaving the readers spinning infinitely. - */ -static inline void part_nr_sects_write(struct hd_struct *part, sector_t size) -{ -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - preempt_disable(); - write_seqcount_begin(&part->nr_sects_seq); - part->nr_sects = size; - write_seqcount_end(&part->nr_sects_seq); - preempt_enable(); -#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPTION) - preempt_disable(); - part->nr_sects = size; - preempt_enable(); -#else - part->nr_sects = size; -#endif +static inline bool req_ref_put_and_test(struct request *req) +{ + WARN_ON_ONCE(req_ref_zero_or_close_to_overflow(req)); + return atomic_dec_and_test(&req->ref); } -struct request_queue *__blk_alloc_queue(int node_id); +static inline void req_ref_set(struct request *req, int value) +{ + atomic_set(&req->ref, value); +} -int bio_add_hw_page(struct request_queue *q, struct bio *bio, - struct page *page, unsigned int len, unsigned int offset, - unsigned int max_sectors, bool *same_page); +static inline int req_ref_read(struct request *req) +{ + return atomic_read(&req->ref); +} #endif /* BLK_INTERNAL_H */ diff --git a/block/bounce.c b/block/bounce.c index c3aaed070124..7cfcb242f9a1 100644 --- a/block/bounce.c +++ b/block/bounce.c @@ -18,18 +18,18 @@ #include <linux/init.h> #include <linux/hash.h> #include <linux/highmem.h> -#include <linux/memblock.h> #include <linux/printk.h> #include <asm/tlbflush.h> #include <trace/events/block.h> #include "blk.h" +#include "blk-cgroup.h" #define POOL_SIZE 64 #define ISA_POOL_SIZE 16 static struct bio_set bounce_bio_set, bounce_bio_split; -static mempool_t page_pool, isa_page_pool; +static mempool_t page_pool; static void init_bounce_bioset(void) { @@ -49,11 +49,11 @@ static void init_bounce_bioset(void) bounce_bs_setup = true; } -#if defined(CONFIG_HIGHMEM) static __init int init_emergency_pool(void) { int ret; -#if defined(CONFIG_HIGHMEM) && !defined(CONFIG_MEMORY_HOTPLUG) + +#ifndef CONFIG_MEMORY_HOTPLUG if (max_pfn <= max_low_pfn) return 0; #endif @@ -67,62 +67,6 @@ static __init int init_emergency_pool(void) } __initcall(init_emergency_pool); -#endif - -#ifdef CONFIG_HIGHMEM -/* - * highmem version, map in to vec - */ -static void bounce_copy_vec(struct bio_vec *to, unsigned char *vfrom) -{ - unsigned char *vto; - - vto = kmap_atomic(to->bv_page); - memcpy(vto + to->bv_offset, vfrom, to->bv_len); - kunmap_atomic(vto); -} - -#else /* CONFIG_HIGHMEM */ - -#define bounce_copy_vec(to, vfrom) \ - memcpy(page_address((to)->bv_page) + (to)->bv_offset, vfrom, (to)->bv_len) - -#endif /* CONFIG_HIGHMEM */ - -/* - * allocate pages in the DMA region for the ISA pool - */ -static void *mempool_alloc_pages_isa(gfp_t gfp_mask, void *data) -{ - return mempool_alloc_pages(gfp_mask | GFP_DMA, data); -} - -static DEFINE_MUTEX(isa_mutex); - -/* - * gets called "every" time someone init's a queue with BLK_BOUNCE_ISA - * as the max address, so check if the pool has already been created. - */ -int init_emergency_isa_pool(void) -{ - int ret; - - mutex_lock(&isa_mutex); - - if (mempool_initialized(&isa_page_pool)) { - mutex_unlock(&isa_mutex); - return 0; - } - - ret = mempool_init(&isa_page_pool, ISA_POOL_SIZE, mempool_alloc_pages_isa, - mempool_free_pages, (void *) 0); - BUG_ON(ret); - - pr_info("isa pool size: %d pages\n", ISA_POOL_SIZE); - init_bounce_bioset(); - mutex_unlock(&isa_mutex); - return 0; -} /* * Simple bounce buffer support for highmem pages. Depending on the @@ -131,7 +75,6 @@ int init_emergency_isa_pool(void) */ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) { - unsigned char *vfrom; struct bio_vec tovec, fromvec; struct bvec_iter iter; /* @@ -149,17 +92,14 @@ static void copy_to_high_bio_irq(struct bio *to, struct bio *from) * been modified by the block layer, so use the original * copy, bounce_copy_vec already uses tovec->bv_len */ - vfrom = page_address(fromvec.bv_page) + - tovec.bv_offset; - - bounce_copy_vec(&tovec, vfrom); - flush_dcache_page(tovec.bv_page); + memcpy_to_bvec(&tovec, page_address(fromvec.bv_page) + + tovec.bv_offset); } bio_advance_iter(from, &from_iter, tovec.bv_len); } } -static void bounce_end_io(struct bio *bio, mempool_t *pool) +static void bounce_end_io(struct bio *bio) { struct bio *bio_orig = bio->bi_private; struct bio_vec *bvec, orig_vec; @@ -173,7 +113,7 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) orig_vec = bio_iter_iovec(bio_orig, orig_iter); if (bvec->bv_page != orig_vec.bv_page) { dec_zone_page_state(bvec->bv_page, NR_BOUNCE); - mempool_free(bvec->bv_page, pool); + mempool_free(bvec->bv_page, &page_pool); } bio_advance_iter(bio_orig, &orig_iter, orig_vec.bv_len); } @@ -185,37 +125,20 @@ static void bounce_end_io(struct bio *bio, mempool_t *pool) static void bounce_end_io_write(struct bio *bio) { - bounce_end_io(bio, &page_pool); -} - -static void bounce_end_io_write_isa(struct bio *bio) -{ - - bounce_end_io(bio, &isa_page_pool); + bounce_end_io(bio); } -static void __bounce_end_io_read(struct bio *bio, mempool_t *pool) +static void bounce_end_io_read(struct bio *bio) { struct bio *bio_orig = bio->bi_private; if (!bio->bi_status) copy_to_high_bio_irq(bio_orig, bio); - bounce_end_io(bio, pool); -} - -static void bounce_end_io_read(struct bio *bio) -{ - __bounce_end_io_read(bio, &page_pool); + bounce_end_io(bio); } -static void bounce_end_io_read_isa(struct bio *bio) -{ - __bounce_end_io_read(bio, &isa_page_pool); -} - -static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) +static struct bio *bounce_clone_bio(struct bio *bio_src) { struct bvec_iter iter; struct bio_vec bv; @@ -230,26 +153,22 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, * - The point of cloning the biovec is to produce a bio with a biovec * the caller can modify: bi_idx and bi_bvec_done should be 0. * - * - The original bio could've had more than BIO_MAX_PAGES biovecs; if + * - The original bio could've had more than BIO_MAX_VECS biovecs; if * we tried to clone the whole thing bio_alloc_bioset() would fail. * But the clone should succeed as long as the number of biovecs we - * actually need to allocate is fewer than BIO_MAX_PAGES. + * actually need to allocate is fewer than BIO_MAX_VECS. * * - Lastly, bi_vcnt should not be looked at or relied upon by code * that does not own the bio - reason being drivers don't use it for * iterating over the biovec anymore, so expecting it to be kept up * to date (i.e. for clones that share the parent biovec) is just - * asking for trouble and would force extra work on - * __bio_clone_fast() anyways. + * asking for trouble and would force extra work. */ - - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); - if (!bio) - return NULL; - bio->bi_disk = bio_src->bi_disk; - bio->bi_opf = bio_src->bi_opf; + bio = bio_alloc_bioset(bio_src->bi_bdev, bio_segments(bio_src), + bio_src->bi_opf, GFP_NOIO, &bounce_bio_set); + if (bio_flagged(bio_src, BIO_REMAPPED)) + bio_set_flag(bio, BIO_REMAPPED); bio->bi_ioprio = bio_src->bi_ioprio; - bio->bi_write_hint = bio_src->bi_write_hint; bio->bi_iter.bi_sector = bio_src->bi_iter.bi_sector; bio->bi_iter.bi_size = bio_src->bi_iter.bi_size; @@ -258,62 +177,61 @@ static struct bio *bounce_clone_bio(struct bio *bio_src, gfp_t gfp_mask, case REQ_OP_SECURE_ERASE: case REQ_OP_WRITE_ZEROES: break; - case REQ_OP_WRITE_SAME: - bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; - break; default: bio_for_each_segment(bv, bio_src, iter) bio->bi_io_vec[bio->bi_vcnt++] = bv; break; } - bio_crypt_clone(bio, bio_src, gfp_mask); + if (bio_crypt_clone(bio, bio_src, GFP_NOIO) < 0) + goto err_put; - if (bio_integrity(bio_src)) { - int ret; - - ret = bio_integrity_clone(bio, bio_src, gfp_mask); - if (ret < 0) { - bio_put(bio); - return NULL; - } - } + if (bio_integrity(bio_src) && + bio_integrity_clone(bio, bio_src, GFP_NOIO) < 0) + goto err_put; bio_clone_blkg_association(bio, bio_src); - blkcg_bio_issue_init(bio); return bio; + +err_put: + bio_put(bio); + return NULL; } -static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, - mempool_t *pool) +struct bio *__blk_queue_bounce(struct bio *bio_orig, struct request_queue *q) { struct bio *bio; - int rw = bio_data_dir(*bio_orig); + int rw = bio_data_dir(bio_orig); struct bio_vec *to, from; struct bvec_iter iter; - unsigned i = 0; + unsigned i = 0, bytes = 0; bool bounce = false; - int sectors = 0; - bool passthrough = bio_is_passthrough(*bio_orig); + int sectors; - bio_for_each_segment(from, *bio_orig, iter) { - if (i++ < BIO_MAX_PAGES) - sectors += from.bv_len >> 9; - if (page_to_pfn(from.bv_page) > q->limits.bounce_pfn) + bio_for_each_segment(from, bio_orig, iter) { + if (i++ < BIO_MAX_VECS) + bytes += from.bv_len; + if (PageHighMem(from.bv_page)) bounce = true; } if (!bounce) - return; + return bio_orig; - if (!passthrough && sectors < bio_sectors(*bio_orig)) { - bio = bio_split(*bio_orig, sectors, GFP_NOIO, &bounce_bio_split); - bio_chain(bio, *bio_orig); - generic_make_request(*bio_orig); - *bio_orig = bio; + /* + * Individual bvecs might not be logical block aligned. Round down + * the split size so that each bio is properly block size aligned, + * even if we do not use the full hardware limits. + */ + sectors = ALIGN_DOWN(bytes, queue_logical_block_size(q)) >> + SECTOR_SHIFT; + if (sectors < bio_sectors(bio_orig)) { + bio = bio_split(bio_orig, sectors, GFP_NOIO, &bounce_bio_split); + bio_chain(bio, bio_orig); + submit_bio_noacct(bio_orig); + bio_orig = bio; } - bio = bounce_clone_bio(*bio_orig, GFP_NOIO, passthrough ? NULL : - &bounce_bio_set); + bio = bounce_clone_bio(bio_orig); /* * Bvec table can't be updated by bio_for_each_segment_all(), @@ -321,70 +239,30 @@ static void __blk_queue_bounce(struct request_queue *q, struct bio **bio_orig, * because the 'bio' is single-page bvec. */ for (i = 0, to = bio->bi_io_vec; i < bio->bi_vcnt; to++, i++) { - struct page *page = to->bv_page; + struct page *bounce_page; - if (page_to_pfn(page) <= q->limits.bounce_pfn) + if (!PageHighMem(to->bv_page)) continue; - to->bv_page = mempool_alloc(pool, q->bounce_gfp); - inc_zone_page_state(to->bv_page, NR_BOUNCE); + bounce_page = mempool_alloc(&page_pool, GFP_NOIO); + inc_zone_page_state(bounce_page, NR_BOUNCE); if (rw == WRITE) { - char *vto, *vfrom; - - flush_dcache_page(page); - - vto = page_address(to->bv_page) + to->bv_offset; - vfrom = kmap_atomic(page) + to->bv_offset; - memcpy(vto, vfrom, to->bv_len); - kunmap_atomic(vfrom); + flush_dcache_page(to->bv_page); + memcpy_from_bvec(page_address(bounce_page), to); } + to->bv_page = bounce_page; } - trace_block_bio_bounce(q, *bio_orig); + trace_block_bio_bounce(bio_orig); bio->bi_flags |= (1 << BIO_BOUNCED); - if (pool == &page_pool) { + if (rw == READ) + bio->bi_end_io = bounce_end_io_read; + else bio->bi_end_io = bounce_end_io_write; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read; - } else { - bio->bi_end_io = bounce_end_io_write_isa; - if (rw == READ) - bio->bi_end_io = bounce_end_io_read_isa; - } - - bio->bi_private = *bio_orig; - *bio_orig = bio; -} - -void blk_queue_bounce(struct request_queue *q, struct bio **bio_orig) -{ - mempool_t *pool; - - /* - * Data-less bio, nothing to bounce - */ - if (!bio_has_data(*bio_orig)) - return; - /* - * for non-isa bounce case, just check if the bounce pfn is equal - * to or bigger than the highest pfn in the system -- in that case, - * don't waste time iterating over bio segments - */ - if (!(q->bounce_gfp & GFP_DMA)) { - if (q->limits.bounce_pfn >= blk_max_pfn) - return; - pool = &page_pool; - } else { - BUG_ON(!mempool_initialized(&isa_page_pool)); - pool = &isa_page_pool; - } - - /* - * slow path - */ - __blk_queue_bounce(q, bio_orig, pool); + bio->bi_private = bio_orig; + return bio; } diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 6cbb7926534c..b3acdbdb6e7e 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -6,6 +6,7 @@ * Copyright (C) 2011 Red Hat, Inc. All rights reserved. * Copyright (C) 2011 Mike Christie */ +#include <linux/bsg.h> #include <linux/slab.h> #include <linux/blk-mq.h> #include <linux/delay.h> @@ -19,36 +20,51 @@ struct bsg_set { struct blk_mq_tag_set tag_set; + struct bsg_device *bd; bsg_job_fn *job_fn; bsg_timeout_fn *timeout_fn; }; -static int bsg_transport_check_proto(struct sg_io_v4 *hdr) +static int bsg_transport_sg_io_fn(struct request_queue *q, struct sg_io_v4 *hdr, + bool open_for_write, unsigned int timeout) { + struct bsg_job *job; + struct request *rq; + struct bio *bio; + void *reply; + int ret; + if (hdr->protocol != BSG_PROTOCOL_SCSI || hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_TRANSPORT) return -EINVAL; if (!capable(CAP_SYS_RAWIO)) return -EPERM; - return 0; -} -static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, - fmode_t mode) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(rq); - int ret; + rq = blk_mq_alloc_request(q, hdr->dout_xfer_len ? + REQ_OP_DRV_OUT : REQ_OP_DRV_IN, 0); + if (IS_ERR(rq)) + return PTR_ERR(rq); + rq->timeout = timeout; + + job = blk_mq_rq_to_pdu(rq); + reply = job->reply; + memset(job, 0, sizeof(*job)); + job->reply = reply; + job->reply_len = SCSI_SENSE_BUFFERSIZE; + job->dd_data = job + 1; job->request_len = hdr->request_len; job->request = memdup_user(uptr64(hdr->request), hdr->request_len); - if (IS_ERR(job->request)) - return PTR_ERR(job->request); + if (IS_ERR(job->request)) { + ret = PTR_ERR(job->request); + goto out_free_rq; + } if (hdr->dout_xfer_len && hdr->din_xfer_len) { - job->bidi_rq = blk_get_request(rq->q, REQ_OP_SCSI_IN, 0); + job->bidi_rq = blk_mq_alloc_request(rq->q, REQ_OP_DRV_IN, 0); if (IS_ERR(job->bidi_rq)) { ret = PTR_ERR(job->bidi_rq); - goto out; + goto out_free_job_request; } ret = blk_rq_map_user(rq->q, job->bidi_rq, NULL, @@ -63,20 +79,20 @@ static int bsg_transport_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, job->bidi_bio = NULL; } - return 0; + ret = 0; + if (hdr->dout_xfer_len) { + ret = blk_rq_map_user(rq->q, rq, NULL, uptr64(hdr->dout_xferp), + hdr->dout_xfer_len, GFP_KERNEL); + } else if (hdr->din_xfer_len) { + ret = blk_rq_map_user(rq->q, rq, NULL, uptr64(hdr->din_xferp), + hdr->din_xfer_len, GFP_KERNEL); + } -out_free_bidi_rq: - if (job->bidi_rq) - blk_put_request(job->bidi_rq); -out: - kfree(job->request); - return ret; -} + if (ret) + goto out_unmap_bidi_rq; -static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(rq); - int ret = 0; + bio = rq->bio; + blk_execute_rq(rq, !(hdr->flags & BSG_FLAG_Q_AT_TAIL)); /* * The assignments below don't make much sense, but are kept for @@ -84,7 +100,7 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) */ hdr->device_status = job->result & 0xff; hdr->transport_status = host_byte(job->result); - hdr->driver_status = driver_byte(job->result); + hdr->driver_status = 0; hdr->info = 0; if (hdr->device_status || hdr->transport_status || hdr->driver_status) hdr->info |= SG_INFO_CHECK; @@ -119,28 +135,20 @@ static int bsg_transport_complete_rq(struct request *rq, struct sg_io_v4 *hdr) hdr->din_resid = 0; } - return ret; -} - -static void bsg_transport_free_rq(struct request *rq) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(rq); - - if (job->bidi_rq) { + blk_rq_unmap_user(bio); +out_unmap_bidi_rq: + if (job->bidi_rq) blk_rq_unmap_user(job->bidi_bio); - blk_put_request(job->bidi_rq); - } - +out_free_bidi_rq: + if (job->bidi_rq) + blk_mq_free_request(job->bidi_rq); +out_free_job_request: kfree(job->request); +out_free_rq: + blk_mq_free_request(rq); + return ret; } -static const struct bsg_ops bsg_transport_ops = { - .check_proto = bsg_transport_check_proto, - .fill_hdr = bsg_transport_fill_hdr, - .complete_rq = bsg_transport_complete_rq, - .free_rq = bsg_transport_free_rq, -}; - /** * bsg_teardown_job - routine to teardown a bsg job * @kref: kref inside bsg_job that is to be torn down @@ -181,9 +189,12 @@ EXPORT_SYMBOL_GPL(bsg_job_get); void bsg_job_done(struct bsg_job *job, int result, unsigned int reply_payload_rcv_len) { + struct request *rq = blk_mq_rq_from_pdu(job); + job->result = result; job->reply_payload_rcv_len = reply_payload_rcv_len; - blk_mq_complete_request(blk_mq_rq_from_pdu(job)); + if (likely(!blk_should_fake_timeout(rq->q))) + blk_mq_complete_request(rq); } EXPORT_SYMBOL_GPL(bsg_job_done); @@ -204,7 +215,7 @@ static int bsg_map_buffer(struct bsg_buffer *buf, struct request *req) BUG_ON(!req->nr_phys_segments); - buf->sg_list = kzalloc(sz, GFP_KERNEL); + buf->sg_list = kmalloc(sz, GFP_KERNEL); if (!buf->sg_list) return -ENOMEM; sg_init_table(buf->sg_list, req->nr_phys_segments); @@ -298,18 +309,6 @@ static int bsg_init_rq(struct blk_mq_tag_set *set, struct request *req, return 0; } -/* called right before the request is given to the request_queue user */ -static void bsg_initialize_rq(struct request *req) -{ - struct bsg_job *job = blk_mq_rq_to_pdu(req); - void *reply = job->reply; - - memset(job, 0, sizeof(*job)); - job->reply = reply; - job->reply_len = SCSI_SENSE_BUFFERSIZE; - job->dd_data = job + 1; -} - static void bsg_exit_rq(struct blk_mq_tag_set *set, struct request *req, unsigned int hctx_idx) { @@ -324,15 +323,16 @@ void bsg_remove_queue(struct request_queue *q) struct bsg_set *bset = container_of(q->tag_set, struct bsg_set, tag_set); - bsg_unregister_queue(q); - blk_cleanup_queue(q); + bsg_unregister_queue(bset->bd); + blk_mq_destroy_queue(q); + blk_put_queue(q); blk_mq_free_tag_set(&bset->tag_set); kfree(bset); } } EXPORT_SYMBOL_GPL(bsg_remove_queue); -static enum blk_eh_timer_return bsg_timeout(struct request *rq, bool reserved) +static enum blk_eh_timer_return bsg_timeout(struct request *rq) { struct bsg_set *bset = container_of(rq->q->tag_set, struct bsg_set, tag_set); @@ -346,7 +346,6 @@ static const struct blk_mq_ops bsg_mq_ops = { .queue_rq = bsg_queue_rq, .init_request = bsg_init_rq, .exit_request = bsg_exit_rq, - .initialize_rq_fn = bsg_initialize_rq, .complete = bsg_complete, .timeout = bsg_timeout, }; @@ -375,7 +374,7 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, bset->timeout_fn = timeout; set = &bset->tag_set; - set->ops = &bsg_mq_ops, + set->ops = &bsg_mq_ops; set->nr_hw_queues = 1; set->queue_depth = 128; set->numa_node = NUMA_NO_NODE; @@ -393,16 +392,16 @@ struct request_queue *bsg_setup_queue(struct device *dev, const char *name, q->queuedata = dev; blk_queue_rq_timeout(q, BLK_DEFAULT_SG_TIMEOUT); - ret = bsg_register_queue(q, dev, name, &bsg_transport_ops); - if (ret) { - printk(KERN_ERR "%s: bsg interface failed to " - "initialize - register queue\n", dev->kobj.name); + bset->bd = bsg_register_queue(q, dev, name, bsg_transport_sg_io_fn); + if (IS_ERR(bset->bd)) { + ret = PTR_ERR(bset->bd); goto out_cleanup_queue; } return q; out_cleanup_queue: - blk_cleanup_queue(q); + blk_mq_destroy_queue(q); + blk_put_queue(q); out_queue: blk_mq_free_tag_set(set); out_tag_set: diff --git a/block/bsg.c b/block/bsg.c index d7bae94b64d9..72157a59b788 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -15,339 +15,99 @@ #include <scsi/scsi.h> #include <scsi/scsi_ioctl.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/scsi_device.h> -#include <scsi/scsi_driver.h> #include <scsi/sg.h> #define BSG_DESCRIPTION "Block layer SCSI generic (bsg) driver" #define BSG_VERSION "0.4" -#define bsg_dbg(bd, fmt, ...) \ - pr_debug("%s: " fmt, (bd)->name, ##__VA_ARGS__) - struct bsg_device { struct request_queue *queue; - spinlock_t lock; - struct hlist_node dev_list; - refcount_t ref_count; - char name[20]; + struct device device; + struct cdev cdev; int max_queue; + unsigned int timeout; + unsigned int reserved_size; + bsg_sg_io_fn *sg_io_fn; }; -#define BSG_DEFAULT_CMDS 64 -#define BSG_MAX_DEVS 32768 - -static DEFINE_MUTEX(bsg_mutex); -static DEFINE_IDR(bsg_minor_idr); - -#define BSG_LIST_ARRAY_SIZE 8 -static struct hlist_head bsg_device_list[BSG_LIST_ARRAY_SIZE]; - -static struct class *bsg_class; -static int bsg_major; - -static inline struct hlist_head *bsg_dev_idx_hash(int index) -{ - return &bsg_device_list[index & (BSG_LIST_ARRAY_SIZE - 1)]; -} - -#define uptr64(val) ((void __user *)(uintptr_t)(val)) - -static int bsg_scsi_check_proto(struct sg_io_v4 *hdr) +static inline struct bsg_device *to_bsg_device(struct inode *inode) { - if (hdr->protocol != BSG_PROTOCOL_SCSI || - hdr->subprotocol != BSG_SUB_PROTOCOL_SCSI_CMD) - return -EINVAL; - return 0; + return container_of(inode->i_cdev, struct bsg_device, cdev); } -static int bsg_scsi_fill_hdr(struct request *rq, struct sg_io_v4 *hdr, - fmode_t mode) -{ - struct scsi_request *sreq = scsi_req(rq); - - if (hdr->dout_xfer_len && hdr->din_xfer_len) { - pr_warn_once("BIDI support in bsg has been removed.\n"); - return -EOPNOTSUPP; - } - - sreq->cmd_len = hdr->request_len; - if (sreq->cmd_len > BLK_MAX_CDB) { - sreq->cmd = kzalloc(sreq->cmd_len, GFP_KERNEL); - if (!sreq->cmd) - return -ENOMEM; - } +#define BSG_DEFAULT_CMDS 64 +#define BSG_MAX_DEVS (1 << MINORBITS) - if (copy_from_user(sreq->cmd, uptr64(hdr->request), sreq->cmd_len)) - return -EFAULT; - if (blk_verify_command(sreq->cmd, mode)) - return -EPERM; - return 0; -} +static DEFINE_IDA(bsg_minor_ida); +static const struct class bsg_class; +static int bsg_major; -static int bsg_scsi_complete_rq(struct request *rq, struct sg_io_v4 *hdr) +static unsigned int bsg_timeout(struct bsg_device *bd, struct sg_io_v4 *hdr) { - struct scsi_request *sreq = scsi_req(rq); - int ret = 0; - - /* - * fill in all the output members - */ - hdr->device_status = sreq->result & 0xff; - hdr->transport_status = host_byte(sreq->result); - hdr->driver_status = driver_byte(sreq->result); - hdr->info = 0; - if (hdr->device_status || hdr->transport_status || hdr->driver_status) - hdr->info |= SG_INFO_CHECK; - hdr->response_len = 0; - - if (sreq->sense_len && hdr->response) { - int len = min_t(unsigned int, hdr->max_response_len, - sreq->sense_len); - - if (copy_to_user(uptr64(hdr->response), sreq->sense, len)) - ret = -EFAULT; - else - hdr->response_len = len; - } - - if (rq_data_dir(rq) == READ) - hdr->din_resid = sreq->resid_len; - else - hdr->dout_resid = sreq->resid_len; + unsigned int timeout = BLK_DEFAULT_SG_TIMEOUT; - return ret; -} + if (hdr->timeout) + timeout = msecs_to_jiffies(hdr->timeout); + else if (bd->timeout) + timeout = bd->timeout; -static void bsg_scsi_free_rq(struct request *rq) -{ - scsi_req_free_cmd(scsi_req(rq)); + return max_t(unsigned int, timeout, BLK_MIN_SG_TIMEOUT); } -static const struct bsg_ops bsg_scsi_ops = { - .check_proto = bsg_scsi_check_proto, - .fill_hdr = bsg_scsi_fill_hdr, - .complete_rq = bsg_scsi_complete_rq, - .free_rq = bsg_scsi_free_rq, -}; - -static int bsg_sg_io(struct request_queue *q, fmode_t mode, void __user *uarg) +static int bsg_sg_io(struct bsg_device *bd, bool open_for_write, + void __user *uarg) { - struct request *rq; - struct bio *bio; struct sg_io_v4 hdr; int ret; if (copy_from_user(&hdr, uarg, sizeof(hdr))) return -EFAULT; - - if (!q->bsg_dev.class_dev) - return -ENXIO; - if (hdr.guard != 'Q') return -EINVAL; - ret = q->bsg_dev.ops->check_proto(&hdr); - if (ret) - return ret; - - rq = blk_get_request(q, hdr.dout_xfer_len ? - REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - - ret = q->bsg_dev.ops->fill_hdr(rq, &hdr, mode); - if (ret) - return ret; - - rq->timeout = msecs_to_jiffies(hdr.timeout); - if (!rq->timeout) - rq->timeout = q->sg_timeout; - if (!rq->timeout) - rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - if (rq->timeout < BLK_MIN_SG_TIMEOUT) - rq->timeout = BLK_MIN_SG_TIMEOUT; - - if (hdr.dout_xfer_len) { - ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.dout_xferp), - hdr.dout_xfer_len, GFP_KERNEL); - } else if (hdr.din_xfer_len) { - ret = blk_rq_map_user(q, rq, NULL, uptr64(hdr.din_xferp), - hdr.din_xfer_len, GFP_KERNEL); - } - - if (ret) - goto out_free_rq; - - bio = rq->bio; - - blk_execute_rq(q, NULL, rq, !(hdr.flags & BSG_FLAG_Q_AT_TAIL)); - ret = rq->q->bsg_dev.ops->complete_rq(rq, &hdr); - blk_rq_unmap_user(bio); - -out_free_rq: - rq->q->bsg_dev.ops->free_rq(rq); - blk_put_request(rq); + ret = bd->sg_io_fn(bd->queue, &hdr, open_for_write, + bsg_timeout(bd, &hdr)); if (!ret && copy_to_user(uarg, &hdr, sizeof(hdr))) return -EFAULT; return ret; } -static struct bsg_device *bsg_alloc_device(void) -{ - struct bsg_device *bd; - - bd = kzalloc(sizeof(struct bsg_device), GFP_KERNEL); - if (unlikely(!bd)) - return NULL; - - spin_lock_init(&bd->lock); - bd->max_queue = BSG_DEFAULT_CMDS; - INIT_HLIST_NODE(&bd->dev_list); - return bd; -} - -static int bsg_put_device(struct bsg_device *bd) -{ - struct request_queue *q = bd->queue; - - mutex_lock(&bsg_mutex); - - if (!refcount_dec_and_test(&bd->ref_count)) { - mutex_unlock(&bsg_mutex); - return 0; - } - - hlist_del(&bd->dev_list); - mutex_unlock(&bsg_mutex); - - bsg_dbg(bd, "tearing down\n"); - - /* - * close can always block - */ - kfree(bd); - blk_put_queue(q); - return 0; -} - -static struct bsg_device *bsg_add_device(struct inode *inode, - struct request_queue *rq, - struct file *file) -{ - struct bsg_device *bd; - unsigned char buf[32]; - - lockdep_assert_held(&bsg_mutex); - - if (!blk_get_queue(rq)) - return ERR_PTR(-ENXIO); - - bd = bsg_alloc_device(); - if (!bd) { - blk_put_queue(rq); - return ERR_PTR(-ENOMEM); - } - - bd->queue = rq; - - refcount_set(&bd->ref_count, 1); - hlist_add_head(&bd->dev_list, bsg_dev_idx_hash(iminor(inode))); - - strncpy(bd->name, dev_name(rq->bsg_dev.class_dev), sizeof(bd->name) - 1); - bsg_dbg(bd, "bound to <%s>, max queue %d\n", - format_dev_t(buf, inode->i_rdev), bd->max_queue); - - return bd; -} - -static struct bsg_device *__bsg_get_device(int minor, struct request_queue *q) -{ - struct bsg_device *bd; - - lockdep_assert_held(&bsg_mutex); - - hlist_for_each_entry(bd, bsg_dev_idx_hash(minor), dev_list) { - if (bd->queue == q) { - refcount_inc(&bd->ref_count); - goto found; - } - } - bd = NULL; -found: - return bd; -} - -static struct bsg_device *bsg_get_device(struct inode *inode, struct file *file) -{ - struct bsg_device *bd; - struct bsg_class_device *bcd; - - /* - * find the class device - */ - mutex_lock(&bsg_mutex); - bcd = idr_find(&bsg_minor_idr, iminor(inode)); - - if (!bcd) { - bd = ERR_PTR(-ENODEV); - goto out_unlock; - } - - bd = __bsg_get_device(iminor(inode), bcd->queue); - if (!bd) - bd = bsg_add_device(inode, bcd->queue, file); - -out_unlock: - mutex_unlock(&bsg_mutex); - return bd; -} - static int bsg_open(struct inode *inode, struct file *file) { - struct bsg_device *bd; - - bd = bsg_get_device(inode, file); - - if (IS_ERR(bd)) - return PTR_ERR(bd); - - file->private_data = bd; + if (!blk_get_queue(to_bsg_device(inode)->queue)) + return -ENXIO; return 0; } static int bsg_release(struct inode *inode, struct file *file) { - struct bsg_device *bd = file->private_data; - - file->private_data = NULL; - return bsg_put_device(bd); + blk_put_queue(to_bsg_device(inode)->queue); + return 0; } static int bsg_get_command_q(struct bsg_device *bd, int __user *uarg) { - return put_user(bd->max_queue, uarg); + return put_user(READ_ONCE(bd->max_queue), uarg); } static int bsg_set_command_q(struct bsg_device *bd, int __user *uarg) { - int queue; + int max_queue; - if (get_user(queue, uarg)) + if (get_user(max_queue, uarg)) return -EFAULT; - if (queue < 1) + if (max_queue < 1) return -EINVAL; - - spin_lock_irq(&bd->lock); - bd->max_queue = queue; - spin_unlock_irq(&bd->lock); + WRITE_ONCE(bd->max_queue, max_queue); return 0; } static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { - struct bsg_device *bd = file->private_data; + struct bsg_device *bd = to_bsg_device(file_inode(file)); + struct request_queue *q = bd->queue; void __user *uarg = (void __user *) arg; + int __user *intp = uarg; + int val; switch (cmd) { /* @@ -362,17 +122,37 @@ static long bsg_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * SCSI/sg ioctls */ case SG_GET_VERSION_NUM: + return put_user(30527, intp); case SCSI_IOCTL_GET_IDLUN: + return put_user(0, intp); case SCSI_IOCTL_GET_BUS_NUMBER: + return put_user(0, intp); case SG_SET_TIMEOUT: + if (get_user(val, intp)) + return -EFAULT; + bd->timeout = clock_t_to_jiffies(val); + return 0; case SG_GET_TIMEOUT: + return jiffies_to_clock_t(bd->timeout); case SG_GET_RESERVED_SIZE: + return put_user(min(bd->reserved_size, queue_max_bytes(q)), + intp); case SG_SET_RESERVED_SIZE: + if (get_user(val, intp)) + return -EFAULT; + if (val < 0) + return -EINVAL; + bd->reserved_size = + min_t(unsigned int, val, queue_max_bytes(q)); + return 0; case SG_EMULATED_HOST: - case SCSI_IOCTL_SEND_COMMAND: - return scsi_cmd_ioctl(bd->queue, NULL, file->f_mode, cmd, uarg); + return put_user(1, intp); case SG_IO: - return bsg_sg_io(bd->queue, file->f_mode, uarg); + return bsg_sg_io(bd, file->f_mode & FMODE_WRITE, uarg); + case SCSI_IOCTL_SEND_COMMAND: + pr_warn_ratelimited("%s: calling unsupported SCSI_IOCTL_SEND_COMMAND\n", + current->comm); + return -EINVAL; default: return -ENOTTY; } @@ -387,129 +167,106 @@ static const struct file_operations bsg_fops = { .llseek = default_llseek, }; -void bsg_unregister_queue(struct request_queue *q) +static void bsg_device_release(struct device *dev) { - struct bsg_class_device *bcd = &q->bsg_dev; - - if (!bcd->class_dev) - return; - - mutex_lock(&bsg_mutex); - idr_remove(&bsg_minor_idr, bcd->minor); - if (q->kobj.sd) - sysfs_remove_link(&q->kobj, "bsg"); - device_unregister(bcd->class_dev); - bcd->class_dev = NULL; - mutex_unlock(&bsg_mutex); + struct bsg_device *bd = container_of(dev, struct bsg_device, device); + + ida_free(&bsg_minor_ida, MINOR(bd->device.devt)); + kfree(bd); } -EXPORT_SYMBOL_GPL(bsg_unregister_queue); -int bsg_register_queue(struct request_queue *q, struct device *parent, - const char *name, const struct bsg_ops *ops) +void bsg_unregister_queue(struct bsg_device *bd) { - struct bsg_class_device *bcd; - dev_t dev; - int ret; - struct device *class_dev = NULL; + struct gendisk *disk = bd->queue->disk; - /* - * we need a proper transport to send commands, not a stacked device - */ - if (!queue_is_mq(q)) - return 0; + if (disk && disk->queue_kobj.sd) + sysfs_remove_link(&disk->queue_kobj, "bsg"); + cdev_device_del(&bd->cdev, &bd->device); + put_device(&bd->device); +} +EXPORT_SYMBOL_GPL(bsg_unregister_queue); - bcd = &q->bsg_dev; - memset(bcd, 0, sizeof(*bcd)); +struct bsg_device *bsg_register_queue(struct request_queue *q, + struct device *parent, const char *name, bsg_sg_io_fn *sg_io_fn) +{ + struct bsg_device *bd; + int ret; - mutex_lock(&bsg_mutex); + bd = kzalloc(sizeof(*bd), GFP_KERNEL); + if (!bd) + return ERR_PTR(-ENOMEM); + bd->max_queue = BSG_DEFAULT_CMDS; + bd->reserved_size = INT_MAX; + bd->queue = q; + bd->sg_io_fn = sg_io_fn; - ret = idr_alloc(&bsg_minor_idr, bcd, 0, BSG_MAX_DEVS, GFP_KERNEL); + ret = ida_alloc_max(&bsg_minor_ida, BSG_MAX_DEVS - 1, GFP_KERNEL); if (ret < 0) { - if (ret == -ENOSPC) { - printk(KERN_ERR "bsg: too many bsg devices\n"); - ret = -EINVAL; - } - goto unlock; + if (ret == -ENOSPC) + dev_err(parent, "bsg: too many bsg devices\n"); + kfree(bd); + return ERR_PTR(ret); } + bd->device.devt = MKDEV(bsg_major, ret); + bd->device.class = &bsg_class; + bd->device.parent = parent; + bd->device.release = bsg_device_release; + dev_set_name(&bd->device, "%s", name); + device_initialize(&bd->device); + + cdev_init(&bd->cdev, &bsg_fops); + bd->cdev.owner = THIS_MODULE; + ret = cdev_device_add(&bd->cdev, &bd->device); + if (ret) + goto out_put_device; - bcd->minor = ret; - bcd->queue = q; - bcd->ops = ops; - dev = MKDEV(bsg_major, bcd->minor); - class_dev = device_create(bsg_class, parent, dev, NULL, "%s", name); - if (IS_ERR(class_dev)) { - ret = PTR_ERR(class_dev); - goto idr_remove; - } - bcd->class_dev = class_dev; - - if (q->kobj.sd) { - ret = sysfs_create_link(&q->kobj, &bcd->class_dev->kobj, "bsg"); + if (q->disk && q->disk->queue_kobj.sd) { + ret = sysfs_create_link(&q->disk->queue_kobj, &bd->device.kobj, + "bsg"); if (ret) - goto unregister_class_dev; + goto out_device_del; } - mutex_unlock(&bsg_mutex); - return 0; - -unregister_class_dev: - device_unregister(class_dev); -idr_remove: - idr_remove(&bsg_minor_idr, bcd->minor); -unlock: - mutex_unlock(&bsg_mutex); - return ret; -} - -int bsg_scsi_register_queue(struct request_queue *q, struct device *parent) -{ - if (!blk_queue_scsi_passthrough(q)) { - WARN_ONCE(true, "Attempt to register a non-SCSI queue\n"); - return -EINVAL; - } + return bd; - return bsg_register_queue(q, parent, dev_name(parent), &bsg_scsi_ops); +out_device_del: + cdev_device_del(&bd->cdev, &bd->device); +out_put_device: + put_device(&bd->device); + return ERR_PTR(ret); } -EXPORT_SYMBOL_GPL(bsg_scsi_register_queue); - -static struct cdev bsg_cdev; +EXPORT_SYMBOL_GPL(bsg_register_queue); -static char *bsg_devnode(struct device *dev, umode_t *mode) +static char *bsg_devnode(const struct device *dev, umode_t *mode) { return kasprintf(GFP_KERNEL, "bsg/%s", dev_name(dev)); } +static const struct class bsg_class = { + .name = "bsg", + .devnode = bsg_devnode, +}; + static int __init bsg_init(void) { - int ret, i; dev_t devid; + int ret; - for (i = 0; i < BSG_LIST_ARRAY_SIZE; i++) - INIT_HLIST_HEAD(&bsg_device_list[i]); - - bsg_class = class_create(THIS_MODULE, "bsg"); - if (IS_ERR(bsg_class)) - return PTR_ERR(bsg_class); - bsg_class->devnode = bsg_devnode; + ret = class_register(&bsg_class); + if (ret) + return ret; ret = alloc_chrdev_region(&devid, 0, BSG_MAX_DEVS, "bsg"); if (ret) goto destroy_bsg_class; - bsg_major = MAJOR(devid); - cdev_init(&bsg_cdev, &bsg_fops); - ret = cdev_add(&bsg_cdev, MKDEV(bsg_major, 0), BSG_MAX_DEVS); - if (ret) - goto unregister_chrdev; - printk(KERN_INFO BSG_DESCRIPTION " version " BSG_VERSION " loaded (major %d)\n", bsg_major); return 0; -unregister_chrdev: - unregister_chrdev_region(MKDEV(bsg_major, 0), BSG_MAX_DEVS); + destroy_bsg_class: - class_destroy(bsg_class); + class_unregister(&bsg_class); return ret; } diff --git a/block/cmdline-parser.c b/block/cmdline-parser.c deleted file mode 100644 index f2a14571882b..000000000000 --- a/block/cmdline-parser.c +++ /dev/null @@ -1,255 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Parse command line, get partition information - * - * Written by Cai Zhiyong <caizhiyong@huawei.com> - * - */ -#include <linux/export.h> -#include <linux/cmdline-parser.h> - -static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) -{ - int ret = 0; - struct cmdline_subpart *new_subpart; - - *subpart = NULL; - - new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); - if (!new_subpart) - return -ENOMEM; - - if (*partdef == '-') { - new_subpart->size = (sector_t)(~0ULL); - partdef++; - } else { - new_subpart->size = (sector_t)memparse(partdef, &partdef); - if (new_subpart->size < (sector_t)PAGE_SIZE) { - pr_warn("cmdline partition size is invalid."); - ret = -EINVAL; - goto fail; - } - } - - if (*partdef == '@') { - partdef++; - new_subpart->from = (sector_t)memparse(partdef, &partdef); - } else { - new_subpart->from = (sector_t)(~0ULL); - } - - if (*partdef == '(') { - int length; - char *next = strchr(++partdef, ')'); - - if (!next) { - pr_warn("cmdline partition format is invalid."); - ret = -EINVAL; - goto fail; - } - - length = min_t(int, next - partdef, - sizeof(new_subpart->name) - 1); - strncpy(new_subpart->name, partdef, length); - new_subpart->name[length] = '\0'; - - partdef = ++next; - } else - new_subpart->name[0] = '\0'; - - new_subpart->flags = 0; - - if (!strncmp(partdef, "ro", 2)) { - new_subpart->flags |= PF_RDONLY; - partdef += 2; - } - - if (!strncmp(partdef, "lk", 2)) { - new_subpart->flags |= PF_POWERUP_LOCK; - partdef += 2; - } - - *subpart = new_subpart; - return 0; -fail: - kfree(new_subpart); - return ret; -} - -static void free_subpart(struct cmdline_parts *parts) -{ - struct cmdline_subpart *subpart; - - while (parts->subpart) { - subpart = parts->subpart; - parts->subpart = subpart->next_subpart; - kfree(subpart); - } -} - -static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) -{ - int ret = -EINVAL; - char *next; - int length; - struct cmdline_subpart **next_subpart; - struct cmdline_parts *newparts; - char buf[BDEVNAME_SIZE + 32 + 4]; - - *parts = NULL; - - newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); - if (!newparts) - return -ENOMEM; - - next = strchr(bdevdef, ':'); - if (!next) { - pr_warn("cmdline partition has no block device."); - goto fail; - } - - length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); - strncpy(newparts->name, bdevdef, length); - newparts->name[length] = '\0'; - newparts->nr_subparts = 0; - - next_subpart = &newparts->subpart; - - while (next && *(++next)) { - bdevdef = next; - next = strchr(bdevdef, ','); - - length = (!next) ? (sizeof(buf) - 1) : - min_t(int, next - bdevdef, sizeof(buf) - 1); - - strncpy(buf, bdevdef, length); - buf[length] = '\0'; - - ret = parse_subpart(next_subpart, buf); - if (ret) - goto fail; - - newparts->nr_subparts++; - next_subpart = &(*next_subpart)->next_subpart; - } - - if (!newparts->subpart) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - *parts = newparts; - - return 0; -fail: - free_subpart(newparts); - kfree(newparts); - return ret; -} - -void cmdline_parts_free(struct cmdline_parts **parts) -{ - struct cmdline_parts *next_parts; - - while (*parts) { - next_parts = (*parts)->next_parts; - free_subpart(*parts); - kfree(*parts); - *parts = next_parts; - } -} -EXPORT_SYMBOL(cmdline_parts_free); - -int cmdline_parts_parse(struct cmdline_parts **parts, const char *cmdline) -{ - int ret; - char *buf; - char *pbuf; - char *next; - struct cmdline_parts **next_parts; - - *parts = NULL; - - next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); - if (!buf) - return -ENOMEM; - - next_parts = parts; - - while (next && *pbuf) { - next = strchr(pbuf, ';'); - if (next) - *next = '\0'; - - ret = parse_parts(next_parts, pbuf); - if (ret) - goto fail; - - if (next) - pbuf = ++next; - - next_parts = &(*next_parts)->next_parts; - } - - if (!*parts) { - pr_warn("cmdline partition has no valid partition."); - ret = -EINVAL; - goto fail; - } - - ret = 0; -done: - kfree(buf); - return ret; - -fail: - cmdline_parts_free(parts); - goto done; -} -EXPORT_SYMBOL(cmdline_parts_parse); - -struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, - const char *bdev) -{ - while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) - parts = parts->next_parts; - return parts; -} -EXPORT_SYMBOL(cmdline_parts_find); - -/* - * add_part() - * 0 success. - * 1 can not add so many partitions. - */ -int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, - int slot, - int (*add_part)(int, struct cmdline_subpart *, void *), - void *param) -{ - sector_t from = 0; - struct cmdline_subpart *subpart; - - for (subpart = parts->subpart; subpart; - subpart = subpart->next_subpart, slot++) { - if (subpart->from == (sector_t)(~0ULL)) - subpart->from = from; - else - from = subpart->from; - - if (from >= disk_size) - break; - - if (subpart->size > (disk_size - from)) - subpart->size = disk_size - from; - - from += subpart->size; - - if (add_part(slot, subpart, param)) - break; - } - - return slot; -} -EXPORT_SYMBOL(cmdline_parts_set); diff --git a/block/disk-events.c b/block/disk-events.c new file mode 100644 index 000000000000..2f697224386a --- /dev/null +++ b/block/disk-events.c @@ -0,0 +1,489 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Disk events - monitor disk events like media change and eject request. + */ +#include <linux/export.h> +#include <linux/moduleparam.h> +#include <linux/blkdev.h> +#include "blk.h" + +struct disk_events { + struct list_head node; /* all disk_event's */ + struct gendisk *disk; /* the associated disk */ + spinlock_t lock; + + struct mutex block_mutex; /* protects blocking */ + int block; /* event blocking depth */ + unsigned int pending; /* events already sent out */ + unsigned int clearing; /* events being cleared */ + + long poll_msecs; /* interval, -1 for default */ + struct delayed_work dwork; +}; + +static const char *disk_events_strs[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", +}; + +static char *disk_uevents[] = { + [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", + [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", +}; + +/* list of all disk_events */ +static DEFINE_MUTEX(disk_events_mutex); +static LIST_HEAD(disk_events); + +/* disable in-kernel polling by default */ +static unsigned long disk_events_dfl_poll_msecs; + +static unsigned long disk_events_poll_jiffies(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + long intv_msecs = 0; + + /* + * If device-specific poll interval is set, always use it. If + * the default is being used, poll if the POLL flag is set. + */ + if (ev->poll_msecs >= 0) + intv_msecs = ev->poll_msecs; + else if (disk->event_flags & DISK_EVENT_FLAG_POLL) + intv_msecs = disk_events_dfl_poll_msecs; + + return msecs_to_jiffies(intv_msecs); +} + +/** + * disk_block_events - block and flush disk event checking + * @disk: disk to block events for + * + * On return from this function, it is guaranteed that event checking + * isn't in progress and won't happen until unblocked by + * disk_unblock_events(). Events blocking is counted and the actual + * unblocking happens after the matching number of unblocks are done. + * + * Note that this intentionally does not block event checking from + * disk_clear_events(). + * + * CONTEXT: + * Might sleep. + */ +void disk_block_events(struct gendisk *disk) +{ + struct disk_events *ev = disk->ev; + unsigned long flags; + bool cancel; + + if (!ev) + return; + + /* + * Outer mutex ensures that the first blocker completes canceling + * the event work before further blockers are allowed to finish. + */ + mutex_lock(&ev->block_mutex); + + spin_lock_irqsave(&ev->lock, flags); + cancel = !ev->block++; + spin_unlock_irqrestore(&ev->lock, flags); + + if (cancel) + cancel_delayed_work_sync(&disk->ev->dwork); + + mutex_unlock(&ev->block_mutex); +} + +static void __disk_unblock_events(struct gendisk *disk, bool check_now) +{ + struct disk_events *ev = disk->ev; + unsigned long intv; + unsigned long flags; + + spin_lock_irqsave(&ev->lock, flags); + + if (WARN_ON_ONCE(ev->block <= 0)) + goto out_unlock; + + if (--ev->block) + goto out_unlock; + + intv = disk_events_poll_jiffies(disk); + if (check_now) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + else if (intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); +out_unlock: + spin_unlock_irqrestore(&ev->lock, flags); +} + +/** + * disk_unblock_events - unblock disk event checking + * @disk: disk to unblock events for + * + * Undo disk_block_events(). When the block count reaches zero, it + * starts events polling if configured. + * + * CONTEXT: + * Don't care. Safe to call from irq context. + */ +void disk_unblock_events(struct gendisk *disk) +{ + if (disk->ev) + __disk_unblock_events(disk, false); +} + +/** + * disk_flush_events - schedule immediate event checking and flushing + * @disk: disk to check and flush events for + * @mask: events to flush + * + * Schedule immediate event checking on @disk if not blocked. Events in + * @mask are scheduled to be cleared from the driver. Note that this + * doesn't clear the events from @disk->ev. + * + * CONTEXT: + * If @mask is non-zero must be called with disk->open_mutex held. + */ +void disk_flush_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + + if (!ev) + return; + + spin_lock_irq(&ev->lock); + ev->clearing |= mask; + if (!ev->block) + mod_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, 0); + spin_unlock_irq(&ev->lock); +} + +/* + * Tell userland about new events. Only the events listed in @disk->events are + * reported, and only if DISK_EVENT_FLAG_UEVENT is set. Otherwise, events are + * processed internally but never get reported to userland. + */ +static void disk_event_uevent(struct gendisk *disk, unsigned int events) +{ + char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; + int nr_events = 0, i; + + for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) + if (events & disk->events & (1 << i)) + envp[nr_events++] = disk_uevents[i]; + + if (nr_events) + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); +} + +static void disk_check_events(struct disk_events *ev, + unsigned int *clearing_ptr) +{ + struct gendisk *disk = ev->disk; + unsigned int clearing = *clearing_ptr; + unsigned int events; + unsigned long intv; + + /* check events */ + events = disk->fops->check_events(disk, clearing); + + /* accumulate pending events and schedule next poll if necessary */ + spin_lock_irq(&ev->lock); + + events &= ~ev->pending; + ev->pending |= events; + *clearing_ptr &= ~clearing; + + intv = disk_events_poll_jiffies(disk); + if (!ev->block && intv) + queue_delayed_work(system_freezable_power_efficient_wq, + &ev->dwork, intv); + + spin_unlock_irq(&ev->lock); + + if (events & DISK_EVENT_MEDIA_CHANGE) + inc_diskseq(disk); + + if (disk->event_flags & DISK_EVENT_FLAG_UEVENT) + disk_event_uevent(disk, events); +} + +/** + * disk_clear_events - synchronously check, clear and return pending events + * @disk: disk to fetch and clear events from + * @mask: mask of events to be fetched and cleared + * + * Disk events are synchronously checked and pending events in @mask + * are cleared and returned. This ignores the block count. + * + * CONTEXT: + * Might sleep. + */ +static unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) +{ + struct disk_events *ev = disk->ev; + unsigned int pending; + unsigned int clearing = mask; + + if (!ev) + return 0; + + disk_block_events(disk); + + /* + * store the union of mask and ev->clearing on the stack so that the + * race with disk_flush_events does not cause ambiguity (ev->clearing + * can still be modified even if events are blocked). + */ + spin_lock_irq(&ev->lock); + clearing |= ev->clearing; + ev->clearing = 0; + spin_unlock_irq(&ev->lock); + + disk_check_events(ev, &clearing); + /* + * if ev->clearing is not 0, the disk_flush_events got called in the + * middle of this function, so we want to run the workfn without delay. + */ + __disk_unblock_events(disk, ev->clearing ? true : false); + + /* then, fetch and clear pending events */ + spin_lock_irq(&ev->lock); + pending = ev->pending & mask; + ev->pending &= ~mask; + spin_unlock_irq(&ev->lock); + WARN_ON_ONCE(clearing & mask); + + return pending; +} + +/** + * disk_check_media_change - check if a removable media has been changed + * @disk: gendisk to check + * + * Returns %true and marks the disk for a partition rescan whether a removable + * media has been changed, and %false if the media did not change. + */ +bool disk_check_media_change(struct gendisk *disk) +{ + unsigned int events; + + events = disk_clear_events(disk, DISK_EVENT_MEDIA_CHANGE | + DISK_EVENT_EJECT_REQUEST); + if (events & DISK_EVENT_MEDIA_CHANGE) { + set_bit(GD_NEED_PART_SCAN, &disk->state); + return true; + } + return false; +} +EXPORT_SYMBOL(disk_check_media_change); + +/** + * disk_force_media_change - force a media change event + * @disk: the disk which will raise the event + * + * Should be called when the media changes for @disk. Generates a uevent + * and attempts to free all dentries and inodes and invalidates all block + * device page cache entries in that case. + */ +void disk_force_media_change(struct gendisk *disk) +{ + disk_event_uevent(disk, DISK_EVENT_MEDIA_CHANGE); + inc_diskseq(disk); + bdev_mark_dead(disk->part0, true); + set_bit(GD_NEED_PART_SCAN, &disk->state); +} +EXPORT_SYMBOL_GPL(disk_force_media_change); + +/* + * Separate this part out so that a different pointer for clearing_ptr can be + * passed in for disk_clear_events. + */ +static void disk_events_workfn(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct disk_events *ev = container_of(dwork, struct disk_events, dwork); + + disk_check_events(ev, &ev->clearing); +} + +/* + * A disk events enabled device has the following sysfs nodes under + * its /sys/block/X/ directory. + * + * events : list of all supported events + * events_async : list of events which can be detected w/o polling + * (always empty, only for backwards compatibility) + * events_poll_msecs : polling interval, 0: disable, -1: system default + */ +static ssize_t __disk_events_show(unsigned int events, char *buf) +{ + const char *delim = ""; + ssize_t pos = 0; + int i; + + for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) + if (events & (1 << i)) { + pos += sprintf(buf + pos, "%s%s", + delim, disk_events_strs[i]); + delim = " "; + } + if (pos) + pos += sprintf(buf + pos, "\n"); + return pos; +} + +static ssize_t disk_events_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) + return 0; + return __disk_events_show(disk->events, buf); +} + +static ssize_t disk_events_async_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return 0; +} + +static ssize_t disk_events_poll_msecs_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + if (!disk->ev) + return sprintf(buf, "-1\n"); + return sprintf(buf, "%ld\n", disk->ev->poll_msecs); +} + +static ssize_t disk_events_poll_msecs_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +{ + struct gendisk *disk = dev_to_disk(dev); + long intv; + + if (!count || !sscanf(buf, "%ld", &intv)) + return -EINVAL; + + if (intv < 0 && intv != -1) + return -EINVAL; + + if (!disk->ev) + return -ENODEV; + + disk_block_events(disk); + disk->ev->poll_msecs = intv; + __disk_unblock_events(disk, true); + return count; +} + +DEVICE_ATTR(events, 0444, disk_events_show, NULL); +DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); +DEVICE_ATTR(events_poll_msecs, 0644, disk_events_poll_msecs_show, + disk_events_poll_msecs_store); + +/* + * The default polling interval can be specified by the kernel + * parameter block.events_dfl_poll_msecs which defaults to 0 + * (disable). This can also be modified runtime by writing to + * /sys/module/block/parameters/events_dfl_poll_msecs. + */ +static int disk_events_set_dfl_poll_msecs(const char *val, + const struct kernel_param *kp) +{ + struct disk_events *ev; + int ret; + + ret = param_set_ulong(val, kp); + if (ret < 0) + return ret; + + mutex_lock(&disk_events_mutex); + list_for_each_entry(ev, &disk_events, node) + disk_flush_events(ev->disk, 0); + mutex_unlock(&disk_events_mutex); + return 0; +} + +static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { + .set = disk_events_set_dfl_poll_msecs, + .get = param_get_ulong, +}; + +#undef MODULE_PARAM_PREFIX +#define MODULE_PARAM_PREFIX "block." + +module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, + &disk_events_dfl_poll_msecs, 0644); + +/* + * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. + */ +int disk_alloc_events(struct gendisk *disk) +{ + struct disk_events *ev; + + if (!disk->fops->check_events || !disk->events) + return 0; + + ev = kzalloc(sizeof(*ev), GFP_KERNEL); + if (!ev) { + pr_warn("%s: failed to initialize events\n", disk->disk_name); + return -ENOMEM; + } + + INIT_LIST_HEAD(&ev->node); + ev->disk = disk; + spin_lock_init(&ev->lock); + mutex_init(&ev->block_mutex); + ev->block = 1; + ev->poll_msecs = -1; + INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); + + disk->ev = ev; + return 0; +} + +void disk_add_events(struct gendisk *disk) +{ + if (!disk->ev) + return; + + mutex_lock(&disk_events_mutex); + list_add_tail(&disk->ev->node, &disk_events); + mutex_unlock(&disk_events_mutex); + + /* + * Block count is initialized to 1 and the following initial + * unblock kicks it into action. + */ + __disk_unblock_events(disk, true); +} + +void disk_del_events(struct gendisk *disk) +{ + if (disk->ev) { + disk_block_events(disk); + + mutex_lock(&disk_events_mutex); + list_del_init(&disk->ev->node); + mutex_unlock(&disk_events_mutex); + } +} + +void disk_release_events(struct gendisk *disk) +{ + /* the block count should be 1 from disk_del_events() */ + WARN_ON_ONCE(disk->ev && disk->ev->block != 1); + kfree(disk->ev); +} diff --git a/block/early-lookup.c b/block/early-lookup.c new file mode 100644 index 000000000000..3effbd0d35e9 --- /dev/null +++ b/block/early-lookup.c @@ -0,0 +1,316 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Code for looking up block devices in the early boot code before mounting the + * root file system. + */ +#include <linux/blkdev.h> +#include <linux/ctype.h> + +struct uuidcmp { + const char *uuid; + int len; +}; + +/** + * match_dev_by_uuid - callback for finding a partition using its uuid + * @dev: device passed in by the caller + * @data: opaque pointer to the desired struct uuidcmp to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int __init match_dev_by_uuid(struct device *dev, const void *data) +{ + struct block_device *bdev = dev_to_bdev(dev); + const struct uuidcmp *cmp = data; + + if (!bdev->bd_meta_info || + strncasecmp(cmp->uuid, bdev->bd_meta_info->uuid, cmp->len)) + return 0; + return 1; +} + +/** + * devt_from_partuuid - looks up the dev_t of a partition by its UUID + * @uuid_str: char array containing ascii UUID + * @devt: dev_t result + * + * The function will return the first partition which contains a matching + * UUID value in its partition_meta_info struct. This does not search + * by filesystem UUIDs. + * + * If @uuid_str is followed by a "/PARTNROFF=%d", then the number will be + * extracted and used as an offset from the partition identified by the UUID. + * + * Returns 0 on success or a negative error code on failure. + */ +static int __init devt_from_partuuid(const char *uuid_str, dev_t *devt) +{ + struct uuidcmp cmp; + struct device *dev = NULL; + int offset = 0; + char *slash; + + cmp.uuid = uuid_str; + + slash = strchr(uuid_str, '/'); + /* Check for optional partition number offset attributes. */ + if (slash) { + char c = 0; + + /* Explicitly fail on poor PARTUUID syntax. */ + if (sscanf(slash + 1, "PARTNROFF=%d%c", &offset, &c) != 1) + goto out_invalid; + cmp.len = slash - uuid_str; + } else { + cmp.len = strlen(uuid_str); + } + + if (!cmp.len) + goto out_invalid; + + dev = class_find_device(&block_class, NULL, &cmp, &match_dev_by_uuid); + if (!dev) + return -ENODEV; + + if (offset) { + /* + * Attempt to find the requested partition by adding an offset + * to the partition number found by UUID. + */ + *devt = part_devt(dev_to_disk(dev), + dev_to_bdev(dev)->bd_partno + offset); + } else { + *devt = dev->devt; + } + + put_device(dev); + return 0; + +out_invalid: + pr_err("VFS: PARTUUID= is invalid.\n" + "Expected PARTUUID=<valid-uuid-id>[/PARTNROFF=%%d]\n"); + return -EINVAL; +} + +/** + * match_dev_by_label - callback for finding a partition using its label + * @dev: device passed in by the caller + * @data: opaque pointer to the label to match + * + * Returns 1 if the device matches, and 0 otherwise. + */ +static int __init match_dev_by_label(struct device *dev, const void *data) +{ + struct block_device *bdev = dev_to_bdev(dev); + const char *label = data; + + if (!bdev->bd_meta_info || strcmp(label, bdev->bd_meta_info->volname)) + return 0; + return 1; +} + +static int __init devt_from_partlabel(const char *label, dev_t *devt) +{ + struct device *dev; + + dev = class_find_device(&block_class, NULL, label, &match_dev_by_label); + if (!dev) + return -ENODEV; + *devt = dev->devt; + put_device(dev); + return 0; +} + +static dev_t __init blk_lookup_devt(const char *name, int partno) +{ + dev_t devt = MKDEV(0, 0); + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + + if (strcmp(dev_name(dev), name)) + continue; + + if (partno < disk->minors) { + /* We need to return the right devno, even + * if the partition doesn't exist yet. + */ + devt = MKDEV(MAJOR(dev->devt), + MINOR(dev->devt) + partno); + } else { + devt = part_devt(disk, partno); + if (devt) + break; + } + } + class_dev_iter_exit(&iter); + return devt; +} + +static int __init devt_from_devname(const char *name, dev_t *devt) +{ + int part; + char s[32]; + char *p; + + if (strlen(name) > 31) + return -EINVAL; + strcpy(s, name); + for (p = s; *p; p++) { + if (*p == '/') + *p = '!'; + } + + *devt = blk_lookup_devt(s, 0); + if (*devt) + return 0; + + /* + * Try non-existent, but valid partition, which may only exist after + * opening the device, like partitioned md devices. + */ + while (p > s && isdigit(p[-1])) + p--; + if (p == s || !*p || *p == '0') + return -ENODEV; + + /* try disk name without <part number> */ + part = simple_strtoul(p, NULL, 10); + *p = '\0'; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + + /* try disk name without p<part number> */ + if (p < s + 2 || !isdigit(p[-2]) || p[-1] != 'p') + return -ENODEV; + p[-1] = '\0'; + *devt = blk_lookup_devt(s, part); + if (*devt) + return 0; + return -ENODEV; +} + +static int __init devt_from_devnum(const char *name, dev_t *devt) +{ + unsigned maj, min, offset; + char *p, dummy; + + if (sscanf(name, "%u:%u%c", &maj, &min, &dummy) == 2 || + sscanf(name, "%u:%u:%u:%c", &maj, &min, &offset, &dummy) == 3) { + *devt = MKDEV(maj, min); + if (maj != MAJOR(*devt) || min != MINOR(*devt)) + return -EINVAL; + } else { + *devt = new_decode_dev(simple_strtoul(name, &p, 16)); + if (*p) + return -EINVAL; + } + + return 0; +} + +/* + * Convert a name into device number. We accept the following variants: + * + * 1) <hex_major><hex_minor> device number in hexadecimal represents itself + * no leading 0x, for example b302. + * 3) /dev/<disk_name> represents the device number of disk + * 4) /dev/<disk_name><decimal> represents the device number + * of partition - device number of disk plus the partition number + * 5) /dev/<disk_name>p<decimal> - same as the above, that form is + * used when disk name of partitioned disk ends on a digit. + * 6) PARTUUID=00112233-4455-6677-8899-AABBCCDDEEFF representing the + * unique id of a partition if the partition table provides it. + * The UUID may be either an EFI/GPT UUID, or refer to an MSDOS + * partition using the format SSSSSSSS-PP, where SSSSSSSS is a zero- + * filled hex representation of the 32-bit "NT disk signature", and PP + * is a zero-filled hex representation of the 1-based partition number. + * 7) PARTUUID=<UUID>/PARTNROFF=<int> to select a partition in relation to + * a partition with a known unique id. + * 8) <major>:<minor> major and minor number of the device separated by + * a colon. + * 9) PARTLABEL=<name> with name being the GPT partition label. + * MSDOS partitions do not support labels! + * + * If name doesn't have fall into the categories above, we return (0,0). + * block_class is used to check if something is a disk name. If the disk + * name contains slashes, the device name has them replaced with + * bangs. + */ +int __init early_lookup_bdev(const char *name, dev_t *devt) +{ + if (strncmp(name, "PARTUUID=", 9) == 0) + return devt_from_partuuid(name + 9, devt); + if (strncmp(name, "PARTLABEL=", 10) == 0) + return devt_from_partlabel(name + 10, devt); + if (strncmp(name, "/dev/", 5) == 0) + return devt_from_devname(name + 5, devt); + return devt_from_devnum(name, devt); +} + +static char __init *bdevt_str(dev_t devt, char *buf) +{ + if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { + char tbuf[BDEVT_SIZE]; + snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); + snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); + } else + snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); + + return buf; +} + +/* + * print a full list of all partitions - intended for places where the root + * filesystem can't be mounted and thus to give the victim some idea of what + * went wrong + */ +void __init printk_all_partitions(void) +{ + struct class_dev_iter iter; + struct device *dev; + + class_dev_iter_init(&iter, &block_class, NULL, &disk_type); + while ((dev = class_dev_iter_next(&iter))) { + struct gendisk *disk = dev_to_disk(dev); + struct block_device *part; + char devt_buf[BDEVT_SIZE]; + unsigned long idx; + + /* + * Don't show empty devices or things that have been + * suppressed + */ + if (get_capacity(disk) == 0 || (disk->flags & GENHD_FL_HIDDEN)) + continue; + + /* + * Note, unlike /proc/partitions, I am showing the numbers in + * hex - the same format as the root= option takes. + */ + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + printk("%s%s %10llu %pg %s", + bdev_is_partition(part) ? " " : "", + bdevt_str(part->bd_dev, devt_buf), + bdev_nr_sectors(part) >> 1, part, + part->bd_meta_info ? + part->bd_meta_info->uuid : ""); + if (bdev_is_partition(part)) + printk("\n"); + else if (dev->parent && dev->parent->driver) + printk(" driver: %s\n", + dev->parent->driver->name); + else + printk(" (driver?)\n"); + } + rcu_read_unlock(); + } + class_dev_iter_exit(&iter); +} diff --git a/block/elevator.c b/block/elevator.c index 4eab3d70e880..5ff093cb3cf8 100644 --- a/block/elevator.c +++ b/block/elevator.c @@ -26,7 +26,6 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> @@ -36,14 +35,15 @@ #include <linux/hash.h> #include <linux/uaccess.h> #include <linux/pm_runtime.h> -#include <linux/blk-cgroup.h> #include <trace/events/block.h> +#include "elevator.h" #include "blk.h" #include "blk-mq-sched.h" #include "blk-pm.h" #include "blk-wbt.h" +#include "blk-cgroup.h" static DEFINE_SPINLOCK(elv_list_lock); static LIST_HEAD(elv_list); @@ -57,7 +57,7 @@ static LIST_HEAD(elv_list); * Query io scheduler to see if the current process issuing bio may be * merged with rq. */ -static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) +static bool elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) { struct request_queue *q = rq->q; struct elevator_queue *e = q->elevator; @@ -65,7 +65,7 @@ static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) if (e->type->ops.allow_merge) return e->type->ops.allow_merge(q, rq, bio); - return 1; + return true; } /* @@ -83,83 +83,50 @@ bool elv_bio_merge_ok(struct request *rq, struct bio *bio) } EXPORT_SYMBOL(elv_bio_merge_ok); -static inline bool elv_support_features(unsigned int elv_features, - unsigned int required_features) +static inline bool elv_support_features(struct request_queue *q, + const struct elevator_type *e) { - return (required_features & elv_features) == required_features; + return (q->required_elevator_features & e->elevator_features) == + q->required_elevator_features; } /** - * elevator_match - Test an elevator name and features + * elevator_match - Check whether @e's name or alias matches @name * @e: Scheduler to test * @name: Elevator name to test - * @required_features: Features that the elevator must provide * - * Return true is the elevator @e name matches @name and if @e provides all the - * the feratures spcified by @required_features. + * Return true if the elevator @e's name or alias matches @name. */ -static bool elevator_match(const struct elevator_type *e, const char *name, - unsigned int required_features) +static bool elevator_match(const struct elevator_type *e, const char *name) { - if (!elv_support_features(e->elevator_features, required_features)) - return false; - if (!strcmp(e->elevator_name, name)) - return true; - if (e->elevator_alias && !strcmp(e->elevator_alias, name)) - return true; - - return false; + return !strcmp(e->elevator_name, name) || + (e->elevator_alias && !strcmp(e->elevator_alias, name)); } -/** - * elevator_find - Find an elevator - * @name: Name of the elevator to find - * @required_features: Features that the elevator must provide - * - * Return the first registered scheduler with name @name and supporting the - * features @required_features and NULL otherwise. - */ -static struct elevator_type *elevator_find(const char *name, - unsigned int required_features) +static struct elevator_type *__elevator_find(const char *name) { struct elevator_type *e; - list_for_each_entry(e, &elv_list, list) { - if (elevator_match(e, name, required_features)) + list_for_each_entry(e, &elv_list, list) + if (elevator_match(e, name)) return e; - } - return NULL; } -static void elevator_put(struct elevator_type *e) -{ - module_put(e->elevator_owner); -} - -static struct elevator_type *elevator_get(struct request_queue *q, - const char *name, bool try_loading) +static struct elevator_type *elevator_find_get(struct request_queue *q, + const char *name) { struct elevator_type *e; spin_lock(&elv_list_lock); - - e = elevator_find(name, q->required_elevator_features); - if (!e && try_loading) { - spin_unlock(&elv_list_lock); - request_module("%s-iosched", name); - spin_lock(&elv_list_lock); - e = elevator_find(name, q->required_elevator_features); - } - - if (e && !try_module_get(e->elevator_owner)) + e = __elevator_find(name); + if (e && (!elv_support_features(q, e) || !elevator_tryget(e))) e = NULL; - spin_unlock(&elv_list_lock); return e; } -static struct kobj_type elv_ktype; +static const struct kobj_type elv_ktype; struct elevator_queue *elevator_alloc(struct request_queue *q, struct elevator_type *e) @@ -170,6 +137,7 @@ struct elevator_queue *elevator_alloc(struct request_queue *q, if (unlikely(!eq)) return NULL; + __elevator_get(e); eq->type = e; kobject_init(&eq->kobj, &elv_ktype); mutex_init(&eq->sysfs_lock); @@ -188,11 +156,15 @@ static void elevator_release(struct kobject *kobj) kfree(e); } -void __elevator_exit(struct request_queue *q, struct elevator_queue *e) +void elevator_exit(struct request_queue *q) { + struct elevator_queue *e = q->elevator; + + ioc_clear_queue(q); + blk_mq_sched_free_rqs(q); + mutex_lock(&e->sysfs_lock); - if (e->type->ops.exit_sched) - blk_mq_exit_sched(q, e); + blk_mq_exit_sched(q, e); mutex_unlock(&e->sysfs_lock); kobject_put(&e->kobj); @@ -337,6 +309,9 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); if (__rq && elv_bio_merge_ok(__rq, bio)) { *req = __rq; + + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_BACK_MERGE; } @@ -351,9 +326,11 @@ enum elv_merge elv_merge(struct request_queue *q, struct request **req, * we can append 'rq' to an existing request, so we can throw 'rq' away * afterwards. * - * Returns true if we merged, false otherwise + * Returns true if we merged, false otherwise. 'free' will contain all + * requests that need to be freed. */ -bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) +bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq, + struct list_head *free) { struct request *__rq; bool ret; @@ -364,8 +341,10 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) /* * First try one-hit cache. */ - if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) + if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) { + list_add(&rq->queuelist, free); return true; + } if (blk_queue_noxmerges(q)) return false; @@ -379,6 +358,7 @@ bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) break; + list_add(&rq->queuelist, free); /* The merged request could be merged with others, try again */ ret = true; rq = __rq; @@ -475,22 +455,19 @@ static const struct sysfs_ops elv_sysfs_ops = { .store = elv_attr_store, }; -static struct kobj_type elv_ktype = { +static const struct kobj_type elv_ktype = { .sysfs_ops = &elv_sysfs_ops, .release = elevator_release, }; -/* - * elv_register_queue is called from either blk_register_queue or - * elevator_switch, elevator switch is prevented from being happen - * in the two paths, so it is safe to not hold q->sysfs_lock. - */ int elv_register_queue(struct request_queue *q, bool uevent) { struct elevator_queue *e = q->elevator; int error; - error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); + lockdep_assert_held(&q->sysfs_lock); + + error = kobject_add(&e->kobj, &q->disk->queue_kobj, "iosched"); if (!error) { struct elv_fs_entry *attr = e->type->elevator_attrs; if (attr) { @@ -503,32 +480,32 @@ int elv_register_queue(struct request_queue *q, bool uevent) if (uevent) kobject_uevent(&e->kobj, KOBJ_ADD); - e->registered = 1; + set_bit(ELEVATOR_FLAG_REGISTERED, &e->flags); } return error; } -/* - * elv_unregister_queue is called from either blk_unregister_queue or - * elevator_switch, elevator switch is prevented from being happen - * in the two paths, so it is safe to not hold q->sysfs_lock. - */ void elv_unregister_queue(struct request_queue *q) { - if (q) { - struct elevator_queue *e = q->elevator; + struct elevator_queue *e = q->elevator; + lockdep_assert_held(&q->sysfs_lock); + + if (e && test_and_clear_bit(ELEVATOR_FLAG_REGISTERED, &e->flags)) { kobject_uevent(&e->kobj, KOBJ_REMOVE); kobject_del(&e->kobj); - - e->registered = 0; - /* Re-enable throttling in case elevator disabled it */ - wbt_enable_default(q); } } int elv_register(struct elevator_type *e) { + /* finish request is mandatory */ + if (WARN_ON_ONCE(!e->ops.finish_request)) + return -EINVAL; + /* insert_requests and dispatch_request are mandatory */ + if (WARN_ON_ONCE(!e->ops.insert_requests || !e->ops.dispatch_request)) + return -EINVAL; + /* create icq_cache if requested */ if (e->icq_size) { if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || @@ -545,7 +522,7 @@ int elv_register(struct elevator_type *e) /* register, don't allow duplicate names */ spin_lock(&elv_list_lock); - if (elevator_find(e->elevator_name, 0)) { + if (__elevator_find(e->elevator_name)) { spin_unlock(&elv_list_lock); kmem_cache_destroy(e->icq_cache); return -EBUSY; @@ -578,45 +555,9 @@ void elv_unregister(struct elevator_type *e) } EXPORT_SYMBOL_GPL(elv_unregister); -int elevator_switch_mq(struct request_queue *q, - struct elevator_type *new_e) -{ - int ret; - - lockdep_assert_held(&q->sysfs_lock); - - if (q->elevator) { - if (q->elevator->registered) - elv_unregister_queue(q); - - ioc_clear_queue(q); - elevator_exit(q, q->elevator); - } - - ret = blk_mq_init_sched(q, new_e); - if (ret) - goto out; - - if (new_e) { - ret = elv_register_queue(q, true); - if (ret) { - elevator_exit(q, q->elevator); - goto out; - } - } - - if (new_e) - blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); - else - blk_add_trace_msg(q, "elv switch: none"); - -out: - return ret; -} - static inline bool elv_support_iosched(struct request_queue *q) { - if (!q->mq_ops || + if (!queue_is_mq(q) || (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED))) return false; return true; @@ -628,10 +569,14 @@ static inline bool elv_support_iosched(struct request_queue *q) */ static struct elevator_type *elevator_get_default(struct request_queue *q) { - if (q->nr_hw_queues != 1) + if (q->tag_set && q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return NULL; + + if (q->nr_hw_queues != 1 && + !blk_mq_is_shared_tags(q->tag_set->flags)) return NULL; - return elevator_get(q, "mq-deadline", false); + return elevator_find_get(q, "mq-deadline"); } /* @@ -645,14 +590,13 @@ static struct elevator_type *elevator_get_by_features(struct request_queue *q) spin_lock(&elv_list_lock); list_for_each_entry(e, &elv_list, list) { - if (elv_support_features(e->elevator_features, - q->required_elevator_features)) { + if (elv_support_features(q, e)) { found = e; break; } } - if (found && !try_module_get(found->elevator_owner)) + if (found && !elevator_tryget(found)) found = NULL; spin_unlock(&elv_list_lock); @@ -673,7 +617,7 @@ void elevator_init_mq(struct request_queue *q) if (!elv_support_iosched(q)) return; - WARN_ON_ONCE(test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)); + WARN_ON_ONCE(blk_queue_registered(q)); if (unlikely(q->elevator)) return; @@ -685,127 +629,164 @@ void elevator_init_mq(struct request_queue *q) if (!e) return; + /* + * We are called before adding disk, when there isn't any FS I/O, + * so freezing queue plus canceling dispatch work is enough to + * drain any dispatch activities originated from passthrough + * requests, then no need to quiesce queue which may add long boot + * latency, especially when lots of disks are involved. + */ blk_mq_freeze_queue(q); - blk_mq_quiesce_queue(q); + blk_mq_cancel_work_sync(q); err = blk_mq_init_sched(q, e); - blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); if (err) { pr_warn("\"%s\" elevator initialization failed, " "falling back to \"none\"\n", e->elevator_name); - elevator_put(e); } -} + elevator_put(e); +} /* - * switch to new_e io scheduler. be careful not to introduce deadlocks - - * we don't free the old io scheduler, before we have allocated what we - * need for the new one. this way we have a chance of going back to the old - * one, if the new one fails init for some reason. + * Switch to new_e io scheduler. + * + * If switching fails, we are most likely running out of memory and not able + * to restore the old io scheduler, so leaving the io scheduler being none. */ -static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) +int elevator_switch(struct request_queue *q, struct elevator_type *new_e) { - int err; + int ret; lockdep_assert_held(&q->sysfs_lock); blk_mq_freeze_queue(q); blk_mq_quiesce_queue(q); - err = elevator_switch_mq(q, new_e); + if (q->elevator) { + elv_unregister_queue(q); + elevator_exit(q); + } + + ret = blk_mq_init_sched(q, new_e); + if (ret) + goto out_unfreeze; + ret = elv_register_queue(q, true); + if (ret) { + elevator_exit(q); + goto out_unfreeze; + } + blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); + +out_unfreeze: blk_mq_unquiesce_queue(q); blk_mq_unfreeze_queue(q); - return err; + if (ret) { + pr_warn("elv: switch to \"%s\" failed, falling back to \"none\"\n", + new_e->elevator_name); + } + + return ret; +} + +void elevator_disable(struct request_queue *q) +{ + lockdep_assert_held(&q->sysfs_lock); + + blk_mq_freeze_queue(q); + blk_mq_quiesce_queue(q); + + elv_unregister_queue(q); + elevator_exit(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + q->elevator = NULL; + q->nr_requests = q->tag_set->queue_depth; + blk_add_trace_msg(q, "elv switch: none"); + + blk_mq_unquiesce_queue(q); + blk_mq_unfreeze_queue(q); } /* * Switch this queue to the given IO scheduler. */ -static int __elevator_change(struct request_queue *q, const char *name) +static int elevator_change(struct request_queue *q, const char *elevator_name) { - char elevator_name[ELV_NAME_MAX]; struct elevator_type *e; + int ret; /* Make sure queue is not in the middle of being removed */ if (!blk_queue_registered(q)) return -ENOENT; - /* - * Special case for mq, turn off scheduling - */ - if (!strncmp(name, "none", 4)) { - if (!q->elevator) - return 0; - return elevator_switch(q, NULL); + if (!strncmp(elevator_name, "none", 4)) { + if (q->elevator) + elevator_disable(q); + return 0; } - strlcpy(elevator_name, name, sizeof(elevator_name)); - e = elevator_get(q, strstrip(elevator_name), true); - if (!e) - return -EINVAL; - - if (q->elevator && - elevator_match(q->elevator->type, elevator_name, 0)) { - elevator_put(e); + if (q->elevator && elevator_match(q->elevator->type, elevator_name)) return 0; - } - return elevator_switch(q, e); + e = elevator_find_get(q, elevator_name); + if (!e) { + request_module("%s-iosched", elevator_name); + e = elevator_find_get(q, elevator_name); + if (!e) + return -EINVAL; + } + ret = elevator_switch(q, e); + elevator_put(e); + return ret; } -ssize_t elv_iosched_store(struct request_queue *q, const char *name, +ssize_t elv_iosched_store(struct request_queue *q, const char *buf, size_t count) { + char elevator_name[ELV_NAME_MAX]; int ret; - if (!queue_is_mq(q) || !elv_support_iosched(q)) + if (!elv_support_iosched(q)) return count; - ret = __elevator_change(q, name); + strscpy(elevator_name, buf, sizeof(elevator_name)); + ret = elevator_change(q, strstrip(elevator_name)); if (!ret) return count; - return ret; } ssize_t elv_iosched_show(struct request_queue *q, char *name) { - struct elevator_queue *e = q->elevator; - struct elevator_type *elv = NULL; - struct elevator_type *__e; + struct elevator_queue *eq = q->elevator; + struct elevator_type *cur = NULL, *e; int len = 0; - if (!queue_is_mq(q)) + if (!elv_support_iosched(q)) return sprintf(name, "none\n"); - if (!q->elevator) + if (!q->elevator) { len += sprintf(name+len, "[none] "); - else - elv = e->type; + } else { + len += sprintf(name+len, "none "); + cur = eq->type; + } spin_lock(&elv_list_lock); - list_for_each_entry(__e, &elv_list, list) { - if (elv && elevator_match(elv, __e->elevator_name, 0)) { - len += sprintf(name+len, "[%s] ", elv->elevator_name); - continue; - } - if (elv_support_iosched(q) && - elevator_match(__e, __e->elevator_name, - q->required_elevator_features)) - len += sprintf(name+len, "%s ", __e->elevator_name); + list_for_each_entry(e, &elv_list, list) { + if (e == cur) + len += sprintf(name+len, "[%s] ", e->elevator_name); + else if (elv_support_features(q, e)) + len += sprintf(name+len, "%s ", e->elevator_name); } spin_unlock(&elv_list_lock); - if (q->elevator) - len += sprintf(name+len, "none"); - - len += sprintf(len+name, "\n"); + len += sprintf(name+len, "\n"); return len; } diff --git a/block/elevator.h b/block/elevator.h new file mode 100644 index 000000000000..7ca3d7b6ed82 --- /dev/null +++ b/block/elevator.h @@ -0,0 +1,186 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ELEVATOR_H +#define _ELEVATOR_H + +#include <linux/percpu.h> +#include <linux/hashtable.h> +#include "blk-mq.h" + +struct io_cq; +struct elevator_type; +struct blk_mq_debugfs_attr; + +/* + * Return values from elevator merger + */ +enum elv_merge { + ELEVATOR_NO_MERGE = 0, + ELEVATOR_FRONT_MERGE = 1, + ELEVATOR_BACK_MERGE = 2, + ELEVATOR_DISCARD_MERGE = 3, +}; + +struct blk_mq_alloc_data; +struct blk_mq_hw_ctx; + +struct elevator_mq_ops { + int (*init_sched)(struct request_queue *, struct elevator_type *); + void (*exit_sched)(struct elevator_queue *); + int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*depth_updated)(struct blk_mq_hw_ctx *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct request_queue *, struct bio *, unsigned int); + int (*request_merge)(struct request_queue *q, struct request **, struct bio *); + void (*request_merged)(struct request_queue *, struct request *, enum elv_merge); + void (*requests_merged)(struct request_queue *, struct request *, struct request *); + void (*limit_depth)(blk_opf_t, struct blk_mq_alloc_data *); + void (*prepare_request)(struct request *); + void (*finish_request)(struct request *); + void (*insert_requests)(struct blk_mq_hw_ctx *hctx, struct list_head *list, + blk_insert_t flags); + struct request *(*dispatch_request)(struct blk_mq_hw_ctx *); + bool (*has_work)(struct blk_mq_hw_ctx *); + void (*completed_request)(struct request *, u64); + void (*requeue_request)(struct request *); + struct request *(*former_request)(struct request_queue *, struct request *); + struct request *(*next_request)(struct request_queue *, struct request *); + void (*init_icq)(struct io_cq *); + void (*exit_icq)(struct io_cq *); +}; + +#define ELV_NAME_MAX (16) + +struct elv_fs_entry { + struct attribute attr; + ssize_t (*show)(struct elevator_queue *, char *); + ssize_t (*store)(struct elevator_queue *, const char *, size_t); +}; + +/* + * identifies an elevator type, such as AS or deadline + */ +struct elevator_type +{ + /* managed by elevator core */ + struct kmem_cache *icq_cache; + + /* fields provided by elevator implementation */ + struct elevator_mq_ops ops; + + size_t icq_size; /* see iocontext.h */ + size_t icq_align; /* ditto */ + struct elv_fs_entry *elevator_attrs; + const char *elevator_name; + const char *elevator_alias; + const unsigned int elevator_features; + struct module *elevator_owner; +#ifdef CONFIG_BLK_DEBUG_FS + const struct blk_mq_debugfs_attr *queue_debugfs_attrs; + const struct blk_mq_debugfs_attr *hctx_debugfs_attrs; +#endif + + /* managed by elevator core */ + char icq_cache_name[ELV_NAME_MAX + 6]; /* elvname + "_io_cq" */ + struct list_head list; +}; + +static inline bool elevator_tryget(struct elevator_type *e) +{ + return try_module_get(e->elevator_owner); +} + +static inline void __elevator_get(struct elevator_type *e) +{ + __module_get(e->elevator_owner); +} + +static inline void elevator_put(struct elevator_type *e) +{ + module_put(e->elevator_owner); +} + +#define ELV_HASH_BITS 6 + +void elv_rqhash_del(struct request_queue *q, struct request *rq); +void elv_rqhash_add(struct request_queue *q, struct request *rq); +void elv_rqhash_reposition(struct request_queue *q, struct request *rq); +struct request *elv_rqhash_find(struct request_queue *q, sector_t offset); + +/* + * each queue has an elevator_queue associated with it + */ +struct elevator_queue +{ + struct elevator_type *type; + void *elevator_data; + struct kobject kobj; + struct mutex sysfs_lock; + unsigned long flags; + DECLARE_HASHTABLE(hash, ELV_HASH_BITS); +}; + +#define ELEVATOR_FLAG_REGISTERED 0 +#define ELEVATOR_FLAG_DISABLE_WBT 1 + +/* + * block elevator interface + */ +extern enum elv_merge elv_merge(struct request_queue *, struct request **, + struct bio *); +extern void elv_merge_requests(struct request_queue *, struct request *, + struct request *); +extern void elv_merged_request(struct request_queue *, struct request *, + enum elv_merge); +extern bool elv_attempt_insert_merge(struct request_queue *, struct request *, + struct list_head *); +extern struct request *elv_former_request(struct request_queue *, struct request *); +extern struct request *elv_latter_request(struct request_queue *, struct request *); +void elevator_init_mq(struct request_queue *q); + +/* + * io scheduler registration + */ +extern int elv_register(struct elevator_type *); +extern void elv_unregister(struct elevator_type *); + +/* + * io scheduler sysfs switching + */ +extern ssize_t elv_iosched_show(struct request_queue *, char *); +extern ssize_t elv_iosched_store(struct request_queue *, const char *, size_t); + +extern bool elv_bio_merge_ok(struct request *, struct bio *); +extern struct elevator_queue *elevator_alloc(struct request_queue *, + struct elevator_type *); + +/* + * Helper functions. + */ +extern struct request *elv_rb_former_request(struct request_queue *, struct request *); +extern struct request *elv_rb_latter_request(struct request_queue *, struct request *); + +/* + * rb support functions. + */ +extern void elv_rb_add(struct rb_root *, struct request *); +extern void elv_rb_del(struct rb_root *, struct request *); +extern struct request *elv_rb_find(struct rb_root *, sector_t); + +/* + * Insertion selection + */ +#define ELEVATOR_INSERT_FRONT 1 +#define ELEVATOR_INSERT_BACK 2 +#define ELEVATOR_INSERT_SORT 3 +#define ELEVATOR_INSERT_REQUEUE 4 +#define ELEVATOR_INSERT_FLUSH 5 +#define ELEVATOR_INSERT_SORT_MERGE 6 + +#define rb_entry_rq(node) rb_entry((node), struct request, rb_node) + +#define rq_entry_fifo(ptr) list_entry((ptr), struct request, queuelist) +#define rq_fifo_clear(rq) list_del_init(&(rq)->queuelist) + +#endif /* _ELEVATOR_H */ diff --git a/block/fops.c b/block/fops.c new file mode 100644 index 000000000000..0cf8cf72cdfa --- /dev/null +++ b/block/fops.c @@ -0,0 +1,877 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (C) 1991, 1992 Linus Torvalds + * Copyright (C) 2001 Andrea Arcangeli <andrea@suse.de> SuSE + * Copyright (C) 2016 - 2020 Christoph Hellwig + */ +#include <linux/init.h> +#include <linux/mm.h> +#include <linux/blkdev.h> +#include <linux/buffer_head.h> +#include <linux/mpage.h> +#include <linux/uio.h> +#include <linux/namei.h> +#include <linux/task_io_accounting_ops.h> +#include <linux/falloc.h> +#include <linux/suspend.h> +#include <linux/fs.h> +#include <linux/iomap.h> +#include <linux/module.h> +#include "blk.h" + +static inline struct inode *bdev_file_inode(struct file *file) +{ + return file->f_mapping->host; +} + +static blk_opf_t dio_bio_write_op(struct kiocb *iocb) +{ + blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE; + + /* avoid the need for a I/O completion work item */ + if (iocb_is_dsync(iocb)) + opf |= REQ_FUA; + return opf; +} + +static bool blkdev_dio_unaligned(struct block_device *bdev, loff_t pos, + struct iov_iter *iter) +{ + return pos & (bdev_logical_block_size(bdev) - 1) || + !bdev_iter_is_aligned(bdev, iter); +} + +#define DIO_INLINE_BIO_VECS 4 + +static ssize_t __blkdev_direct_IO_simple(struct kiocb *iocb, + struct iov_iter *iter, unsigned int nr_pages) +{ + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + struct bio_vec inline_vecs[DIO_INLINE_BIO_VECS], *vecs; + loff_t pos = iocb->ki_pos; + bool should_dirty = false; + struct bio bio; + ssize_t ret; + + if (blkdev_dio_unaligned(bdev, pos, iter)) + return -EINVAL; + + if (nr_pages <= DIO_INLINE_BIO_VECS) + vecs = inline_vecs; + else { + vecs = kmalloc_array(nr_pages, sizeof(struct bio_vec), + GFP_KERNEL); + if (!vecs) + return -ENOMEM; + } + + if (iov_iter_rw(iter) == READ) { + bio_init(&bio, bdev, vecs, nr_pages, REQ_OP_READ); + if (user_backed_iter(iter)) + should_dirty = true; + } else { + bio_init(&bio, bdev, vecs, nr_pages, dio_bio_write_op(iocb)); + } + bio.bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio.bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(&bio, iter); + if (unlikely(ret)) + goto out; + ret = bio.bi_iter.bi_size; + + if (iov_iter_rw(iter) == WRITE) + task_io_account_write(ret); + + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; + + submit_bio_wait(&bio); + + bio_release_pages(&bio, should_dirty); + if (unlikely(bio.bi_status)) + ret = blk_status_to_errno(bio.bi_status); + +out: + if (vecs != inline_vecs) + kfree(vecs); + + bio_uninit(&bio); + + return ret; +} + +enum { + DIO_SHOULD_DIRTY = 1, + DIO_IS_SYNC = 2, +}; + +struct blkdev_dio { + union { + struct kiocb *iocb; + struct task_struct *waiter; + }; + size_t size; + atomic_t ref; + unsigned int flags; + struct bio bio ____cacheline_aligned_in_smp; +}; + +static struct bio_set blkdev_dio_pool; + +static void blkdev_bio_end_io(struct bio *bio) +{ + struct blkdev_dio *dio = bio->bi_private; + bool should_dirty = dio->flags & DIO_SHOULD_DIRTY; + + if (bio->bi_status && !dio->bio.bi_status) + dio->bio.bi_status = bio->bi_status; + + if (atomic_dec_and_test(&dio->ref)) { + if (!(dio->flags & DIO_IS_SYNC)) { + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + WRITE_ONCE(iocb->private, NULL); + + if (likely(!dio->bio.bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(dio->bio.bi_status); + } + + dio->iocb->ki_complete(iocb, ret); + bio_put(&dio->bio); + } else { + struct task_struct *waiter = dio->waiter; + + WRITE_ONCE(dio->waiter, NULL); + blk_wake_io_task(waiter); + } + } + + if (should_dirty) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, + unsigned int nr_pages) +{ + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + struct blk_plug plug; + struct blkdev_dio *dio; + struct bio *bio; + bool is_read = (iov_iter_rw(iter) == READ), is_sync; + blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); + loff_t pos = iocb->ki_pos; + int ret = 0; + + if (blkdev_dio_unaligned(bdev, pos, iter)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); + dio = container_of(bio, struct blkdev_dio, bio); + atomic_set(&dio->ref, 1); + /* + * Grab an extra reference to ensure the dio structure which is embedded + * into the first bio stays around. + */ + bio_get(bio); + + is_sync = is_sync_kiocb(iocb); + if (is_sync) { + dio->flags = DIO_IS_SYNC; + dio->waiter = current; + } else { + dio->flags = 0; + dio->iocb = iocb; + } + + dio->size = 0; + if (is_read && user_backed_iter(iter)) + dio->flags |= DIO_SHOULD_DIRTY; + + blk_start_plug(&plug); + + for (;;) { + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_private = dio; + bio->bi_end_io = blkdev_bio_end_io; + bio->bi_ioprio = iocb->ki_ioprio; + + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + break; + } + if (iocb->ki_flags & IOCB_NOWAIT) { + /* + * This is nonblocking IO, and we need to allocate + * another bio if we have data left to map. As we + * cannot guarantee that one of the sub bios will not + * fail getting issued FOR NOWAIT and as error results + * are coalesced across all of them, be safe and ask for + * a retry of this from blocking context. + */ + if (unlikely(iov_iter_count(iter))) { + bio_release_pages(bio, false); + bio_clear_flag(bio, BIO_REFFED); + bio_put(bio); + blk_finish_plug(&plug); + return -EAGAIN; + } + bio->bi_opf |= REQ_NOWAIT; + } + + if (is_read) { + if (dio->flags & DIO_SHOULD_DIRTY) + bio_set_pages_dirty(bio); + } else { + task_io_account_write(bio->bi_iter.bi_size); + } + dio->size += bio->bi_iter.bi_size; + pos += bio->bi_iter.bi_size; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS); + if (!nr_pages) { + submit_bio(bio); + break; + } + atomic_inc(&dio->ref); + submit_bio(bio); + bio = bio_alloc(bdev, nr_pages, opf, GFP_KERNEL); + } + + blk_finish_plug(&plug); + + if (!is_sync) + return -EIOCBQUEUED; + + for (;;) { + set_current_state(TASK_UNINTERRUPTIBLE); + if (!READ_ONCE(dio->waiter)) + break; + blk_io_schedule(); + } + __set_current_state(TASK_RUNNING); + + if (!ret) + ret = blk_status_to_errno(dio->bio.bi_status); + if (likely(!ret)) + ret = dio->size; + + bio_put(&dio->bio); + return ret; +} + +static void blkdev_bio_end_io_async(struct bio *bio) +{ + struct blkdev_dio *dio = container_of(bio, struct blkdev_dio, bio); + struct kiocb *iocb = dio->iocb; + ssize_t ret; + + WRITE_ONCE(iocb->private, NULL); + + if (likely(!bio->bi_status)) { + ret = dio->size; + iocb->ki_pos += ret; + } else { + ret = blk_status_to_errno(bio->bi_status); + } + + iocb->ki_complete(iocb, ret); + + if (dio->flags & DIO_SHOULD_DIRTY) { + bio_check_pages_dirty(bio); + } else { + bio_release_pages(bio, false); + bio_put(bio); + } +} + +static ssize_t __blkdev_direct_IO_async(struct kiocb *iocb, + struct iov_iter *iter, + unsigned int nr_pages) +{ + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + bool is_read = iov_iter_rw(iter) == READ; + blk_opf_t opf = is_read ? REQ_OP_READ : dio_bio_write_op(iocb); + struct blkdev_dio *dio; + struct bio *bio; + loff_t pos = iocb->ki_pos; + int ret = 0; + + if (blkdev_dio_unaligned(bdev, pos, iter)) + return -EINVAL; + + if (iocb->ki_flags & IOCB_ALLOC_CACHE) + opf |= REQ_ALLOC_CACHE; + bio = bio_alloc_bioset(bdev, nr_pages, opf, GFP_KERNEL, + &blkdev_dio_pool); + dio = container_of(bio, struct blkdev_dio, bio); + dio->flags = 0; + dio->iocb = iocb; + bio->bi_iter.bi_sector = pos >> SECTOR_SHIFT; + bio->bi_end_io = blkdev_bio_end_io_async; + bio->bi_ioprio = iocb->ki_ioprio; + + if (iov_iter_is_bvec(iter)) { + /* + * Users don't rely on the iterator being in any particular + * state for async I/O returning -EIOCBQUEUED, hence we can + * avoid expensive iov_iter_advance(). Bypass + * bio_iov_iter_get_pages() and set the bvec directly. + */ + bio_iov_bvec_set(bio, iter); + } else { + ret = bio_iov_iter_get_pages(bio, iter); + if (unlikely(ret)) { + bio_put(bio); + return ret; + } + } + dio->size = bio->bi_iter.bi_size; + + if (is_read) { + if (user_backed_iter(iter)) { + dio->flags |= DIO_SHOULD_DIRTY; + bio_set_pages_dirty(bio); + } + } else { + task_io_account_write(bio->bi_iter.bi_size); + } + + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; + + if (iocb->ki_flags & IOCB_HIPRI) { + bio->bi_opf |= REQ_POLLED; + submit_bio(bio); + WRITE_ONCE(iocb->private, bio); + } else { + submit_bio(bio); + } + return -EIOCBQUEUED; +} + +static ssize_t blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +{ + unsigned int nr_pages; + + if (!iov_iter_count(iter)) + return 0; + + nr_pages = bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS + 1); + if (likely(nr_pages <= BIO_MAX_VECS)) { + if (is_sync_kiocb(iocb)) + return __blkdev_direct_IO_simple(iocb, iter, nr_pages); + return __blkdev_direct_IO_async(iocb, iter, nr_pages); + } + return __blkdev_direct_IO(iocb, iter, bio_max_segs(nr_pages)); +} + +static int blkdev_iomap_begin(struct inode *inode, loff_t offset, loff_t length, + unsigned int flags, struct iomap *iomap, struct iomap *srcmap) +{ + struct block_device *bdev = I_BDEV(inode); + loff_t isize = i_size_read(inode); + + iomap->bdev = bdev; + iomap->offset = ALIGN_DOWN(offset, bdev_logical_block_size(bdev)); + if (iomap->offset >= isize) + return -EIO; + iomap->type = IOMAP_MAPPED; + iomap->addr = iomap->offset; + iomap->length = isize - iomap->offset; + iomap->flags |= IOMAP_F_BUFFER_HEAD; /* noop for !CONFIG_BUFFER_HEAD */ + return 0; +} + +static const struct iomap_ops blkdev_iomap_ops = { + .iomap_begin = blkdev_iomap_begin, +}; + +#ifdef CONFIG_BUFFER_HEAD +static int blkdev_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh, int create) +{ + bh->b_bdev = I_BDEV(inode); + bh->b_blocknr = iblock; + set_buffer_mapped(bh); + return 0; +} + +/* + * We cannot call mpage_writepages() as it does not take the buffer lock. + * We must use block_write_full_folio() directly which holds the buffer + * lock. The buffer lock provides the synchronisation with writeback + * that filesystems rely on when they use the blockdev's mapping. + */ +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct blk_plug plug; + int err; + + blk_start_plug(&plug); + err = write_cache_pages(mapping, wbc, block_write_full_folio, + blkdev_get_block); + blk_finish_plug(&plug); + + return err; +} + +static int blkdev_read_folio(struct file *file, struct folio *folio) +{ + return block_read_full_folio(folio, blkdev_get_block); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + mpage_readahead(rac, blkdev_get_block); +} + +static int blkdev_write_begin(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, struct page **pagep, void **fsdata) +{ + return block_write_begin(mapping, pos, len, pagep, blkdev_get_block); +} + +static int blkdev_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, struct page *page, + void *fsdata) +{ + int ret; + ret = block_write_end(file, mapping, pos, len, copied, page, fsdata); + + unlock_page(page); + put_page(page); + + return ret; +} + +const struct address_space_operations def_blk_aops = { + .dirty_folio = block_dirty_folio, + .invalidate_folio = block_invalidate_folio, + .read_folio = blkdev_read_folio, + .readahead = blkdev_readahead, + .writepages = blkdev_writepages, + .write_begin = blkdev_write_begin, + .write_end = blkdev_write_end, + .migrate_folio = buffer_migrate_folio_norefs, + .is_dirty_writeback = buffer_check_dirty_writeback, +}; +#else /* CONFIG_BUFFER_HEAD */ +static int blkdev_read_folio(struct file *file, struct folio *folio) +{ + return iomap_read_folio(folio, &blkdev_iomap_ops); +} + +static void blkdev_readahead(struct readahead_control *rac) +{ + iomap_readahead(rac, &blkdev_iomap_ops); +} + +static int blkdev_map_blocks(struct iomap_writepage_ctx *wpc, + struct inode *inode, loff_t offset) +{ + loff_t isize = i_size_read(inode); + + if (WARN_ON_ONCE(offset >= isize)) + return -EIO; + if (offset >= wpc->iomap.offset && + offset < wpc->iomap.offset + wpc->iomap.length) + return 0; + return blkdev_iomap_begin(inode, offset, isize - offset, + IOMAP_WRITE, &wpc->iomap, NULL); +} + +static const struct iomap_writeback_ops blkdev_writeback_ops = { + .map_blocks = blkdev_map_blocks, +}; + +static int blkdev_writepages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct iomap_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc, &blkdev_writeback_ops); +} + +const struct address_space_operations def_blk_aops = { + .dirty_folio = filemap_dirty_folio, + .release_folio = iomap_release_folio, + .invalidate_folio = iomap_invalidate_folio, + .read_folio = blkdev_read_folio, + .readahead = blkdev_readahead, + .writepages = blkdev_writepages, + .is_partially_uptodate = iomap_is_partially_uptodate, + .error_remove_folio = generic_error_remove_folio, + .migrate_folio = filemap_migrate_folio, +}; +#endif /* CONFIG_BUFFER_HEAD */ + +/* + * for a block special file file_inode(file)->i_size is zero + * so we compute the size by hand (just as in block_read/write above) + */ +static loff_t blkdev_llseek(struct file *file, loff_t offset, int whence) +{ + struct inode *bd_inode = bdev_file_inode(file); + loff_t retval; + + inode_lock(bd_inode); + retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); + inode_unlock(bd_inode); + return retval; +} + +static int blkdev_fsync(struct file *filp, loff_t start, loff_t end, + int datasync) +{ + struct block_device *bdev = I_BDEV(filp->f_mapping->host); + int error; + + error = file_write_and_wait_range(filp, start, end); + if (error) + return error; + + /* + * There is no need to serialise calls to blkdev_issue_flush with + * i_mutex and doing so causes performance issues with concurrent + * O_SYNC writers to a block device. + */ + error = blkdev_issue_flush(bdev); + if (error == -EOPNOTSUPP) + error = 0; + + return error; +} + +/** + * file_to_blk_mode - get block open flags from file flags + * @file: file whose open flags should be converted + * + * Look at file open flags and generate corresponding block open flags from + * them. The function works both for file just being open (e.g. during ->open + * callback) and for file that is already open. This is actually non-trivial + * (see comment in the function). + */ +blk_mode_t file_to_blk_mode(struct file *file) +{ + blk_mode_t mode = 0; + struct bdev_handle *handle = file->private_data; + + if (file->f_mode & FMODE_READ) + mode |= BLK_OPEN_READ; + if (file->f_mode & FMODE_WRITE) + mode |= BLK_OPEN_WRITE; + /* + * do_dentry_open() clears O_EXCL from f_flags, use handle->mode to + * determine whether the open was exclusive for already open files. + */ + if (handle) + mode |= handle->mode & BLK_OPEN_EXCL; + else if (file->f_flags & O_EXCL) + mode |= BLK_OPEN_EXCL; + if (file->f_flags & O_NDELAY) + mode |= BLK_OPEN_NDELAY; + + /* + * If all bits in O_ACCMODE set (aka O_RDWR | O_WRONLY), the floppy + * driver has historically allowed ioctls as if the file was opened for + * writing, but does not allow and actual reads or writes. + */ + if ((file->f_flags & O_ACCMODE) == (O_RDWR | O_WRONLY)) + mode |= BLK_OPEN_WRITE_IOCTL; + + return mode; +} + +static int blkdev_open(struct inode *inode, struct file *filp) +{ + struct bdev_handle *handle; + blk_mode_t mode; + + /* + * Preserve backwards compatibility and allow large file access + * even if userspace doesn't ask for it explicitly. Some mkfs + * binary needs it. We might want to drop this workaround + * during an unstable branch. + */ + filp->f_flags |= O_LARGEFILE; + filp->f_mode |= FMODE_BUF_RASYNC | FMODE_CAN_ODIRECT; + + mode = file_to_blk_mode(filp); + handle = bdev_open_by_dev(inode->i_rdev, mode, + mode & BLK_OPEN_EXCL ? filp : NULL, NULL); + if (IS_ERR(handle)) + return PTR_ERR(handle); + + if (bdev_nowait(handle->bdev)) + filp->f_mode |= FMODE_NOWAIT; + + filp->f_mapping = handle->bdev->bd_inode->i_mapping; + filp->f_wb_err = filemap_sample_wb_err(filp->f_mapping); + filp->private_data = handle; + return 0; +} + +static int blkdev_release(struct inode *inode, struct file *filp) +{ + bdev_release(filp->private_data); + return 0; +} + +static ssize_t +blkdev_direct_write(struct kiocb *iocb, struct iov_iter *from) +{ + size_t count = iov_iter_count(from); + ssize_t written; + + written = kiocb_invalidate_pages(iocb, count); + if (written) { + if (written == -EBUSY) + return 0; + return written; + } + + written = blkdev_direct_IO(iocb, from); + if (written > 0) { + kiocb_invalidate_post_direct_write(iocb, count); + iocb->ki_pos += written; + count -= written; + } + if (written != -EIOCBQUEUED) + iov_iter_revert(from, count - iov_iter_count(from)); + return written; +} + +static ssize_t blkdev_buffered_write(struct kiocb *iocb, struct iov_iter *from) +{ + return iomap_file_buffered_write(iocb, from, &blkdev_iomap_ops); +} + +/* + * Write data to the block device. Only intended for the block device itself + * and the raw driver which basically is a fake block device. + * + * Does not take i_mutex for the write and thus is not for general purpose + * use. + */ +static ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct block_device *bdev = I_BDEV(file->f_mapping->host); + struct inode *bd_inode = bdev->bd_inode; + loff_t size = bdev_nr_bytes(bdev); + size_t shorted = 0; + ssize_t ret; + + if (bdev_read_only(bdev)) + return -EPERM; + + if (IS_SWAPFILE(bd_inode) && !is_hibernate_resume_dev(bd_inode->i_rdev)) + return -ETXTBSY; + + if (!iov_iter_count(from)) + return 0; + + if (iocb->ki_pos >= size) + return -ENOSPC; + + if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) + return -EOPNOTSUPP; + + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } + + ret = file_update_time(file); + if (ret) + return ret; + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = blkdev_direct_write(iocb, from); + if (ret >= 0 && iov_iter_count(from)) + ret = direct_write_fallback(iocb, from, ret, + blkdev_buffered_write(iocb, from)); + } else { + ret = blkdev_buffered_write(iocb, from); + } + + if (ret > 0) + ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); + return ret; +} + +static ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct block_device *bdev = I_BDEV(iocb->ki_filp->f_mapping->host); + loff_t size = bdev_nr_bytes(bdev); + loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret = 0; + size_t count; + + if (unlikely(pos + iov_iter_count(to) > size)) { + if (pos >= size) + return 0; + size -= pos; + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + count = iov_iter_count(to); + if (!count) + goto reexpand; /* skip atime */ + + if (iocb->ki_flags & IOCB_DIRECT) { + ret = kiocb_write_and_wait(iocb, count); + if (ret < 0) + goto reexpand; + file_accessed(iocb->ki_filp); + + ret = blkdev_direct_IO(iocb, to); + if (ret >= 0) { + iocb->ki_pos += ret; + count -= ret; + } + iov_iter_revert(to, count - iov_iter_count(to)); + if (ret < 0 || !count) + goto reexpand; + } + + ret = filemap_read(iocb, to, ret); + +reexpand: + if (unlikely(shorted)) + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; +} + +#define BLKDEV_FALLOC_FL_SUPPORTED \ + (FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE | \ + FALLOC_FL_ZERO_RANGE | FALLOC_FL_NO_HIDE_STALE) + +static long blkdev_fallocate(struct file *file, int mode, loff_t start, + loff_t len) +{ + struct inode *inode = bdev_file_inode(file); + struct block_device *bdev = I_BDEV(inode); + loff_t end = start + len - 1; + loff_t isize; + int error; + + /* Fail if we don't recognize the flags. */ + if (mode & ~BLKDEV_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* Don't go off the end of the device. */ + isize = bdev_nr_bytes(bdev); + if (start >= isize) + return -EINVAL; + if (end >= isize) { + if (mode & FALLOC_FL_KEEP_SIZE) { + len = isize - start; + end = start + len - 1; + } else + return -EINVAL; + } + + /* + * Don't allow IO that isn't aligned to logical block size. + */ + if ((start | len) & (bdev_logical_block_size(bdev) - 1)) + return -EINVAL; + + filemap_invalidate_lock(inode->i_mapping); + + /* + * Invalidate the page cache, including dirty pages, for valid + * de-allocate mode calls to fallocate(). + */ + switch (mode) { + case FALLOC_FL_ZERO_RANGE: + case FALLOC_FL_ZERO_RANGE | FALLOC_FL_KEEP_SIZE: + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); + if (error) + goto fail; + + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOUNMAP); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); + if (error) + goto fail; + + error = blkdev_issue_zeroout(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL, + BLKDEV_ZERO_NOFALLBACK); + break; + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE | FALLOC_FL_NO_HIDE_STALE: + error = truncate_bdev_range(bdev, file_to_blk_mode(file), start, end); + if (error) + goto fail; + + error = blkdev_issue_discard(bdev, start >> SECTOR_SHIFT, + len >> SECTOR_SHIFT, GFP_KERNEL); + break; + default: + error = -EOPNOTSUPP; + } + + fail: + filemap_invalidate_unlock(inode->i_mapping); + return error; +} + +static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) +{ + struct inode *bd_inode = bdev_file_inode(file); + + if (bdev_read_only(I_BDEV(bd_inode))) + return generic_file_readonly_mmap(file, vma); + + return generic_file_mmap(file, vma); +} + +const struct file_operations def_blk_fops = { + .open = blkdev_open, + .release = blkdev_release, + .llseek = blkdev_llseek, + .read_iter = blkdev_read_iter, + .write_iter = blkdev_write_iter, + .iopoll = iocb_bio_iopoll, + .mmap = blkdev_mmap, + .fsync = blkdev_fsync, + .unlocked_ioctl = blkdev_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = compat_blkdev_ioctl, +#endif + .splice_read = filemap_splice_read, + .splice_write = iter_file_splice_write, + .fallocate = blkdev_fallocate, +}; + +static __init int blkdev_init(void) +{ + return bioset_init(&blkdev_dio_pool, 4, + offsetof(struct blkdev_dio, bio), + BIOSET_NEED_BVECS|BIOSET_PERCPU_CACHE); +} +module_init(blkdev_init); diff --git a/block/genhd.c b/block/genhd.c index 1a7659327664..d74fb5b4ae68 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -1,12 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 /* * gendisk handling + * + * Portions Copyright (C) 2020 Christoph Hellwig */ #include <linux/module.h> #include <linux/ctype.h> #include <linux/fs.h> -#include <linux/genhd.h> #include <linux/kdev_t.h> #include <linux/kernel.h> #include <linux/blkdev.h> @@ -17,88 +18,93 @@ #include <linux/seq_file.h> #include <linux/slab.h> #include <linux/kmod.h> -#include <linux/kobj_map.h> +#include <linux/major.h> #include <linux/mutex.h> #include <linux/idr.h> #include <linux/log2.h> #include <linux/pm_runtime.h> #include <linux/badblocks.h> +#include <linux/part_stat.h> +#include <linux/blktrace_api.h> +#include "blk-throttle.h" #include "blk.h" +#include "blk-mq-sched.h" +#include "blk-rq-qos.h" +#include "blk-cgroup.h" -static DEFINE_MUTEX(block_class_lock); static struct kobject *block_depr; -/* for extended dynamic devt allocation, currently only one major is used */ -#define NR_EXT_DEVT (1 << MINORBITS) - -/* For extended devt allocation. ext_devt_lock prevents look up - * results from going away underneath its user. +/* + * Unique, monotonically increasing sequential number associated with block + * devices instances (i.e. incremented each time a device is attached). + * Associating uevents with block devices in userspace is difficult and racy: + * the uevent netlink socket is lossy, and on slow and overloaded systems has + * a very high latency. + * Block devices do not have exclusive owners in userspace, any process can set + * one up (e.g. loop devices). Moreover, device names can be reused (e.g. loop0 + * can be reused again and again). + * A userspace process setting up a block device and watching for its events + * cannot thus reliably tell whether an event relates to the device it just set + * up or another earlier instance with the same name. + * This sequential number allows userspace processes to solve this problem, and + * uniquely associate an uevent to the lifetime to a device. */ -static DEFINE_SPINLOCK(ext_devt_lock); -static DEFINE_IDR(ext_devt_idr); +static atomic64_t diskseq; -static const struct device_type disk_type; +/* for extended dynamic devt allocation, currently only one major is used */ +#define NR_EXT_DEVT (1 << MINORBITS) +static DEFINE_IDA(ext_devt_ida); -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr); -static void disk_alloc_events(struct gendisk *disk); -static void disk_add_events(struct gendisk *disk); -static void disk_del_events(struct gendisk *disk); -static void disk_release_events(struct gendisk *disk); +void set_capacity(struct gendisk *disk, sector_t sectors) +{ + bdev_set_nr_sectors(disk->part0, sectors); +} +EXPORT_SYMBOL(set_capacity); /* - * Set disk capacity and notify if the size is not currently - * zero and will not be set to zero + * Set disk capacity and notify if the size is not currently zero and will not + * be set to zero. Returns true if a uevent was sent, otherwise false. */ -void set_capacity_revalidate_and_notify(struct gendisk *disk, sector_t size, - bool revalidate) +bool set_capacity_and_notify(struct gendisk *disk, sector_t size) { sector_t capacity = get_capacity(disk); + char *envp[] = { "RESIZE=1", NULL }; set_capacity(disk, size); - if (revalidate) - revalidate_disk(disk); - - if (capacity != size && capacity != 0 && size != 0) { - char *envp[] = { "RESIZE=1", NULL }; - - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); - } -} - -EXPORT_SYMBOL_GPL(set_capacity_revalidate_and_notify); - -/* - * Format the device name of the indicated disk into the supplied buffer and - * return a pointer to that same buffer for convenience. - */ -char *disk_name(struct gendisk *hd, int partno, char *buf) -{ - if (!partno) - snprintf(buf, BDEVNAME_SIZE, "%s", hd->disk_name); - else if (isdigit(hd->disk_name[strlen(hd->disk_name)-1])) - snprintf(buf, BDEVNAME_SIZE, "%sp%d", hd->disk_name, partno); - else - snprintf(buf, BDEVNAME_SIZE, "%s%d", hd->disk_name, partno); + /* + * Only print a message and send a uevent if the gendisk is user visible + * and alive. This avoids spamming the log and udev when setting the + * initial capacity during probing. + */ + if (size == capacity || + !disk_live(disk) || + (disk->flags & GENHD_FL_HIDDEN)) + return false; - return buf; -} + pr_info("%s: detected capacity change from %lld to %lld\n", + disk->disk_name, capacity, size); -const char *bdevname(struct block_device *bdev, char *buf) -{ - return disk_name(bdev->bd_disk, bdev->bd_part->partno, buf); + /* + * Historically we did not send a uevent for changes to/from an empty + * device. + */ + if (!capacity || !size) + return false; + kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); + return true; } -EXPORT_SYMBOL(bdevname); +EXPORT_SYMBOL_GPL(set_capacity_and_notify); -static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) +static void part_stat_read_all(struct block_device *part, + struct disk_stats *stat) { int cpu; memset(stat, 0, sizeof(struct disk_stats)); for_each_possible_cpu(cpu) { - struct disk_stats *ptr = per_cpu_ptr(part->dkstats, cpu); + struct disk_stats *ptr = per_cpu_ptr(part->bd_stats, cpu); int group; for (group = 0; group < NR_STAT_GROUPS; group++) { @@ -112,8 +118,7 @@ static void part_stat_read_all(struct hd_struct *part, struct disk_stats *stat) } } -static unsigned int part_in_flight(struct request_queue *q, - struct hd_struct *part) +static unsigned int part_in_flight(struct block_device *part) { unsigned int inflight = 0; int cpu; @@ -128,7 +133,7 @@ static unsigned int part_in_flight(struct request_queue *q, return inflight; } -static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, +static void part_in_flight_rw(struct block_device *part, unsigned int inflight[2]) { int cpu; @@ -145,247 +150,6 @@ static void part_in_flight_rw(struct request_queue *q, struct hd_struct *part, inflight[1] = 0; } -struct hd_struct *__disk_get_part(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *ptbl = rcu_dereference(disk->part_tbl); - - if (unlikely(partno < 0 || partno >= ptbl->len)) - return NULL; - return rcu_dereference(ptbl->part[partno]); -} - -/** - * disk_get_part - get partition - * @disk: disk to look partition from - * @partno: partition number - * - * Look for partition @partno from @disk. If found, increment - * reference count and return it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Pointer to the found partition on success, NULL if not found. - */ -struct hd_struct *disk_get_part(struct gendisk *disk, int partno) -{ - struct hd_struct *part; - - rcu_read_lock(); - part = __disk_get_part(disk, partno); - if (part) - get_device(part_to_dev(part)); - rcu_read_unlock(); - - return part; -} - -/** - * disk_part_iter_init - initialize partition iterator - * @piter: iterator to initialize - * @disk: disk to iterate over - * @flags: DISK_PITER_* flags - * - * Initialize @piter so that it iterates over partitions of @disk. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_init(struct disk_part_iter *piter, struct gendisk *disk, - unsigned int flags) -{ - struct disk_part_tbl *ptbl; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - piter->disk = disk; - piter->part = NULL; - - if (flags & DISK_PITER_REVERSE) - piter->idx = ptbl->len - 1; - else if (flags & (DISK_PITER_INCL_PART0 | DISK_PITER_INCL_EMPTY_PART0)) - piter->idx = 0; - else - piter->idx = 1; - - piter->flags = flags; - - rcu_read_unlock(); -} -EXPORT_SYMBOL_GPL(disk_part_iter_init); - -/** - * disk_part_iter_next - proceed iterator to the next partition and return it - * @piter: iterator of interest - * - * Proceed @piter to the next partition and return it. - * - * CONTEXT: - * Don't care. - */ -struct hd_struct *disk_part_iter_next(struct disk_part_iter *piter) -{ - struct disk_part_tbl *ptbl; - int inc, end; - - /* put the last partition */ - disk_put_part(piter->part); - piter->part = NULL; - - /* get part_tbl */ - rcu_read_lock(); - ptbl = rcu_dereference(piter->disk->part_tbl); - - /* determine iteration parameters */ - if (piter->flags & DISK_PITER_REVERSE) { - inc = -1; - if (piter->flags & (DISK_PITER_INCL_PART0 | - DISK_PITER_INCL_EMPTY_PART0)) - end = -1; - else - end = 0; - } else { - inc = 1; - end = ptbl->len; - } - - /* iterate to the next partition */ - for (; piter->idx != end; piter->idx += inc) { - struct hd_struct *part; - - part = rcu_dereference(ptbl->part[piter->idx]); - if (!part) - continue; - if (!part_nr_sects_read(part) && - !(piter->flags & DISK_PITER_INCL_EMPTY) && - !(piter->flags & DISK_PITER_INCL_EMPTY_PART0 && - piter->idx == 0)) - continue; - - get_device(part_to_dev(part)); - piter->part = part; - piter->idx += inc; - break; - } - - rcu_read_unlock(); - - return piter->part; -} -EXPORT_SYMBOL_GPL(disk_part_iter_next); - -/** - * disk_part_iter_exit - finish up partition iteration - * @piter: iter of interest - * - * Called when iteration is over. Cleans up @piter. - * - * CONTEXT: - * Don't care. - */ -void disk_part_iter_exit(struct disk_part_iter *piter) -{ - disk_put_part(piter->part); - piter->part = NULL; -} -EXPORT_SYMBOL_GPL(disk_part_iter_exit); - -static inline int sector_in_part(struct hd_struct *part, sector_t sector) -{ - return part->start_sect <= sector && - sector < part->start_sect + part_nr_sects_read(part); -} - -/** - * disk_map_sector_rcu - map sector to partition - * @disk: gendisk of interest - * @sector: sector to map - * - * Find out which partition @sector maps to on @disk. This is - * primarily used for stats accounting. - * - * CONTEXT: - * RCU read locked. The returned partition pointer is always valid - * because its refcount is grabbed except for part0, which lifetime - * is same with the disk. - * - * RETURNS: - * Found partition on success, part0 is returned if no partition matches - * or the matched partition is being deleted. - */ -struct hd_struct *disk_map_sector_rcu(struct gendisk *disk, sector_t sector) -{ - struct disk_part_tbl *ptbl; - struct hd_struct *part; - int i; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - part = rcu_dereference(ptbl->last_lookup); - if (part && sector_in_part(part, sector) && hd_struct_try_get(part)) - goto out_unlock; - - for (i = 1; i < ptbl->len; i++) { - part = rcu_dereference(ptbl->part[i]); - - if (part && sector_in_part(part, sector)) { - /* - * only live partition can be cached for lookup, - * so use-after-free on cached & deleting partition - * can be avoided - */ - if (!hd_struct_try_get(part)) - break; - rcu_assign_pointer(ptbl->last_lookup, part); - goto out_unlock; - } - } - - part = &disk->part0; -out_unlock: - rcu_read_unlock(); - return part; -} - -/** - * disk_has_partitions - * @disk: gendisk of interest - * - * Walk through the partition table and check if valid partition exists. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * True if the gendisk has at least one valid non-zero size partition. - * Otherwise false. - */ -bool disk_has_partitions(struct gendisk *disk) -{ - struct disk_part_tbl *ptbl; - int i; - bool ret = false; - - rcu_read_lock(); - ptbl = rcu_dereference(disk->part_tbl); - - /* Iterate partitions skipping the whole device at index 0 */ - for (i = 1; i < ptbl->len; i++) { - if (rcu_dereference(ptbl->part[i])) { - ret = true; - break; - } - } - - rcu_read_unlock(); - - return ret; -} -EXPORT_SYMBOL_GPL(disk_has_partitions); - /* * Can be deleted altogether. Later. * @@ -395,7 +159,12 @@ static struct blk_major_name { struct blk_major_name *next; int major; char name[16]; +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD + void (*probe)(dev_t devt); +#endif } *major_names[BLKDEV_MAJOR_HASH_SIZE]; +static DEFINE_MUTEX(major_names_lock); +static DEFINE_SPINLOCK(major_names_spinlock); /* index in the above - for now: assume no multimajor ranges */ static inline int major_to_index(unsigned major) @@ -408,20 +177,24 @@ void blkdev_show(struct seq_file *seqf, off_t offset) { struct blk_major_name *dp; - mutex_lock(&block_class_lock); + spin_lock(&major_names_spinlock); for (dp = major_names[major_to_index(offset)]; dp; dp = dp->next) if (dp->major == offset) seq_printf(seqf, "%3d %s\n", dp->major, dp->name); - mutex_unlock(&block_class_lock); + spin_unlock(&major_names_spinlock); } #endif /* CONFIG_PROC_FS */ /** - * register_blkdev - register a new block device + * __register_blkdev - register a new block device * * @major: the requested major device number [1..BLKDEV_MAJOR_MAX-1]. If * @major = 0, try to allocate any unused major number. * @name: the name of the new block device as a zero terminated string + * @probe: pre-devtmpfs / pre-udev callback used to create disks when their + * pre-created device node is accessed. When a probe call uses + * add_disk() and it fails the driver must cleanup resources. This + * interface may soon be removed. * * The @name must be unique within the system. * @@ -435,13 +208,16 @@ void blkdev_show(struct seq_file *seqf, off_t offset) * * See Documentation/admin-guide/devices.txt for the list of allocated * major numbers. + * + * Use register_blkdev instead for any new code. */ -int register_blkdev(unsigned int major, const char *name) +int __register_blkdev(unsigned int major, const char *name, + void (*probe)(dev_t devt)) { struct blk_major_name **n, *p; int index, ret = 0; - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); /* temporary */ if (major == 0) { @@ -475,10 +251,14 @@ int register_blkdev(unsigned int major, const char *name) } p->major = major; - strlcpy(p->name, name, sizeof(p->name)); +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD + p->probe = probe; +#endif + strscpy(p->name, name, sizeof(p->name)); p->next = NULL; index = major_to_index(major); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) { if ((*n)->major == major) break; @@ -487,6 +267,7 @@ int register_blkdev(unsigned int major, const char *name) *n = p; else ret = -EBUSY; + spin_unlock(&major_names_spinlock); if (ret < 0) { printk("register_blkdev: cannot get major %u for %s\n", @@ -494,11 +275,10 @@ int register_blkdev(unsigned int major, const char *name) kfree(p); } out: - mutex_unlock(&block_class_lock); + mutex_unlock(&major_names_lock); return ret; } - -EXPORT_SYMBOL(register_blkdev); +EXPORT_SYMBOL(__register_blkdev); void unregister_blkdev(unsigned int major, const char *name) { @@ -506,7 +286,8 @@ void unregister_blkdev(unsigned int major, const char *name) struct blk_major_name *p = NULL; int index = major_to_index(major); - mutex_lock(&block_class_lock); + mutex_lock(&major_names_lock); + spin_lock(&major_names_spinlock); for (n = &major_names[index]; *n; n = &(*n)->next) if ((*n)->major == major) break; @@ -516,198 +297,176 @@ void unregister_blkdev(unsigned int major, const char *name) p = *n; *n = p->next; } - mutex_unlock(&block_class_lock); + spin_unlock(&major_names_spinlock); + mutex_unlock(&major_names_lock); kfree(p); } EXPORT_SYMBOL(unregister_blkdev); -static struct kobj_map *bdev_map; - -/** - * blk_mangle_minor - scatter minor numbers apart - * @minor: minor number to mangle - * - * Scatter consecutively allocated @minor number apart if MANGLE_DEVT - * is enabled. Mangling twice gives the original value. - * - * RETURNS: - * Mangled value. - * - * CONTEXT: - * Don't care. - */ -static int blk_mangle_minor(int minor) +int blk_alloc_ext_minor(void) { -#ifdef CONFIG_DEBUG_BLOCK_EXT_DEVT - int i; - - for (i = 0; i < MINORBITS / 2; i++) { - int low = minor & (1 << i); - int high = minor & (1 << (MINORBITS - 1 - i)); - int distance = MINORBITS - 1 - 2 * i; + int idx; - minor ^= low | high; /* clear both bits */ - low <<= distance; /* swap the positions */ - high >>= distance; - minor |= low | high; /* and set */ - } -#endif - return minor; + idx = ida_alloc_range(&ext_devt_ida, 0, NR_EXT_DEVT - 1, GFP_KERNEL); + if (idx == -ENOSPC) + return -EBUSY; + return idx; } -/** - * blk_alloc_devt - allocate a dev_t for a partition - * @part: partition to allocate dev_t for - * @devt: out parameter for resulting dev_t - * - * Allocate a dev_t for block device. - * - * RETURNS: - * 0 on success, allocated dev_t is returned in *@devt. -errno on - * failure. - * - * CONTEXT: - * Might sleep. - */ -int blk_alloc_devt(struct hd_struct *part, dev_t *devt) +void blk_free_ext_minor(unsigned int minor) { - struct gendisk *disk = part_to_disk(part); - int idx; - - /* in consecutive minor range? */ - if (part->partno < disk->minors) { - *devt = MKDEV(disk->major, disk->first_minor + part->partno); - return 0; - } - - /* allocate ext devt */ - idr_preload(GFP_KERNEL); - - spin_lock_bh(&ext_devt_lock); - idx = idr_alloc(&ext_devt_idr, part, 0, NR_EXT_DEVT, GFP_NOWAIT); - spin_unlock_bh(&ext_devt_lock); - - idr_preload_end(); - if (idx < 0) - return idx == -ENOSPC ? -EBUSY : idx; - - *devt = MKDEV(BLOCK_EXT_MAJOR, blk_mangle_minor(idx)); - return 0; + ida_free(&ext_devt_ida, minor); } -/** - * blk_free_devt - free a dev_t - * @devt: dev_t to free - * - * Free @devt which was allocated using blk_alloc_devt(). - * - * CONTEXT: - * Might sleep. - */ -void blk_free_devt(dev_t devt) +void disk_uevent(struct gendisk *disk, enum kobject_action action) { - if (devt == MKDEV(0, 0)) - return; + struct block_device *part; + unsigned long idx; - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_remove(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); - } -} + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, part) { + if (bdev_is_partition(part) && !bdev_nr_sectors(part)) + continue; + if (!kobject_get_unless_zero(&part->bd_device.kobj)) + continue; -/* - * We invalidate devt by assigning NULL pointer for devt in idr. - */ -void blk_invalidate_devt(dev_t devt) -{ - if (MAJOR(devt) == BLOCK_EXT_MAJOR) { - spin_lock_bh(&ext_devt_lock); - idr_replace(&ext_devt_idr, NULL, blk_mangle_minor(MINOR(devt))); - spin_unlock_bh(&ext_devt_lock); + rcu_read_unlock(); + kobject_uevent(bdev_kobj(part), action); + put_device(&part->bd_device); + rcu_read_lock(); } + rcu_read_unlock(); } +EXPORT_SYMBOL_GPL(disk_uevent); -static char *bdevt_str(dev_t devt, char *buf) +int disk_scan_partitions(struct gendisk *disk, blk_mode_t mode) { - if (MAJOR(devt) <= 0xff && MINOR(devt) <= 0xff) { - char tbuf[BDEVT_SIZE]; - snprintf(tbuf, BDEVT_SIZE, "%02x%02x", MAJOR(devt), MINOR(devt)); - snprintf(buf, BDEVT_SIZE, "%-9s", tbuf); - } else - snprintf(buf, BDEVT_SIZE, "%03x:%05x", MAJOR(devt), MINOR(devt)); - - return buf; -} + struct bdev_handle *handle; + int ret = 0; -/* - * Register device numbers dev..(dev+range-1) - * range must be nonzero - * The hash chain is sorted on range, so that subranges can override. - */ -void blk_register_region(dev_t devt, unsigned long range, struct module *module, - struct kobject *(*probe)(dev_t, int *, void *), - int (*lock)(dev_t, void *), void *data) -{ - kobj_map(bdev_map, devt, range, module, probe, lock, data); -} + if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) + return -EINVAL; + if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + return -EINVAL; + if (disk->open_partitions) + return -EBUSY; -EXPORT_SYMBOL(blk_register_region); + /* + * If the device is opened exclusively by current thread already, it's + * safe to scan partitons, otherwise, use bd_prepare_to_claim() to + * synchronize with other exclusive openers and other partition + * scanners. + */ + if (!(mode & BLK_OPEN_EXCL)) { + ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions, + NULL); + if (ret) + return ret; + } -void blk_unregister_region(dev_t devt, unsigned long range) -{ - kobj_unmap(bdev_map, devt, range); + set_bit(GD_NEED_PART_SCAN, &disk->state); + handle = bdev_open_by_dev(disk_devt(disk), mode & ~BLK_OPEN_EXCL, NULL, + NULL); + if (IS_ERR(handle)) + ret = PTR_ERR(handle); + else + bdev_release(handle); + + /* + * If blkdev_get_by_dev() failed early, GD_NEED_PART_SCAN is still set, + * and this will cause that re-assemble partitioned raid device will + * creat partition for underlying disk. + */ + clear_bit(GD_NEED_PART_SCAN, &disk->state); + if (!(mode & BLK_OPEN_EXCL)) + bd_abort_claiming(disk->part0, disk_scan_partitions); + return ret; } -EXPORT_SYMBOL(blk_unregister_region); +/** + * device_add_disk - add disk information to kernel list + * @parent: parent device for the disk + * @disk: per-device partitioning information + * @groups: Additional per-device sysfs groups + * + * This function registers the partitioning information in @disk + * with the kernel. + */ +int __must_check device_add_disk(struct device *parent, struct gendisk *disk, + const struct attribute_group **groups) -static struct kobject *exact_match(dev_t devt, int *partno, void *data) { - struct gendisk *p = data; + struct device *ddev = disk_to_dev(disk); + int ret; - return &disk_to_dev(p)->kobj; -} + /* Only makes sense for bio-based to set ->poll_bio */ + if (queue_is_mq(disk->queue) && disk->fops->poll_bio) + return -EINVAL; -static int exact_lock(dev_t devt, void *data) -{ - struct gendisk *p = data; + /* + * The disk queue should now be all set with enough information about + * the device for the elevator code to pick an adequate default + * elevator if one is needed, that is, for devices requesting queue + * registration. + */ + elevator_init_mq(disk->queue); - if (!get_disk_and_module(p)) - return -1; - return 0; -} + /* Mark bdev as having a submit_bio, if needed */ + disk->part0->bd_has_submit_bio = disk->fops->submit_bio != NULL; -static void register_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) -{ - struct device *ddev = disk_to_dev(disk); - struct block_device *bdev; - struct disk_part_iter piter; - struct hd_struct *part; - int err; + /* + * If the driver provides an explicit major number it also must provide + * the number of minors numbers supported, and those will be used to + * setup the gendisk. + * Otherwise just allocate the device numbers for both the whole device + * and all partitions from the extended dev_t space. + */ + ret = -EINVAL; + if (disk->major) { + if (WARN_ON(!disk->minors)) + goto out_exit_elevator; + + if (disk->minors > DISK_MAX_PARTS) { + pr_err("block: can't allocate more than %d partitions\n", + DISK_MAX_PARTS); + disk->minors = DISK_MAX_PARTS; + } + if (disk->first_minor > MINORMASK || + disk->minors > MINORMASK + 1 || + disk->first_minor + disk->minors > MINORMASK + 1) + goto out_exit_elevator; + } else { + if (WARN_ON(disk->minors)) + goto out_exit_elevator; + + ret = blk_alloc_ext_minor(); + if (ret < 0) + goto out_exit_elevator; + disk->major = BLOCK_EXT_MAJOR; + disk->first_minor = ret; + } - ddev->parent = parent; + /* delay uevents, until we scanned partition table */ + dev_set_uevent_suppress(ddev, 1); + ddev->parent = parent; + ddev->groups = groups; dev_set_name(ddev, "%s", disk->disk_name); + if (!(disk->flags & GENHD_FL_HIDDEN)) + ddev->devt = MKDEV(disk->major, disk->first_minor); + ret = device_add(ddev); + if (ret) + goto out_free_ext_minor; - /* delay uevents, until we scanned partition table */ - dev_set_uevent_suppress(ddev, 1); + ret = disk_alloc_events(disk); + if (ret) + goto out_device_del; - if (groups) { - WARN_ON(ddev->groups); - ddev->groups = groups; - } - if (device_add(ddev)) - return; - if (!sysfs_deprecated) { - err = sysfs_create_link(block_depr, &ddev->kobj, - kobject_name(&ddev->kobj)); - if (err) { - device_del(ddev); - return; - } - } + ret = sysfs_create_link(block_depr, &ddev->kobj, + kobject_name(&ddev->kobj)); + if (ret) + goto out_device_del; /* * avoid probable deadlock caused by allocating memory with @@ -716,229 +475,281 @@ static void register_disk(struct device *parent, struct gendisk *disk, */ pm_runtime_set_memalloc_noio(ddev, true); - disk->part0.holder_dir = kobject_create_and_add("holders", &ddev->kobj); + disk->part0->bd_holder_dir = + kobject_create_and_add("holders", &ddev->kobj); + if (!disk->part0->bd_holder_dir) { + ret = -ENOMEM; + goto out_del_block_link; + } disk->slave_dir = kobject_create_and_add("slaves", &ddev->kobj); + if (!disk->slave_dir) { + ret = -ENOMEM; + goto out_put_holder_dir; + } + + ret = blk_register_queue(disk); + if (ret) + goto out_put_slave_dir; + + if (!(disk->flags & GENHD_FL_HIDDEN)) { + ret = bdi_register(disk->bdi, "%u:%u", + disk->major, disk->first_minor); + if (ret) + goto out_unregister_queue; + bdi_set_owner(disk->bdi, ddev); + ret = sysfs_create_link(&ddev->kobj, + &disk->bdi->dev->kobj, "bdi"); + if (ret) + goto out_unregister_bdi; + + /* Make sure the first partition scan will be proceed */ + if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + set_bit(GD_NEED_PART_SCAN, &disk->state); + + bdev_add(disk->part0, ddev->devt); + if (get_capacity(disk)) + disk_scan_partitions(disk, BLK_OPEN_READ); - if (disk->flags & GENHD_FL_HIDDEN) { + /* + * Announce the disk and partitions after all partitions are + * created. (for hidden disks uevents remain suppressed forever) + */ dev_set_uevent_suppress(ddev, 0); - return; + disk_uevent(disk, KOBJ_ADD); + } else { + /* + * Even if the block_device for a hidden gendisk is not + * registered, it needs to have a valid bd_dev so that the + * freeing of the dynamic major works. + */ + disk->part0->bd_dev = MKDEV(disk->major, disk->first_minor); } - /* No minors to use for partitions */ - if (!disk_part_scan_enabled(disk)) - goto exit; - - /* No such device (e.g., media were just removed) */ - if (!get_capacity(disk)) - goto exit; - - bdev = bdget_disk(disk, 0); - if (!bdev) - goto exit; - - bdev->bd_invalidated = 1; - err = blkdev_get(bdev, FMODE_READ, NULL); - if (err < 0) - goto exit; - blkdev_put(bdev, FMODE_READ); - -exit: - /* announce disk after possible partitions are created */ - dev_set_uevent_suppress(ddev, 0); - kobject_uevent(&ddev->kobj, KOBJ_ADD); - - /* announce possible partitions */ - disk_part_iter_init(&piter, disk, 0); - while ((part = disk_part_iter_next(&piter))) - kobject_uevent(&part_to_dev(part)->kobj, KOBJ_ADD); - disk_part_iter_exit(&piter); - - if (disk->queue->backing_dev_info->dev) { - err = sysfs_create_link(&ddev->kobj, - &disk->queue->backing_dev_info->dev->kobj, - "bdi"); - WARN_ON(err); - } + disk_update_readahead(disk); + disk_add_events(disk); + set_bit(GD_ADDED, &disk->state); + return 0; + +out_unregister_bdi: + if (!(disk->flags & GENHD_FL_HIDDEN)) + bdi_unregister(disk->bdi); +out_unregister_queue: + blk_unregister_queue(disk); + rq_qos_exit(disk->queue); +out_put_slave_dir: + kobject_put(disk->slave_dir); + disk->slave_dir = NULL; +out_put_holder_dir: + kobject_put(disk->part0->bd_holder_dir); +out_del_block_link: + sysfs_remove_link(block_depr, dev_name(ddev)); + pm_runtime_set_memalloc_noio(ddev, false); +out_device_del: + device_del(ddev); +out_free_ext_minor: + if (disk->major == BLOCK_EXT_MAJOR) + blk_free_ext_minor(disk->first_minor); +out_exit_elevator: + if (disk->queue->elevator) + elevator_exit(disk->queue); + return ret; } +EXPORT_SYMBOL(device_add_disk); -/** - * __device_add_disk - add disk information to kernel list - * @parent: parent device for the disk - * @disk: per-device partitioning information - * @groups: Additional per-device sysfs groups - * @register_queue: register the queue if set to true - * - * This function registers the partitioning information in @disk - * with the kernel. - * - * FIXME: error handling - */ -static void __device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups, - bool register_queue) +static void blk_report_disk_dead(struct gendisk *disk, bool surprise) { - dev_t devt; - int retval; + struct block_device *bdev; + unsigned long idx; /* - * The disk queue should now be all set with enough information about - * the device for the elevator code to pick an adequate default - * elevator if one is needed, that is, for devices requesting queue - * registration. + * On surprise disk removal, bdev_mark_dead() may call into file + * systems below. Make it clear that we're expecting to not hold + * disk->open_mutex. */ - if (register_queue) - elevator_init_mq(disk->queue); + lockdep_assert_not_held(&disk->open_mutex); - /* minors == 0 indicates to use ext devt from part0 and should - * be accompanied with EXT_DEVT flag. Make sure all - * parameters make sense. - */ - WARN_ON(disk->minors && !(disk->major || disk->first_minor)); - WARN_ON(!disk->minors && - !(disk->flags & (GENHD_FL_EXT_DEVT | GENHD_FL_HIDDEN))); + rcu_read_lock(); + xa_for_each(&disk->part_tbl, idx, bdev) { + if (!kobject_get_unless_zero(&bdev->bd_device.kobj)) + continue; + rcu_read_unlock(); - disk->flags |= GENHD_FL_UP; + bdev_mark_dead(bdev, surprise); - retval = blk_alloc_devt(&disk->part0, &devt); - if (retval) { - WARN_ON(1); - return; + put_device(&bdev->bd_device); + rcu_read_lock(); } - disk->major = MAJOR(devt); - disk->first_minor = MINOR(devt); + rcu_read_unlock(); +} - disk_alloc_events(disk); +static void __blk_mark_disk_dead(struct gendisk *disk) +{ + /* + * Fail any new I/O. + */ + if (test_and_set_bit(GD_DEAD, &disk->state)) + return; - if (disk->flags & GENHD_FL_HIDDEN) { - /* - * Don't let hidden disks show up in /proc/partitions, - * and don't bother scanning for partitions either. - */ - disk->flags |= GENHD_FL_SUPPRESS_PARTITION_INFO; - disk->flags |= GENHD_FL_NO_PART_SCAN; - } else { - struct backing_dev_info *bdi = disk->queue->backing_dev_info; - struct device *dev = disk_to_dev(disk); - int ret; - - /* Register BDI before referencing it from bdev */ - dev->devt = devt; - ret = bdi_register(bdi, "%u:%u", MAJOR(devt), MINOR(devt)); - WARN_ON(ret); - bdi_set_owner(bdi, dev); - blk_register_region(disk_devt(disk), disk->minors, NULL, - exact_match, exact_lock, disk); - } - register_disk(parent, disk, groups); - if (register_queue) - blk_register_queue(disk); + if (test_bit(GD_OWNS_QUEUE, &disk->state)) + blk_queue_flag_set(QUEUE_FLAG_DYING, disk->queue); /* - * Take an extra ref on queue which will be put on disk_release() - * so that it sticks around as long as @disk is there. + * Stop buffered writers from dirtying pages that can't be written out. */ - WARN_ON_ONCE(!blk_get_queue(disk->queue)); + set_capacity(disk, 0); - disk_add_events(disk); - blk_integrity_add(disk); + /* + * Prevent new I/O from crossing bio_queue_enter(). + */ + blk_queue_start_drain(disk->queue); } -void device_add_disk(struct device *parent, struct gendisk *disk, - const struct attribute_group **groups) - +/** + * blk_mark_disk_dead - mark a disk as dead + * @disk: disk to mark as dead + * + * Mark as disk as dead (e.g. surprise removed) and don't accept any new I/O + * to this disk. + */ +void blk_mark_disk_dead(struct gendisk *disk) { - __device_add_disk(parent, disk, groups, true); + __blk_mark_disk_dead(disk); + blk_report_disk_dead(disk, true); } -EXPORT_SYMBOL(device_add_disk); +EXPORT_SYMBOL_GPL(blk_mark_disk_dead); -void device_add_disk_no_queue_reg(struct device *parent, struct gendisk *disk) +/** + * del_gendisk - remove the gendisk + * @disk: the struct gendisk to remove + * + * Removes the gendisk and all its associated resources. This deletes the + * partitions associated with the gendisk, and unregisters the associated + * request_queue. + * + * This is the counter to the respective __device_add_disk() call. + * + * The final removal of the struct gendisk happens when its refcount reaches 0 + * with put_disk(), which should be called after del_gendisk(), if + * __device_add_disk() was used. + * + * Drivers exist which depend on the release of the gendisk to be synchronous, + * it should not be deferred. + * + * Context: can sleep + */ +void del_gendisk(struct gendisk *disk) { - __device_add_disk(parent, disk, NULL, false); -} -EXPORT_SYMBOL(device_add_disk_no_queue_reg); + struct request_queue *q = disk->queue; + struct block_device *part; + unsigned long idx; -static void invalidate_partition(struct gendisk *disk, int partno) -{ - struct block_device *bdev; + might_sleep(); - bdev = bdget_disk(disk, partno); - if (!bdev) + if (WARN_ON_ONCE(!disk_live(disk) && !(disk->flags & GENHD_FL_HIDDEN))) return; - fsync_bdev(bdev); - __invalidate_device(bdev, true); + disk_del_events(disk); /* - * Unhash the bdev inode for this device so that it gets evicted as soon - * as last inode reference is dropped. + * Prevent new openers by unlinked the bdev inode. */ - remove_inode_hash(bdev->bd_inode); - bdput(bdev); -} - -void del_gendisk(struct gendisk *disk) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - blk_integrity_del(disk); - disk_del_events(disk); + mutex_lock(&disk->open_mutex); + xa_for_each(&disk->part_tbl, idx, part) + remove_inode_hash(part->bd_inode); + mutex_unlock(&disk->open_mutex); /* - * Block lookups of the disk until all bdevs are unhashed and the - * disk is marked as dead (GENHD_FL_UP cleared). + * Tell the file system to write back all dirty data and shut down if + * it hasn't been notified earlier. */ - down_write(&disk->lookup_sem); - /* invalidate stuff */ - disk_part_iter_init(&piter, disk, - DISK_PITER_INCL_EMPTY | DISK_PITER_REVERSE); - while ((part = disk_part_iter_next(&piter))) { - invalidate_partition(disk, part->partno); - delete_partition(disk, part); - } - disk_part_iter_exit(&piter); + if (!test_bit(GD_DEAD, &disk->state)) + blk_report_disk_dead(disk, false); + __blk_mark_disk_dead(disk); - invalidate_partition(disk, 0); - set_capacity(disk, 0); - disk->flags &= ~GENHD_FL_UP; - up_write(&disk->lookup_sem); + /* + * Drop all partitions now that the disk is marked dead. + */ + mutex_lock(&disk->open_mutex); + xa_for_each_start(&disk->part_tbl, idx, part, 1) + drop_partition(part); + mutex_unlock(&disk->open_mutex); - if (!(disk->flags & GENHD_FL_HIDDEN)) + if (!(disk->flags & GENHD_FL_HIDDEN)) { sysfs_remove_link(&disk_to_dev(disk)->kobj, "bdi"); - if (disk->queue) { + /* * Unregister bdi before releasing device numbers (as they can * get reused and we'd get clashes in sysfs). */ - if (!(disk->flags & GENHD_FL_HIDDEN)) - bdi_unregister(disk->queue->backing_dev_info); - blk_unregister_queue(disk); - } else { - WARN_ON(1); + bdi_unregister(disk->bdi); } - if (!(disk->flags & GENHD_FL_HIDDEN)) - blk_unregister_region(disk_devt(disk), disk->minors); - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(disk_devt(disk)); + blk_unregister_queue(disk); - kobject_put(disk->part0.holder_dir); + kobject_put(disk->part0->bd_holder_dir); kobject_put(disk->slave_dir); + disk->slave_dir = NULL; - part_stat_set_all(&disk->part0, 0); - disk->part0.stamp = 0; - if (!sysfs_deprecated) - sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); + part_stat_set_all(disk->part0, 0); + disk->part0->bd_stamp = 0; + sysfs_remove_link(block_depr, dev_name(disk_to_dev(disk))); pm_runtime_set_memalloc_noio(disk_to_dev(disk), false); device_del(disk_to_dev(disk)); + + blk_mq_freeze_queue_wait(q); + + blk_throtl_cancel_bios(disk); + + blk_sync_queue(q); + blk_flush_integrity(); + + if (queue_is_mq(q)) + blk_mq_cancel_work_sync(q); + + blk_mq_quiesce_queue(q); + if (q->elevator) { + mutex_lock(&q->sysfs_lock); + elevator_exit(q); + mutex_unlock(&q->sysfs_lock); + } + rq_qos_exit(q); + blk_mq_unquiesce_queue(q); + + /* + * If the disk does not own the queue, allow using passthrough requests + * again. Else leave the queue frozen to fail all I/O. + */ + if (!test_bit(GD_OWNS_QUEUE, &disk->state)) { + blk_queue_flag_clear(QUEUE_FLAG_INIT_DONE, q); + __blk_mq_unfreeze_queue(q, true); + } else { + if (queue_is_mq(q)) + blk_mq_exit_queue(q); + } } EXPORT_SYMBOL(del_gendisk); +/** + * invalidate_disk - invalidate the disk + * @disk: the struct gendisk to invalidate + * + * A helper to invalidates the disk. It will clean the disk's associated + * buffer/page caches and reset its internal states so that the disk + * can be reused by the drivers. + * + * Context: can sleep + */ +void invalidate_disk(struct gendisk *disk) +{ + struct block_device *bdev = disk->part0; + + invalidate_bdev(bdev); + bdev->bd_inode->i_mapping->wb_err = 0; + set_capacity(disk, 0); +} +EXPORT_SYMBOL(invalidate_disk); + /* sysfs access to bad-blocks list. */ static ssize_t disk_badblocks_show(struct device *dev, struct device_attribute *attr, @@ -964,135 +775,27 @@ static ssize_t disk_badblocks_store(struct device *dev, return badblocks_store(disk->bb, page, len, 0); } -/** - * get_gendisk - get partitioning information for a given device - * @devt: device to get partitioning information for - * @partno: returned partition index - * - * This function gets the structure containing partitioning - * information for the given device @devt. - */ -struct gendisk *get_gendisk(dev_t devt, int *partno) +#ifdef CONFIG_BLOCK_LEGACY_AUTOLOAD +void blk_request_module(dev_t devt) { - struct gendisk *disk = NULL; - - if (MAJOR(devt) != BLOCK_EXT_MAJOR) { - struct kobject *kobj; - - kobj = kobj_lookup(bdev_map, devt, partno); - if (kobj) - disk = dev_to_disk(kobj_to_dev(kobj)); - } else { - struct hd_struct *part; + unsigned int major = MAJOR(devt); + struct blk_major_name **n; - spin_lock_bh(&ext_devt_lock); - part = idr_find(&ext_devt_idr, blk_mangle_minor(MINOR(devt))); - if (part && get_disk_and_module(part_to_disk(part))) { - *partno = part->partno; - disk = part_to_disk(part); + mutex_lock(&major_names_lock); + for (n = &major_names[major_to_index(major)]; *n; n = &(*n)->next) { + if ((*n)->major == major && (*n)->probe) { + (*n)->probe(devt); + mutex_unlock(&major_names_lock); + return; } - spin_unlock_bh(&ext_devt_lock); } + mutex_unlock(&major_names_lock); - if (!disk) - return NULL; - - /* - * Synchronize with del_gendisk() to not return disk that is being - * destroyed. - */ - down_read(&disk->lookup_sem); - if (unlikely((disk->flags & GENHD_FL_HIDDEN) || - !(disk->flags & GENHD_FL_UP))) { - up_read(&disk->lookup_sem); - put_disk_and_module(disk); - disk = NULL; - } else { - up_read(&disk->lookup_sem); - } - return disk; -} - -/** - * bdget_disk - do bdget() by gendisk and partition number - * @disk: gendisk of interest - * @partno: partition number - * - * Find partition @partno from @disk, do bdget() on it. - * - * CONTEXT: - * Don't care. - * - * RETURNS: - * Resulting block_device on success, NULL on failure. - */ -struct block_device *bdget_disk(struct gendisk *disk, int partno) -{ - struct hd_struct *part; - struct block_device *bdev = NULL; - - part = disk_get_part(disk, partno); - if (part) - bdev = bdget(part_devt(part)); - disk_put_part(part); - - return bdev; -} -EXPORT_SYMBOL(bdget_disk); - -/* - * print a full list of all partitions - intended for places where the root - * filesystem can't be mounted and thus to give the victim some idea of what - * went wrong - */ -void __init printk_all_partitions(void) -{ - struct class_dev_iter iter; - struct device *dev; - - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct disk_part_iter piter; - struct hd_struct *part; - char name_buf[BDEVNAME_SIZE]; - char devt_buf[BDEVT_SIZE]; - - /* - * Don't show empty devices or things that have been - * suppressed - */ - if (get_capacity(disk) == 0 || - (disk->flags & GENHD_FL_SUPPRESS_PARTITION_INFO)) - continue; - - /* - * Note, unlike /proc/partitions, I am showing the - * numbers in hex - the same format as the root= - * option takes. - */ - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) { - bool is_part0 = part == &disk->part0; - - printk("%s%s %10llu %s %s", is_part0 ? "" : " ", - bdevt_str(part_devt(part), devt_buf), - (unsigned long long)part_nr_sects_read(part) >> 1 - , disk_name(disk, part->partno, name_buf), - part->info ? part->info->uuid : ""); - if (is_part0) { - if (dev->parent && dev->parent->driver) - printk(" driver: %s\n", - dev->parent->driver->name); - else - printk(" (driver?)\n"); - } else - printk("\n"); - } - disk_part_iter_exit(&piter); - } - class_dev_iter_exit(&iter); + if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) + /* Make old-style 2.4 aliases work */ + request_module("block-major-%d", MAJOR(devt)); } +#endif /* CONFIG_BLOCK_LEGACY_AUTOLOAD */ #ifdef CONFIG_PROC_FS /* iterator */ @@ -1154,26 +857,21 @@ static void *show_partition_start(struct seq_file *seqf, loff_t *pos) static int show_partition(struct seq_file *seqf, void *v) { struct gendisk *sgp = v; - struct disk_part_iter piter; - struct hd_struct *part; - char buf[BDEVNAME_SIZE]; + struct block_device *part; + unsigned long idx; - /* Don't show non-partitionable removeable devices or empty devices */ - if (!get_capacity(sgp) || (!disk_max_parts(sgp) && - (sgp->flags & GENHD_FL_REMOVABLE))) - return 0; - if (sgp->flags & GENHD_FL_SUPPRESS_PARTITION_INFO) + if (!get_capacity(sgp) || (sgp->flags & GENHD_FL_HIDDEN)) return 0; - /* show the full disk and all non-0 size partitions of it */ - disk_part_iter_init(&piter, sgp, DISK_PITER_INCL_PART0); - while ((part = disk_part_iter_next(&piter))) - seq_printf(seqf, "%4d %7d %10llu %s\n", - MAJOR(part_devt(part)), MINOR(part_devt(part)), - (unsigned long long)part_nr_sects_read(part) >> 1, - disk_name(sgp, part->partno, buf)); - disk_part_iter_exit(&piter); - + rcu_read_lock(); + xa_for_each(&sgp->part_tbl, idx, part) { + if (!bdev_nr_sectors(part)) + continue; + seq_printf(seqf, "%4d %7d %10llu %pg\n", + MAJOR(part->bd_dev), MINOR(part->bd_dev), + bdev_nr_sectors(part) >> 1, part); + } + rcu_read_unlock(); return 0; } @@ -1185,31 +883,19 @@ static const struct seq_operations partitions_op = { }; #endif - -static struct kobject *base_probe(dev_t devt, int *partno, void *data) -{ - if (request_module("block-major-%d-%d", MAJOR(devt), MINOR(devt)) > 0) - /* Make old-style 2.4 aliases work */ - request_module("block-major-%d", MAJOR(devt)); - return NULL; -} - static int __init genhd_device_init(void) { int error; - block_class.dev_kobj = sysfs_dev_block_kobj; error = class_register(&block_class); if (unlikely(error)) return error; - bdev_map = kobj_map_init(base_probe, &block_class_lock); blk_dev_init(); register_blkdev(BLOCK_EXT_MAJOR, "blkext"); /* create top-level block dir */ - if (!sysfs_deprecated) - block_depr = kobject_create_and_add("block", NULL); + block_depr = kobject_create_and_add("block", NULL); return 0; } @@ -1228,7 +914,8 @@ static ssize_t disk_ext_range_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", disk_max_parts(disk)); + return sprintf(buf, "%d\n", + (disk->flags & GENHD_FL_NO_PART) ? 1 : DISK_MAX_PARTS); } static ssize_t disk_removable_show(struct device *dev, @@ -1260,26 +947,28 @@ static ssize_t disk_ro_show(struct device *dev, ssize_t part_size_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n", - (unsigned long long)part_nr_sects_read(p)); + return sprintf(buf, "%llu\n", bdev_nr_sectors(dev_to_bdev(dev))); } ssize_t part_stat_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); struct disk_stats stat; unsigned int inflight; - part_stat_read_all(p, &stat); if (queue_is_mq(q)) - inflight = blk_mq_in_flight(q, p); + inflight = blk_mq_in_flight(q, bdev); else - inflight = part_in_flight(q, p); + inflight = part_in_flight(bdev); + if (inflight) { + part_stat_lock(); + update_io_ticks(bdev, jiffies, true); + part_stat_unlock(); + } + part_stat_read_all(bdev, &stat); return sprintf(buf, "%8lu %8lu %8llu %8u " "%8lu %8lu %8llu %8u " @@ -1313,14 +1002,14 @@ ssize_t part_stat_show(struct device *dev, ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - struct request_queue *q = part_to_disk(p)->queue; + struct block_device *bdev = dev_to_bdev(dev); + struct request_queue *q = bdev_get_queue(bdev); unsigned int inflight[2]; if (queue_is_mq(q)) - blk_mq_in_flight_rw(q, p, inflight); + blk_mq_in_flight_rw(q, bdev, inflight); else - part_in_flight_rw(q, p, inflight); + part_in_flight_rw(bdev, inflight); return sprintf(buf, "%8u %8u\n", inflight[0], inflight[1]); } @@ -1328,9 +1017,8 @@ ssize_t part_inflight_show(struct device *dev, struct device_attribute *attr, static ssize_t disk_capability_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct gendisk *disk = dev_to_disk(dev); - - return sprintf(buf, "%x\n", disk->flags); + dev_warn_once(dev, "the capability attribute has been deprecated.\n"); + return sprintf(buf, "0\n"); } static ssize_t disk_alignment_offset_show(struct device *dev, @@ -1339,7 +1027,7 @@ static ssize_t disk_alignment_offset_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_alignment_offset(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); } static ssize_t disk_discard_alignment_show(struct device *dev, @@ -1348,7 +1036,15 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); + return sprintf(buf, "%d\n", bdev_alignment_offset(disk->part0)); +} + +static ssize_t diskseq_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct gendisk *disk = dev_to_disk(dev); + + return sprintf(buf, "%llu\n", disk->diskseq); } static DEVICE_ATTR(range, 0444, disk_range_show, NULL); @@ -1363,25 +1059,23 @@ static DEVICE_ATTR(capability, 0444, disk_capability_show, NULL); static DEVICE_ATTR(stat, 0444, part_stat_show, NULL); static DEVICE_ATTR(inflight, 0444, part_inflight_show, NULL); static DEVICE_ATTR(badblocks, 0644, disk_badblocks_show, disk_badblocks_store); +static DEVICE_ATTR(diskseq, 0444, diskseq_show, NULL); #ifdef CONFIG_FAIL_MAKE_REQUEST ssize_t part_fail_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->make_it_fail); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_make_it_fail); } ssize_t part_fail_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) { - struct hd_struct *p = dev_to_part(dev); int i; if (count > 0 && sscanf(buf, "%d", &i) > 0) - p->make_it_fail = (i == 0) ? 0 : 1; + dev_to_bdev(dev)->bd_make_it_fail = i; return count; } @@ -1408,6 +1102,10 @@ static struct attribute *disk_attrs[] = { &dev_attr_stat.attr, &dev_attr_inflight.attr, &dev_attr_badblocks.attr, + &dev_attr_events.attr, + &dev_attr_events_async.attr, + &dev_attr_events_poll_msecs.attr, + &dev_attr_diskseq.attr, #ifdef CONFIG_FAIL_MAKE_REQUEST &dev_attr_fail.attr, #endif @@ -1434,104 +1132,81 @@ static struct attribute_group disk_attr_group = { static const struct attribute_group *disk_attr_groups[] = { &disk_attr_group, +#ifdef CONFIG_BLK_DEV_IO_TRACE + &blk_trace_attr_group, +#endif +#ifdef CONFIG_BLK_DEV_INTEGRITY + &blk_integrity_attr_group, +#endif NULL }; /** - * disk_replace_part_tbl - replace disk->part_tbl in RCU-safe way - * @disk: disk to replace part_tbl for - * @new_ptbl: new part_tbl to install + * disk_release - releases all allocated resources of the gendisk + * @dev: the device representing this disk * - * Replace disk->part_tbl with @new_ptbl in RCU-safe way. The - * original ptbl is freed using RCU callback. + * This function releases all allocated resources of the gendisk. * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. + * Drivers which used __device_add_disk() have a gendisk with a request_queue + * assigned. Since the request_queue sits on top of the gendisk for these + * drivers we also call blk_put_queue() for them, and we expect the + * request_queue refcount to reach 0 at this point, and so the request_queue + * will also be freed prior to the disk. + * + * Context: can sleep */ -static void disk_replace_part_tbl(struct gendisk *disk, - struct disk_part_tbl *new_ptbl) +static void disk_release(struct device *dev) { - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); + struct gendisk *disk = dev_to_disk(dev); - rcu_assign_pointer(disk->part_tbl, new_ptbl); + might_sleep(); + WARN_ON_ONCE(disk_live(disk)); - if (old_ptbl) { - rcu_assign_pointer(old_ptbl->last_lookup, NULL); - kfree_rcu(old_ptbl, rcu_head); - } -} - -/** - * disk_expand_part_tbl - expand disk->part_tbl - * @disk: disk to expand part_tbl for - * @partno: expand such that this partno can fit in - * - * Expand disk->part_tbl such that @partno can fit in. disk->part_tbl - * uses RCU to allow unlocked dereferencing for stats and other stuff. - * - * LOCKING: - * Matching bd_mutex locked or the caller is the only user of @disk. - * Might sleep. - * - * RETURNS: - * 0 on success, -errno on failure. - */ -int disk_expand_part_tbl(struct gendisk *disk, int partno) -{ - struct disk_part_tbl *old_ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - struct disk_part_tbl *new_ptbl; - int len = old_ptbl ? old_ptbl->len : 0; - int i, target; + blk_trace_remove(disk->queue); /* - * check for int overflow, since we can get here from blkpg_ioctl() - * with a user passed 'partno'. + * To undo the all initialization from blk_mq_init_allocated_queue in + * case of a probe failure where add_disk is never called we have to + * call blk_mq_exit_queue here. We can't do this for the more common + * teardown case (yet) as the tagset can be gone by the time the disk + * is released once it was added. */ - target = partno + 1; - if (target < 0) - return -EINVAL; + if (queue_is_mq(disk->queue) && + test_bit(GD_OWNS_QUEUE, &disk->state) && + !test_bit(GD_ADDED, &disk->state)) + blk_mq_exit_queue(disk->queue); - /* disk_max_parts() is zero during initialization, ignore if so */ - if (disk_max_parts(disk) && target > disk_max_parts(disk)) - return -EINVAL; + blkcg_exit_disk(disk); - if (target <= len) - return 0; + bioset_exit(&disk->bio_split); - new_ptbl = kzalloc_node(struct_size(new_ptbl, part, target), GFP_KERNEL, - disk->node_id); - if (!new_ptbl) - return -ENOMEM; + disk_release_events(disk); + kfree(disk->random); + disk_free_zone_bitmaps(disk); + xa_destroy(&disk->part_tbl); - new_ptbl->len = target; + disk->queue->disk = NULL; + blk_put_queue(disk->queue); - for (i = 0; i < len; i++) - rcu_assign_pointer(new_ptbl->part[i], old_ptbl->part[i]); + if (test_bit(GD_ADDED, &disk->state) && disk->fops->free_disk) + disk->fops->free_disk(disk); - disk_replace_part_tbl(disk, new_ptbl); - return 0; + iput(disk->part0->bd_inode); /* frees the disk */ } -static void disk_release(struct device *dev) +static int block_uevent(const struct device *dev, struct kobj_uevent_env *env) { - struct gendisk *disk = dev_to_disk(dev); + const struct gendisk *disk = dev_to_disk(dev); - blk_free_devt(dev->devt); - disk_release_events(disk); - kfree(disk->random); - disk_replace_part_tbl(disk, NULL); - hd_free_part(&disk->part0); - if (disk->queue) - blk_put_queue(disk->queue); - kfree(disk); + return add_uevent_var(env, "DISKSEQ=%llu", disk->diskseq); } + struct class block_class = { .name = "block", + .dev_uevent = block_uevent, }; -static char *block_devnode(struct device *dev, umode_t *mode, +static char *block_devnode(const struct device *dev, umode_t *mode, kuid_t *uid, kgid_t *gid) { struct gendisk *disk = dev_to_disk(dev); @@ -1541,7 +1216,7 @@ static char *block_devnode(struct device *dev, umode_t *mode, return NULL; } -static const struct device_type disk_type = { +const struct device_type disk_type = { .name = "disk", .groups = disk_attr_groups, .release = disk_release, @@ -1559,11 +1234,10 @@ static const struct device_type disk_type = { static int diskstats_show(struct seq_file *seqf, void *v) { struct gendisk *gp = v; - struct disk_part_iter piter; - struct hd_struct *hd; - char buf[BDEVNAME_SIZE]; + struct block_device *hd; unsigned int inflight; struct disk_stats stat; + unsigned long idx; /* if (&disk_to_dev(gp)->kobj.entry == block_class.devices.next) @@ -1573,23 +1247,29 @@ static int diskstats_show(struct seq_file *seqf, void *v) "\n\n"); */ - disk_part_iter_init(&piter, gp, DISK_PITER_INCL_EMPTY_PART0); - while ((hd = disk_part_iter_next(&piter))) { - part_stat_read_all(hd, &stat); + rcu_read_lock(); + xa_for_each(&gp->part_tbl, idx, hd) { + if (bdev_is_partition(hd) && !bdev_nr_sectors(hd)) + continue; if (queue_is_mq(gp->queue)) inflight = blk_mq_in_flight(gp->queue, hd); else - inflight = part_in_flight(gp->queue, hd); + inflight = part_in_flight(hd); - seq_printf(seqf, "%4d %7d %s " + if (inflight) { + part_stat_lock(); + update_io_ticks(hd, jiffies, true); + part_stat_unlock(); + } + part_stat_read_all(hd, &stat); + seq_printf(seqf, "%4d %7d %pg " "%lu %lu %lu %u " "%lu %lu %lu %u " "%u %u %u " "%lu %lu %lu %u " "%lu %u" "\n", - MAJOR(part_devt(hd)), MINOR(part_devt(hd)), - disk_name(gp, hd->partno, buf), + MAJOR(hd->bd_dev), MINOR(hd->bd_dev), hd, stat.ios[STAT_READ], stat.merges[STAT_READ], stat.sectors[STAT_READ], @@ -1617,7 +1297,7 @@ static int diskstats_show(struct seq_file *seqf, void *v) NSEC_PER_MSEC) ); } - disk_part_iter_exit(&piter); + rcu_read_unlock(); return 0; } @@ -1638,137 +1318,118 @@ static int __init proc_genhd_init(void) module_init(proc_genhd_init); #endif /* CONFIG_PROC_FS */ -dev_t blk_lookup_devt(const char *name, int partno) +dev_t part_devt(struct gendisk *disk, u8 partno) { - dev_t devt = MKDEV(0, 0); - struct class_dev_iter iter; - struct device *dev; + struct block_device *part; + dev_t devt = 0; - class_dev_iter_init(&iter, &block_class, NULL, &disk_type); - while ((dev = class_dev_iter_next(&iter))) { - struct gendisk *disk = dev_to_disk(dev); - struct hd_struct *part; - - if (strcmp(dev_name(dev), name)) - continue; + rcu_read_lock(); + part = xa_load(&disk->part_tbl, partno); + if (part) + devt = part->bd_dev; + rcu_read_unlock(); - if (partno < disk->minors) { - /* We need to return the right devno, even - * if the partition doesn't exist yet. - */ - devt = MKDEV(MAJOR(dev->devt), - MINOR(dev->devt) + partno); - break; - } - part = disk_get_part(disk, partno); - if (part) { - devt = part_devt(part); - disk_put_part(part); - break; - } - disk_put_part(part); - } - class_dev_iter_exit(&iter); return devt; } -struct gendisk *__alloc_disk_node(int minors, int node_id) +struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id, + struct lock_class_key *lkclass) { struct gendisk *disk; - struct disk_part_tbl *ptbl; - - if (minors > DISK_MAX_PARTS) { - printk(KERN_ERR - "block: can't allocate more than %d partitions\n", - DISK_MAX_PARTS); - minors = DISK_MAX_PARTS; - } disk = kzalloc_node(sizeof(struct gendisk), GFP_KERNEL, node_id); - if (disk) { - disk->part0.dkstats = alloc_percpu(struct disk_stats); - if (!disk->part0.dkstats) { - kfree(disk); - return NULL; - } - init_rwsem(&disk->lookup_sem); - disk->node_id = node_id; - if (disk_expand_part_tbl(disk, 0)) { - free_percpu(disk->part0.dkstats); - kfree(disk); - return NULL; - } - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - rcu_assign_pointer(ptbl->part[0], &disk->part0); - - /* - * set_capacity() and get_capacity() currently don't use - * seqcounter to read/update the part0->nr_sects. Still init - * the counter as we can read the sectors in IO submission - * patch using seqence counters. - * - * TODO: Ideally set_capacity() and get_capacity() should be - * converted to make use of bd_mutex and sequence counters. - */ - hd_sects_seq_init(&disk->part0); - if (hd_ref_init(&disk->part0)) { - hd_free_part(&disk->part0); - kfree(disk); - return NULL; - } + if (!disk) + return NULL; - disk->minors = minors; - rand_initialize_disk(disk); - disk_to_dev(disk)->class = &block_class; - disk_to_dev(disk)->type = &disk_type; - device_initialize(disk_to_dev(disk)); - } + if (bioset_init(&disk->bio_split, BIO_POOL_SIZE, 0, 0)) + goto out_free_disk; + + disk->bdi = bdi_alloc(node_id); + if (!disk->bdi) + goto out_free_bioset; + + /* bdev_alloc() might need the queue, set before the first call */ + disk->queue = q; + + disk->part0 = bdev_alloc(disk, 0); + if (!disk->part0) + goto out_free_bdi; + + disk->node_id = node_id; + mutex_init(&disk->open_mutex); + xa_init(&disk->part_tbl); + if (xa_insert(&disk->part_tbl, 0, disk->part0, GFP_KERNEL)) + goto out_destroy_part_tbl; + + if (blkcg_init_disk(disk)) + goto out_erase_part0; + + rand_initialize_disk(disk); + disk_to_dev(disk)->class = &block_class; + disk_to_dev(disk)->type = &disk_type; + device_initialize(disk_to_dev(disk)); + inc_diskseq(disk); + q->disk = disk; + lockdep_init_map(&disk->lockdep_map, "(bio completion)", lkclass, 0); +#ifdef CONFIG_BLOCK_HOLDER_DEPRECATED + INIT_LIST_HEAD(&disk->slave_bdevs); +#endif return disk; + +out_erase_part0: + xa_erase(&disk->part_tbl, 0); +out_destroy_part_tbl: + xa_destroy(&disk->part_tbl); + disk->part0->bd_disk = NULL; + iput(disk->part0->bd_inode); +out_free_bdi: + bdi_put(disk->bdi); +out_free_bioset: + bioset_exit(&disk->bio_split); +out_free_disk: + kfree(disk); + return NULL; } -EXPORT_SYMBOL(__alloc_disk_node); -struct kobject *get_disk_and_module(struct gendisk *disk) +struct gendisk *__blk_alloc_disk(int node, struct lock_class_key *lkclass) { - struct module *owner; - struct kobject *kobj; + struct request_queue *q; + struct gendisk *disk; - if (!disk->fops) + q = blk_alloc_queue(node); + if (!q) return NULL; - owner = disk->fops->owner; - if (owner && !try_module_get(owner)) - return NULL; - kobj = kobject_get_unless_zero(&disk_to_dev(disk)->kobj); - if (kobj == NULL) { - module_put(owner); + + disk = __alloc_disk_node(q, node, lkclass); + if (!disk) { + blk_put_queue(q); return NULL; } - return kobj; - + set_bit(GD_OWNS_QUEUE, &disk->state); + return disk; } -EXPORT_SYMBOL(get_disk_and_module); +EXPORT_SYMBOL(__blk_alloc_disk); +/** + * put_disk - decrements the gendisk refcount + * @disk: the struct gendisk to decrement the refcount for + * + * This decrements the refcount for the struct gendisk. When this reaches 0 + * we'll have disk_release() called. + * + * Note: for blk-mq disk put_disk must be called before freeing the tag_set + * when handling probe errors (that is before add_disk() is called). + * + * Context: Any context, but the last reference must not be dropped from + * atomic context. + */ void put_disk(struct gendisk *disk) { if (disk) - kobject_put(&disk_to_dev(disk)->kobj); + put_device(disk_to_dev(disk)); } EXPORT_SYMBOL(put_disk); -/* - * This is a counterpart of get_disk_and_module() and thus also of - * get_gendisk(). - */ -void put_disk_and_module(struct gendisk *disk) -{ - if (disk) { - struct module *owner = disk->fops->owner; - - put_disk(disk); - module_put(owner); - } -} -EXPORT_SYMBOL(put_disk_and_module); - static void set_disk_ro_uevent(struct gendisk *gd, int ro) { char event[] = "DISK_RO=1"; @@ -1779,500 +1440,29 @@ static void set_disk_ro_uevent(struct gendisk *gd, int ro) kobject_uevent_env(&disk_to_dev(gd)->kobj, KOBJ_CHANGE, envp); } -void set_device_ro(struct block_device *bdev, int flag) -{ - bdev->bd_part->policy = flag; -} - -EXPORT_SYMBOL(set_device_ro); - -void set_disk_ro(struct gendisk *disk, int flag) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - if (disk->part0.policy != flag) { - set_disk_ro_uevent(disk, flag); - disk->part0.policy = flag; - } - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - part->policy = flag; - disk_part_iter_exit(&piter); -} - -EXPORT_SYMBOL(set_disk_ro); - -int bdev_read_only(struct block_device *bdev) -{ - if (!bdev) - return 0; - return bdev->bd_part->policy; -} - -EXPORT_SYMBOL(bdev_read_only); - -/* - * Disk events - monitor disk events like media change and eject request. - */ -struct disk_events { - struct list_head node; /* all disk_event's */ - struct gendisk *disk; /* the associated disk */ - spinlock_t lock; - - struct mutex block_mutex; /* protects blocking */ - int block; /* event blocking depth */ - unsigned int pending; /* events already sent out */ - unsigned int clearing; /* events being cleared */ - - long poll_msecs; /* interval, -1 for default */ - struct delayed_work dwork; -}; - -static const char *disk_events_strs[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "media_change", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "eject_request", -}; - -static char *disk_uevents[] = { - [ilog2(DISK_EVENT_MEDIA_CHANGE)] = "DISK_MEDIA_CHANGE=1", - [ilog2(DISK_EVENT_EJECT_REQUEST)] = "DISK_EJECT_REQUEST=1", -}; - -/* list of all disk_events */ -static DEFINE_MUTEX(disk_events_mutex); -static LIST_HEAD(disk_events); - -/* disable in-kernel polling by default */ -static unsigned long disk_events_dfl_poll_msecs; - -static unsigned long disk_events_poll_jiffies(struct gendisk *disk) -{ - struct disk_events *ev = disk->ev; - long intv_msecs = 0; - - /* - * If device-specific poll interval is set, always use it. If - * the default is being used, poll if the POLL flag is set. - */ - if (ev->poll_msecs >= 0) - intv_msecs = ev->poll_msecs; - else if (disk->event_flags & DISK_EVENT_FLAG_POLL) - intv_msecs = disk_events_dfl_poll_msecs; - - return msecs_to_jiffies(intv_msecs); -} - /** - * disk_block_events - block and flush disk event checking - * @disk: disk to block events for + * set_disk_ro - set a gendisk read-only + * @disk: gendisk to operate on + * @read_only: %true to set the disk read-only, %false set the disk read/write * - * On return from this function, it is guaranteed that event checking - * isn't in progress and won't happen until unblocked by - * disk_unblock_events(). Events blocking is counted and the actual - * unblocking happens after the matching number of unblocks are done. - * - * Note that this intentionally does not block event checking from - * disk_clear_events(). - * - * CONTEXT: - * Might sleep. + * This function is used to indicate whether a given disk device should have its + * read-only flag set. set_disk_ro() is typically used by device drivers to + * indicate whether the underlying physical device is write-protected. */ -void disk_block_events(struct gendisk *disk) +void set_disk_ro(struct gendisk *disk, bool read_only) { - struct disk_events *ev = disk->ev; - unsigned long flags; - bool cancel; - - if (!ev) - return; - - /* - * Outer mutex ensures that the first blocker completes canceling - * the event work before further blockers are allowed to finish. - */ - mutex_lock(&ev->block_mutex); - - spin_lock_irqsave(&ev->lock, flags); - cancel = !ev->block++; - spin_unlock_irqrestore(&ev->lock, flags); - - if (cancel) - cancel_delayed_work_sync(&disk->ev->dwork); - - mutex_unlock(&ev->block_mutex); -} - -static void __disk_unblock_events(struct gendisk *disk, bool check_now) -{ - struct disk_events *ev = disk->ev; - unsigned long intv; - unsigned long flags; - - spin_lock_irqsave(&ev->lock, flags); - - if (WARN_ON_ONCE(ev->block <= 0)) - goto out_unlock; - - if (--ev->block) - goto out_unlock; - - intv = disk_events_poll_jiffies(disk); - if (check_now) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - else if (intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); -out_unlock: - spin_unlock_irqrestore(&ev->lock, flags); -} - -/** - * disk_unblock_events - unblock disk event checking - * @disk: disk to unblock events for - * - * Undo disk_block_events(). When the block count reaches zero, it - * starts events polling if configured. - * - * CONTEXT: - * Don't care. Safe to call from irq context. - */ -void disk_unblock_events(struct gendisk *disk) -{ - if (disk->ev) - __disk_unblock_events(disk, false); -} - -/** - * disk_flush_events - schedule immediate event checking and flushing - * @disk: disk to check and flush events for - * @mask: events to flush - * - * Schedule immediate event checking on @disk if not blocked. Events in - * @mask are scheduled to be cleared from the driver. Note that this - * doesn't clear the events from @disk->ev. - * - * CONTEXT: - * If @mask is non-zero must be called with bdev->bd_mutex held. - */ -void disk_flush_events(struct gendisk *disk, unsigned int mask) -{ - struct disk_events *ev = disk->ev; - - if (!ev) - return; - - spin_lock_irq(&ev->lock); - ev->clearing |= mask; - if (!ev->block) - mod_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, 0); - spin_unlock_irq(&ev->lock); -} - -/** - * disk_clear_events - synchronously check, clear and return pending events - * @disk: disk to fetch and clear events from - * @mask: mask of events to be fetched and cleared - * - * Disk events are synchronously checked and pending events in @mask - * are cleared and returned. This ignores the block count. - * - * CONTEXT: - * Might sleep. - */ -unsigned int disk_clear_events(struct gendisk *disk, unsigned int mask) -{ - const struct block_device_operations *bdops = disk->fops; - struct disk_events *ev = disk->ev; - unsigned int pending; - unsigned int clearing = mask; - - if (!ev) { - /* for drivers still using the old ->media_changed method */ - if ((mask & DISK_EVENT_MEDIA_CHANGE) && - bdops->media_changed && bdops->media_changed(disk)) - return DISK_EVENT_MEDIA_CHANGE; - return 0; - } - - disk_block_events(disk); - - /* - * store the union of mask and ev->clearing on the stack so that the - * race with disk_flush_events does not cause ambiguity (ev->clearing - * can still be modified even if events are blocked). - */ - spin_lock_irq(&ev->lock); - clearing |= ev->clearing; - ev->clearing = 0; - spin_unlock_irq(&ev->lock); - - disk_check_events(ev, &clearing); - /* - * if ev->clearing is not 0, the disk_flush_events got called in the - * middle of this function, so we want to run the workfn without delay. - */ - __disk_unblock_events(disk, ev->clearing ? true : false); - - /* then, fetch and clear pending events */ - spin_lock_irq(&ev->lock); - pending = ev->pending & mask; - ev->pending &= ~mask; - spin_unlock_irq(&ev->lock); - WARN_ON_ONCE(clearing & mask); - - return pending; -} - -/* - * Separate this part out so that a different pointer for clearing_ptr can be - * passed in for disk_clear_events. - */ -static void disk_events_workfn(struct work_struct *work) -{ - struct delayed_work *dwork = to_delayed_work(work); - struct disk_events *ev = container_of(dwork, struct disk_events, dwork); - - disk_check_events(ev, &ev->clearing); -} - -static void disk_check_events(struct disk_events *ev, - unsigned int *clearing_ptr) -{ - struct gendisk *disk = ev->disk; - char *envp[ARRAY_SIZE(disk_uevents) + 1] = { }; - unsigned int clearing = *clearing_ptr; - unsigned int events; - unsigned long intv; - int nr_events = 0, i; - - /* check events */ - events = disk->fops->check_events(disk, clearing); - - /* accumulate pending events and schedule next poll if necessary */ - spin_lock_irq(&ev->lock); - - events &= ~ev->pending; - ev->pending |= events; - *clearing_ptr &= ~clearing; - - intv = disk_events_poll_jiffies(disk); - if (!ev->block && intv) - queue_delayed_work(system_freezable_power_efficient_wq, - &ev->dwork, intv); - - spin_unlock_irq(&ev->lock); - - /* - * Tell userland about new events. Only the events listed in - * @disk->events are reported, and only if DISK_EVENT_FLAG_UEVENT - * is set. Otherwise, events are processed internally but never - * get reported to userland. - */ - for (i = 0; i < ARRAY_SIZE(disk_uevents); i++) - if ((events & disk->events & (1 << i)) && - (disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - envp[nr_events++] = disk_uevents[i]; - - if (nr_events) - kobject_uevent_env(&disk_to_dev(disk)->kobj, KOBJ_CHANGE, envp); -} - -/* - * A disk events enabled device has the following sysfs nodes under - * its /sys/block/X/ directory. - * - * events : list of all supported events - * events_async : list of events which can be detected w/o polling - * (always empty, only for backwards compatibility) - * events_poll_msecs : polling interval, 0: disable, -1: system default - */ -static ssize_t __disk_events_show(unsigned int events, char *buf) -{ - const char *delim = ""; - ssize_t pos = 0; - int i; - - for (i = 0; i < ARRAY_SIZE(disk_events_strs); i++) - if (events & (1 << i)) { - pos += sprintf(buf + pos, "%s%s", - delim, disk_events_strs[i]); - delim = " "; - } - if (pos) - pos += sprintf(buf + pos, "\n"); - return pos; -} - -static ssize_t disk_events_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!(disk->event_flags & DISK_EVENT_FLAG_UEVENT)) - return 0; - - return __disk_events_show(disk->events, buf); -} - -static ssize_t disk_events_async_show(struct device *dev, - struct device_attribute *attr, char *buf) -{ - return 0; -} - -static ssize_t disk_events_poll_msecs_show(struct device *dev, - struct device_attribute *attr, - char *buf) -{ - struct gendisk *disk = dev_to_disk(dev); - - if (!disk->ev) - return sprintf(buf, "-1\n"); - - return sprintf(buf, "%ld\n", disk->ev->poll_msecs); -} - -static ssize_t disk_events_poll_msecs_store(struct device *dev, - struct device_attribute *attr, - const char *buf, size_t count) -{ - struct gendisk *disk = dev_to_disk(dev); - long intv; - - if (!count || !sscanf(buf, "%ld", &intv)) - return -EINVAL; - - if (intv < 0 && intv != -1) - return -EINVAL; - - if (!disk->ev) - return -ENODEV; - - disk_block_events(disk); - disk->ev->poll_msecs = intv; - __disk_unblock_events(disk, true); - - return count; -} - -static const DEVICE_ATTR(events, 0444, disk_events_show, NULL); -static const DEVICE_ATTR(events_async, 0444, disk_events_async_show, NULL); -static const DEVICE_ATTR(events_poll_msecs, 0644, - disk_events_poll_msecs_show, - disk_events_poll_msecs_store); - -static const struct attribute *disk_events_attrs[] = { - &dev_attr_events.attr, - &dev_attr_events_async.attr, - &dev_attr_events_poll_msecs.attr, - NULL, -}; - -/* - * The default polling interval can be specified by the kernel - * parameter block.events_dfl_poll_msecs which defaults to 0 - * (disable). This can also be modified runtime by writing to - * /sys/module/block/parameters/events_dfl_poll_msecs. - */ -static int disk_events_set_dfl_poll_msecs(const char *val, - const struct kernel_param *kp) -{ - struct disk_events *ev; - int ret; - - ret = param_set_ulong(val, kp); - if (ret < 0) - return ret; - - mutex_lock(&disk_events_mutex); - - list_for_each_entry(ev, &disk_events, node) - disk_flush_events(ev->disk, 0); - - mutex_unlock(&disk_events_mutex); - - return 0; -} - -static const struct kernel_param_ops disk_events_dfl_poll_msecs_param_ops = { - .set = disk_events_set_dfl_poll_msecs, - .get = param_get_ulong, -}; - -#undef MODULE_PARAM_PREFIX -#define MODULE_PARAM_PREFIX "block." - -module_param_cb(events_dfl_poll_msecs, &disk_events_dfl_poll_msecs_param_ops, - &disk_events_dfl_poll_msecs, 0644); - -/* - * disk_{alloc|add|del|release}_events - initialize and destroy disk_events. - */ -static void disk_alloc_events(struct gendisk *disk) -{ - struct disk_events *ev; - - if (!disk->fops->check_events || !disk->events) - return; - - ev = kzalloc(sizeof(*ev), GFP_KERNEL); - if (!ev) { - pr_warn("%s: failed to initialize events\n", disk->disk_name); - return; - } - - INIT_LIST_HEAD(&ev->node); - ev->disk = disk; - spin_lock_init(&ev->lock); - mutex_init(&ev->block_mutex); - ev->block = 1; - ev->poll_msecs = -1; - INIT_DELAYED_WORK(&ev->dwork, disk_events_workfn); - - disk->ev = ev; -} - -static void disk_add_events(struct gendisk *disk) -{ - /* FIXME: error handling */ - if (sysfs_create_files(&disk_to_dev(disk)->kobj, disk_events_attrs) < 0) - pr_warn("%s: failed to create sysfs files for events\n", - disk->disk_name); - - if (!disk->ev) - return; - - mutex_lock(&disk_events_mutex); - list_add_tail(&disk->ev->node, &disk_events); - mutex_unlock(&disk_events_mutex); - - /* - * Block count is initialized to 1 and the following initial - * unblock kicks it into action. - */ - __disk_unblock_events(disk, true); -} - -static void disk_del_events(struct gendisk *disk) -{ - if (disk->ev) { - disk_block_events(disk); - - mutex_lock(&disk_events_mutex); - list_del_init(&disk->ev->node); - mutex_unlock(&disk_events_mutex); + if (read_only) { + if (test_and_set_bit(GD_READ_ONLY, &disk->state)) + return; + } else { + if (!test_and_clear_bit(GD_READ_ONLY, &disk->state)) + return; } - - sysfs_remove_files(&disk_to_dev(disk)->kobj, disk_events_attrs); + set_disk_ro_uevent(disk, read_only); } +EXPORT_SYMBOL(set_disk_ro); -static void disk_release_events(struct gendisk *disk) +void inc_diskseq(struct gendisk *disk) { - /* the block count should be 1 from disk_del_events() */ - WARN_ON_ONCE(disk->ev && disk->ev->block != 1); - kfree(disk->ev); + disk->diskseq = atomic64_inc_return(&diskseq); } diff --git a/block/holder.c b/block/holder.c new file mode 100644 index 000000000000..37d18c13d958 --- /dev/null +++ b/block/holder.c @@ -0,0 +1,154 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/blkdev.h> +#include <linux/slab.h> + +struct bd_holder_disk { + struct list_head list; + struct kobject *holder_dir; + int refcnt; +}; + +static struct bd_holder_disk *bd_find_holder_disk(struct block_device *bdev, + struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + list_for_each_entry(holder, &disk->slave_bdevs, list) + if (holder->holder_dir == bdev->bd_holder_dir) + return holder; + return NULL; +} + +static int add_symlink(struct kobject *from, struct kobject *to) +{ + return sysfs_create_link(from, to, kobject_name(to)); +} + +static void del_symlink(struct kobject *from, struct kobject *to) +{ + sysfs_remove_link(from, kobject_name(to)); +} + +/** + * bd_link_disk_holder - create symlinks between holding disk and slave bdev + * @bdev: the claimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * This functions creates the following sysfs symlinks. + * + * - from "slaves" directory of the holder @disk to the claimed @bdev + * - from "holders" directory of the @bdev to the holder @disk + * + * For example, if /dev/dm-0 maps to /dev/sda and disk for dm-0 is + * passed to bd_link_disk_holder(), then: + * + * /sys/block/dm-0/slaves/sda --> /sys/block/sda + * /sys/block/sda/holders/dm-0 --> /sys/block/dm-0 + * + * The caller must have claimed @bdev before calling this function and + * ensure that both @bdev and @disk are valid during the creation and + * lifetime of these symlinks. + * + * CONTEXT: + * Might sleep. + * + * RETURNS: + * 0 on success, -errno on failure. + */ +int bd_link_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + int ret = 0; + + if (WARN_ON_ONCE(!disk->slave_dir)) + return -EINVAL; + + if (bdev->bd_disk == disk) + return -EINVAL; + + /* + * del_gendisk drops the initial reference to bd_holder_dir, so we + * need to keep our own here to allow for cleanup past that point. + */ + mutex_lock(&bdev->bd_disk->open_mutex); + if (!disk_live(bdev->bd_disk)) { + mutex_unlock(&bdev->bd_disk->open_mutex); + return -ENODEV; + } + kobject_get(bdev->bd_holder_dir); + mutex_unlock(&bdev->bd_disk->open_mutex); + + mutex_lock(&disk->open_mutex); + WARN_ON_ONCE(!bdev->bd_holder); + + holder = bd_find_holder_disk(bdev, disk); + if (holder) { + kobject_put(bdev->bd_holder_dir); + holder->refcnt++; + goto out_unlock; + } + + holder = kzalloc(sizeof(*holder), GFP_KERNEL); + if (!holder) { + ret = -ENOMEM; + goto out_unlock; + } + + INIT_LIST_HEAD(&holder->list); + holder->refcnt = 1; + holder->holder_dir = bdev->bd_holder_dir; + + ret = add_symlink(disk->slave_dir, bdev_kobj(bdev)); + if (ret) + goto out_free_holder; + ret = add_symlink(bdev->bd_holder_dir, &disk_to_dev(disk)->kobj); + if (ret) + goto out_del_symlink; + list_add(&holder->list, &disk->slave_bdevs); + + mutex_unlock(&disk->open_mutex); + return 0; + +out_del_symlink: + del_symlink(disk->slave_dir, bdev_kobj(bdev)); +out_free_holder: + kfree(holder); +out_unlock: + mutex_unlock(&disk->open_mutex); + if (ret) + kobject_put(bdev->bd_holder_dir); + return ret; +} +EXPORT_SYMBOL_GPL(bd_link_disk_holder); + +/** + * bd_unlink_disk_holder - destroy symlinks created by bd_link_disk_holder() + * @bdev: the calimed slave bdev + * @disk: the holding disk + * + * DON'T USE THIS UNLESS YOU'RE ALREADY USING IT. + * + * CONTEXT: + * Might sleep. + */ +void bd_unlink_disk_holder(struct block_device *bdev, struct gendisk *disk) +{ + struct bd_holder_disk *holder; + + if (WARN_ON_ONCE(!disk->slave_dir)) + return; + + mutex_lock(&disk->open_mutex); + holder = bd_find_holder_disk(bdev, disk); + if (!WARN_ON_ONCE(holder == NULL) && !--holder->refcnt) { + del_symlink(disk->slave_dir, bdev_kobj(bdev)); + del_symlink(holder->holder_dir, &disk_to_dev(disk)->kobj); + kobject_put(holder->holder_dir); + list_del_init(&holder->list); + kfree(holder); + } + mutex_unlock(&disk->open_mutex); +} +EXPORT_SYMBOL_GPL(bd_unlink_disk_holder); diff --git a/block/ioctl.c b/block/ioctl.c index bdb3bbb253d9..438f79c564cf 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -16,42 +16,37 @@ static int blkpg_do_ioctl(struct block_device *bdev, struct blkpg_partition __user *upart, int op) { + struct gendisk *disk = bdev->bd_disk; struct blkpg_partition p; - long long start, length; + sector_t start, length; if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (copy_from_user(&p, upart, sizeof(struct blkpg_partition))) return -EFAULT; - if (bdev != bdev->bd_contains) + if (bdev_is_partition(bdev)) return -EINVAL; if (p.pno <= 0) return -EINVAL; if (op == BLKPG_DEL_PARTITION) - return bdev_del_partition(bdev, p.pno); + return bdev_del_partition(disk, p.pno); + + if (p.start < 0 || p.length <= 0 || p.start + p.length < 0) + return -EINVAL; + /* Check that the partition is aligned to the block size */ + if (!IS_ALIGNED(p.start | p.length, bdev_logical_block_size(bdev))) + return -EINVAL; start = p.start >> SECTOR_SHIFT; length = p.length >> SECTOR_SHIFT; - /* check for fit in a hd_struct */ - if (sizeof(sector_t) < sizeof(long long)) { - long pstart = start, plength = length; - - if (pstart != start || plength != length || pstart < 0 || - plength < 0 || p.pno > 65535) - return -EINVAL; - } - switch (op) { case BLKPG_ADD_PARTITION: - /* check if partition is aligned to blocksize */ - if (p.start & (bdev_logical_block_size(bdev) - 1)) - return -EINVAL; - return bdev_add_partition(bdev, p.pno, start, length); + return bdev_add_partition(disk, p.pno, start, length); case BLKPG_RESIZE_PARTITION: - return bdev_resize_partition(bdev, p.pno, start, length); + return bdev_resize_partition(disk, p.pno, start, length); default: return -EINVAL; } @@ -90,35 +85,18 @@ static int compat_blkpg_ioctl(struct block_device *bdev, } #endif -static int blkdev_reread_part(struct block_device *bdev) -{ - int ret; - - if (!disk_part_scan_enabled(bdev->bd_disk) || bdev != bdev->bd_contains) - return -EINVAL; - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - - mutex_lock(&bdev->bd_mutex); - ret = bdev_disk_changed(bdev, false); - mutex_unlock(&bdev->bd_mutex); - - return ret; -} - -static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, - unsigned long arg, unsigned long flags) +static int blk_ioctl_discard(struct block_device *bdev, blk_mode_t mode, + unsigned long arg) { uint64_t range[2]; uint64_t start, len; - struct request_queue *q = bdev_get_queue(bdev); - struct address_space *mapping = bdev->bd_inode->i_mapping; - + struct inode *inode = bdev->bd_inode; + int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; - if (!blk_queue_discard(q)) + if (!bdev_max_discard_sectors(bdev)) return -EOPNOTSUPP; if (copy_from_user(range, (void __user *)arg, sizeof(range))) @@ -132,21 +110,59 @@ static int blk_ioctl_discard(struct block_device *bdev, fmode_t mode, if (len & 511) return -EINVAL; - if (start + len > i_size_read(bdev->bd_inode)) + if (start + len > bdev_nr_bytes(bdev)) return -EINVAL; - truncate_inode_pages_range(mapping, start, start + len - 1); - return blkdev_issue_discard(bdev, start >> 9, len >> 9, - GFP_KERNEL, flags); + + filemap_invalidate_lock(inode->i_mapping); + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (err) + goto fail; + err = blkdev_issue_discard(bdev, start >> 9, len >> 9, GFP_KERNEL); +fail: + filemap_invalidate_unlock(inode->i_mapping); + return err; +} + +static int blk_ioctl_secure_erase(struct block_device *bdev, blk_mode_t mode, + void __user *argp) +{ + uint64_t start, len; + uint64_t range[2]; + int err; + + if (!(mode & BLK_OPEN_WRITE)) + return -EBADF; + if (!bdev_max_secure_erase_sectors(bdev)) + return -EOPNOTSUPP; + if (copy_from_user(range, argp, sizeof(range))) + return -EFAULT; + + start = range[0]; + len = range[1]; + if ((start & 511) || (len & 511)) + return -EINVAL; + if (start + len > bdev_nr_bytes(bdev)) + return -EINVAL; + + filemap_invalidate_lock(bdev->bd_inode->i_mapping); + err = truncate_bdev_range(bdev, mode, start, start + len - 1); + if (!err) + err = blkdev_issue_secure_erase(bdev, start >> 9, len >> 9, + GFP_KERNEL); + filemap_invalidate_unlock(bdev->bd_inode->i_mapping); + return err; } -static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, + +static int blk_ioctl_zeroout(struct block_device *bdev, blk_mode_t mode, unsigned long arg) { uint64_t range[2]; - struct address_space *mapping; uint64_t start, end, len; + struct inode *inode = bdev->bd_inode; + int err; - if (!(mode & FMODE_WRITE)) + if (!(mode & BLK_OPEN_WRITE)) return -EBADF; if (copy_from_user(range, (void __user *)arg, sizeof(range))) @@ -160,17 +176,23 @@ static int blk_ioctl_zeroout(struct block_device *bdev, fmode_t mode, return -EINVAL; if (len & 511) return -EINVAL; - if (end >= (uint64_t)i_size_read(bdev->bd_inode)) + if (end >= (uint64_t)bdev_nr_bytes(bdev)) return -EINVAL; if (end < start) return -EINVAL; /* Invalidate the page cache, including dirty pages */ - mapping = bdev->bd_inode->i_mapping; - truncate_inode_pages_range(mapping, start, end); + filemap_invalidate_lock(inode->i_mapping); + err = truncate_bdev_range(bdev, mode, start, end); + if (err) + goto fail; - return blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, - BLKDEV_ZERO_NOUNMAP); + err = blkdev_issue_zeroout(bdev, start >> 9, len >> 9, GFP_KERNEL, + BLKDEV_ZERO_NOUNMAP); + +fail: + filemap_invalidate_unlock(inode->i_mapping); + return err; } static int put_ushort(unsigned short __user *argp, unsigned short val) @@ -215,30 +237,13 @@ static int compat_put_ulong(compat_ulong_t __user *argp, compat_ulong_t val) } #endif -int __blkdev_driver_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) -{ - struct gendisk *disk = bdev->bd_disk; - - if (disk->fops->ioctl) - return disk->fops->ioctl(bdev, mode, cmd, arg); - - return -ENOTTY; -} -/* - * For the record: _GPL here is only because somebody decided to slap it - * on the previous export. Sheer idiocy, since it wasn't copyrightable - * at all and could be open-coded without any exports by anybody who cares. - */ -EXPORT_SYMBOL_GPL(__blkdev_driver_ioctl); - #ifdef CONFIG_COMPAT /* * This is the equivalent of compat_ptr_ioctl(), to be used by block * drivers that implement only commands that are completely compatible * between 32-bit and 64-bit user space */ -int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, +int blkdev_compat_ptr_ioctl(struct block_device *bdev, blk_mode_t mode, unsigned cmd, unsigned long arg) { struct gendisk *disk = bdev->bd_disk; @@ -252,13 +257,28 @@ int blkdev_compat_ptr_ioctl(struct block_device *bdev, fmode_t mode, EXPORT_SYMBOL(blkdev_compat_ptr_ioctl); #endif -static int blkdev_pr_register(struct block_device *bdev, +static bool blkdev_pr_allowed(struct block_device *bdev, blk_mode_t mode) +{ + /* no sense to make reservations for partitions */ + if (bdev_is_partition(bdev)) + return false; + + if (capable(CAP_SYS_ADMIN)) + return true; + /* + * Only allow unprivileged reservations if the file descriptor is open + * for writing. + */ + return mode & BLK_OPEN_WRITE; +} + +static int blkdev_pr_register(struct block_device *bdev, blk_mode_t mode, struct pr_registration __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_registration reg; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_register) return -EOPNOTSUPP; @@ -270,13 +290,13 @@ static int blkdev_pr_register(struct block_device *bdev, return ops->pr_register(bdev, reg.old_key, reg.new_key, reg.flags); } -static int blkdev_pr_reserve(struct block_device *bdev, +static int blkdev_pr_reserve(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_reserve) return -EOPNOTSUPP; @@ -288,13 +308,13 @@ static int blkdev_pr_reserve(struct block_device *bdev, return ops->pr_reserve(bdev, rsv.key, rsv.type, rsv.flags); } -static int blkdev_pr_release(struct block_device *bdev, +static int blkdev_pr_release(struct block_device *bdev, blk_mode_t mode, struct pr_reservation __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_reservation rsv; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_release) return -EOPNOTSUPP; @@ -306,13 +326,13 @@ static int blkdev_pr_release(struct block_device *bdev, return ops->pr_release(bdev, rsv.key, rsv.type); } -static int blkdev_pr_preempt(struct block_device *bdev, +static int blkdev_pr_preempt(struct block_device *bdev, blk_mode_t mode, struct pr_preempt __user *arg, bool abort) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_preempt p; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_preempt) return -EOPNOTSUPP; @@ -324,13 +344,13 @@ static int blkdev_pr_preempt(struct block_device *bdev, return ops->pr_preempt(bdev, p.old_key, p.new_key, p.type, abort); } -static int blkdev_pr_clear(struct block_device *bdev, +static int blkdev_pr_clear(struct block_device *bdev, blk_mode_t mode, struct pr_clear __user *arg) { const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; struct pr_clear c; - if (!capable(CAP_SYS_ADMIN)) + if (!blkdev_pr_allowed(bdev, mode)) return -EPERM; if (!ops || !ops->pr_clear) return -EOPNOTSUPP; @@ -342,57 +362,40 @@ static int blkdev_pr_clear(struct block_device *bdev, return ops->pr_clear(bdev, c.key); } -/* - * Is it an unrecognized ioctl? The correct returns are either - * ENOTTY (final) or ENOIOCTLCMD ("I don't know this one, try a - * fallback"). ENOIOCTLCMD gets turned into ENOTTY by the ioctl - * code before returning. - * - * Confused drivers sometimes return EINVAL, which is wrong. It - * means "I understood the ioctl command, but the parameters to - * it were wrong". - * - * We should aim to just fix the broken drivers, the EINVAL case - * should go away. - */ -static inline int is_unrecognized_ioctl(int ret) -{ - return ret == -EINVAL || - ret == -ENOTTY || - ret == -ENOIOCTLCMD; -} - -static int blkdev_flushbuf(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) +static int blkdev_flushbuf(struct block_device *bdev, unsigned cmd, + unsigned long arg) { - int ret; - if (!capable(CAP_SYS_ADMIN)) return -EACCES; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; + mutex_lock(&bdev->bd_holder_lock); + if (bdev->bd_holder_ops && bdev->bd_holder_ops->sync) + bdev->bd_holder_ops->sync(bdev); + else { + mutex_unlock(&bdev->bd_holder_lock); + sync_blockdev(bdev); + } - fsync_bdev(bdev); invalidate_bdev(bdev); return 0; } -static int blkdev_roset(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg) +static int blkdev_roset(struct block_device *bdev, unsigned cmd, + unsigned long arg) { int ret, n; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - ret = __blkdev_driver_ioctl(bdev, mode, cmd, arg); - if (!is_unrecognized_ioctl(ret)) - return ret; if (get_user(n, (int __user *)arg)) return -EFAULT; - set_device_ro(bdev, n); + if (bdev->bd_disk->fops->set_read_only) { + ret = bdev->bd_disk->fops->set_read_only(bdev, n); + if (ret) + return ret; + } + bdev->bd_read_only = n; return 0; } @@ -462,10 +465,11 @@ static int compat_hdio_getgeo(struct block_device *bdev, #endif /* set the logical block size */ -static int blkdev_bszset(struct block_device *bdev, fmode_t mode, +static int blkdev_bszset(struct block_device *bdev, blk_mode_t mode, int __user *argp) { int ret, n; + struct bdev_handle *handle; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -474,15 +478,15 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, if (get_user(n, argp)) return -EFAULT; - if (!(mode & FMODE_EXCL)) { - bdgrab(bdev); - if (blkdev_get(bdev, mode | FMODE_EXCL, &bdev) < 0) - return -EBUSY; - } + if (mode & BLK_OPEN_EXCL) + return set_blocksize(bdev, n); + handle = bdev_open_by_dev(bdev->bd_dev, mode, &bdev, NULL); + if (IS_ERR(handle)) + return -EBUSY; ret = set_blocksize(bdev, n); - if (!(mode & FMODE_EXCL)) - blkdev_put(bdev, mode | FMODE_EXCL); + bdev_release(handle); + return ret; } @@ -491,25 +495,27 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, - unsigned cmd, unsigned long arg, void __user *argp) +static int blkdev_common_ioctl(struct block_device *bdev, blk_mode_t mode, + unsigned int cmd, unsigned long arg, + void __user *argp) { unsigned int max_sectors; switch (cmd) { case BLKFLSBUF: - return blkdev_flushbuf(bdev, mode, cmd, arg); + return blkdev_flushbuf(bdev, cmd, arg); case BLKROSET: - return blkdev_roset(bdev, mode, cmd, arg); + return blkdev_roset(bdev, cmd, arg); case BLKDISCARD: - return blk_ioctl_discard(bdev, mode, arg, 0); + return blk_ioctl_discard(bdev, mode, arg); case BLKSECDISCARD: - return blk_ioctl_discard(bdev, mode, arg, - BLKDEV_DISCARD_SECURE); + return blk_ioctl_secure_erase(bdev, mode, argp); case BLKZEROOUT: return blk_ioctl_zeroout(bdev, mode, arg); + case BLKGETDISKSEQ: + return put_u64(argp, bdev->bd_disk->diskseq); case BLKREPORTZONE: - return blkdev_report_zones_ioctl(bdev, mode, cmd, arg); + return blkdev_report_zones_ioctl(bdev, cmd, arg); case BLKRESETZONE: case BLKOPENZONE: case BLKCLOSEZONE: @@ -518,7 +524,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, case BLKGETZONESZ: return put_uint(argp, bdev_zone_sectors(bdev)); case BLKGETNRZONES: - return put_uint(argp, blkdev_nr_zones(bdev->bd_disk)); + return put_uint(argp, bdev_nr_zones(bdev)); case BLKROGET: return put_int(argp, bdev_read_only(bdev) != 0); case BLKSSZGET: /* get block device logical block size */ @@ -538,31 +544,35 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, queue_max_sectors(bdev_get_queue(bdev))); return put_ushort(argp, max_sectors); case BLKROTATIONAL: - return put_ushort(argp, !blk_queue_nonrot(bdev_get_queue(bdev))); + return put_ushort(argp, !bdev_nonrot(bdev)); case BLKRASET: case BLKFRASET: if(!capable(CAP_SYS_ADMIN)) return -EACCES; - bdev->bd_bdi->ra_pages = (arg * 512) / PAGE_SIZE; + bdev->bd_disk->bdi->ra_pages = (arg * 512) / PAGE_SIZE; return 0; case BLKRRPART: - return blkdev_reread_part(bdev); + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (bdev_is_partition(bdev)) + return -EINVAL; + return disk_scan_partitions(bdev->bd_disk, mode); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: return blk_trace_ioctl(bdev, cmd, argp); case IOC_PR_REGISTER: - return blkdev_pr_register(bdev, argp); + return blkdev_pr_register(bdev, mode, argp); case IOC_PR_RESERVE: - return blkdev_pr_reserve(bdev, argp); + return blkdev_pr_reserve(bdev, mode, argp); case IOC_PR_RELEASE: - return blkdev_pr_release(bdev, argp); + return blkdev_pr_release(bdev, mode, argp); case IOC_PR_PREEMPT: - return blkdev_pr_preempt(bdev, argp, false); + return blkdev_pr_preempt(bdev, mode, argp, false); case IOC_PR_PREEMPT_ABORT: - return blkdev_pr_preempt(bdev, argp, true); + return blkdev_pr_preempt(bdev, mode, argp, true); case IOC_PR_CLEAR: - return blkdev_pr_clear(bdev, argp); + return blkdev_pr_clear(bdev, mode, argp); default: return -ENOIOCTLCMD; } @@ -574,12 +584,12 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, * * New commands must be compatible and go into blkdev_common_ioctl */ -int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, - unsigned long arg) +long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { - int ret; - loff_t size; + struct block_device *bdev = I_BDEV(file->f_mapping->host); void __user *argp = (void __user *)arg; + blk_mode_t mode = file_to_blk_mode(file); + int ret; switch (cmd) { /* These need separate implementations for the data structure */ @@ -593,12 +603,12 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKFRAGET: if (!argp) return -EINVAL; - return put_long(argp, (bdev->bd_bdi->ra_pages*PAGE_SIZE) / 512); + return put_long(argp, + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~0UL) return -EFBIG; - return put_ulong(argp, size >> 9); + return put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET: /* get block device soft block size (cf. BLKSSZGET) */ @@ -606,7 +616,7 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, case BLKBSZSET: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP: @@ -616,12 +626,13 @@ int blkdev_ioctl(struct block_device *bdev, fmode_t mode, unsigned cmd, } ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); - if (ret == -ENOIOCTLCMD) - return __blkdev_driver_ioctl(bdev, mode, cmd, arg); + if (ret != -ENOIOCTLCMD) + return ret; - return ret; + if (!bdev->bd_disk->fops->ioctl) + return -ENOTTY; + return bdev->bd_disk->fops->ioctl(bdev, mode, cmd, arg); } -EXPORT_SYMBOL_GPL(blkdev_ioctl); /* for /dev/raw */ #ifdef CONFIG_COMPAT @@ -636,20 +647,9 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) { int ret; void __user *argp = compat_ptr(arg); - struct inode *inode = file->f_mapping->host; - struct block_device *bdev = inode->i_bdev; + struct block_device *bdev = I_BDEV(file->f_mapping->host); struct gendisk *disk = bdev->bd_disk; - fmode_t mode = file->f_mode; - loff_t size; - - /* - * O_NDELAY can be altered using fcntl(.., F_SETFL, ..), so we have - * to updated it before every ioctl. - */ - if (file->f_flags & O_NDELAY) - mode |= FMODE_NDELAY; - else - mode &= ~FMODE_NDELAY; + blk_mode_t mode = file_to_blk_mode(file); switch (cmd) { /* These need separate implementations for the data structure */ @@ -664,12 +664,11 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) if (!argp) return -EINVAL; return compat_put_long(argp, - (bdev->bd_bdi->ra_pages * PAGE_SIZE) / 512); + (bdev->bd_disk->bdi->ra_pages * PAGE_SIZE) / 512); case BLKGETSIZE: - size = i_size_read(bdev->bd_inode); - if ((size >> 9) > ~0UL) + if (bdev_nr_sectors(bdev) > ~(compat_ulong_t)0) return -EFBIG; - return compat_put_ulong(argp, size >> 9); + return compat_put_ulong(argp, bdev_nr_sectors(bdev)); /* The data is compatible, but the command number is different */ case BLKBSZGET_32: /* get the logical block size (cf. BLKSSZGET) */ @@ -677,7 +676,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) case BLKBSZSET_32: return blkdev_bszset(bdev, mode, argp); case BLKGETSIZE64_32: - return put_u64(argp, i_size_read(bdev->bd_inode)); + return put_u64(argp, bdev_nr_bytes(bdev)); /* Incompatible alignment on i386 */ case BLKTRACESETUP32: diff --git a/block/ioprio.c b/block/ioprio.c index 77bcab11dce5..73301a261429 100644 --- a/block/ioprio.c +++ b/block/ioprio.c @@ -22,68 +22,43 @@ */ #include <linux/gfp.h> #include <linux/kernel.h> -#include <linux/export.h> #include <linux/ioprio.h> #include <linux/cred.h> #include <linux/blkdev.h> #include <linux/capability.h> -#include <linux/sched/user.h> -#include <linux/sched/task.h> #include <linux/syscalls.h> #include <linux/security.h> #include <linux/pid_namespace.h> -int set_task_ioprio(struct task_struct *task, int ioprio) -{ - int err; - struct io_context *ioc; - const struct cred *cred = current_cred(), *tcred; - - rcu_read_lock(); - tcred = __task_cred(task); - if (!uid_eq(tcred->uid, cred->euid) && - !uid_eq(tcred->uid, cred->uid) && !capable(CAP_SYS_NICE)) { - rcu_read_unlock(); - return -EPERM; - } - rcu_read_unlock(); - - err = security_task_setioprio(task, ioprio); - if (err) - return err; - - ioc = get_task_io_context(task, GFP_ATOMIC, NUMA_NO_NODE); - if (ioc) { - ioc->ioprio = ioprio; - put_io_context(ioc); - } - - return err; -} -EXPORT_SYMBOL_GPL(set_task_ioprio); - int ioprio_check_cap(int ioprio) { int class = IOPRIO_PRIO_CLASS(ioprio); - int data = IOPRIO_PRIO_DATA(ioprio); + int level = IOPRIO_PRIO_LEVEL(ioprio); switch (class) { case IOPRIO_CLASS_RT: - if (!capable(CAP_SYS_ADMIN)) + /* + * Originally this only checked for CAP_SYS_ADMIN, + * which was implicitly allowed for pid 0 by security + * modules such as SELinux. Make sure we check + * CAP_SYS_ADMIN first to avoid a denial/avc for + * possibly missing CAP_SYS_NICE permission. + */ + if (!capable(CAP_SYS_ADMIN) && !capable(CAP_SYS_NICE)) return -EPERM; - /* fall through */ + fallthrough; /* rt has prio field too */ case IOPRIO_CLASS_BE: - if (data >= IOPRIO_BE_NR || data < 0) + if (level >= IOPRIO_NR_LEVELS) return -EINVAL; - break; case IOPRIO_CLASS_IDLE: break; case IOPRIO_CLASS_NONE: - if (data) + if (level) return -EINVAL; break; + case IOPRIO_CLASS_INVALID: default: return -EINVAL; } @@ -119,11 +94,17 @@ SYSCALL_DEFINE3(ioprio_set, int, which, int, who, int, ioprio) pgrp = task_pgrp(current); else pgrp = find_vpid(who); + + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { ret = set_task_ioprio(p, ioprio); - if (ret) - break; + if (ret) { + read_unlock(&tasklist_lock); + goto out; + } } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + break; case IOPRIO_WHO_USER: uid = make_kuid(current_user_ns(), who); @@ -153,6 +134,7 @@ free_uid: ret = -EINVAL; } +out: rcu_read_unlock(); return ret; } @@ -164,22 +146,38 @@ static int get_task_ioprio(struct task_struct *p) ret = security_task_getioprio(p); if (ret) goto out; - ret = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_NONE, IOPRIO_NORM); + task_lock(p); + ret = __get_task_ioprio(p); + task_unlock(p); +out: + return ret; +} + +/* + * Return raw IO priority value as set by userspace. We use this for + * ioprio_get(pid, IOPRIO_WHO_PROCESS) so that we keep historical behavior and + * also so that userspace can distinguish unset IO priority (which just gets + * overriden based on task's nice value) from IO priority set to some value. + */ +static int get_task_raw_ioprio(struct task_struct *p) +{ + int ret; + + ret = security_task_getioprio(p); + if (ret) + goto out; task_lock(p); if (p->io_context) ret = p->io_context->ioprio; + else + ret = IOPRIO_DEFAULT; task_unlock(p); out: return ret; } -int ioprio_best(unsigned short aprio, unsigned short bprio) +static int ioprio_best(unsigned short aprio, unsigned short bprio) { - if (!ioprio_valid(aprio)) - aprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - if (!ioprio_valid(bprio)) - bprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, IOPRIO_NORM); - return min(aprio, bprio); } @@ -200,13 +198,14 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) else p = find_task_by_vpid(who); if (p) - ret = get_task_ioprio(p); + ret = get_task_raw_ioprio(p); break; case IOPRIO_WHO_PGRP: if (!who) pgrp = task_pgrp(current); else pgrp = find_vpid(who); + read_lock(&tasklist_lock); do_each_pid_thread(pgrp, PIDTYPE_PGID, p) { tmpio = get_task_ioprio(p); if (tmpio < 0) @@ -216,6 +215,8 @@ SYSCALL_DEFINE2(ioprio_get, int, which, int, who) else ret = ioprio_best(ret, tmpio); } while_each_pid_thread(pgrp, PIDTYPE_PGID, p); + read_unlock(&tasklist_lock); + break; case IOPRIO_WHO_USER: uid = make_kuid(current_user_ns(), who); diff --git a/block/keyslot-manager.c b/block/keyslot-manager.c deleted file mode 100644 index 35abcb1ec051..000000000000 --- a/block/keyslot-manager.c +++ /dev/null @@ -1,396 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright 2019 Google LLC - */ - -/** - * DOC: The Keyslot Manager - * - * Many devices with inline encryption support have a limited number of "slots" - * into which encryption contexts may be programmed, and requests can be tagged - * with a slot number to specify the key to use for en/decryption. - * - * As the number of slots is limited, and programming keys is expensive on - * many inline encryption hardware, we don't want to program the same key into - * multiple slots - if multiple requests are using the same key, we want to - * program just one slot with that key and use that slot for all requests. - * - * The keyslot manager manages these keyslots appropriately, and also acts as - * an abstraction between the inline encryption hardware and the upper layers. - * - * Lower layer devices will set up a keyslot manager in their request queue - * and tell it how to perform device specific operations like programming/ - * evicting keys from keyslots. - * - * Upper layers will call blk_ksm_get_slot_for_key() to program a - * key into some slot in the inline encryption hardware. - */ - -#define pr_fmt(fmt) "blk-crypto: " fmt - -#include <linux/keyslot-manager.h> -#include <linux/atomic.h> -#include <linux/mutex.h> -#include <linux/pm_runtime.h> -#include <linux/wait.h> -#include <linux/blkdev.h> - -struct blk_ksm_keyslot { - atomic_t slot_refs; - struct list_head idle_slot_node; - struct hlist_node hash_node; - const struct blk_crypto_key *key; - struct blk_keyslot_manager *ksm; -}; - -static inline void blk_ksm_hw_enter(struct blk_keyslot_manager *ksm) -{ - /* - * Calling into the driver requires ksm->lock held and the device - * resumed. But we must resume the device first, since that can acquire - * and release ksm->lock via blk_ksm_reprogram_all_keys(). - */ - if (ksm->dev) - pm_runtime_get_sync(ksm->dev); - down_write(&ksm->lock); -} - -static inline void blk_ksm_hw_exit(struct blk_keyslot_manager *ksm) -{ - up_write(&ksm->lock); - if (ksm->dev) - pm_runtime_put_sync(ksm->dev); -} - -/** - * blk_ksm_init() - Initialize a keyslot manager - * @ksm: The keyslot_manager to initialize. - * @num_slots: The number of key slots to manage. - * - * Allocate memory for keyslots and initialize a keyslot manager. Called by - * e.g. storage drivers to set up a keyslot manager in their request_queue. - * - * Return: 0 on success, or else a negative error code. - */ -int blk_ksm_init(struct blk_keyslot_manager *ksm, unsigned int num_slots) -{ - unsigned int slot; - unsigned int i; - unsigned int slot_hashtable_size; - - memset(ksm, 0, sizeof(*ksm)); - - if (num_slots == 0) - return -EINVAL; - - ksm->slots = kvcalloc(num_slots, sizeof(ksm->slots[0]), GFP_KERNEL); - if (!ksm->slots) - return -ENOMEM; - - ksm->num_slots = num_slots; - - init_rwsem(&ksm->lock); - - init_waitqueue_head(&ksm->idle_slots_wait_queue); - INIT_LIST_HEAD(&ksm->idle_slots); - - for (slot = 0; slot < num_slots; slot++) { - ksm->slots[slot].ksm = ksm; - list_add_tail(&ksm->slots[slot].idle_slot_node, - &ksm->idle_slots); - } - - spin_lock_init(&ksm->idle_slots_lock); - - slot_hashtable_size = roundup_pow_of_two(num_slots); - ksm->log_slot_ht_size = ilog2(slot_hashtable_size); - ksm->slot_hashtable = kvmalloc_array(slot_hashtable_size, - sizeof(ksm->slot_hashtable[0]), - GFP_KERNEL); - if (!ksm->slot_hashtable) - goto err_destroy_ksm; - for (i = 0; i < slot_hashtable_size; i++) - INIT_HLIST_HEAD(&ksm->slot_hashtable[i]); - - return 0; - -err_destroy_ksm: - blk_ksm_destroy(ksm); - return -ENOMEM; -} -EXPORT_SYMBOL_GPL(blk_ksm_init); - -static inline struct hlist_head * -blk_ksm_hash_bucket_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - return &ksm->slot_hashtable[hash_ptr(key, ksm->log_slot_ht_size)]; -} - -static void blk_ksm_remove_slot_from_lru_list(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm = slot->ksm; - unsigned long flags; - - spin_lock_irqsave(&ksm->idle_slots_lock, flags); - list_del(&slot->idle_slot_node); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); -} - -static struct blk_ksm_keyslot *blk_ksm_find_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - const struct hlist_head *head = blk_ksm_hash_bucket_for_key(ksm, key); - struct blk_ksm_keyslot *slotp; - - hlist_for_each_entry(slotp, head, hash_node) { - if (slotp->key == key) - return slotp; - } - return NULL; -} - -static struct blk_ksm_keyslot *blk_ksm_find_and_grab_keyslot( - struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - return NULL; - if (atomic_inc_return(&slot->slot_refs) == 1) { - /* Took first reference to this slot; remove it from LRU list */ - blk_ksm_remove_slot_from_lru_list(slot); - } - return slot; -} - -unsigned int blk_ksm_get_slot_idx(struct blk_ksm_keyslot *slot) -{ - return slot - slot->ksm->slots; -} -EXPORT_SYMBOL_GPL(blk_ksm_get_slot_idx); - -/** - * blk_ksm_get_slot_for_key() - Program a key into a keyslot. - * @ksm: The keyslot manager to program the key into. - * @key: Pointer to the key object to program, including the raw key, crypto - * mode, and data unit size. - * @slot_ptr: A pointer to return the pointer of the allocated keyslot. - * - * Get a keyslot that's been programmed with the specified key. If one already - * exists, return it with incremented refcount. Otherwise, wait for a keyslot - * to become idle and program it. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: BLK_STS_OK on success (and keyslot is set to the pointer of the - * allocated keyslot), or some other blk_status_t otherwise (and - * keyslot is set to NULL). - */ -blk_status_t blk_ksm_get_slot_for_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key, - struct blk_ksm_keyslot **slot_ptr) -{ - struct blk_ksm_keyslot *slot; - int slot_idx; - int err; - - *slot_ptr = NULL; - down_read(&ksm->lock); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - up_read(&ksm->lock); - if (slot) - goto success; - - for (;;) { - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_and_grab_keyslot(ksm, key); - if (slot) { - blk_ksm_hw_exit(ksm); - goto success; - } - - /* - * If we're here, that means there wasn't a slot that was - * already programmed with the key. So try to program it. - */ - if (!list_empty(&ksm->idle_slots)) - break; - - blk_ksm_hw_exit(ksm); - wait_event(ksm->idle_slots_wait_queue, - !list_empty(&ksm->idle_slots)); - } - - slot = list_first_entry(&ksm->idle_slots, struct blk_ksm_keyslot, - idle_slot_node); - slot_idx = blk_ksm_get_slot_idx(slot); - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot_idx); - if (err) { - wake_up(&ksm->idle_slots_wait_queue); - blk_ksm_hw_exit(ksm); - return errno_to_blk_status(err); - } - - /* Move this slot to the hash list for the new key. */ - if (slot->key) - hlist_del(&slot->hash_node); - slot->key = key; - hlist_add_head(&slot->hash_node, blk_ksm_hash_bucket_for_key(ksm, key)); - - atomic_set(&slot->slot_refs, 1); - - blk_ksm_remove_slot_from_lru_list(slot); - - blk_ksm_hw_exit(ksm); -success: - *slot_ptr = slot; - return BLK_STS_OK; -} - -/** - * blk_ksm_put_slot() - Release a reference to a slot - * @slot: The keyslot to release the reference of. - * - * Context: Any context. - */ -void blk_ksm_put_slot(struct blk_ksm_keyslot *slot) -{ - struct blk_keyslot_manager *ksm; - unsigned long flags; - - if (!slot) - return; - - ksm = slot->ksm; - - if (atomic_dec_and_lock_irqsave(&slot->slot_refs, - &ksm->idle_slots_lock, flags)) { - list_add_tail(&slot->idle_slot_node, &ksm->idle_slots); - spin_unlock_irqrestore(&ksm->idle_slots_lock, flags); - wake_up(&ksm->idle_slots_wait_queue); - } -} - -/** - * blk_ksm_crypto_cfg_supported() - Find out if a crypto configuration is - * supported by a ksm. - * @ksm: The keyslot manager to check - * @cfg: The crypto configuration to check for. - * - * Checks for crypto_mode/data unit size/dun bytes support. - * - * Return: Whether or not this ksm supports the specified crypto config. - */ -bool blk_ksm_crypto_cfg_supported(struct blk_keyslot_manager *ksm, - const struct blk_crypto_config *cfg) -{ - if (!ksm) - return false; - if (!(ksm->crypto_modes_supported[cfg->crypto_mode] & - cfg->data_unit_size)) - return false; - if (ksm->max_dun_bytes_supported < cfg->dun_bytes) - return false; - return true; -} - -/** - * blk_ksm_evict_key() - Evict a key from the lower layer device. - * @ksm: The keyslot manager to evict from - * @key: The key to evict - * - * Find the keyslot that the specified key was programmed into, and evict that - * slot from the lower layer device. The slot must not be in use by any - * in-flight IO when this function is called. - * - * Context: Process context. Takes and releases ksm->lock. - * Return: 0 on success or if there's no keyslot with the specified key, -EBUSY - * if the keyslot is still in use, or another -errno value on other - * error. - */ -int blk_ksm_evict_key(struct blk_keyslot_manager *ksm, - const struct blk_crypto_key *key) -{ - struct blk_ksm_keyslot *slot; - int err = 0; - - blk_ksm_hw_enter(ksm); - slot = blk_ksm_find_keyslot(ksm, key); - if (!slot) - goto out_unlock; - - if (WARN_ON_ONCE(atomic_read(&slot->slot_refs) != 0)) { - err = -EBUSY; - goto out_unlock; - } - err = ksm->ksm_ll_ops.keyslot_evict(ksm, key, - blk_ksm_get_slot_idx(slot)); - if (err) - goto out_unlock; - - hlist_del(&slot->hash_node); - slot->key = NULL; - err = 0; -out_unlock: - blk_ksm_hw_exit(ksm); - return err; -} - -/** - * blk_ksm_reprogram_all_keys() - Re-program all keyslots. - * @ksm: The keyslot manager - * - * Re-program all keyslots that are supposed to have a key programmed. This is - * intended only for use by drivers for hardware that loses its keys on reset. - * - * Context: Process context. Takes and releases ksm->lock. - */ -void blk_ksm_reprogram_all_keys(struct blk_keyslot_manager *ksm) -{ - unsigned int slot; - - /* This is for device initialization, so don't resume the device */ - down_write(&ksm->lock); - for (slot = 0; slot < ksm->num_slots; slot++) { - const struct blk_crypto_key *key = ksm->slots[slot].key; - int err; - - if (!key) - continue; - - err = ksm->ksm_ll_ops.keyslot_program(ksm, key, slot); - WARN_ON(err); - } - up_write(&ksm->lock); -} -EXPORT_SYMBOL_GPL(blk_ksm_reprogram_all_keys); - -void blk_ksm_destroy(struct blk_keyslot_manager *ksm) -{ - if (!ksm) - return; - kvfree(ksm->slot_hashtable); - kvfree_sensitive(ksm->slots, sizeof(ksm->slots[0]) * ksm->num_slots); - memzero_explicit(ksm, sizeof(*ksm)); -} -EXPORT_SYMBOL_GPL(blk_ksm_destroy); - -bool blk_ksm_register(struct blk_keyslot_manager *ksm, struct request_queue *q) -{ - if (blk_integrity_queue_supports_integrity(q)) { - pr_warn("Integrity and hardware inline encryption are not supported together. Disabling hardware inline encryption.\n"); - return false; - } - q->ksm = ksm; - return true; -} -EXPORT_SYMBOL_GPL(blk_ksm_register); - -void blk_ksm_unregister(struct request_queue *q) -{ - q->ksm = NULL; -} diff --git a/block/kyber-iosched.c b/block/kyber-iosched.c index a38c5ab103d1..4155594aefc6 100644 --- a/block/kyber-iosched.c +++ b/block/kyber-iosched.c @@ -8,16 +8,16 @@ #include <linux/kernel.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/elevator.h> #include <linux/module.h> #include <linux/sbitmap.h> +#include <trace/events/block.h> + +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" #include "blk-mq-sched.h" -#include "blk-mq-tag.h" #define CREATE_TRACE_POINTS #include <trace/events/kyber.h> @@ -149,6 +149,7 @@ struct kyber_ctx_queue { struct kyber_queue_data { struct request_queue *q; + dev_t dev; /* * Each scheduling domain has a limited number of in-flight requests @@ -192,9 +193,9 @@ struct kyber_hctx_data { static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, void *key); -static unsigned int kyber_sched_domain(unsigned int op) +static unsigned int kyber_sched_domain(blk_opf_t opf) { - switch (op & REQ_OP_MASK) { + switch (opf & REQ_OP_MASK) { case REQ_OP_READ: return KYBER_READ; case REQ_OP_WRITE: @@ -255,7 +256,7 @@ static int calculate_percentile(struct kyber_queue_data *kqd, } memset(buckets, 0, sizeof(kqd->latency_buckets[sched_domain][type])); - trace_kyber_latency(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_latency(kqd->dev, kyber_domain_names[sched_domain], kyber_latency_type_names[type], percentile, bucket + 1, 1 << KYBER_LATENCY_SHIFT, samples); @@ -268,7 +269,7 @@ static void kyber_resize_domain(struct kyber_queue_data *kqd, depth = clamp(depth, 1U, kyber_depth[sched_domain]); if (depth != kqd->domain_tokens[sched_domain].sb.depth) { sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); - trace_kyber_adjust(kqd->q, kyber_domain_names[sched_domain], + trace_kyber_adjust(kqd->dev, kyber_domain_names[sched_domain], depth); } } @@ -353,19 +354,9 @@ static void kyber_timer_fn(struct timer_list *t) } } -static unsigned int kyber_sched_tags_shift(struct request_queue *q) -{ - /* - * All of the hardware queues have the same depth, so we can just grab - * the shift of the first one. - */ - return q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; -} - static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) { struct kyber_queue_data *kqd; - unsigned int shift; int ret = -ENOMEM; int i; @@ -374,6 +365,7 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) goto err; kqd->q = q; + kqd->dev = disk_devt(q->disk); kqd->cpu_latency = alloc_percpu_gfp(struct kyber_cpu_latency, GFP_KERNEL | __GFP_ZERO); @@ -400,9 +392,6 @@ static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) kqd->latency_targets[i] = kyber_latency_targets[i]; } - shift = kyber_sched_tags_shift(q); - kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; - return kqd; err_buckets: @@ -430,6 +419,8 @@ static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) blk_stat_enable_accounting(q); + blk_queue_flag_clear(QUEUE_FLAG_SQ_SCHED, q); + eq->elevator_data = kqd; q->elevator = eq; @@ -441,7 +432,8 @@ static void kyber_exit_sched(struct elevator_queue *e) struct kyber_queue_data *kqd = e->elevator_data; int i; - del_timer_sync(&kqd->timer); + timer_shutdown_sync(&kqd->timer); + blk_stat_disable_accounting(kqd->q); for (i = 0; i < KYBER_NUM_DOMAINS; i++) sbitmap_queue_free(&kqd->domain_tokens[i]); @@ -458,9 +450,19 @@ static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) INIT_LIST_HEAD(&kcq->rq_list[i]); } -static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +static void kyber_depth_updated(struct blk_mq_hw_ctx *hctx) { struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; + + kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; + + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, kqd->async_depth); +} + +static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ struct kyber_hctx_data *khd; int i; @@ -479,7 +481,8 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) for (i = 0; i < KYBER_NUM_DOMAINS; i++) { if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, - ilog2(8), GFP_KERNEL, hctx->numa_node)) { + ilog2(8), GFP_KERNEL, hctx->numa_node, + false, false)) { while (--i >= 0) sbitmap_free(&khd->kcq_map[i]); goto err_kcqs; @@ -502,8 +505,7 @@ static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) khd->batching = 0; hctx->sched_data = khd; - sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, - kqd->async_depth); + kyber_depth_updated(hctx); return 0; @@ -549,31 +551,32 @@ static void rq_clear_domain_token(struct kyber_queue_data *kqd, } } -static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) +static void kyber_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { /* * We use the scheduler tags as per-hardware queue queueing tokens. * Async requests can be limited at this stage. */ - if (!op_is_sync(op)) { + if (!op_is_sync(opf)) { struct kyber_queue_data *kqd = data->q->elevator->elevator_data; data->shallow_depth = kqd->async_depth; } } -static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +static bool kyber_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { + struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); + struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); struct kyber_hctx_data *khd = hctx->sched_data; - struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw[hctx->type]]; unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); struct list_head *rq_list = &kcq->rq_list[sched_domain]; bool merged; spin_lock(&kcq->lock); - merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); + merged = blk_bio_list_merge(hctx->queue, rq_list, bio, nr_segs); spin_unlock(&kcq->lock); return merged; @@ -585,7 +588,8 @@ static void kyber_prepare_request(struct request *rq) } static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *rq_list, bool at_head) + struct list_head *rq_list, + blk_insert_t flags) { struct kyber_hctx_data *khd = hctx->sched_data; struct request *rq, *next; @@ -596,13 +600,13 @@ static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, struct list_head *head = &kcq->rq_list[sched_domain]; spin_lock(&kcq->lock); - if (at_head) + trace_block_rq_insert(rq); + if (flags & BLK_MQ_INSERT_AT_HEAD) list_move(&rq->queuelist, head); else list_move_tail(&rq->queuelist, head); sbitmap_set_bit(&khd->kcq_map[sched_domain], rq->mq_ctx->index_hw[hctx->type]); - blk_mq_sched_request_inserted(rq); spin_unlock(&kcq->lock); } } @@ -774,7 +778,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { @@ -787,7 +791,7 @@ kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, list_del_init(&rq->queuelist); return rq; } else { - trace_kyber_throttled(kqd->q, + trace_kyber_throttled(kqd->dev, kyber_domain_names[khd->cur_domain]); } } @@ -1022,6 +1026,7 @@ static struct elevator_type kyber_sched = { .completed_request = kyber_completed_request, .dispatch_request = kyber_dispatch_request, .has_work = kyber_has_work, + .depth_updated = kyber_depth_updated, }, #ifdef CONFIG_BLK_DEBUG_FS .queue_debugfs_attrs = kyber_queue_debugfs_attrs, diff --git a/block/mq-deadline.c b/block/mq-deadline.c index b57470e154c8..f958e79277b8 100644 --- a/block/mq-deadline.c +++ b/block/mq-deadline.c @@ -8,8 +8,6 @@ #include <linux/kernel.h> #include <linux/fs.h> #include <linux/blkdev.h> -#include <linux/blk-mq.h> -#include <linux/elevator.h> #include <linux/bio.h> #include <linux/module.h> #include <linux/slab.h> @@ -18,10 +16,12 @@ #include <linux/rbtree.h> #include <linux/sbitmap.h> +#include <trace/events/block.h> + +#include "elevator.h" #include "blk.h" #include "blk-mq.h" #include "blk-mq-debugfs.h" -#include "blk-mq-tag.h" #include "blk-mq-sched.h" /* @@ -29,45 +29,117 @@ */ static const int read_expire = HZ / 2; /* max time before a read is submitted. */ static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ +/* + * Time after which to dispatch lower priority requests even if higher + * priority requests are pending. + */ +static const int prio_aging_expire = 10 * HZ; static const int writes_starved = 2; /* max times reads can starve a write */ static const int fifo_batch = 16; /* # of sequential requests treated as one by the above parameters. For throughput. */ +enum dd_data_dir { + DD_READ = READ, + DD_WRITE = WRITE, +}; + +enum { DD_DIR_COUNT = 2 }; + +enum dd_prio { + DD_RT_PRIO = 0, + DD_BE_PRIO = 1, + DD_IDLE_PRIO = 2, + DD_PRIO_MAX = 2, +}; + +enum { DD_PRIO_COUNT = 3 }; + +/* + * I/O statistics per I/O priority. It is fine if these counters overflow. + * What matters is that these counters are at least as wide as + * log2(max_outstanding_requests). + */ +struct io_stats_per_prio { + uint32_t inserted; + uint32_t merged; + uint32_t dispatched; + atomic_t completed; +}; + +/* + * Deadline scheduler data per I/O priority (enum dd_prio). Requests are + * present on both sort_list[] and fifo_list[]. + */ +struct dd_per_prio { + struct list_head dispatch; + struct rb_root sort_list[DD_DIR_COUNT]; + struct list_head fifo_list[DD_DIR_COUNT]; + /* Position of the most recently dispatched request. */ + sector_t latest_pos[DD_DIR_COUNT]; + struct io_stats_per_prio stats; +}; + struct deadline_data { /* * run time data */ - /* - * requests (deadline_rq s) are present on both sort_list and fifo_list - */ - struct rb_root sort_list[2]; - struct list_head fifo_list[2]; + struct dd_per_prio per_prio[DD_PRIO_COUNT]; - /* - * next in sort order. read, write or both are NULL - */ - struct request *next_rq[2]; + /* Data direction of latest dispatched request. */ + enum dd_data_dir last_dir; unsigned int batching; /* number of sequential requests made */ unsigned int starved; /* times reads have starved writes */ /* * settings that change how the i/o scheduler behaves */ - int fifo_expire[2]; + int fifo_expire[DD_DIR_COUNT]; int fifo_batch; int writes_starved; int front_merges; + u32 async_depth; + int prio_aging_expire; spinlock_t lock; spinlock_t zone_lock; - struct list_head dispatch; +}; + +/* Maps an I/O priority class to a deadline scheduler priority. */ +static const enum dd_prio ioprio_class_to_prio[] = { + [IOPRIO_CLASS_NONE] = DD_BE_PRIO, + [IOPRIO_CLASS_RT] = DD_RT_PRIO, + [IOPRIO_CLASS_BE] = DD_BE_PRIO, + [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, }; static inline struct rb_root * -deadline_rb_root(struct deadline_data *dd, struct request *rq) +deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) { - return &dd->sort_list[rq_data_dir(rq)]; + return &per_prio->sort_list[rq_data_dir(rq)]; +} + +/* + * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a + * request. + */ +static u8 dd_rq_ioclass(struct request *rq) +{ + return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); +} + +/* + * get the request before `rq' in sector-sorted order + */ +static inline struct request * +deadline_earlier_request(struct request *rq) +{ + struct rb_node *node = rb_prev(&rq->rb_node); + + if (node) + return rb_entry_rq(node); + + return NULL; } /* @@ -84,39 +156,68 @@ deadline_latter_request(struct request *rq) return NULL; } +/* + * Return the first request for which blk_rq_pos() >= @pos. For zoned devices, + * return the first request after the start of the zone containing @pos. + */ +static inline struct request *deadline_from_pos(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir, sector_t pos) +{ + struct rb_node *node = per_prio->sort_list[data_dir].rb_node; + struct request *rq, *res = NULL; + + if (!node) + return NULL; + + rq = rb_entry_rq(node); + /* + * A zoned write may have been requeued with a starting position that + * is below that of the most recently dispatched request. Hence, for + * zoned writes, start searching from the start of a zone. + */ + if (blk_rq_is_seq_zoned_write(rq)) + pos = round_down(pos, rq->q->limits.chunk_sectors); + + while (node) { + rq = rb_entry_rq(node); + if (blk_rq_pos(rq) >= pos) { + res = rq; + node = node->rb_left; + } else { + node = node->rb_right; + } + } + return res; +} + static void -deadline_add_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - struct rb_root *root = deadline_rb_root(dd, rq); + struct rb_root *root = deadline_rb_root(per_prio, rq); elv_rb_add(root, rq); } static inline void -deadline_del_rq_rb(struct deadline_data *dd, struct request *rq) +deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) { - const int data_dir = rq_data_dir(rq); - - if (dd->next_rq[data_dir] == rq) - dd->next_rq[data_dir] = deadline_latter_request(rq); - - elv_rb_del(deadline_rb_root(dd, rq), rq); + elv_rb_del(deadline_rb_root(per_prio, rq), rq); } /* * remove rq from rbtree and fifo. */ -static void deadline_remove_request(struct request_queue *q, struct request *rq) +static void deadline_remove_request(struct request_queue *q, + struct dd_per_prio *per_prio, + struct request *rq) { - struct deadline_data *dd = q->elevator->elevator_data; - list_del_init(&rq->queuelist); /* * We might not be on the rbtree, if we are doing an insert merge */ if (!RB_EMPTY_NODE(&rq->rb_node)) - deadline_del_rq_rb(dd, rq); + deadline_del_rq_rb(per_prio, rq); elv_rqhash_del(q, rq); if (q->last_merge == rq) @@ -127,19 +228,33 @@ static void dd_request_merged(struct request_queue *q, struct request *req, enum elv_merge type) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(req); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; /* * if the merge was a front merge, we need to reposition request */ if (type == ELEVATOR_FRONT_MERGE) { - elv_rb_del(deadline_rb_root(dd, req), req); - deadline_add_rq_rb(dd, req); + elv_rb_del(deadline_rb_root(per_prio, req), req); + deadline_add_rq_rb(per_prio, req); } } +/* + * Callback function that is invoked after @next has been merged into @req. + */ static void dd_merged_requests(struct request_queue *q, struct request *req, struct request *next) { + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(next); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + + lockdep_assert_held(&dd->lock); + + dd->per_prio[prio].stats.merged++; + /* * if next expires before rq, assign its expire time to rq * and move into next position (next will be deleted) in fifo @@ -155,42 +270,72 @@ static void dd_merged_requests(struct request_queue *q, struct request *req, /* * kill knowledge of next, this one is a goner */ - deadline_remove_request(q, next); + deadline_remove_request(q, &dd->per_prio[prio], next); } /* * move an entry to dispatch queue */ static void -deadline_move_request(struct deadline_data *dd, struct request *rq) +deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + struct request *rq) { - const int data_dir = rq_data_dir(rq); - - dd->next_rq[READ] = NULL; - dd->next_rq[WRITE] = NULL; - dd->next_rq[data_dir] = deadline_latter_request(rq); - /* * take it off the sort and fifo list */ - deadline_remove_request(rq->q, rq); + deadline_remove_request(rq->q, per_prio, rq); +} + +/* Number of requests queued for a given priority level. */ +static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) +{ + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; + + lockdep_assert_held(&dd->lock); + + return stats->inserted - atomic_read(&stats->completed); } /* - * deadline_check_fifo returns 0 if there are no expired requests on the fifo, - * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) + * deadline_check_fifo returns true if and only if there are expired requests + * in the FIFO list. Requires !list_empty(&dd->fifo_list[data_dir]). */ -static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) +static inline bool deadline_check_fifo(struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { - struct request *rq = rq_entry_fifo(dd->fifo_list[ddir].next); + struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); - /* - * rq is expired! - */ - if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) - return 1; + return time_is_before_eq_jiffies((unsigned long)rq->fifo_time); +} - return 0; +/* + * Check if rq has a sequential request preceding it. + */ +static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) +{ + struct request *prev = deadline_earlier_request(rq); + + if (!prev) + return false; + + return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); +} + +/* + * Skip all write requests that are sequential from @rq, even if we cross + * a zone boundary. + */ +static struct request *deadline_skip_seq_writes(struct deadline_data *dd, + struct request *rq) +{ + sector_t pos = blk_rq_pos(rq); + + do { + pos += blk_rq_sectors(rq); + rq = deadline_latter_request(rq); + } while (rq && blk_rq_pos(rq) == pos); + + return rq; } /* @@ -198,28 +343,36 @@ static inline int deadline_check_fifo(struct deadline_data *dd, int ddir) * dispatch using arrival ordered lists. */ static struct request * -deadline_fifo_request(struct deadline_data *dd, int data_dir) +deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { - struct request *rq; + struct request *rq, *rb_rq, *next; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - if (list_empty(&dd->fifo_list[data_dir])) + if (list_empty(&per_prio->fifo_list[data_dir])) return NULL; - rq = rq_entry_fifo(dd->fifo_list[data_dir].next); - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. + * an unlocked target zone. For some HDDs, breaking a sequential + * write stream can lead to lower throughput, so make sure to preserve + * sequential write streams, even if that stream crosses into the next + * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); - list_for_each_entry(rq, &dd->fifo_list[WRITE], queuelist) { - if (blk_req_can_dispatch_to_zone(rq)) + list_for_each_entry_safe(rq, next, &per_prio->fifo_list[DD_WRITE], + queuelist) { + /* Check whether a prior request exists for the same zone. */ + rb_rq = deadline_from_pos(per_prio, data_dir, blk_rq_pos(rq)); + if (rb_rq && blk_rq_pos(rb_rq) < blk_rq_pos(rq)) + rq = rb_rq; + if (blk_req_can_dispatch_to_zone(rq) && + (blk_queue_nonrot(rq->q) || + !deadline_is_seq_write(dd, rq))) goto out; } rq = NULL; @@ -234,30 +387,35 @@ out: * dispatch using sector position sorted lists. */ static struct request * -deadline_next_request(struct deadline_data *dd, int data_dir) +deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, + enum dd_data_dir data_dir) { struct request *rq; unsigned long flags; - if (WARN_ON_ONCE(data_dir != READ && data_dir != WRITE)) - return NULL; - - rq = dd->next_rq[data_dir]; + rq = deadline_from_pos(per_prio, data_dir, + per_prio->latest_pos[data_dir]); if (!rq) return NULL; - if (data_dir == READ || !blk_queue_is_zoned(rq->q)) + if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) return rq; /* * Look for a write request that can be dispatched, that is one with - * an unlocked target zone. + * an unlocked target zone. For some HDDs, breaking a sequential + * write stream can lead to lower throughput, so make sure to preserve + * sequential write streams, even if that stream crosses into the next + * zones and these zones are unlocked. */ spin_lock_irqsave(&dd->zone_lock, flags); while (rq) { if (blk_req_can_dispatch_to_zone(rq)) break; - rq = deadline_latter_request(rq); + if (blk_queue_nonrot(rq->q)) + rq = deadline_latter_request(rq); + else + rq = deadline_skip_seq_writes(dd, rq); } spin_unlock_irqrestore(&dd->zone_lock, flags); @@ -265,48 +423,67 @@ deadline_next_request(struct deadline_data *dd, int data_dir) } /* + * Returns true if and only if @rq started after @latest_start where + * @latest_start is in jiffies. + */ +static bool started_after(struct deadline_data *dd, struct request *rq, + unsigned long latest_start) +{ + unsigned long start_time = (unsigned long)rq->fifo_time; + + start_time -= dd->fifo_expire[rq_data_dir(rq)]; + + return time_after(start_time, latest_start); +} + +/* * deadline_dispatch_requests selects the best request according to - * read/write expire, fifo_batch, etc + * read/write expire, fifo_batch, etc and with a start time <= @latest_start. */ -static struct request *__dd_dispatch_request(struct deadline_data *dd) +static struct request *__dd_dispatch_request(struct deadline_data *dd, + struct dd_per_prio *per_prio, + unsigned long latest_start) { struct request *rq, *next_rq; - bool reads, writes; - int data_dir; + enum dd_data_dir data_dir; + enum dd_prio prio; + u8 ioprio_class; + + lockdep_assert_held(&dd->lock); - if (!list_empty(&dd->dispatch)) { - rq = list_first_entry(&dd->dispatch, struct request, queuelist); + if (!list_empty(&per_prio->dispatch)) { + rq = list_first_entry(&per_prio->dispatch, struct request, + queuelist); + if (started_after(dd, rq, latest_start)) + return NULL; list_del_init(&rq->queuelist); + data_dir = rq_data_dir(rq); goto done; } - reads = !list_empty(&dd->fifo_list[READ]); - writes = !list_empty(&dd->fifo_list[WRITE]); - /* * batches are currently reads XOR writes */ - rq = deadline_next_request(dd, WRITE); - if (!rq) - rq = deadline_next_request(dd, READ); - - if (rq && dd->batching < dd->fifo_batch) - /* we have a next request are still entitled to batch */ + rq = deadline_next_request(dd, per_prio, dd->last_dir); + if (rq && dd->batching < dd->fifo_batch) { + /* we have a next request and are still entitled to batch */ + data_dir = rq_data_dir(rq); goto dispatch_request; + } /* * at this point we are not running a batch. select the appropriate * data direction (read / write) */ - if (reads) { - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[READ])); + if (!list_empty(&per_prio->fifo_list[DD_READ])) { + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); - if (deadline_fifo_request(dd, WRITE) && + if (deadline_fifo_request(dd, per_prio, DD_WRITE) && (dd->starved++ >= dd->writes_starved)) goto dispatch_writes; - data_dir = READ; + data_dir = DD_READ; goto dispatch_find_request; } @@ -315,13 +492,13 @@ static struct request *__dd_dispatch_request(struct deadline_data *dd) * there are either no reads or writes have been starved */ - if (writes) { + if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { dispatch_writes: - BUG_ON(RB_EMPTY_ROOT(&dd->sort_list[WRITE])); + BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); dd->starved = 0; - data_dir = WRITE; + data_dir = DD_WRITE; goto dispatch_find_request; } @@ -332,14 +509,14 @@ dispatch_find_request: /* * we are not running a batch, find best request for selected data_dir */ - next_rq = deadline_next_request(dd, data_dir); - if (deadline_check_fifo(dd, data_dir) || !next_rq) { + next_rq = deadline_next_request(dd, per_prio, data_dir); + if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { /* * A deadline has expired, the last request was in the other * direction, or we have run out of higher-sectored requests. * Start again from the request with the earliest expiry time. */ - rq = deadline_fifo_request(dd, data_dir); + rq = deadline_fifo_request(dd, per_prio, data_dir); } else { /* * The last req was the same dir and we have a next request in @@ -355,15 +532,23 @@ dispatch_find_request: if (!rq) return NULL; + dd->last_dir = data_dir; dd->batching = 0; dispatch_request: + if (started_after(dd, rq, latest_start)) + return NULL; + /* * rq is the selected appropriate request. */ dd->batching++; - deadline_move_request(dd, rq); + deadline_move_request(dd, per_prio, rq); done: + ioprio_class = dd_rq_ioclass(rq); + prio = ioprio_class_to_prio[ioprio_class]; + dd->per_prio[prio].latest_pos[data_dir] = blk_rq_pos(rq); + dd->per_prio[prio].stats.dispatched++; /* * If the request needs its target zone locked, do it. */ @@ -373,6 +558,36 @@ done: } /* + * Check whether there are any requests with priority other than DD_RT_PRIO + * that were inserted more than prio_aging_expire jiffies ago. + */ +static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, + unsigned long now) +{ + struct request *rq; + enum dd_prio prio; + int prio_cnt; + + lockdep_assert_held(&dd->lock); + + prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + + !!dd_queued(dd, DD_IDLE_PRIO); + if (prio_cnt < 2) + return NULL; + + for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], + now - dd->prio_aging_expire); + if (rq) + return rq; + } + + return NULL; +} + +/* + * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). + * * One confusing aspect here is that we get called for a specific * hardware queue, but we may return a request that is for a * different hardware queue. This is because mq-deadline has shared @@ -381,21 +596,92 @@ done: static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + const unsigned long now = jiffies; struct request *rq; + enum dd_prio prio; spin_lock(&dd->lock); - rq = __dd_dispatch_request(dd); + rq = dd_dispatch_prio_aged_requests(dd, now); + if (rq) + goto unlock; + + /* + * Next, dispatch requests in priority order. Ignore lower priority + * requests if any higher priority requests are pending. + */ + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); + if (rq || dd_queued(dd, prio)) + break; + } + +unlock: spin_unlock(&dd->lock); return rq; } -static void dd_exit_queue(struct elevator_queue *e) +/* + * Called by __blk_mq_alloc_request(). The shallow_depth value set by this + * function is used by __blk_mq_get_tag(). + */ +static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) +{ + struct deadline_data *dd = data->q->elevator->elevator_data; + + /* Do not throttle synchronous reads. */ + if (op_is_sync(opf) && !op_is_write(opf)) + return; + + /* + * Throttle asynchronous requests and writes such that these requests + * do not block the allocation of synchronous requests. + */ + data->shallow_depth = dd->async_depth; +} + +/* Called by blk_mq_update_nr_requests(). */ +static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) +{ + struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + struct blk_mq_tags *tags = hctx->sched_tags; + unsigned int shift = tags->bitmap_tags.sb.shift; + + dd->async_depth = max(1U, 3 * (1U << shift) / 4); + + sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); +} + +/* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ +static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) +{ + dd_depth_updated(hctx); + return 0; +} + +static void dd_exit_sched(struct elevator_queue *e) { struct deadline_data *dd = e->elevator_data; + enum dd_prio prio; - BUG_ON(!list_empty(&dd->fifo_list[READ])); - BUG_ON(!list_empty(&dd->fifo_list[WRITE])); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + const struct io_stats_per_prio *stats = &per_prio->stats; + uint32_t queued; + + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); + WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); + + spin_lock(&dd->lock); + queued = dd_queued(dd, prio); + spin_unlock(&dd->lock); + + WARN_ONCE(queued != 0, + "statistics for priority %d: i %u m %u d %u c %u\n", + prio, stats->inserted, stats->merged, + stats->dispatched, atomic_read(&stats->completed)); + } kfree(dd); } @@ -403,55 +689,78 @@ static void dd_exit_queue(struct elevator_queue *e) /* * initialize elevator private data (deadline_data). */ -static int dd_init_queue(struct request_queue *q, struct elevator_type *e) +static int dd_init_sched(struct request_queue *q, struct elevator_type *e) { struct deadline_data *dd; struct elevator_queue *eq; + enum dd_prio prio; + int ret = -ENOMEM; eq = elevator_alloc(q, e); if (!eq) - return -ENOMEM; + return ret; dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); - if (!dd) { - kobject_put(&eq->kobj); - return -ENOMEM; - } + if (!dd) + goto put_eq; + eq->elevator_data = dd; - INIT_LIST_HEAD(&dd->fifo_list[READ]); - INIT_LIST_HEAD(&dd->fifo_list[WRITE]); - dd->sort_list[READ] = RB_ROOT; - dd->sort_list[WRITE] = RB_ROOT; - dd->fifo_expire[READ] = read_expire; - dd->fifo_expire[WRITE] = write_expire; + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + INIT_LIST_HEAD(&per_prio->dispatch); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); + INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); + per_prio->sort_list[DD_READ] = RB_ROOT; + per_prio->sort_list[DD_WRITE] = RB_ROOT; + } + dd->fifo_expire[DD_READ] = read_expire; + dd->fifo_expire[DD_WRITE] = write_expire; dd->writes_starved = writes_starved; dd->front_merges = 1; + dd->last_dir = DD_WRITE; dd->fifo_batch = fifo_batch; + dd->prio_aging_expire = prio_aging_expire; spin_lock_init(&dd->lock); spin_lock_init(&dd->zone_lock); - INIT_LIST_HEAD(&dd->dispatch); + + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); q->elevator = eq; return 0; + +put_eq: + kobject_put(&eq->kobj); + return ret; } +/* + * Try to merge @bio into an existing request. If @bio has been merged into + * an existing request, store the pointer to that request into *@rq. + */ static int dd_request_merge(struct request_queue *q, struct request **rq, struct bio *bio) { struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; sector_t sector = bio_end_sector(bio); struct request *__rq; if (!dd->front_merges) return ELEVATOR_NO_MERGE; - __rq = elv_rb_find(&dd->sort_list[bio_data_dir(bio)], sector); + __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); if (__rq) { BUG_ON(sector != blk_rq_pos(__rq)); if (elv_bio_merge_ok(__rq, bio)) { *rq = __rq; + if (blk_discard_mergable(__rq)) + return ELEVATOR_DISCARD_MERGE; return ELEVATOR_FRONT_MERGE; } } @@ -459,10 +768,13 @@ static int dd_request_merge(struct request_queue *q, struct request **rq, return ELEVATOR_NO_MERGE; } -static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, +/* + * Attempt to merge a bio into an existing request. This function is called + * before @bio is associated with a request. + */ +static bool dd_bio_merge(struct request_queue *q, struct bio *bio, unsigned int nr_segs) { - struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; struct request *free = NULL; bool ret; @@ -481,11 +793,17 @@ static bool dd_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio, * add rq to rbtree and fifo */ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - bool at_head) + blk_insert_t flags, struct list_head *free) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; - const int data_dir = rq_data_dir(rq); + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); + u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); + struct dd_per_prio *per_prio; + enum dd_prio prio; + + lockdep_assert_held(&dd->lock); /* * This may be a requeue of a write request that has locked its @@ -493,18 +811,25 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, */ blk_req_zone_write_unlock(rq); - if (blk_mq_sched_try_insert_merge(q, rq)) + prio = ioprio_class_to_prio[ioprio_class]; + per_prio = &dd->per_prio[prio]; + if (!rq->elv.priv[0]) { + per_prio->stats.inserted++; + rq->elv.priv[0] = (void *)(uintptr_t)1; + } + + if (blk_mq_sched_try_insert_merge(q, rq, free)) return; - blk_mq_sched_request_inserted(rq); + trace_block_rq_insert(rq); - if (at_head || blk_rq_is_passthrough(rq)) { - if (at_head) - list_add(&rq->queuelist, &dd->dispatch); - else - list_add_tail(&rq->queuelist, &dd->dispatch); + if (flags & BLK_MQ_INSERT_AT_HEAD) { + list_add(&rq->queuelist, &per_prio->dispatch); + rq->fifo_time = jiffies; } else { - deadline_add_rq_rb(dd, rq); + struct list_head *insert_before; + + deadline_add_rq_rb(per_prio, rq); if (rq_mergeable(rq)) { elv_rqhash_add(q, rq); @@ -516,15 +841,33 @@ static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, * set expire time and add to fifo list */ rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; - list_add_tail(&rq->queuelist, &dd->fifo_list[data_dir]); + insert_before = &per_prio->fifo_list[data_dir]; +#ifdef CONFIG_BLK_DEV_ZONED + /* + * Insert zoned writes such that requests are sorted by + * position per zone. + */ + if (blk_rq_is_seq_zoned_write(rq)) { + struct request *rq2 = deadline_latter_request(rq); + + if (rq2 && blk_rq_zone_no(rq2) == blk_rq_zone_no(rq)) + insert_before = &rq2->queuelist; + } +#endif + list_add_tail(&rq->queuelist, insert_before); } } +/* + * Called from blk_mq_insert_request() or blk_mq_dispatch_plug_list(). + */ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, bool at_head) + struct list_head *list, + blk_insert_t flags) { struct request_queue *q = hctx->queue; struct deadline_data *dd = q->elevator->elevator_data; + LIST_HEAD(free); spin_lock(&dd->lock); while (!list_empty(list)) { @@ -532,20 +875,34 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, rq = list_first_entry(list, struct request, queuelist); list_del_init(&rq->queuelist); - dd_insert_request(hctx, rq, at_head); + dd_insert_request(hctx, rq, flags, &free); } spin_unlock(&dd->lock); + + blk_mq_free_requests(&free); } -/* - * Nothing to do here. This is defined only to ensure that .finish_request - * method is called upon request completion. - */ +/* Callback from inside blk_mq_rq_ctx_init(). */ static void dd_prepare_request(struct request *rq) { + rq->elv.priv[0] = NULL; +} + +static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) +{ + struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio p; + + for (p = 0; p <= DD_PRIO_MAX; p++) + if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) + return true; + + return false; } /* + * Callback from inside blk_mq_free_request(). + * * For zoned block devices, write unlock the target zone of * completed write requests. Do this while holding the zone lock * spinlock so that the zone is never unlocked while deadline_fifo_request() @@ -562,83 +919,103 @@ static void dd_prepare_request(struct request *rq) static void dd_finish_request(struct request *rq) { struct request_queue *q = rq->q; + struct deadline_data *dd = q->elevator->elevator_data; + const u8 ioprio_class = dd_rq_ioclass(rq); + const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + + /* + * The block layer core may call dd_finish_request() without having + * called dd_insert_requests(). Skip requests that bypassed I/O + * scheduling. See also blk_mq_request_bypass_insert(). + */ + if (!rq->elv.priv[0]) + return; + + atomic_inc(&per_prio->stats.completed); if (blk_queue_is_zoned(q)) { - struct deadline_data *dd = q->elevator->elevator_data; unsigned long flags; spin_lock_irqsave(&dd->zone_lock, flags); blk_req_zone_write_unlock(rq); - if (!list_empty(&dd->fifo_list[WRITE])) - blk_mq_sched_mark_restart_hctx(rq->mq_hctx); spin_unlock_irqrestore(&dd->zone_lock, flags); + + if (dd_has_write_work(rq->mq_hctx)) + blk_mq_sched_mark_restart_hctx(rq->mq_hctx); } } +static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) +{ + return !list_empty_careful(&per_prio->dispatch) || + !list_empty_careful(&per_prio->fifo_list[DD_READ]) || + !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); +} + static bool dd_has_work(struct blk_mq_hw_ctx *hctx) { struct deadline_data *dd = hctx->queue->elevator->elevator_data; + enum dd_prio prio; - return !list_empty_careful(&dd->dispatch) || - !list_empty_careful(&dd->fifo_list[0]) || - !list_empty_careful(&dd->fifo_list[1]); + for (prio = 0; prio <= DD_PRIO_MAX; prio++) + if (dd_has_work_for_prio(&dd->per_prio[prio])) + return true; + + return false; } /* * sysfs parts below */ -static ssize_t -deadline_var_show(int var, char *page) -{ - return sprintf(page, "%d\n", var); -} - -static void -deadline_var_store(int *var, const char *page) -{ - char *p = (char *) page; - - *var = simple_strtol(p, &p, 10); -} - -#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ +#define SHOW_INT(__FUNC, __VAR) \ static ssize_t __FUNC(struct elevator_queue *e, char *page) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data = __VAR; \ - if (__CONV) \ - __data = jiffies_to_msecs(__data); \ - return deadline_var_show(__data, (page)); \ -} -SHOW_FUNCTION(deadline_read_expire_show, dd->fifo_expire[READ], 1); -SHOW_FUNCTION(deadline_write_expire_show, dd->fifo_expire[WRITE], 1); -SHOW_FUNCTION(deadline_writes_starved_show, dd->writes_starved, 0); -SHOW_FUNCTION(deadline_front_merges_show, dd->front_merges, 0); -SHOW_FUNCTION(deadline_fifo_batch_show, dd->fifo_batch, 0); -#undef SHOW_FUNCTION + \ + return sysfs_emit(page, "%d\n", __VAR); \ +} +#define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) +SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); +SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); +SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); +SHOW_INT(deadline_writes_starved_show, dd->writes_starved); +SHOW_INT(deadline_front_merges_show, dd->front_merges); +SHOW_INT(deadline_async_depth_show, dd->async_depth); +SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); +#undef SHOW_INT +#undef SHOW_JIFFIES #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ { \ struct deadline_data *dd = e->elevator_data; \ - int __data; \ - deadline_var_store(&__data, (page)); \ + int __data, __ret; \ + \ + __ret = kstrtoint(page, 0, &__data); \ + if (__ret < 0) \ + return __ret; \ if (__data < (MIN)) \ __data = (MIN); \ else if (__data > (MAX)) \ __data = (MAX); \ - if (__CONV) \ - *(__PTR) = msecs_to_jiffies(__data); \ - else \ - *(__PTR) = __data; \ + *(__PTR) = __CONV(__data); \ return count; \ } -STORE_FUNCTION(deadline_read_expire_store, &dd->fifo_expire[READ], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_write_expire_store, &dd->fifo_expire[WRITE], 0, INT_MAX, 1); -STORE_FUNCTION(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX, 0); -STORE_FUNCTION(deadline_front_merges_store, &dd->front_merges, 0, 1, 0); -STORE_FUNCTION(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX, 0); +#define STORE_INT(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) +#define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ + STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) +STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); +STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); +STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); +STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); +STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); +STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); +STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); #undef STORE_FUNCTION +#undef STORE_INT +#undef STORE_JIFFIES #define DD_ATTR(name) \ __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) @@ -648,21 +1025,24 @@ static struct elv_fs_entry deadline_attrs[] = { DD_ATTR(write_expire), DD_ATTR(writes_starved), DD_ATTR(front_merges), + DD_ATTR(async_depth), DD_ATTR(fifo_batch), + DD_ATTR(prio_aging_expire), __ATTR_NULL }; #ifdef CONFIG_BLK_DEBUG_FS -#define DEADLINE_DEBUGFS_DDIR_ATTRS(ddir, name) \ +#define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ static void *deadline_##name##_fifo_start(struct seq_file *m, \ loff_t *pos) \ __acquires(&dd->lock) \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ spin_lock(&dd->lock); \ - return seq_list_start(&dd->fifo_list[ddir], *pos); \ + return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ } \ \ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ @@ -670,8 +1050,9 @@ static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ { \ struct request_queue *q = m->private; \ struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ \ - return seq_list_next(v, &dd->fifo_list[ddir], pos); \ + return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ } \ \ static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ @@ -695,14 +1076,22 @@ static int deadline_##name##_next_rq_show(void *data, \ { \ struct request_queue *q = data; \ struct deadline_data *dd = q->elevator->elevator_data; \ - struct request *rq = dd->next_rq[ddir]; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + struct request *rq; \ \ + rq = deadline_from_pos(per_prio, data_dir, \ + per_prio->latest_pos[data_dir]); \ if (rq) \ __blk_mq_debugfs_rq_show(m, rq); \ return 0; \ } -DEADLINE_DEBUGFS_DDIR_ATTRS(READ, read) -DEADLINE_DEBUGFS_DDIR_ATTRS(WRITE, write) + +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); +DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); #undef DEADLINE_DEBUGFS_DDIR_ATTRS static int deadline_batching_show(void *data, struct seq_file *m) @@ -723,49 +1112,130 @@ static int deadline_starved_show(void *data, struct seq_file *m) return 0; } -static void *deadline_dispatch_start(struct seq_file *m, loff_t *pos) - __acquires(&dd->lock) +static int dd_async_depth_show(void *data, struct seq_file *m) { - struct request_queue *q = m->private; + struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; - spin_lock(&dd->lock); - return seq_list_start(&dd->dispatch, *pos); + seq_printf(m, "%u\n", dd->async_depth); + return 0; } -static void *deadline_dispatch_next(struct seq_file *m, void *v, loff_t *pos) +static int dd_queued_show(void *data, struct seq_file *m) { - struct request_queue *q = m->private; + struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; + u32 rt, be, idle; + + spin_lock(&dd->lock); + rt = dd_queued(dd, DD_RT_PRIO); + be = dd_queued(dd, DD_BE_PRIO); + idle = dd_queued(dd, DD_IDLE_PRIO); + spin_unlock(&dd->lock); + + seq_printf(m, "%u %u %u\n", rt, be, idle); + + return 0; +} + +/* Number of requests owned by the block driver for a given priority. */ +static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) +{ + const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; - return seq_list_next(v, &dd->dispatch, pos); + lockdep_assert_held(&dd->lock); + + return stats->dispatched + stats->merged - + atomic_read(&stats->completed); } -static void deadline_dispatch_stop(struct seq_file *m, void *v) - __releases(&dd->lock) +static int dd_owned_by_driver_show(void *data, struct seq_file *m) { - struct request_queue *q = m->private; + struct request_queue *q = data; struct deadline_data *dd = q->elevator->elevator_data; + u32 rt, be, idle; + spin_lock(&dd->lock); + rt = dd_owned_by_driver(dd, DD_RT_PRIO); + be = dd_owned_by_driver(dd, DD_BE_PRIO); + idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); spin_unlock(&dd->lock); + + seq_printf(m, "%u %u %u\n", rt, be, idle); + + return 0; } -static const struct seq_operations deadline_dispatch_seq_ops = { - .start = deadline_dispatch_start, - .next = deadline_dispatch_next, - .stop = deadline_dispatch_stop, - .show = blk_mq_debugfs_rq_show, -}; +#define DEADLINE_DISPATCH_ATTR(prio) \ +static void *deadline_dispatch##prio##_start(struct seq_file *m, \ + loff_t *pos) \ + __acquires(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + spin_lock(&dd->lock); \ + return seq_list_start(&per_prio->dispatch, *pos); \ +} \ + \ +static void *deadline_dispatch##prio##_next(struct seq_file *m, \ + void *v, loff_t *pos) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ + \ + return seq_list_next(v, &per_prio->dispatch, pos); \ +} \ + \ +static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ + __releases(&dd->lock) \ +{ \ + struct request_queue *q = m->private; \ + struct deadline_data *dd = q->elevator->elevator_data; \ + \ + spin_unlock(&dd->lock); \ +} \ + \ +static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ + .start = deadline_dispatch##prio##_start, \ + .next = deadline_dispatch##prio##_next, \ + .stop = deadline_dispatch##prio##_stop, \ + .show = blk_mq_debugfs_rq_show, \ +} + +DEADLINE_DISPATCH_ATTR(0); +DEADLINE_DISPATCH_ATTR(1); +DEADLINE_DISPATCH_ATTR(2); +#undef DEADLINE_DISPATCH_ATTR -#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ - {#name "_fifo_list", 0400, .seq_ops = &deadline_##name##_fifo_seq_ops}, \ +#define DEADLINE_QUEUE_DDIR_ATTRS(name) \ + {#name "_fifo_list", 0400, \ + .seq_ops = &deadline_##name##_fifo_seq_ops} +#define DEADLINE_NEXT_RQ_ATTR(name) \ {#name "_next_rq", 0400, deadline_##name##_next_rq_show} static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { - DEADLINE_QUEUE_DDIR_ATTRS(read), - DEADLINE_QUEUE_DDIR_ATTRS(write), + DEADLINE_QUEUE_DDIR_ATTRS(read0), + DEADLINE_QUEUE_DDIR_ATTRS(write0), + DEADLINE_QUEUE_DDIR_ATTRS(read1), + DEADLINE_QUEUE_DDIR_ATTRS(write1), + DEADLINE_QUEUE_DDIR_ATTRS(read2), + DEADLINE_QUEUE_DDIR_ATTRS(write2), + DEADLINE_NEXT_RQ_ATTR(read0), + DEADLINE_NEXT_RQ_ATTR(write0), + DEADLINE_NEXT_RQ_ATTR(read1), + DEADLINE_NEXT_RQ_ATTR(write1), + DEADLINE_NEXT_RQ_ATTR(read2), + DEADLINE_NEXT_RQ_ATTR(write2), {"batching", 0400, deadline_batching_show}, {"starved", 0400, deadline_starved_show}, - {"dispatch", 0400, .seq_ops = &deadline_dispatch_seq_ops}, + {"async_depth", 0400, dd_async_depth_show}, + {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, + {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, + {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, + {"owned_by_driver", 0400, dd_owned_by_driver_show}, + {"queued", 0400, dd_queued_show}, {}, }; #undef DEADLINE_QUEUE_DDIR_ATTRS @@ -773,6 +1243,8 @@ static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { static struct elevator_type mq_deadline = { .ops = { + .depth_updated = dd_depth_updated, + .limit_depth = dd_limit_depth, .insert_requests = dd_insert_requests, .dispatch_request = dd_dispatch_request, .prepare_request = dd_prepare_request, @@ -784,8 +1256,9 @@ static struct elevator_type mq_deadline = { .requests_merged = dd_merged_requests, .request_merged = dd_request_merged, .has_work = dd_has_work, - .init_sched = dd_init_queue, - .exit_sched = dd_exit_queue, + .init_sched = dd_init_sched, + .exit_sched = dd_exit_sched, + .init_hctx = dd_init_hctx, }, #ifdef CONFIG_BLK_DEBUG_FS @@ -812,6 +1285,6 @@ static void __exit deadline_exit(void) module_init(deadline_init); module_exit(deadline_exit); -MODULE_AUTHOR("Jens Axboe"); +MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); MODULE_LICENSE("GPL"); MODULE_DESCRIPTION("MQ deadline IO scheduler"); diff --git a/block/opal_proto.h b/block/opal_proto.h index b486b3ec7dc4..d247a457bf6e 100644 --- a/block/opal_proto.h +++ b/block/opal_proto.h @@ -39,7 +39,12 @@ enum opal_response_token { #define FIRST_TPER_SESSION_NUM 4096 #define TPER_SYNC_SUPPORTED 0x01 +/* FC_LOCKING features */ +#define LOCKING_SUPPORTED_MASK 0x01 +#define LOCKING_ENABLED_MASK 0x02 +#define LOCKED_MASK 0x04 #define MBR_ENABLED_MASK 0x10 +#define MBR_DONE_MASK 0x20 #define TINY_ATOM_DATA_MASK 0x3F #define TINY_ATOM_SIGNED 0x40 @@ -66,6 +71,7 @@ enum opal_response_token { #define SHORT_ATOM_BYTE 0xBF #define MEDIUM_ATOM_BYTE 0xDF #define LONG_ATOM_BYTE 0xE3 +#define EMPTY_ATOM_BYTE 0xFF #define OPAL_INVAL_PARAM 12 #define OPAL_MANUFACTURED_INACTIVE 0x08 @@ -81,6 +87,15 @@ enum opal_response_token { #define OPAL_MSID_KEYLEN 15 #define OPAL_UID_LENGTH_HALF 4 +/* + * Boolean operators from TCG Core spec 2.01 Section: + * 5.1.3.11 + * Table 61 + */ +#define OPAL_BOOLEAN_AND 0 +#define OPAL_BOOLEAN_OR 1 +#define OPAL_BOOLEAN_NOT 2 + /* Enum to index OPALUID array */ enum opal_uid { /* users */ @@ -100,6 +115,7 @@ enum opal_uid { /* tables */ OPAL_TABLE_TABLE, OPAL_LOCKINGRANGE_GLOBAL, + OPAL_LOCKINGRANGE_ACE_START_TO_KEY, OPAL_LOCKINGRANGE_ACE_RDLOCKED, OPAL_LOCKINGRANGE_ACE_WRLOCKED, OPAL_MBRCONTROL, @@ -210,6 +226,10 @@ enum opal_parameter { OPAL_SUM_SET_LIST = 0x060000, }; +enum opal_revertlsp { + OPAL_KEEP_GLOBAL_RANGE_KEY = 0x060000, +}; + /* Packets derived from: * TCG_Storage_Architecture_Core_Spec_v2.01_r1.00 * Secion: 3.2.3 ComPackets, Packets & Subpackets diff --git a/block/partitions/Kconfig b/block/partitions/Kconfig index 6e2a649669e5..7aff4eb81c60 100644 --- a/block/partitions/Kconfig +++ b/block/partitions/Kconfig @@ -2,6 +2,8 @@ # # Partition configuration # +menu "Partition Types" + config PARTITION_ADVANCED bool "Advanced partition selection" help @@ -264,7 +266,8 @@ config SYSV68_PARTITION config CMDLINE_PARTITION bool "Command line partition support" if PARTITION_ADVANCED - select BLK_CMDLINE_PARSER help Say Y here if you want to read the partition table from bootargs. The format for the command line is just like mtdparts. + +endmenu diff --git a/block/partitions/acorn.c b/block/partitions/acorn.c index c64c57b958bf..d2fc122d7426 100644 --- a/block/partitions/acorn.c +++ b/block/partitions/acorn.c @@ -275,20 +275,20 @@ int adfspart_check_ADFS(struct parsed_partitions *state) /* * Work out start of non-adfs partition. */ - nr_sects = (state->bdev->bd_inode->i_size >> 9) - start_sect; + nr_sects = get_capacity(state->disk) - start_sect; if (start_sect) { switch (id) { #ifdef CONFIG_ACORN_PARTITION_RISCIX case PARTITION_RISCIX_SCSI: case PARTITION_RISCIX_MFM: - slot = riscix_partition(state, start_sect, slot, + riscix_partition(state, start_sect, slot, nr_sects); break; #endif case PARTITION_LINUX: - slot = linux_partition(state, start_sect, slot, + linux_partition(state, start_sect, slot, nr_sects); break; } @@ -540,7 +540,7 @@ int adfspart_check_EESOX(struct parsed_partitions *state) if (i != 0) { sector_t size; - size = get_capacity(state->bdev->bd_disk); + size = get_capacity(state->disk); put_partition(state, slot++, start, size - start); strlcat(state->pp_buf, "\n", PAGE_SIZE); } diff --git a/block/partitions/aix.c b/block/partitions/aix.c index c7b4fd1a4a97..85f4b967565e 100644 --- a/block/partitions/aix.c +++ b/block/partitions/aix.c @@ -67,29 +67,13 @@ struct pvd { #define LVM_MAXLVS 256 /** - * last_lba(): return number of last logical block of device - * @bdev: block device - * - * Description: Returns last LBA value on success, 0 on error. - * This is stored (by sd and ide-geometry) in - * the part[0] entry for this disk, and is the number of - * physical sectors available on the disk. - */ -static u64 last_lba(struct block_device *bdev) -{ - if (!bdev || !bdev->bd_inode) - return 0; - return (bdev->bd_inode->i_size >> 9) - 1ULL; -} - -/** * read_lba(): Read bytes from disk, starting at given LBA * @state * @lba * @buffer * @count * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, @@ -97,7 +81,7 @@ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, { size_t totalreadcount = 0; - if (!buffer || lba + count / 512 > last_lba(state->bdev)) + if (!buffer || lba + count / 512 > get_capacity(state->disk) - 1ULL) return 0; while (count) { diff --git a/block/partitions/amiga.c b/block/partitions/amiga.c index 9526491d9aed..506921095412 100644 --- a/block/partitions/amiga.c +++ b/block/partitions/amiga.c @@ -11,10 +11,18 @@ #define pr_fmt(fmt) fmt #include <linux/types.h> +#include <linux/mm_types.h> +#include <linux/overflow.h> #include <linux/affs_hardblocks.h> #include "check.h" +/* magic offsets in partition DosEnvVec */ +#define NR_HD 3 +#define NR_SECT 5 +#define LO_CYL 9 +#define HI_CYL 10 + static __inline__ u32 checksum_block(__be32 *m, int size) { @@ -31,18 +39,21 @@ int amiga_partition(struct parsed_partitions *state) unsigned char *data; struct RigidDiskBlock *rdb; struct PartitionBlock *pb; - int start_sect, nr_sects, blk, part, res = 0; - int blksize = 1; /* Multiplier for disk block size */ + u64 start_sect, nr_sects; + sector_t blk, end_sect; + u32 cylblk; /* rdb_CylBlocks = nr_heads*sect_per_track */ + u32 nr_hd, nr_sect, lo_cyl, hi_cyl; + int part, res = 0; + unsigned int blksize = 1; /* Multiplier for disk block size */ int slot = 1; - char b[BDEVNAME_SIZE]; for (blk = 0; ; blk++, put_dev_sector(sect)) { if (blk == RDB_ALLOCATION_LIMIT) goto rdb_done; data = read_part_sector(state, blk, §); if (!data) { - pr_err("Dev %s: unable to read RDB block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read RDB block %llu\n", + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -58,13 +69,13 @@ int amiga_partition(struct parsed_partitions *state) *(__be32 *)(data+0xdc) = 0; if (checksum_block((__be32 *)data, be32_to_cpu(rdb->rdb_SummedLongs) & 0x7F)==0) { - pr_err("Trashed word at 0xd0 in block %d ignored in checksum calculation\n", + pr_err("Trashed word at 0xd0 in block %llu ignored in checksum calculation\n", blk); break; } - pr_err("Dev %s: RDB in block %d has bad checksum\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: RDB in block %llu has bad checksum\n", + state->disk->disk_name, blk); } /* blksize is blocks per 512 byte standard block */ @@ -79,12 +90,17 @@ int amiga_partition(struct parsed_partitions *state) } blk = be32_to_cpu(rdb->rdb_PartitionList); put_dev_sector(sect); - for (part = 1; blk>0 && part<=16; part++, put_dev_sector(sect)) { - blk *= blksize; /* Read in terms partition table understands */ + for (part = 1; (s32) blk>0 && part<=16; part++, put_dev_sector(sect)) { + /* Read in terms partition table understands */ + if (check_mul_overflow(blk, (sector_t) blksize, &blk)) { + pr_err("Dev %s: overflow calculating partition block %llu! Skipping partitions %u and beyond\n", + state->disk->disk_name, blk, part); + break; + } data = read_part_sector(state, blk, §); if (!data) { - pr_err("Dev %s: unable to read partition block %d\n", - bdevname(state->bdev, b), blk); + pr_err("Dev %s: unable to read partition block %llu\n", + state->disk->disk_name, blk); res = -1; goto rdb_done; } @@ -95,19 +111,70 @@ int amiga_partition(struct parsed_partitions *state) if (checksum_block((__be32 *)pb, be32_to_cpu(pb->pb_SummedLongs) & 0x7F) != 0 ) continue; - /* Tell Kernel about it */ + /* RDB gives us more than enough rope to hang ourselves with, + * many times over (2^128 bytes if all fields max out). + * Some careful checks are in order, so check for potential + * overflows. + * We are multiplying four 32 bit numbers to one sector_t! + */ + + nr_hd = be32_to_cpu(pb->pb_Environment[NR_HD]); + nr_sect = be32_to_cpu(pb->pb_Environment[NR_SECT]); + + /* CylBlocks is total number of blocks per cylinder */ + if (check_mul_overflow(nr_hd, nr_sect, &cylblk)) { + pr_err("Dev %s: heads*sects %u overflows u32, skipping partition!\n", + state->disk->disk_name, cylblk); + continue; + } + + /* check for consistency with RDB defined CylBlocks */ + if (cylblk > be32_to_cpu(rdb->rdb_CylBlocks)) { + pr_warn("Dev %s: cylblk %u > rdb_CylBlocks %u!\n", + state->disk->disk_name, cylblk, + be32_to_cpu(rdb->rdb_CylBlocks)); + } + + /* RDB allows for variable logical block size - + * normalize to 512 byte blocks and check result. + */ + + if (check_mul_overflow(cylblk, blksize, &cylblk)) { + pr_err("Dev %s: partition %u bytes per cyl. overflows u32, skipping partition!\n", + state->disk->disk_name, part); + continue; + } + + /* Calculate partition start and end. Limit of 32 bit on cylblk + * guarantees no overflow occurs if LBD support is enabled. + */ + + lo_cyl = be32_to_cpu(pb->pb_Environment[LO_CYL]); + start_sect = ((u64) lo_cyl * cylblk); + + hi_cyl = be32_to_cpu(pb->pb_Environment[HI_CYL]); + nr_sects = (((u64) hi_cyl - lo_cyl + 1) * cylblk); - nr_sects = (be32_to_cpu(pb->pb_Environment[10]) + 1 - - be32_to_cpu(pb->pb_Environment[9])) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; if (!nr_sects) continue; - start_sect = be32_to_cpu(pb->pb_Environment[9]) * - be32_to_cpu(pb->pb_Environment[3]) * - be32_to_cpu(pb->pb_Environment[5]) * - blksize; + + /* Warn user if partition end overflows u32 (AmigaDOS limit) */ + + if ((start_sect + nr_sects) > UINT_MAX) { + pr_warn("Dev %s: partition %u (%llu-%llu) needs 64 bit device support!\n", + state->disk->disk_name, part, + start_sect, start_sect + nr_sects); + } + + if (check_add_overflow(start_sect, nr_sects, &end_sect)) { + pr_err("Dev %s: partition %u (%llu-%llu) needs LBD device support, skipping partition!\n", + state->disk->disk_name, part, + start_sect, end_sect); + continue; + } + + /* Tell Kernel about it */ + put_partition(state,slot++,start_sect,nr_sects); { /* Be even more informative to aid mounting */ diff --git a/block/partitions/atari.c b/block/partitions/atari.c index 2305840c8522..9655c728262a 100644 --- a/block/partitions/atari.c +++ b/block/partitions/atari.c @@ -47,7 +47,7 @@ int atari_partition(struct parsed_partitions *state) * ATARI partition scheme supports 512 lba only. If this is not * the case, bail early to avoid miscalculating hd_size. */ - if (bdev_logical_block_size(state->bdev) != 512) + if (queue_logical_block_size(state->disk->queue) != 512) return 0; rs = read_part_sector(state, 0, §); @@ -55,7 +55,7 @@ int atari_partition(struct parsed_partitions *state) return -1; /* Verify this is an Atari rootsector: */ - hd_size = state->bdev->bd_inode->i_size >> 9; + hd_size = get_capacity(state->disk); if (!VALID_PARTITION(&rs->part[0], hd_size) && !VALID_PARTITION(&rs->part[1], hd_size) && !VALID_PARTITION(&rs->part[2], hd_size) && @@ -140,7 +140,6 @@ int atari_partition(struct parsed_partitions *state) /* accept only GEM,BGM,RAW,LNX,SWP partitions */ if (!((pi->flg & 1) && OK_id(pi->id))) continue; - part_fmt = 2; put_partition (state, slot, be32_to_cpu(pi->st), be32_to_cpu(pi->siz)); diff --git a/block/partitions/check.h b/block/partitions/check.h index c577e9ee67f0..8d70a880c372 100644 --- a/block/partitions/check.h +++ b/block/partitions/check.h @@ -1,7 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ #include <linux/pagemap.h> #include <linux/blkdev.h> -#include <linux/genhd.h> #include "../blk.h" /* @@ -9,7 +8,7 @@ * description. */ struct parsed_partitions { - struct block_device *bdev; + struct gendisk *disk; char name[BDEVNAME_SIZE]; struct { sector_t from; @@ -25,13 +24,13 @@ struct parsed_partitions { }; typedef struct { - struct page *v; + struct folio *v; } Sector; void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p); static inline void put_dev_sector(Sector p) { - put_page(p.v); + folio_put(p.v); } static inline void diff --git a/block/partitions/cmdline.c b/block/partitions/cmdline.c index 8f545c36cde4..c03bc105e575 100644 --- a/block/partitions/cmdline.c +++ b/block/partitions/cmdline.c @@ -14,20 +14,245 @@ * For further information, see "Documentation/block/cmdline-partition.rst" * */ +#include <linux/blkdev.h> +#include <linux/fs.h> +#include <linux/slab.h> +#include "check.h" -#include <linux/cmdline-parser.h> -#include "check.h" +/* partition flags */ +#define PF_RDONLY 0x01 /* Device is read only */ +#define PF_POWERUP_LOCK 0x02 /* Always locked after reset */ + +struct cmdline_subpart { + char name[BDEVNAME_SIZE]; /* partition name, such as 'rootfs' */ + sector_t from; + sector_t size; + int flags; + struct cmdline_subpart *next_subpart; +}; + +struct cmdline_parts { + char name[BDEVNAME_SIZE]; /* block device, such as 'mmcblk0' */ + unsigned int nr_subparts; + struct cmdline_subpart *subpart; + struct cmdline_parts *next_parts; +}; + +static int parse_subpart(struct cmdline_subpart **subpart, char *partdef) +{ + int ret = 0; + struct cmdline_subpart *new_subpart; + + *subpart = NULL; + + new_subpart = kzalloc(sizeof(struct cmdline_subpart), GFP_KERNEL); + if (!new_subpart) + return -ENOMEM; + + if (*partdef == '-') { + new_subpart->size = (sector_t)(~0ULL); + partdef++; + } else { + new_subpart->size = (sector_t)memparse(partdef, &partdef); + if (new_subpart->size < (sector_t)PAGE_SIZE) { + pr_warn("cmdline partition size is invalid."); + ret = -EINVAL; + goto fail; + } + } + + if (*partdef == '@') { + partdef++; + new_subpart->from = (sector_t)memparse(partdef, &partdef); + } else { + new_subpart->from = (sector_t)(~0ULL); + } + + if (*partdef == '(') { + int length; + char *next = strchr(++partdef, ')'); + + if (!next) { + pr_warn("cmdline partition format is invalid."); + ret = -EINVAL; + goto fail; + } + + length = min_t(int, next - partdef, + sizeof(new_subpart->name) - 1); + strscpy(new_subpart->name, partdef, length); + + partdef = ++next; + } else + new_subpart->name[0] = '\0'; + + new_subpart->flags = 0; + + if (!strncmp(partdef, "ro", 2)) { + new_subpart->flags |= PF_RDONLY; + partdef += 2; + } + + if (!strncmp(partdef, "lk", 2)) { + new_subpart->flags |= PF_POWERUP_LOCK; + partdef += 2; + } + + *subpart = new_subpart; + return 0; +fail: + kfree(new_subpart); + return ret; +} + +static void free_subpart(struct cmdline_parts *parts) +{ + struct cmdline_subpart *subpart; + + while (parts->subpart) { + subpart = parts->subpart; + parts->subpart = subpart->next_subpart; + kfree(subpart); + } +} + +static int parse_parts(struct cmdline_parts **parts, const char *bdevdef) +{ + int ret = -EINVAL; + char *next; + int length; + struct cmdline_subpart **next_subpart; + struct cmdline_parts *newparts; + char buf[BDEVNAME_SIZE + 32 + 4]; + + *parts = NULL; + + newparts = kzalloc(sizeof(struct cmdline_parts), GFP_KERNEL); + if (!newparts) + return -ENOMEM; + + next = strchr(bdevdef, ':'); + if (!next) { + pr_warn("cmdline partition has no block device."); + goto fail; + } + + length = min_t(int, next - bdevdef, sizeof(newparts->name) - 1); + strscpy(newparts->name, bdevdef, length); + newparts->nr_subparts = 0; + + next_subpart = &newparts->subpart; + + while (next && *(++next)) { + bdevdef = next; + next = strchr(bdevdef, ','); + + length = (!next) ? (sizeof(buf) - 1) : + min_t(int, next - bdevdef, sizeof(buf) - 1); + + strscpy(buf, bdevdef, length); + + ret = parse_subpart(next_subpart, buf); + if (ret) + goto fail; + + newparts->nr_subparts++; + next_subpart = &(*next_subpart)->next_subpart; + } + + if (!newparts->subpart) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + *parts = newparts; + + return 0; +fail: + free_subpart(newparts); + kfree(newparts); + return ret; +} + +static void cmdline_parts_free(struct cmdline_parts **parts) +{ + struct cmdline_parts *next_parts; + + while (*parts) { + next_parts = (*parts)->next_parts; + free_subpart(*parts); + kfree(*parts); + *parts = next_parts; + } +} + +static int cmdline_parts_parse(struct cmdline_parts **parts, + const char *cmdline) +{ + int ret; + char *buf; + char *pbuf; + char *next; + struct cmdline_parts **next_parts; + + *parts = NULL; + + next = pbuf = buf = kstrdup(cmdline, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + next_parts = parts; + + while (next && *pbuf) { + next = strchr(pbuf, ';'); + if (next) + *next = '\0'; + + ret = parse_parts(next_parts, pbuf); + if (ret) + goto fail; + + if (next) + pbuf = ++next; + + next_parts = &(*next_parts)->next_parts; + } + + if (!*parts) { + pr_warn("cmdline partition has no valid partition."); + ret = -EINVAL; + goto fail; + } + + ret = 0; +done: + kfree(buf); + return ret; + +fail: + cmdline_parts_free(parts); + goto done; +} + +static struct cmdline_parts *cmdline_parts_find(struct cmdline_parts *parts, + const char *bdev) +{ + while (parts && strncmp(bdev, parts->name, sizeof(parts->name))) + parts = parts->next_parts; + return parts; +} static char *cmdline; static struct cmdline_parts *bdev_parts; -static int add_part(int slot, struct cmdline_subpart *subpart, void *param) +static int add_part(int slot, struct cmdline_subpart *subpart, + struct parsed_partitions *state) { int label_min; struct partition_meta_info *info; char tmp[sizeof(info->volname) + 4]; - struct parsed_partitions *state = (struct parsed_partitions *)param; if (slot >= state->limit) return 1; @@ -39,8 +264,7 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) label_min = min_t(int, sizeof(info->volname) - 1, sizeof(subpart->name)); - strncpy(info->volname, subpart->name, label_min); - info->volname[label_min] = '\0'; + strscpy(info->volname, subpart->name, label_min); snprintf(tmp, sizeof(tmp), "(%s)", info->volname); strlcat(state->pp_buf, tmp, PAGE_SIZE); @@ -50,6 +274,35 @@ static int add_part(int slot, struct cmdline_subpart *subpart, void *param) return 0; } +static int cmdline_parts_set(struct cmdline_parts *parts, sector_t disk_size, + struct parsed_partitions *state) +{ + sector_t from = 0; + struct cmdline_subpart *subpart; + int slot = 1; + + for (subpart = parts->subpart; subpart; + subpart = subpart->next_subpart, slot++) { + if (subpart->from == (sector_t)(~0ULL)) + subpart->from = from; + else + from = subpart->from; + + if (from >= disk_size) + break; + + if (subpart->size > (disk_size - from)) + subpart->size = disk_size - from; + + from += subpart->size; + + if (add_part(slot, subpart, state)) + break; + } + + return slot; +} + static int __init cmdline_parts_setup(char *s) { cmdline = s; @@ -123,7 +376,6 @@ static void cmdline_parts_verifier(int slot, struct parsed_partitions *state) int cmdline_partition(struct parsed_partitions *state) { sector_t disk_size; - char bdev[BDEVNAME_SIZE]; struct cmdline_parts *parts; if (cmdline) { @@ -140,14 +392,13 @@ int cmdline_partition(struct parsed_partitions *state) if (!bdev_parts) return 0; - bdevname(state->bdev, bdev); - parts = cmdline_parts_find(bdev_parts, bdev); + parts = cmdline_parts_find(bdev_parts, state->disk->disk_name); if (!parts) return 0; - disk_size = get_capacity(state->bdev->bd_disk) << 9; + disk_size = get_capacity(state->disk) << 9; - cmdline_parts_set(parts, disk_size, 1, add_part, (void *)state); + cmdline_parts_set(parts, disk_size, state); cmdline_parts_verifier(1, state); strlcat(state->pp_buf, "\n", PAGE_SIZE); diff --git a/block/partitions/core.c b/block/partitions/core.c index 78951e33b2d7..5f5ed5c75f04 100644 --- a/block/partitions/core.c +++ b/block/partitions/core.c @@ -2,17 +2,17 @@ /* * Copyright (C) 1991-1998 Linus Torvalds * Re-organised Feb 1998 Russell King + * Copyright (C) 2020 Christoph Hellwig */ #include <linux/fs.h> +#include <linux/major.h> #include <linux/slab.h> #include <linux/ctype.h> -#include <linux/genhd.h> #include <linux/vmalloc.h> -#include <linux/blktrace_api.h> #include <linux/raid/detect.h> #include "check.h" -static int (*check_part[])(struct parsed_partitions *) = { +static int (*const check_part[])(struct parsed_partitions *) = { /* * Probe partition formats with tables at disk address 0 * that also have an ADFS boot block at 0xdc0. @@ -88,13 +88,12 @@ static int (*check_part[])(struct parsed_partitions *) = { static struct parsed_partitions *allocate_partitions(struct gendisk *hd) { struct parsed_partitions *state; - int nr; + int nr = DISK_MAX_PARTS; state = kzalloc(sizeof(*state), GFP_KERNEL); if (!state) return NULL; - nr = disk_max_parts(hd); state->parts = vzalloc(array_size(nr, sizeof(state->parts[0]))); if (!state->parts) { kfree(state); @@ -112,8 +111,7 @@ static void free_partitions(struct parsed_partitions *state) kfree(state); } -static struct parsed_partitions *check_partition(struct gendisk *hd, - struct block_device *bdev) +static struct parsed_partitions *check_partition(struct gendisk *hd) { struct parsed_partitions *state; int i, res, err; @@ -128,8 +126,8 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, } state->pp_buf[0] = '\0'; - state->bdev = bdev; - disk_name(hd, 0, state->name); + state->disk = hd; + snprintf(state->name, BDEVNAME_SIZE, "%s", hd->disk_name); snprintf(state->pp_buf, PAGE_SIZE, " %s:", state->name); if (isdigit(state->name[strlen(state->name)-1])) sprintf(state->name, "p"); @@ -175,38 +173,31 @@ static struct parsed_partitions *check_partition(struct gendisk *hd, static ssize_t part_partition_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%d\n", p->partno); + return sprintf(buf, "%d\n", dev_to_bdev(dev)->bd_partno); } static ssize_t part_start_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - - return sprintf(buf, "%llu\n",(unsigned long long)p->start_sect); + return sprintf(buf, "%llu\n", dev_to_bdev(dev)->bd_start_sect); } static ssize_t part_ro_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%d\n", p->policy ? 1 : 0); + return sprintf(buf, "%d\n", bdev_read_only(dev_to_bdev(dev))); } static ssize_t part_alignment_offset_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%llu\n", (unsigned long long)p->alignment_offset); + return sprintf(buf, "%u\n", bdev_alignment_offset(dev_to_bdev(dev))); } static ssize_t part_discard_alignment_show(struct device *dev, struct device_attribute *attr, char *buf) { - struct hd_struct *p = dev_to_part(dev); - return sprintf(buf, "%u\n", p->discard_alignment); + return sprintf(buf, "%u\n", bdev_discard_alignment(dev_to_bdev(dev))); } static DEVICE_ATTR(partition, 0444, part_partition_show, NULL); @@ -237,7 +228,7 @@ static struct attribute *part_attrs[] = { NULL }; -static struct attribute_group part_attr_group = { +static const struct attribute_group part_attr_group = { .attrs = part_attrs, }; @@ -251,87 +242,36 @@ static const struct attribute_group *part_attr_groups[] = { static void part_release(struct device *dev) { - struct hd_struct *p = dev_to_part(dev); - blk_free_devt(dev->devt); - hd_free_part(p); - kfree(p); + put_disk(dev_to_bdev(dev)->bd_disk); + iput(dev_to_bdev(dev)->bd_inode); } -static int part_uevent(struct device *dev, struct kobj_uevent_env *env) +static int part_uevent(const struct device *dev, struct kobj_uevent_env *env) { - struct hd_struct *part = dev_to_part(dev); + const struct block_device *part = dev_to_bdev(dev); - add_uevent_var(env, "PARTN=%u", part->partno); - if (part->info && part->info->volname[0]) - add_uevent_var(env, "PARTNAME=%s", part->info->volname); + add_uevent_var(env, "PARTN=%u", part->bd_partno); + if (part->bd_meta_info && part->bd_meta_info->volname[0]) + add_uevent_var(env, "PARTNAME=%s", part->bd_meta_info->volname); return 0; } -struct device_type part_type = { +const struct device_type part_type = { .name = "partition", .groups = part_attr_groups, .release = part_release, .uevent = part_uevent, }; -static void hd_struct_free_work(struct work_struct *work) -{ - struct hd_struct *part = - container_of(to_rcu_work(work), struct hd_struct, rcu_work); - - part->start_sect = 0; - part->nr_sects = 0; - part_stat_set_all(part, 0); - put_device(part_to_dev(part)); -} - -static void hd_struct_free(struct percpu_ref *ref) -{ - struct hd_struct *part = container_of(ref, struct hd_struct, ref); - struct gendisk *disk = part_to_disk(part); - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); - - rcu_assign_pointer(ptbl->last_lookup, NULL); - put_device(disk_to_dev(disk)); - - INIT_RCU_WORK(&part->rcu_work, hd_struct_free_work); - queue_rcu_work(system_wq, &part->rcu_work); -} - -int hd_ref_init(struct hd_struct *part) -{ - if (percpu_ref_init(&part->ref, hd_struct_free, 0, GFP_KERNEL)) - return -ENOMEM; - return 0; -} - -/* - * Must be called either with bd_mutex held, before a disk can be opened or - * after all disk users are gone. - */ -void delete_partition(struct gendisk *disk, struct hd_struct *part) +void drop_partition(struct block_device *part) { - struct disk_part_tbl *ptbl = - rcu_dereference_protected(disk->part_tbl, 1); + lockdep_assert_held(&part->bd_disk->open_mutex); - /* - * ->part_tbl is referenced in this part's release handler, so - * we have to hold the disk device - */ - get_device(disk_to_dev(part_to_disk(part))); - rcu_assign_pointer(ptbl->part[part->partno], NULL); - kobject_put(part->holder_dir); - device_del(part_to_dev(part)); + xa_erase(&part->bd_disk->part_tbl, part->bd_partno); + kobject_put(part->bd_holder_dir); - /* - * Remove gendisk pointer from idr so that it cannot be looked up - * while RCU period before freeing gendisk is running to prevent - * use-after-free issues. Note that the device number stays - * "in-use" until we really free the gendisk. - */ - blk_invalidate_devt(part_devt(part)); - percpu_ref_kill(&part->ref); + device_del(&part->bd_device); + put_device(&part->bd_device); } static ssize_t whole_disk_show(struct device *dev, @@ -339,84 +279,53 @@ static ssize_t whole_disk_show(struct device *dev, { return 0; } -static DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); +static const DEVICE_ATTR(whole_disk, 0444, whole_disk_show, NULL); /* - * Must be called either with bd_mutex held, before a disk can be opened or + * Must be called either with open_mutex held, before a disk can be opened or * after all disk users are gone. */ -static struct hd_struct *add_partition(struct gendisk *disk, int partno, +static struct block_device *add_partition(struct gendisk *disk, int partno, sector_t start, sector_t len, int flags, struct partition_meta_info *info) { - struct hd_struct *p; dev_t devt = MKDEV(0, 0); struct device *ddev = disk_to_dev(disk); struct device *pdev; - struct disk_part_tbl *ptbl; + struct block_device *bdev; const char *dname; int err; + lockdep_assert_held(&disk->open_mutex); + + if (partno >= DISK_MAX_PARTS) + return ERR_PTR(-EINVAL); + /* * Partitions are not supported on zoned block devices that are used as * such. */ - switch (disk->queue->limits.zoned) { - case BLK_ZONED_HM: + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: partitions not supported on host managed zoned block device\n", disk->disk_name); return ERR_PTR(-ENXIO); - case BLK_ZONED_HA: - pr_info("%s: disabling host aware zoned block device support due to partitions\n", - disk->disk_name); - disk->queue->limits.zoned = BLK_ZONED_NONE; - break; - case BLK_ZONED_NONE: - break; } - err = disk_expand_part_tbl(disk, partno); - if (err) - return ERR_PTR(err); - ptbl = rcu_dereference_protected(disk->part_tbl, 1); - - if (ptbl->part[partno]) - return ERR_PTR(-EBUSY); - - p = kzalloc(sizeof(*p), GFP_KERNEL); - if (!p) + if (xa_load(&disk->part_tbl, partno)) return ERR_PTR(-EBUSY); - p->dkstats = alloc_percpu(struct disk_stats); - if (!p->dkstats) { - err = -ENOMEM; - goto out_free; - } - - hd_sects_seq_init(p); - pdev = part_to_dev(p); + /* ensure we always have a reference to the whole disk */ + get_device(disk_to_dev(disk)); - p->start_sect = start; - p->alignment_offset = - queue_limit_alignment_offset(&disk->queue->limits, start); - p->discard_alignment = - queue_limit_discard_alignment(&disk->queue->limits, start); - p->nr_sects = len; - p->partno = partno; - p->policy = get_disk_ro(disk); - - if (info) { - struct partition_meta_info *pinfo; + err = -ENOMEM; + bdev = bdev_alloc(disk, partno); + if (!bdev) + goto out_put_disk; - pinfo = kzalloc_node(sizeof(*pinfo), GFP_KERNEL, disk->node_id); - if (!pinfo) { - err = -ENOMEM; - goto out_free_stats; - } - memcpy(pinfo, info, sizeof(*info)); - p->info = pinfo; - } + bdev->bd_start_sect = start; + bdev_set_nr_sectors(bdev, len); + pdev = &bdev->bd_device; dname = dev_name(ddev); if (isdigit(dname[strlen(dname) - 1])) dev_set_name(pdev, "%sp%d", dname, partno); @@ -428,11 +337,24 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, pdev->type = &part_type; pdev->parent = ddev; - err = blk_alloc_devt(p, &devt); - if (err) - goto out_free_info; + /* in consecutive minor range? */ + if (bdev->bd_partno < disk->minors) { + devt = MKDEV(disk->major, disk->first_minor + bdev->bd_partno); + } else { + err = blk_alloc_ext_minor(); + if (err < 0) + goto out_put; + devt = MKDEV(BLOCK_EXT_MAJOR, err); + } pdev->devt = devt; + if (info) { + err = -ENOMEM; + bdev->bd_meta_info = kmemdup(info, sizeof(*info), GFP_KERNEL); + if (!bdev->bd_meta_info) + goto out_put; + } + /* delay uevent until 'holders' subdir is created */ dev_set_uevent_suppress(pdev, 1); err = device_add(pdev); @@ -440,8 +362,8 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_put; err = -ENOMEM; - p->holder_dir = kobject_create_and_add("holders", &pdev->kobj); - if (!p->holder_dir) + bdev->bd_holder_dir = kobject_create_and_add("holders", &pdev->kobj); + if (!bdev->bd_holder_dir) goto out_del; dev_set_uevent_suppress(pdev, 0); @@ -451,200 +373,168 @@ static struct hd_struct *add_partition(struct gendisk *disk, int partno, goto out_del; } - err = hd_ref_init(p); - if (err) { - if (flags & ADDPART_FLAG_WHOLEDISK) - goto out_remove_file; - goto out_del; - } - /* everything is up and running, commence */ - rcu_assign_pointer(ptbl->part[partno], p); + err = xa_insert(&disk->part_tbl, partno, bdev, GFP_KERNEL); + if (err) + goto out_del; + bdev_add(bdev, devt); /* suppress uevent if the disk suppresses it */ if (!dev_get_uevent_suppress(ddev)) kobject_uevent(&pdev->kobj, KOBJ_ADD); - return p; - -out_free_info: - kfree(p->info); -out_free_stats: - free_percpu(p->dkstats); -out_free: - kfree(p); - return ERR_PTR(err); -out_remove_file: - device_remove_file(pdev, &dev_attr_whole_disk); + return bdev; + out_del: - kobject_put(p->holder_dir); + kobject_put(bdev->bd_holder_dir); device_del(pdev); out_put: put_device(pdev); return ERR_PTR(err); +out_put_disk: + put_disk(disk); + return ERR_PTR(err); } static bool partition_overlaps(struct gendisk *disk, sector_t start, sector_t length, int skip_partno) { - struct disk_part_iter piter; - struct hd_struct *part; + struct block_device *part; bool overlap = false; - - disk_part_iter_init(&piter, disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) { - if (part->partno == skip_partno || - start >= part->start_sect + part->nr_sects || - start + length <= part->start_sect) - continue; - overlap = true; - break; + unsigned long idx; + + rcu_read_lock(); + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + if (part->bd_partno != skip_partno && + start < part->bd_start_sect + bdev_nr_sectors(part) && + start + length > part->bd_start_sect) { + overlap = true; + break; + } } + rcu_read_unlock(); - disk_part_iter_exit(&piter); return overlap; } -int bdev_add_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_add_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { - struct hd_struct *part; + sector_t capacity = get_capacity(disk), end; + struct block_device *part; + int ret; - mutex_lock(&bdev->bd_mutex); - if (partition_overlaps(bdev->bd_disk, start, length, -1)) { - mutex_unlock(&bdev->bd_mutex); - return -EBUSY; + mutex_lock(&disk->open_mutex); + if (check_add_overflow(start, length, &end)) { + ret = -EINVAL; + goto out; + } + + if (start >= capacity || end > capacity) { + ret = -EINVAL; + goto out; } - part = add_partition(bdev->bd_disk, partno, start, length, + if (!disk_live(disk)) { + ret = -ENXIO; + goto out; + } + + if (disk->flags & GENHD_FL_NO_PART) { + ret = -EINVAL; + goto out; + } + + if (partition_overlaps(disk, start, length, -1)) { + ret = -EBUSY; + goto out; + } + + part = add_partition(disk, partno, start, length, ADDPART_FLAG_NONE, NULL); - mutex_unlock(&bdev->bd_mutex); - return PTR_ERR_OR_ZERO(part); + ret = PTR_ERR_OR_ZERO(part); +out: + mutex_unlock(&disk->open_mutex); + return ret; } -int bdev_del_partition(struct block_device *bdev, int partno) +int bdev_del_partition(struct gendisk *disk, int partno) { - struct block_device *bdevp; - struct hd_struct *part; - int ret = 0; + struct block_device *part = NULL; + int ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) - return -ENXIO; - - ret = -ENOMEM; - bdevp = bdget(part_devt(part)); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); + goto out_unlock; ret = -EBUSY; - if (bdevp->bd_openers) + if (atomic_read(&part->bd_openers)) goto out_unlock; - sync_blockdev(bdevp); - invalidate_bdev(bdevp); - - mutex_lock_nested(&bdev->bd_mutex, 1); - delete_partition(bdev->bd_disk, part); - mutex_unlock(&bdev->bd_mutex); + /* + * We verified that @part->bd_openers is zero above and so + * @part->bd_holder{_ops} can't be set. And since we hold + * @disk->open_mutex the device can't be claimed by anyone. + * + * So no need to call @part->bd_holder_ops->mark_dead() here. + * Just delete the partition and invalidate it. + */ + remove_inode_hash(part->bd_inode); + invalidate_bdev(part); + drop_partition(part); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + mutex_unlock(&disk->open_mutex); return ret; } -int bdev_resize_partition(struct block_device *bdev, int partno, - sector_t start, sector_t length) +int bdev_resize_partition(struct gendisk *disk, int partno, sector_t start, + sector_t length) { - struct block_device *bdevp; - struct hd_struct *part; - int ret = 0; + struct block_device *part = NULL; + int ret = -ENXIO; - part = disk_get_part(bdev->bd_disk, partno); + mutex_lock(&disk->open_mutex); + part = xa_load(&disk->part_tbl, partno); if (!part) - return -ENXIO; - - ret = -ENOMEM; - bdevp = bdget(part_devt(part)); - if (!bdevp) - goto out_put_part; - - mutex_lock(&bdevp->bd_mutex); - mutex_lock_nested(&bdev->bd_mutex, 1); + goto out_unlock; ret = -EINVAL; - if (start != part->start_sect) + if (start != part->bd_start_sect) goto out_unlock; ret = -EBUSY; - if (partition_overlaps(bdev->bd_disk, start, length, partno)) + if (partition_overlaps(disk, start, length, partno)) goto out_unlock; - part_nr_sects_write(part, (sector_t)length); - i_size_write(bdevp->bd_inode, length << SECTOR_SHIFT); + bdev_set_nr_sectors(part, length); ret = 0; out_unlock: - mutex_unlock(&bdevp->bd_mutex); - mutex_unlock(&bdev->bd_mutex); - bdput(bdevp); -out_put_part: - disk_put_part(part); + mutex_unlock(&disk->open_mutex); return ret; } static bool disk_unlock_native_capacity(struct gendisk *disk) { - const struct block_device_operations *bdops = disk->fops; - - if (bdops->unlock_native_capacity && - !(disk->flags & GENHD_FL_NATIVE_CAPACITY)) { - printk(KERN_CONT "enabling native capacity\n"); - bdops->unlock_native_capacity(disk); - disk->flags |= GENHD_FL_NATIVE_CAPACITY; - return true; - } else { + if (!disk->fops->unlock_native_capacity || + test_and_set_bit(GD_NATIVE_CAPACITY, &disk->state)) { printk(KERN_CONT "truncated\n"); return false; } -} - -int blk_drop_partitions(struct block_device *bdev) -{ - struct disk_part_iter piter; - struct hd_struct *part; - - if (!disk_part_scan_enabled(bdev->bd_disk)) - return 0; - if (bdev->bd_part_count) - return -EBUSY; - sync_blockdev(bdev); - invalidate_bdev(bdev); - - disk_part_iter_init(&piter, bdev->bd_disk, DISK_PITER_INCL_EMPTY); - while ((part = disk_part_iter_next(&piter))) - delete_partition(bdev->bd_disk, part); - disk_part_iter_exit(&piter); - - return 0; + printk(KERN_CONT "enabling native capacity\n"); + disk->fops->unlock_native_capacity(disk); + return true; } -#ifdef CONFIG_S390 -/* for historic reasons in the DASD driver */ -EXPORT_SYMBOL_GPL(blk_drop_partitions); -#endif -static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, +static bool blk_add_partition(struct gendisk *disk, struct parsed_partitions *state, int p) { sector_t size = state->parts[p].size; sector_t from = state->parts[p].from; - struct hd_struct *part; + struct block_device *part; if (!size) return true; @@ -677,27 +567,30 @@ static bool blk_add_partition(struct gendisk *disk, struct block_device *bdev, part = add_partition(disk, p, from, size, state->parts[p].flags, &state->parts[p].info); if (IS_ERR(part) && PTR_ERR(part) != -ENXIO) { - printk(KERN_ERR " %s: p%d could not be added: %ld\n", - disk->disk_name, p, -PTR_ERR(part)); + printk(KERN_ERR " %s: p%d could not be added: %pe\n", + disk->disk_name, p, part); return true; } if (IS_BUILTIN(CONFIG_BLK_DEV_MD) && (state->parts[p].flags & ADDPART_FLAG_RAID)) - md_autodetect_dev(part_to_dev(part)->devt); + md_autodetect_dev(part->bd_dev); return true; } -int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) +static int blk_add_partitions(struct gendisk *disk) { struct parsed_partitions *state; - int ret = -EAGAIN, p, highest; + int ret = -EAGAIN, p; + + if (disk->flags & GENHD_FL_NO_PART) + return 0; - if (!disk_part_scan_enabled(disk)) + if (test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) return 0; - state = check_partition(disk, bdev); + state = check_partition(disk); if (!state) return 0; if (IS_ERR(state)) { @@ -717,7 +610,7 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) /* * Partitions are not supported on host managed zoned block devices. */ - if (disk->queue->limits.zoned == BLK_ZONED_HM) { + if (bdev_is_zoned(disk->part0)) { pr_warn("%s: ignoring partition table on host managed zoned block device\n", disk->disk_name); ret = 0; @@ -740,17 +633,8 @@ int blk_add_partitions(struct gendisk *disk, struct block_device *bdev) /* tell userspace that the media / partition table may have changed */ kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); - /* - * Detect the highest partition number and preallocate disk->part_tbl. - * This is an optimization and not strictly necessary. - */ - for (p = 1, highest = 0; p < state->limit; p++) - if (state->parts[p].size) - highest = p; - disk_expand_part_tbl(disk, highest); - for (p = 1; p < state->limit; p++) - if (!blk_add_partition(disk, bdev, state, p)) + if (!blk_add_partition(disk, state, p)) goto out_free_state; ret = 0; @@ -759,28 +643,92 @@ out_free_state: return ret; } +int bdev_disk_changed(struct gendisk *disk, bool invalidate) +{ + struct block_device *part; + unsigned long idx; + int ret = 0; + + lockdep_assert_held(&disk->open_mutex); + + if (!disk_live(disk)) + return -ENXIO; + +rescan: + if (disk->open_partitions) + return -EBUSY; + sync_blockdev(disk->part0); + invalidate_bdev(disk->part0); + + xa_for_each_start(&disk->part_tbl, idx, part, 1) { + /* + * Remove the block device from the inode hash, so that + * it cannot be looked up any more even when openers + * still hold references. + */ + remove_inode_hash(part->bd_inode); + + /* + * If @disk->open_partitions isn't elevated but there's + * still an active holder of that block device things + * are broken. + */ + WARN_ON_ONCE(atomic_read(&part->bd_openers)); + invalidate_bdev(part); + drop_partition(part); + } + clear_bit(GD_NEED_PART_SCAN, &disk->state); + + /* + * Historically we only set the capacity to zero for devices that + * support partitions (independ of actually having partitions created). + * Doing that is rather inconsistent, but changing it broke legacy + * udisks polling for legacy ide-cdrom devices. Use the crude check + * below to get the sane behavior for most device while not breaking + * userspace for this particular setup. + */ + if (invalidate) { + if (!(disk->flags & GENHD_FL_NO_PART) || + !(disk->flags & GENHD_FL_REMOVABLE)) + set_capacity(disk, 0); + } + + if (get_capacity(disk)) { + ret = blk_add_partitions(disk); + if (ret == -EAGAIN) + goto rescan; + } else if (invalidate) { + /* + * Tell userspace that the media / partition table may have + * changed. + */ + kobject_uevent(&disk_to_dev(disk)->kobj, KOBJ_CHANGE); + } + + return ret; +} +/* + * Only exported for loop and dasd for historic reasons. Don't use in new + * code! + */ +EXPORT_SYMBOL_GPL(bdev_disk_changed); + void *read_part_sector(struct parsed_partitions *state, sector_t n, Sector *p) { - struct address_space *mapping = state->bdev->bd_inode->i_mapping; - struct page *page; + struct address_space *mapping = state->disk->part0->bd_inode->i_mapping; + struct folio *folio; - if (n >= get_capacity(state->bdev->bd_disk)) { + if (n >= get_capacity(state->disk)) { state->access_beyond_eod = true; - return NULL; + goto out; } - page = read_mapping_page(mapping, - (pgoff_t)(n >> (PAGE_SHIFT - 9)), NULL); - if (IS_ERR(page)) + folio = read_mapping_folio(mapping, n >> PAGE_SECTORS_SHIFT, NULL); + if (IS_ERR(folio)) goto out; - if (PageError(page)) - goto out_put_page; - - p->v = page; - return (unsigned char *)page_address(page) + - ((n & ((1 << (PAGE_SHIFT - 9)) - 1)) << SECTOR_SHIFT); -out_put_page: - put_page(page); + + p->v = folio; + return folio_address(folio) + offset_in_folio(folio, n * SECTOR_SIZE); out: p->v = NULL; return NULL; diff --git a/block/partitions/efi.c b/block/partitions/efi.c index b64bfdd4326c..5e9be13a56a8 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -124,19 +124,17 @@ efi_crc32(const void *buf, unsigned long len) /** * last_lba(): return number of last logical block of device - * @bdev: block device + * @disk: block device * * Description: Returns last LBA value on success, 0 on error. * This is stored (by sd and ide-geometry) in * the part[0] entry for this disk, and is the number of * physical sectors available on the disk. */ -static u64 last_lba(struct block_device *bdev) +static u64 last_lba(struct gendisk *disk) { - if (!bdev || !bdev->bd_inode) - return 0; - return div_u64(bdev->bd_inode->i_size, - bdev_logical_block_size(bdev)) - 1ULL; + return div_u64(bdev_nr_bytes(disk->part0), + queue_logical_block_size(disk->queue)) - 1ULL; } static inline int pmbr_part_valid(gpt_mbr_record *part) @@ -231,17 +229,17 @@ done: * @buffer: destination buffer * @count: bytes to read * - * Description: Reads @count bytes from @state->bdev into @buffer. + * Description: Reads @count bytes from @state->disk into @buffer. * Returns number of bytes read on success, 0 on error. */ static size_t read_lba(struct parsed_partitions *state, u64 lba, u8 *buffer, size_t count) { size_t totalreadcount = 0; - struct block_device *bdev = state->bdev; - sector_t n = lba * (bdev_logical_block_size(bdev) / 512); + sector_t n = lba * + (queue_logical_block_size(state->disk->queue) / 512); - if (!buffer || lba > last_lba(bdev)) + if (!buffer || lba > last_lba(state->disk)) return 0; while (count) { @@ -302,14 +300,14 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, * @lba: the Logical Block Address of the partition table * * Description: returns GPT header on success, NULL on error. Allocates - * and fills a GPT header starting at @ from @state->bdev. + * and fills a GPT header starting at @ from @state->disk. * Note: remember to free gpt when finished with it. */ static gpt_header *alloc_read_gpt_header(struct parsed_partitions *state, u64 lba) { gpt_header *gpt; - unsigned ssz = bdev_logical_block_size(state->bdev); + unsigned ssz = queue_logical_block_size(state->disk->queue); gpt = kmalloc(ssz, GFP_KERNEL); if (!gpt) @@ -356,10 +354,10 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the GUID Partition Table header size is too big */ if (le32_to_cpu((*gpt)->header_size) > - bdev_logical_block_size(state->bdev)) { + queue_logical_block_size(state->disk->queue)) { pr_debug("GUID Partition Table Header size is too large: %u > %u\n", le32_to_cpu((*gpt)->header_size), - bdev_logical_block_size(state->bdev)); + queue_logical_block_size(state->disk->queue)); goto fail; } @@ -395,7 +393,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, /* Check the first_usable_lba and last_usable_lba are * within the disk. */ - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (le64_to_cpu((*gpt)->first_usable_lba) > lastlba) { pr_debug("GPT: first_usable_lba incorrect: %lld > %lld\n", (unsigned long long)le64_to_cpu((*gpt)->first_usable_lba), @@ -587,13 +585,15 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, gpt_header *pgpt = NULL, *agpt = NULL; gpt_entry *pptes = NULL, *aptes = NULL; legacy_mbr *legacymbr; - sector_t total_sectors = i_size_read(state->bdev->bd_inode) >> 9; + struct gendisk *disk = state->disk; + const struct block_device_operations *fops = disk->fops; + sector_t total_sectors = get_capacity(state->disk); u64 lastlba; if (!ptes) return 0; - lastlba = last_lba(state->bdev); + lastlba = last_lba(state->disk); if (!force_gpt) { /* This will be added to the EFI Spec. per Intel after v1.02. */ legacymbr = kzalloc(sizeof(*legacymbr), GFP_KERNEL); @@ -621,6 +621,16 @@ static int find_valid_gpt(struct parsed_partitions *state, gpt_header **gpt, if (!good_agpt && force_gpt) good_agpt = is_gpt_valid(state, lastlba, &agpt, &aptes); + if (!good_agpt && force_gpt && fops->alternative_gpt_sector) { + sector_t agpt_sector; + int err; + + err = fops->alternative_gpt_sector(disk, &agpt_sector); + if (!err) + good_agpt = is_gpt_valid(state, agpt_sector, + &agpt, &aptes); + } + /* The obviously unsuccessful case */ if (!good_pgpt && !good_agpt) goto fail; @@ -682,7 +692,7 @@ static void utf16_le_to_7bit(const __le16 *in, unsigned int size, u8 *out) } /** - * efi_partition(struct parsed_partitions *state) + * efi_partition - scan for GPT partitions * @state: disk parsed partitions * * Description: called from check.c, if the disk contains GPT @@ -705,7 +715,7 @@ int efi_partition(struct parsed_partitions *state) gpt_header *gpt = NULL; gpt_entry *ptes = NULL; u32 i; - unsigned ssz = bdev_logical_block_size(state->bdev) / 512; + unsigned ssz = queue_logical_block_size(state->disk->queue) / 512; if (!find_valid_gpt(state, &gpt, &ptes) || !gpt || !ptes) { kfree(gpt); @@ -722,7 +732,7 @@ int efi_partition(struct parsed_partitions *state) u64 size = le64_to_cpu(ptes[i].ending_lba) - le64_to_cpu(ptes[i].starting_lba) + 1ULL; - if (!is_pte_valid(&ptes[i], last_lba(state->bdev))) + if (!is_pte_valid(&ptes[i], last_lba(state->disk))) continue; put_partition(state, i+1, start * ssz, size * ssz); diff --git a/block/partitions/efi.h b/block/partitions/efi.h index 8cc2b88d0aa8..84b9f36b9e47 100644 --- a/block/partitions/efi.h +++ b/block/partitions/efi.h @@ -13,7 +13,6 @@ #include <linux/types.h> #include <linux/fs.h> -#include <linux/genhd.h> #include <linux/kernel.h> #include <linux/major.h> #include <linux/string.h> diff --git a/block/partitions/ibm.c b/block/partitions/ibm.c index d6e18df9c53c..82d9c4c3fb41 100644 --- a/block/partitions/ibm.c +++ b/block/partitions/ibm.c @@ -61,6 +61,47 @@ static sector_t cchhb2blk(struct vtoc_cchhb *ptr, struct hd_geometry *geo) ptr->b; } +/* Volume Label Type/ID Length */ +#define DASD_VOL_TYPE_LEN 4 +#define DASD_VOL_ID_LEN 6 + +/* Volume Label Types */ +#define DASD_VOLLBL_TYPE_VOL1 0 +#define DASD_VOLLBL_TYPE_LNX1 1 +#define DASD_VOLLBL_TYPE_CMS1 2 + +struct dasd_vollabel { + char *type; + int idx; +}; + +static struct dasd_vollabel dasd_vollabels[] = { + [DASD_VOLLBL_TYPE_VOL1] = { + .type = "VOL1", + .idx = DASD_VOLLBL_TYPE_VOL1, + }, + [DASD_VOLLBL_TYPE_LNX1] = { + .type = "LNX1", + .idx = DASD_VOLLBL_TYPE_LNX1, + }, + [DASD_VOLLBL_TYPE_CMS1] = { + .type = "CMS1", + .idx = DASD_VOLLBL_TYPE_CMS1, + }, +}; + +static int get_label_by_type(const char *type) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(dasd_vollabels); i++) { + if (!memcmp(type, dasd_vollabels[i].type, DASD_VOL_TYPE_LEN)) + return dasd_vollabels[i].idx; + } + + return -1; +} + static int find_label(struct parsed_partitions *state, dasd_information2_t *info, struct hd_geometry *geo, @@ -70,12 +111,10 @@ static int find_label(struct parsed_partitions *state, char type[], union label_t *label) { - Sector sect; - unsigned char *data; sector_t testsect[3]; - unsigned char temp[5]; - int found = 0; int i, testcount; + Sector sect; + void *data; /* There a three places where we may find a valid label: * - on an ECKD disk it's block 2 @@ -103,31 +142,27 @@ static int find_label(struct parsed_partitions *state, if (data == NULL) continue; memcpy(label, data, sizeof(*label)); - memcpy(temp, data, 4); - temp[4] = 0; - EBCASC(temp, 4); + memcpy(type, data, DASD_VOL_TYPE_LEN); + EBCASC(type, DASD_VOL_TYPE_LEN); put_dev_sector(sect); - if (!strcmp(temp, "VOL1") || - !strcmp(temp, "LNX1") || - !strcmp(temp, "CMS1")) { - if (!strcmp(temp, "VOL1")) { - strncpy(type, label->vol.vollbl, 4); - strncpy(name, label->vol.volid, 6); - } else { - strncpy(type, label->lnx.vollbl, 4); - strncpy(name, label->lnx.volid, 6); - } - EBCASC(type, 4); - EBCASC(name, 6); + switch (get_label_by_type(type)) { + case DASD_VOLLBL_TYPE_VOL1: + memcpy(name, label->vol.volid, DASD_VOL_ID_LEN); + EBCASC(name, DASD_VOL_ID_LEN); + *labelsect = testsect[i]; + return 1; + case DASD_VOLLBL_TYPE_LNX1: + case DASD_VOLLBL_TYPE_CMS1: + memcpy(name, label->lnx.volid, DASD_VOL_ID_LEN); + EBCASC(name, DASD_VOL_ID_LEN); *labelsect = testsect[i]; - found = 1; + return 1; + default: break; } } - if (!found) - memset(label, 0, sizeof(*label)); - return found; + return 0; } static int find_vol1_partitions(struct parsed_partitions *state, @@ -198,7 +233,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, char name[], union label_t *label, sector_t labelsect, - loff_t i_size, + sector_t nr_sectors, dasd_information2_t *info) { loff_t offset, geo_size, size; @@ -213,14 +248,14 @@ static int find_lnx1_partitions(struct parsed_partitions *state, } else { /* * Formated w/o large volume support. If the sanity check - * 'size based on geo == size based on i_size' is true, then + * 'size based on geo == size based on nr_sectors' is true, then * we can safely assume that we know the formatted size of * the disk, otherwise we need additional information * that we can only get from a real DASD device. */ geo_size = geo->cylinders * geo->heads * geo->sectors * secperblk; - size = i_size >> 9; + size = nr_sectors; if (size != geo_size) { if (!info) { strlcat(state->pp_buf, "\n", PAGE_SIZE); @@ -229,7 +264,7 @@ static int find_lnx1_partitions(struct parsed_partitions *state, if (!strcmp(info->type, "ECKD")) if (geo_size < size) size = geo_size; - /* else keep size based on i_size */ + /* else keep size based on nr_sectors */ } } /* first and only partition starts in the first block after the label */ @@ -290,14 +325,15 @@ static int find_cms1_partitions(struct parsed_partitions *state, int ibm_partition(struct parsed_partitions *state) { int (*fn)(struct gendisk *disk, dasd_information2_t *info); - struct block_device *bdev = state->bdev; - struct gendisk *disk = bdev->bd_disk; + struct gendisk *disk = state->disk; + struct block_device *bdev = disk->part0; int blocksize, res; - loff_t i_size, offset, size; + loff_t offset, size; + sector_t nr_sectors; dasd_information2_t *info; struct hd_geometry *geo; - char type[5] = {0,}; - char name[7] = {0,}; + char type[DASD_VOL_TYPE_LEN + 1] = ""; + char name[DASD_VOL_ID_LEN + 1] = ""; sector_t labelsect; union label_t *label; @@ -305,13 +341,11 @@ int ibm_partition(struct parsed_partitions *state) if (!disk->fops->getgeo) goto out_exit; fn = symbol_get(dasd_biodasdinfo); - if (!fn) - goto out_exit; blocksize = bdev_logical_block_size(bdev); if (blocksize <= 0) goto out_symbol; - i_size = i_size_read(bdev->bd_inode); - if (i_size == 0) + nr_sectors = bdev_nr_sectors(bdev); + if (nr_sectors == 0) goto out_symbol; info = kmalloc(sizeof(dasd_information2_t), GFP_KERNEL); if (info == NULL) @@ -326,23 +360,26 @@ int ibm_partition(struct parsed_partitions *state) geo->start = get_start_sect(bdev); if (disk->fops->getgeo(bdev, geo)) goto out_freeall; - if (fn(disk, info)) { + if (!fn || fn(disk, info)) { kfree(info); info = NULL; } - if (find_label(state, info, geo, blocksize, &labelsect, name, type, - label)) { - if (!strncmp(type, "VOL1", 4)) { + if (find_label(state, info, geo, blocksize, &labelsect, name, type, label)) { + switch (get_label_by_type(type)) { + case DASD_VOLLBL_TYPE_VOL1: res = find_vol1_partitions(state, geo, blocksize, name, label); - } else if (!strncmp(type, "LNX1", 4)) { + break; + case DASD_VOLLBL_TYPE_LNX1: res = find_lnx1_partitions(state, geo, blocksize, name, - label, labelsect, i_size, + label, labelsect, nr_sectors, info); - } else if (!strncmp(type, "CMS1", 4)) { + break; + case DASD_VOLLBL_TYPE_CMS1: res = find_cms1_partitions(state, geo, blocksize, name, label, labelsect); + break; } } else if (info) { /* @@ -355,7 +392,7 @@ int ibm_partition(struct parsed_partitions *state) res = 1; if (info->format == DASD_FORMAT_LDL) { strlcat(state->pp_buf, "(nonl)", PAGE_SIZE); - size = i_size >> 9; + size = nr_sectors; offset = (info->label_block + 1) * (blocksize >> 9); put_partition(state, 1, offset, size-offset); strlcat(state->pp_buf, "\n", PAGE_SIZE); @@ -370,7 +407,8 @@ out_nolab: out_nogeo: kfree(info); out_symbol: - symbol_put(dasd_biodasdinfo); + if (fn) + symbol_put(dasd_biodasdinfo); out_exit: return res; } diff --git a/block/partitions/ldm.c b/block/partitions/ldm.c index d333786b5c7e..38e58960ae03 100644 --- a/block/partitions/ldm.c +++ b/block/partitions/ldm.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * ldm - Support for Windows Logical Disk Manager (Dynamic Disks) * * Copyright (C) 2001,2002 Richard Russon <ldm@flatcap.org> @@ -304,7 +304,7 @@ static bool ldm_validate_privheads(struct parsed_partitions *state, } } - num_sects = state->bdev->bd_inode->i_size >> 9; + num_sects = get_capacity(state->disk); if ((ph[0]->config_start > num_sects) || ((ph[0]->config_start + ph[0]->config_size) > num_sects)) { @@ -339,11 +339,11 @@ out: /** * ldm_validate_tocblocks - Validate the table of contents and its backups * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * Find and compare the four tables of contents of the LDM Database stored on - * @state->bdev and return the parsed information into @toc1. + * @state->disk and return the parsed information into @toc1. * * The offsets and sizes of the configs are range-checked against a privhead. * @@ -486,8 +486,8 @@ out: * only likely to happen if the underlying device is strange. If that IS * the case we should return zero to let someone else try. * - * Return: 'true' @state->bdev is a dynamic disk - * 'false' @state->bdev is not a dynamic disk, or an error occurred + * Return: 'true' @state->disk is a dynamic disk + * 'false' @state->disk is not a dynamic disk, or an error occurred */ static bool ldm_validate_partition_table(struct parsed_partitions *state) { @@ -510,7 +510,7 @@ static bool ldm_validate_partition_table(struct parsed_partitions *state) p = (struct msdos_partition *)(data + 0x01BE); for (i = 0; i < 4; i++, p++) - if (SYS_IND (p) == LDM_PARTITION) { + if (p->sys_ind == LDM_PARTITION) { result = true; break; } @@ -736,7 +736,6 @@ static bool ldm_parse_cmp3 (const u8 *buffer, int buflen, struct vblk *vb) len = r_cols; } else { r_stripe = 0; - r_cols = 0; len = r_parent; } if (len < 0) @@ -783,11 +782,8 @@ static int ldm_parse_dgr3 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x24, r_diskid); r_id2 = ldm_relative (buffer, buflen, 0x24, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_diskid; - } if (len < 0) return false; @@ -826,11 +822,8 @@ static bool ldm_parse_dgr4 (const u8 *buffer, int buflen, struct vblk *vb) r_id1 = ldm_relative (buffer, buflen, 0x44, r_name); r_id2 = ldm_relative (buffer, buflen, 0x44, r_id1); len = r_id2; - } else { - r_id1 = 0; - r_id2 = 0; + } else len = r_name; - } if (len < 0) return false; @@ -963,10 +956,8 @@ static bool ldm_parse_prt3(const u8 *buffer, int buflen, struct vblk *vb) return false; } len = r_index; - } else { - r_index = 0; + } else len = r_diskid; - } if (len < 0) { ldm_error("len %d < 0", len); return false; @@ -1340,7 +1331,7 @@ static bool ldm_frag_commit (struct list_head *frags, struct ldmdb *ldb) /** * ldm_get_vblks - Read the on-disk database of VBLKs into memory * @state: Partition check state including device holding the LDM Database - * @base: Offset, into @state->bdev, of the database + * @base: Offset, into @state->disk, of the database * @ldb: Cache of the database structures * * To use the information from the VBLKs, they need to be read from the disk, @@ -1432,10 +1423,10 @@ static void ldm_free_vblks (struct list_head *lh) * example, if the device is hda, we would have: hda1: LDM database, hda2, hda3, * and so on: the actual data containing partitions. * - * Return: 1 Success, @state->bdev is a dynamic disk and we handled it - * 0 Success, @state->bdev is not a dynamic disk + * Return: 1 Success, @state->disk is a dynamic disk and we handled it + * 0 Success, @state->disk is not a dynamic disk * -1 An error occurred before enough information had been read - * Or @state->bdev is a dynamic disk, but it may be corrupted + * Or @state->disk is a dynamic disk, but it may be corrupted */ int ldm_partition(struct parsed_partitions *state) { diff --git a/block/partitions/ldm.h b/block/partitions/ldm.h index d8d6beaa72c4..0a747a0c782d 100644 --- a/block/partitions/ldm.h +++ b/block/partitions/ldm.h @@ -14,7 +14,6 @@ #include <linux/types.h> #include <linux/list.h> -#include <linux/genhd.h> #include <linux/fs.h> #include <asm/unaligned.h> #include <asm/byteorder.h> @@ -84,9 +83,6 @@ struct parsed_partitions; #define TOC_BITMAP1 "config" /* Names of the two defined */ #define TOC_BITMAP2 "log" /* bitmaps in the TOCBLOCK. */ -/* Borrowed from msdos.c */ -#define SYS_IND(p) (get_unaligned(&(p)->sys_ind)) - struct frag { /* VBLK Fragment handling */ struct list_head list; u32 group; diff --git a/block/partitions/mac.c b/block/partitions/mac.c index b6095335636c..7b521df00a39 100644 --- a/block/partitions/mac.c +++ b/block/partitions/mac.c @@ -133,7 +133,7 @@ int mac_partition(struct parsed_partitions *state) } #ifdef CONFIG_PPC_PMAC if (found_root_goodness) - note_bootable_part(state->bdev->bd_dev, found_root, + note_bootable_part(state->disk->part0->bd_dev, found_root, found_root_goodness); #endif diff --git a/block/partitions/msdos.c b/block/partitions/msdos.c index 8f2fcc080264..b5d5c229cc3b 100644 --- a/block/partitions/msdos.c +++ b/block/partitions/msdos.c @@ -38,8 +38,6 @@ */ #include <asm/unaligned.h> -#define SYS_IND(p) get_unaligned(&p->sys_ind) - static inline sector_t nr_sects(struct msdos_partition *p) { return (sector_t)get_unaligned_le32(&p->nr_sects); @@ -52,9 +50,9 @@ static inline sector_t start_sect(struct msdos_partition *p) static inline int is_extended_partition(struct msdos_partition *p) { - return (SYS_IND(p) == DOS_EXTENDED_PARTITION || - SYS_IND(p) == WIN98_EXTENDED_PARTITION || - SYS_IND(p) == LINUX_EXTENDED_PARTITION); + return (p->sys_ind == DOS_EXTENDED_PARTITION || + p->sys_ind == WIN98_EXTENDED_PARTITION || + p->sys_ind == LINUX_EXTENDED_PARTITION); } #define MSDOS_LABEL_MAGIC1 0x55 @@ -137,11 +135,12 @@ static void parse_extended(struct parsed_partitions *state, Sector sect; unsigned char *data; sector_t this_sector, this_size; - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; int loopct = 0; /* number of links followed without finding a data partition */ int i; + sector_size = queue_logical_block_size(state->disk->queue) / 512; this_sector = first_sector; this_size = first_size; @@ -193,7 +192,7 @@ static void parse_extended(struct parsed_partitions *state, put_partition(state, state->next, next, size); set_info(state, state->next, disksig); - if (SYS_IND(p) == LINUX_RAID_PARTITION) + if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[state->next].flags = ADDPART_FLAG_RAID; loopct = 0; if (++state->next == state->limit) @@ -546,7 +545,7 @@ static void parse_minix(struct parsed_partitions *state, * a secondary MBR describing its subpartitions, or * the normal boot sector. */ if (msdos_magic_present(data + 510) && - SYS_IND(p) == MINIX_PARTITION) { /* subpartition table present */ + p->sys_ind == MINIX_PARTITION) { /* subpartition table present */ char tmp[1 + BDEVNAME_SIZE + 10 + 9 + 1]; snprintf(tmp, sizeof(tmp), " %s%d: <minix:", state->name, origin); @@ -555,7 +554,7 @@ static void parse_minix(struct parsed_partitions *state, if (state->next == state->limit) break; /* add each partition in use */ - if (SYS_IND(p) == MINIX_PARTITION) + if (p->sys_ind == MINIX_PARTITION) put_partition(state, state->next++, start_sect(p), nr_sects(p)); } @@ -581,7 +580,7 @@ static struct { int msdos_partition(struct parsed_partitions *state) { - sector_t sector_size = bdev_logical_block_size(state->bdev) / 512; + sector_t sector_size; Sector sect; unsigned char *data; struct msdos_partition *p; @@ -589,6 +588,7 @@ int msdos_partition(struct parsed_partitions *state) int slot; u32 disksig; + sector_size = queue_logical_block_size(state->disk->queue) / 512; data = read_part_sector(state, 0, §); if (!data) return -1; @@ -622,7 +622,7 @@ int msdos_partition(struct parsed_partitions *state) for (slot = 1; slot <= 4; slot++, p++) { if (p->boot_ind != 0 && p->boot_ind != 0x80) { /* - * Even without a valid boot inidicator value + * Even without a valid boot indicator value * its still possible this is valid FAT filesystem * without a partition table. */ @@ -643,7 +643,7 @@ int msdos_partition(struct parsed_partitions *state) p = (struct msdos_partition *) (data + 0x1be); for (slot = 1 ; slot <= 4 ; slot++, p++) { /* If this is an EFI GPT disk, msdos should ignore it. */ - if (SYS_IND(p) == EFI_PMBR_OSTYPE_EFI_GPT) { + if (p->sys_ind == EFI_PMBR_OSTYPE_EFI_GPT) { put_dev_sector(sect); return 0; } @@ -685,11 +685,11 @@ int msdos_partition(struct parsed_partitions *state) } put_partition(state, slot, start, size); set_info(state, slot, disksig); - if (SYS_IND(p) == LINUX_RAID_PARTITION) + if (p->sys_ind == LINUX_RAID_PARTITION) state->parts[slot].flags = ADDPART_FLAG_RAID; - if (SYS_IND(p) == DM6_PARTITION) + if (p->sys_ind == DM6_PARTITION) strlcat(state->pp_buf, "[DM]", PAGE_SIZE); - if (SYS_IND(p) == EZD_PARTITION) + if (p->sys_ind == EZD_PARTITION) strlcat(state->pp_buf, "[EZD]", PAGE_SIZE); } @@ -698,7 +698,7 @@ int msdos_partition(struct parsed_partitions *state) /* second pass - output for each on a separate line */ p = (struct msdos_partition *) (0x1be + data); for (slot = 1 ; slot <= 4 ; slot++, p++) { - unsigned char id = SYS_IND(p); + unsigned char id = p->sys_ind; int n; if (!nr_sects(p)) diff --git a/block/partitions/sgi.c b/block/partitions/sgi.c index 4273f1bb0515..9cc6b8c1eea4 100644 --- a/block/partitions/sgi.c +++ b/block/partitions/sgi.c @@ -43,7 +43,6 @@ int sgi_partition(struct parsed_partitions *state) Sector sect; struct sgi_disklabel *label; struct sgi_partition *p; - char b[BDEVNAME_SIZE]; label = read_part_sector(state, 0, §); if (!label) @@ -52,7 +51,7 @@ int sgi_partition(struct parsed_partitions *state) magic = label->magic_mushroom; if(be32_to_cpu(magic) != SGI_LABEL_MAGIC) { /*printk("Dev %s SGI disklabel: bad magic %08x\n", - bdevname(bdev, b), be32_to_cpu(magic));*/ + state->disk->disk_name, be32_to_cpu(magic));*/ put_dev_sector(sect); return 0; } @@ -63,7 +62,7 @@ int sgi_partition(struct parsed_partitions *state) } if(csum) { printk(KERN_WARNING "Dev %s SGI disklabel: csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/partitions/sun.c b/block/partitions/sun.c index 47dc53eccf77..ddf9e6def4b2 100644 --- a/block/partitions/sun.c +++ b/block/partitions/sun.c @@ -65,7 +65,6 @@ int sun_partition(struct parsed_partitions *state) } * label; struct sun_partition *p; unsigned long spc; - char b[BDEVNAME_SIZE]; int use_vtoc; int nparts; @@ -76,7 +75,7 @@ int sun_partition(struct parsed_partitions *state) p = label->partitions; if (be16_to_cpu(label->magic) != SUN_LABEL_MAGIC) { /* printk(KERN_INFO "Dev %s Sun disklabel: bad magic %04x\n", - bdevname(bdev, b), be16_to_cpu(label->magic)); */ + state->disk->disk_name, be16_to_cpu(label->magic)); */ put_dev_sector(sect); return 0; } @@ -86,7 +85,7 @@ int sun_partition(struct parsed_partitions *state) csum ^= *ush--; if (csum) { printk("Dev %s Sun disklabel: Csum bad, label corrupted\n", - bdevname(state->bdev, b)); + state->disk->disk_name); put_dev_sector(sect); return 0; } diff --git a/block/scsi_ioctl.c b/block/scsi_ioctl.c deleted file mode 100644 index ef722f04f88a..000000000000 --- a/block/scsi_ioctl.c +++ /dev/null @@ -1,900 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * Copyright (C) 2001 Jens Axboe <axboe@suse.de> - */ -#include <linux/compat.h> -#include <linux/kernel.h> -#include <linux/errno.h> -#include <linux/string.h> -#include <linux/module.h> -#include <linux/blkdev.h> -#include <linux/capability.h> -#include <linux/completion.h> -#include <linux/cdrom.h> -#include <linux/ratelimit.h> -#include <linux/slab.h> -#include <linux/times.h> -#include <linux/uio.h> -#include <linux/uaccess.h> - -#include <scsi/scsi.h> -#include <scsi/scsi_ioctl.h> -#include <scsi/scsi_cmnd.h> -#include <scsi/sg.h> - -struct blk_cmd_filter { - unsigned long read_ok[BLK_SCSI_CMD_PER_LONG]; - unsigned long write_ok[BLK_SCSI_CMD_PER_LONG]; -}; - -static struct blk_cmd_filter blk_default_cmd_filter; - -/* Command group 3 is reserved and should never be used. */ -const unsigned char scsi_command_size_tbl[8] = -{ - 6, 10, 10, 12, - 16, 12, 10, 10 -}; -EXPORT_SYMBOL(scsi_command_size_tbl); - -#include <scsi/sg.h> - -static int sg_get_version(int __user *p) -{ - static const int sg_version_num = 30527; - return put_user(sg_version_num, p); -} - -static int scsi_get_idlun(struct request_queue *q, int __user *p) -{ - return put_user(0, p); -} - -static int scsi_get_bus(struct request_queue *q, int __user *p) -{ - return put_user(0, p); -} - -static int sg_get_timeout(struct request_queue *q) -{ - return jiffies_to_clock_t(q->sg_timeout); -} - -static int sg_set_timeout(struct request_queue *q, int __user *p) -{ - int timeout, err = get_user(timeout, p); - - if (!err) - q->sg_timeout = clock_t_to_jiffies(timeout); - - return err; -} - -static int max_sectors_bytes(struct request_queue *q) -{ - unsigned int max_sectors = queue_max_sectors(q); - - max_sectors = min_t(unsigned int, max_sectors, INT_MAX >> 9); - - return max_sectors << 9; -} - -static int sg_get_reserved_size(struct request_queue *q, int __user *p) -{ - int val = min_t(int, q->sg_reserved_size, max_sectors_bytes(q)); - - return put_user(val, p); -} - -static int sg_set_reserved_size(struct request_queue *q, int __user *p) -{ - int size, err = get_user(size, p); - - if (err) - return err; - - if (size < 0) - return -EINVAL; - - q->sg_reserved_size = min(size, max_sectors_bytes(q)); - return 0; -} - -/* - * will always return that we are ATAPI even for a real SCSI drive, I'm not - * so sure this is worth doing anything about (why would you care??) - */ -static int sg_emulated_host(struct request_queue *q, int __user *p) -{ - return put_user(1, p); -} - -static void blk_set_cmd_filter_defaults(struct blk_cmd_filter *filter) -{ - /* Basic read-only commands */ - __set_bit(TEST_UNIT_READY, filter->read_ok); - __set_bit(REQUEST_SENSE, filter->read_ok); - __set_bit(READ_6, filter->read_ok); - __set_bit(READ_10, filter->read_ok); - __set_bit(READ_12, filter->read_ok); - __set_bit(READ_16, filter->read_ok); - __set_bit(READ_BUFFER, filter->read_ok); - __set_bit(READ_DEFECT_DATA, filter->read_ok); - __set_bit(READ_CAPACITY, filter->read_ok); - __set_bit(READ_LONG, filter->read_ok); - __set_bit(INQUIRY, filter->read_ok); - __set_bit(MODE_SENSE, filter->read_ok); - __set_bit(MODE_SENSE_10, filter->read_ok); - __set_bit(LOG_SENSE, filter->read_ok); - __set_bit(START_STOP, filter->read_ok); - __set_bit(GPCMD_VERIFY_10, filter->read_ok); - __set_bit(VERIFY_16, filter->read_ok); - __set_bit(REPORT_LUNS, filter->read_ok); - __set_bit(SERVICE_ACTION_IN_16, filter->read_ok); - __set_bit(RECEIVE_DIAGNOSTIC, filter->read_ok); - __set_bit(MAINTENANCE_IN, filter->read_ok); - __set_bit(GPCMD_READ_BUFFER_CAPACITY, filter->read_ok); - - /* Audio CD commands */ - __set_bit(GPCMD_PLAY_CD, filter->read_ok); - __set_bit(GPCMD_PLAY_AUDIO_10, filter->read_ok); - __set_bit(GPCMD_PLAY_AUDIO_MSF, filter->read_ok); - __set_bit(GPCMD_PLAY_AUDIO_TI, filter->read_ok); - __set_bit(GPCMD_PAUSE_RESUME, filter->read_ok); - - /* CD/DVD data reading */ - __set_bit(GPCMD_READ_CD, filter->read_ok); - __set_bit(GPCMD_READ_CD_MSF, filter->read_ok); - __set_bit(GPCMD_READ_DISC_INFO, filter->read_ok); - __set_bit(GPCMD_READ_CDVD_CAPACITY, filter->read_ok); - __set_bit(GPCMD_READ_DVD_STRUCTURE, filter->read_ok); - __set_bit(GPCMD_READ_HEADER, filter->read_ok); - __set_bit(GPCMD_READ_TRACK_RZONE_INFO, filter->read_ok); - __set_bit(GPCMD_READ_SUBCHANNEL, filter->read_ok); - __set_bit(GPCMD_READ_TOC_PMA_ATIP, filter->read_ok); - __set_bit(GPCMD_REPORT_KEY, filter->read_ok); - __set_bit(GPCMD_SCAN, filter->read_ok); - __set_bit(GPCMD_GET_CONFIGURATION, filter->read_ok); - __set_bit(GPCMD_READ_FORMAT_CAPACITIES, filter->read_ok); - __set_bit(GPCMD_GET_EVENT_STATUS_NOTIFICATION, filter->read_ok); - __set_bit(GPCMD_GET_PERFORMANCE, filter->read_ok); - __set_bit(GPCMD_SEEK, filter->read_ok); - __set_bit(GPCMD_STOP_PLAY_SCAN, filter->read_ok); - - /* Basic writing commands */ - __set_bit(WRITE_6, filter->write_ok); - __set_bit(WRITE_10, filter->write_ok); - __set_bit(WRITE_VERIFY, filter->write_ok); - __set_bit(WRITE_12, filter->write_ok); - __set_bit(WRITE_VERIFY_12, filter->write_ok); - __set_bit(WRITE_16, filter->write_ok); - __set_bit(WRITE_LONG, filter->write_ok); - __set_bit(WRITE_LONG_2, filter->write_ok); - __set_bit(WRITE_SAME, filter->write_ok); - __set_bit(WRITE_SAME_16, filter->write_ok); - __set_bit(WRITE_SAME_32, filter->write_ok); - __set_bit(ERASE, filter->write_ok); - __set_bit(GPCMD_MODE_SELECT_10, filter->write_ok); - __set_bit(MODE_SELECT, filter->write_ok); - __set_bit(LOG_SELECT, filter->write_ok); - __set_bit(GPCMD_BLANK, filter->write_ok); - __set_bit(GPCMD_CLOSE_TRACK, filter->write_ok); - __set_bit(GPCMD_FLUSH_CACHE, filter->write_ok); - __set_bit(GPCMD_FORMAT_UNIT, filter->write_ok); - __set_bit(GPCMD_REPAIR_RZONE_TRACK, filter->write_ok); - __set_bit(GPCMD_RESERVE_RZONE_TRACK, filter->write_ok); - __set_bit(GPCMD_SEND_DVD_STRUCTURE, filter->write_ok); - __set_bit(GPCMD_SEND_EVENT, filter->write_ok); - __set_bit(GPCMD_SEND_KEY, filter->write_ok); - __set_bit(GPCMD_SEND_OPC, filter->write_ok); - __set_bit(GPCMD_SEND_CUE_SHEET, filter->write_ok); - __set_bit(GPCMD_SET_SPEED, filter->write_ok); - __set_bit(GPCMD_PREVENT_ALLOW_MEDIUM_REMOVAL, filter->write_ok); - __set_bit(GPCMD_LOAD_UNLOAD, filter->write_ok); - __set_bit(GPCMD_SET_STREAMING, filter->write_ok); - __set_bit(GPCMD_SET_READ_AHEAD, filter->write_ok); - - /* ZBC Commands */ - __set_bit(ZBC_OUT, filter->write_ok); - __set_bit(ZBC_IN, filter->read_ok); -} - -int blk_verify_command(unsigned char *cmd, fmode_t mode) -{ - struct blk_cmd_filter *filter = &blk_default_cmd_filter; - - /* root can do any command. */ - if (capable(CAP_SYS_RAWIO)) - return 0; - - /* Anybody who can open the device can do a read-safe command */ - if (test_bit(cmd[0], filter->read_ok)) - return 0; - - /* Write-safe commands require a writable open */ - if (test_bit(cmd[0], filter->write_ok) && (mode & FMODE_WRITE)) - return 0; - - return -EPERM; -} -EXPORT_SYMBOL(blk_verify_command); - -static int blk_fill_sghdr_rq(struct request_queue *q, struct request *rq, - struct sg_io_hdr *hdr, fmode_t mode) -{ - struct scsi_request *req = scsi_req(rq); - - if (copy_from_user(req->cmd, hdr->cmdp, hdr->cmd_len)) - return -EFAULT; - if (blk_verify_command(req->cmd, mode)) - return -EPERM; - - /* - * fill in request structure - */ - req->cmd_len = hdr->cmd_len; - - rq->timeout = msecs_to_jiffies(hdr->timeout); - if (!rq->timeout) - rq->timeout = q->sg_timeout; - if (!rq->timeout) - rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - if (rq->timeout < BLK_MIN_SG_TIMEOUT) - rq->timeout = BLK_MIN_SG_TIMEOUT; - - return 0; -} - -static int blk_complete_sghdr_rq(struct request *rq, struct sg_io_hdr *hdr, - struct bio *bio) -{ - struct scsi_request *req = scsi_req(rq); - int r, ret = 0; - - /* - * fill in all the output members - */ - hdr->status = req->result & 0xff; - hdr->masked_status = status_byte(req->result); - hdr->msg_status = msg_byte(req->result); - hdr->host_status = host_byte(req->result); - hdr->driver_status = driver_byte(req->result); - hdr->info = 0; - if (hdr->masked_status || hdr->host_status || hdr->driver_status) - hdr->info |= SG_INFO_CHECK; - hdr->resid = req->resid_len; - hdr->sb_len_wr = 0; - - if (req->sense_len && hdr->sbp) { - int len = min((unsigned int) hdr->mx_sb_len, req->sense_len); - - if (!copy_to_user(hdr->sbp, req->sense, len)) - hdr->sb_len_wr = len; - else - ret = -EFAULT; - } - - r = blk_rq_unmap_user(bio); - if (!ret) - ret = r; - - return ret; -} - -static int sg_io(struct request_queue *q, struct gendisk *bd_disk, - struct sg_io_hdr *hdr, fmode_t mode) -{ - unsigned long start_time; - ssize_t ret = 0; - int writing = 0; - int at_head = 0; - struct request *rq; - struct scsi_request *req; - struct bio *bio; - - if (hdr->interface_id != 'S') - return -EINVAL; - - if (hdr->dxfer_len > (queue_max_hw_sectors(q) << 9)) - return -EIO; - - if (hdr->dxfer_len) - switch (hdr->dxfer_direction) { - default: - return -EINVAL; - case SG_DXFER_TO_DEV: - writing = 1; - break; - case SG_DXFER_TO_FROM_DEV: - case SG_DXFER_FROM_DEV: - break; - } - if (hdr->flags & SG_FLAG_Q_AT_HEAD) - at_head = 1; - - ret = -ENOMEM; - rq = blk_get_request(q, writing ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - req = scsi_req(rq); - - if (hdr->cmd_len > BLK_MAX_CDB) { - req->cmd = kzalloc(hdr->cmd_len, GFP_KERNEL); - if (!req->cmd) - goto out_put_request; - } - - ret = blk_fill_sghdr_rq(q, rq, hdr, mode); - if (ret < 0) - goto out_free_cdb; - - ret = 0; - if (hdr->iovec_count) { - struct iov_iter i; - struct iovec *iov = NULL; - -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) - ret = compat_import_iovec(rq_data_dir(rq), - hdr->dxferp, hdr->iovec_count, - 0, &iov, &i); - else -#endif - ret = import_iovec(rq_data_dir(rq), - hdr->dxferp, hdr->iovec_count, - 0, &iov, &i); - if (ret < 0) - goto out_free_cdb; - - /* SG_IO howto says that the shorter of the two wins */ - iov_iter_truncate(&i, hdr->dxfer_len); - - ret = blk_rq_map_user_iov(q, rq, NULL, &i, GFP_KERNEL); - kfree(iov); - } else if (hdr->dxfer_len) - ret = blk_rq_map_user(q, rq, NULL, hdr->dxferp, hdr->dxfer_len, - GFP_KERNEL); - - if (ret) - goto out_free_cdb; - - bio = rq->bio; - req->retries = 0; - - start_time = jiffies; - - /* ignore return value. All information is passed back to caller - * (if he doesn't check that is his problem). - * N.B. a non-zero SCSI status is _not_ necessarily an error. - */ - blk_execute_rq(q, bd_disk, rq, at_head); - - hdr->duration = jiffies_to_msecs(jiffies - start_time); - - ret = blk_complete_sghdr_rq(rq, hdr, bio); - -out_free_cdb: - scsi_req_free_cmd(req); -out_put_request: - blk_put_request(rq); - return ret; -} - -/** - * sg_scsi_ioctl -- handle deprecated SCSI_IOCTL_SEND_COMMAND ioctl - * @q: request queue to send scsi commands down - * @disk: gendisk to operate on (option) - * @mode: mode used to open the file through which the ioctl has been - * submitted - * @sic: userspace structure describing the command to perform - * - * Send down the scsi command described by @sic to the device below - * the request queue @q. If @file is non-NULL it's used to perform - * fine-grained permission checks that allow users to send down - * non-destructive SCSI commands. If the caller has a struct gendisk - * available it should be passed in as @disk to allow the low level - * driver to use the information contained in it. A non-NULL @disk - * is only allowed if the caller knows that the low level driver doesn't - * need it (e.g. in the scsi subsystem). - * - * Notes: - * - This interface is deprecated - users should use the SG_IO - * interface instead, as this is a more flexible approach to - * performing SCSI commands on a device. - * - The SCSI command length is determined by examining the 1st byte - * of the given command. There is no way to override this. - * - Data transfers are limited to PAGE_SIZE - * - The length (x + y) must be at least OMAX_SB_LEN bytes long to - * accommodate the sense buffer when an error occurs. - * The sense buffer is truncated to OMAX_SB_LEN (16) bytes so that - * old code will not be surprised. - * - If a Unix error occurs (e.g. ENOMEM) then the user will receive - * a negative return and the Unix error code in 'errno'. - * If the SCSI command succeeds then 0 is returned. - * Positive numbers returned are the compacted SCSI error codes (4 - * bytes in one int) where the lowest byte is the SCSI status. - */ -int sg_scsi_ioctl(struct request_queue *q, struct gendisk *disk, fmode_t mode, - struct scsi_ioctl_command __user *sic) -{ - enum { OMAX_SB_LEN = 16 }; /* For backward compatibility */ - struct request *rq; - struct scsi_request *req; - int err; - unsigned int in_len, out_len, bytes, opcode, cmdlen; - char *buffer = NULL; - - if (!sic) - return -EINVAL; - - /* - * get in an out lengths, verify they don't exceed a page worth of data - */ - if (get_user(in_len, &sic->inlen)) - return -EFAULT; - if (get_user(out_len, &sic->outlen)) - return -EFAULT; - if (in_len > PAGE_SIZE || out_len > PAGE_SIZE) - return -EINVAL; - if (get_user(opcode, sic->data)) - return -EFAULT; - - bytes = max(in_len, out_len); - if (bytes) { - buffer = kzalloc(bytes, q->bounce_gfp | GFP_USER| __GFP_NOWARN); - if (!buffer) - return -ENOMEM; - - } - - rq = blk_get_request(q, in_len ? REQ_OP_SCSI_OUT : REQ_OP_SCSI_IN, 0); - if (IS_ERR(rq)) { - err = PTR_ERR(rq); - goto error_free_buffer; - } - req = scsi_req(rq); - - cmdlen = COMMAND_SIZE(opcode); - - /* - * get command and data to send to device, if any - */ - err = -EFAULT; - req->cmd_len = cmdlen; - if (copy_from_user(req->cmd, sic->data, cmdlen)) - goto error; - - if (in_len && copy_from_user(buffer, sic->data + cmdlen, in_len)) - goto error; - - err = blk_verify_command(req->cmd, mode); - if (err) - goto error; - - /* default. possible overriden later */ - req->retries = 5; - - switch (opcode) { - case SEND_DIAGNOSTIC: - case FORMAT_UNIT: - rq->timeout = FORMAT_UNIT_TIMEOUT; - req->retries = 1; - break; - case START_STOP: - rq->timeout = START_STOP_TIMEOUT; - break; - case MOVE_MEDIUM: - rq->timeout = MOVE_MEDIUM_TIMEOUT; - break; - case READ_ELEMENT_STATUS: - rq->timeout = READ_ELEMENT_STATUS_TIMEOUT; - break; - case READ_DEFECT_DATA: - rq->timeout = READ_DEFECT_DATA_TIMEOUT; - req->retries = 1; - break; - default: - rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - break; - } - - if (bytes && blk_rq_map_kern(q, rq, buffer, bytes, GFP_NOIO)) { - err = DRIVER_ERROR << 24; - goto error; - } - - blk_execute_rq(q, disk, rq, 0); - - err = req->result & 0xff; /* only 8 bit SCSI status */ - if (err) { - if (req->sense_len && req->sense) { - bytes = (OMAX_SB_LEN > req->sense_len) ? - req->sense_len : OMAX_SB_LEN; - if (copy_to_user(sic->data, req->sense, bytes)) - err = -EFAULT; - } - } else { - if (copy_to_user(sic->data, buffer, out_len)) - err = -EFAULT; - } - -error: - blk_put_request(rq); - -error_free_buffer: - kfree(buffer); - - return err; -} -EXPORT_SYMBOL_GPL(sg_scsi_ioctl); - -/* Send basic block requests */ -static int __blk_send_generic(struct request_queue *q, struct gendisk *bd_disk, - int cmd, int data) -{ - struct request *rq; - int err; - - rq = blk_get_request(q, REQ_OP_SCSI_OUT, 0); - if (IS_ERR(rq)) - return PTR_ERR(rq); - rq->timeout = BLK_DEFAULT_SG_TIMEOUT; - scsi_req(rq)->cmd[0] = cmd; - scsi_req(rq)->cmd[4] = data; - scsi_req(rq)->cmd_len = 6; - blk_execute_rq(q, bd_disk, rq, 0); - err = scsi_req(rq)->result ? -EIO : 0; - blk_put_request(rq); - - return err; -} - -static inline int blk_send_start_stop(struct request_queue *q, - struct gendisk *bd_disk, int data) -{ - return __blk_send_generic(q, bd_disk, GPCMD_START_STOP_UNIT, data); -} - -int put_sg_io_hdr(const struct sg_io_hdr *hdr, void __user *argp) -{ -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) { - struct compat_sg_io_hdr hdr32 = { - .interface_id = hdr->interface_id, - .dxfer_direction = hdr->dxfer_direction, - .cmd_len = hdr->cmd_len, - .mx_sb_len = hdr->mx_sb_len, - .iovec_count = hdr->iovec_count, - .dxfer_len = hdr->dxfer_len, - .dxferp = (uintptr_t)hdr->dxferp, - .cmdp = (uintptr_t)hdr->cmdp, - .sbp = (uintptr_t)hdr->sbp, - .timeout = hdr->timeout, - .flags = hdr->flags, - .pack_id = hdr->pack_id, - .usr_ptr = (uintptr_t)hdr->usr_ptr, - .status = hdr->status, - .masked_status = hdr->masked_status, - .msg_status = hdr->msg_status, - .sb_len_wr = hdr->sb_len_wr, - .host_status = hdr->host_status, - .driver_status = hdr->driver_status, - .resid = hdr->resid, - .duration = hdr->duration, - .info = hdr->info, - }; - - if (copy_to_user(argp, &hdr32, sizeof(hdr32))) - return -EFAULT; - - return 0; - } -#endif - - if (copy_to_user(argp, hdr, sizeof(*hdr))) - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL(put_sg_io_hdr); - -int get_sg_io_hdr(struct sg_io_hdr *hdr, const void __user *argp) -{ -#ifdef CONFIG_COMPAT - struct compat_sg_io_hdr hdr32; - - if (in_compat_syscall()) { - if (copy_from_user(&hdr32, argp, sizeof(hdr32))) - return -EFAULT; - - *hdr = (struct sg_io_hdr) { - .interface_id = hdr32.interface_id, - .dxfer_direction = hdr32.dxfer_direction, - .cmd_len = hdr32.cmd_len, - .mx_sb_len = hdr32.mx_sb_len, - .iovec_count = hdr32.iovec_count, - .dxfer_len = hdr32.dxfer_len, - .dxferp = compat_ptr(hdr32.dxferp), - .cmdp = compat_ptr(hdr32.cmdp), - .sbp = compat_ptr(hdr32.sbp), - .timeout = hdr32.timeout, - .flags = hdr32.flags, - .pack_id = hdr32.pack_id, - .usr_ptr = compat_ptr(hdr32.usr_ptr), - .status = hdr32.status, - .masked_status = hdr32.masked_status, - .msg_status = hdr32.msg_status, - .sb_len_wr = hdr32.sb_len_wr, - .host_status = hdr32.host_status, - .driver_status = hdr32.driver_status, - .resid = hdr32.resid, - .duration = hdr32.duration, - .info = hdr32.info, - }; - - return 0; - } -#endif - - if (copy_from_user(hdr, argp, sizeof(*hdr))) - return -EFAULT; - - return 0; -} -EXPORT_SYMBOL(get_sg_io_hdr); - -#ifdef CONFIG_COMPAT -struct compat_cdrom_generic_command { - unsigned char cmd[CDROM_PACKET_SIZE]; - compat_caddr_t buffer; - compat_uint_t buflen; - compat_int_t stat; - compat_caddr_t sense; - unsigned char data_direction; - compat_int_t quiet; - compat_int_t timeout; - compat_caddr_t reserved[1]; -}; -#endif - -static int scsi_get_cdrom_generic_arg(struct cdrom_generic_command *cgc, - const void __user *arg) -{ -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) { - struct compat_cdrom_generic_command cgc32; - - if (copy_from_user(&cgc32, arg, sizeof(cgc32))) - return -EFAULT; - - *cgc = (struct cdrom_generic_command) { - .buffer = compat_ptr(cgc32.buffer), - .buflen = cgc32.buflen, - .stat = cgc32.stat, - .sense = compat_ptr(cgc32.sense), - .data_direction = cgc32.data_direction, - .quiet = cgc32.quiet, - .timeout = cgc32.timeout, - .reserved[0] = compat_ptr(cgc32.reserved[0]), - }; - memcpy(&cgc->cmd, &cgc32.cmd, CDROM_PACKET_SIZE); - return 0; - } -#endif - if (copy_from_user(cgc, arg, sizeof(*cgc))) - return -EFAULT; - - return 0; -} - -static int scsi_put_cdrom_generic_arg(const struct cdrom_generic_command *cgc, - void __user *arg) -{ -#ifdef CONFIG_COMPAT - if (in_compat_syscall()) { - struct compat_cdrom_generic_command cgc32 = { - .buffer = (uintptr_t)(cgc->buffer), - .buflen = cgc->buflen, - .stat = cgc->stat, - .sense = (uintptr_t)(cgc->sense), - .data_direction = cgc->data_direction, - .quiet = cgc->quiet, - .timeout = cgc->timeout, - .reserved[0] = (uintptr_t)(cgc->reserved[0]), - }; - memcpy(&cgc32.cmd, &cgc->cmd, CDROM_PACKET_SIZE); - - if (copy_to_user(arg, &cgc32, sizeof(cgc32))) - return -EFAULT; - - return 0; - } -#endif - if (copy_to_user(arg, cgc, sizeof(*cgc))) - return -EFAULT; - - return 0; -} - -static int scsi_cdrom_send_packet(struct request_queue *q, - struct gendisk *bd_disk, - fmode_t mode, void __user *arg) -{ - struct cdrom_generic_command cgc; - struct sg_io_hdr hdr; - int err; - - err = scsi_get_cdrom_generic_arg(&cgc, arg); - if (err) - return err; - - cgc.timeout = clock_t_to_jiffies(cgc.timeout); - memset(&hdr, 0, sizeof(hdr)); - hdr.interface_id = 'S'; - hdr.cmd_len = sizeof(cgc.cmd); - hdr.dxfer_len = cgc.buflen; - switch (cgc.data_direction) { - case CGC_DATA_UNKNOWN: - hdr.dxfer_direction = SG_DXFER_UNKNOWN; - break; - case CGC_DATA_WRITE: - hdr.dxfer_direction = SG_DXFER_TO_DEV; - break; - case CGC_DATA_READ: - hdr.dxfer_direction = SG_DXFER_FROM_DEV; - break; - case CGC_DATA_NONE: - hdr.dxfer_direction = SG_DXFER_NONE; - break; - default: - return -EINVAL; - } - - hdr.dxferp = cgc.buffer; - hdr.sbp = cgc.sense; - if (hdr.sbp) - hdr.mx_sb_len = sizeof(struct request_sense); - hdr.timeout = jiffies_to_msecs(cgc.timeout); - hdr.cmdp = ((struct cdrom_generic_command __user*) arg)->cmd; - hdr.cmd_len = sizeof(cgc.cmd); - - err = sg_io(q, bd_disk, &hdr, mode); - if (err == -EFAULT) - return -EFAULT; - - if (hdr.status) - return -EIO; - - cgc.stat = err; - cgc.buflen = hdr.resid; - if (scsi_put_cdrom_generic_arg(&cgc, arg)) - return -EFAULT; - - return err; -} - -int scsi_cmd_ioctl(struct request_queue *q, struct gendisk *bd_disk, fmode_t mode, - unsigned int cmd, void __user *arg) -{ - int err; - - if (!q) - return -ENXIO; - - switch (cmd) { - /* - * new sgv3 interface - */ - case SG_GET_VERSION_NUM: - err = sg_get_version(arg); - break; - case SCSI_IOCTL_GET_IDLUN: - err = scsi_get_idlun(q, arg); - break; - case SCSI_IOCTL_GET_BUS_NUMBER: - err = scsi_get_bus(q, arg); - break; - case SG_SET_TIMEOUT: - err = sg_set_timeout(q, arg); - break; - case SG_GET_TIMEOUT: - err = sg_get_timeout(q); - break; - case SG_GET_RESERVED_SIZE: - err = sg_get_reserved_size(q, arg); - break; - case SG_SET_RESERVED_SIZE: - err = sg_set_reserved_size(q, arg); - break; - case SG_EMULATED_HOST: - err = sg_emulated_host(q, arg); - break; - case SG_IO: { - struct sg_io_hdr hdr; - - err = get_sg_io_hdr(&hdr, arg); - if (err) - break; - err = sg_io(q, bd_disk, &hdr, mode); - if (err == -EFAULT) - break; - - if (put_sg_io_hdr(&hdr, arg)) - err = -EFAULT; - break; - } - case CDROM_SEND_PACKET: - err = scsi_cdrom_send_packet(q, bd_disk, mode, arg); - break; - - /* - * old junk scsi send command ioctl - */ - case SCSI_IOCTL_SEND_COMMAND: - printk(KERN_WARNING "program %s is using a deprecated SCSI ioctl, please convert it to SG_IO\n", current->comm); - err = -EINVAL; - if (!arg) - break; - - err = sg_scsi_ioctl(q, bd_disk, mode, arg); - break; - case CDROMCLOSETRAY: - err = blk_send_start_stop(q, bd_disk, 0x03); - break; - case CDROMEJECT: - err = blk_send_start_stop(q, bd_disk, 0x02); - break; - default: - err = -ENOTTY; - } - - return err; -} -EXPORT_SYMBOL(scsi_cmd_ioctl); - -int scsi_verify_blk_ioctl(struct block_device *bd, unsigned int cmd) -{ - if (bd && bd == bd->bd_contains) - return 0; - - if (capable(CAP_SYS_RAWIO)) - return 0; - - return -ENOIOCTLCMD; -} -EXPORT_SYMBOL(scsi_verify_blk_ioctl); - -int scsi_cmd_blk_ioctl(struct block_device *bd, fmode_t mode, - unsigned int cmd, void __user *arg) -{ - int ret; - - ret = scsi_verify_blk_ioctl(bd, cmd); - if (ret < 0) - return ret; - - return scsi_cmd_ioctl(bd->bd_disk->queue, bd->bd_disk, mode, cmd, arg); -} -EXPORT_SYMBOL(scsi_cmd_blk_ioctl); - -/** - * scsi_req_init - initialize certain fields of a scsi_request structure - * @req: Pointer to a scsi_request structure. - * Initializes .__cmd[], .cmd, .cmd_len and .sense_len but no other members - * of struct scsi_request. - */ -void scsi_req_init(struct scsi_request *req) -{ - memset(req->__cmd, 0, sizeof(req->__cmd)); - req->cmd = req->__cmd; - req->cmd_len = BLK_MAX_CDB; - req->sense_len = 0; -} -EXPORT_SYMBOL(scsi_req_init); - -static int __init blk_scsi_ioctl_init(void) -{ - blk_set_cmd_filter_defaults(&blk_default_cmd_filter); - return 0; -} -fs_initcall(blk_scsi_ioctl_init); diff --git a/block/sed-opal.c b/block/sed-opal.c index daafadbb88ca..fa4dba5d8531 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -13,13 +13,17 @@ #include <linux/device.h> #include <linux/kernel.h> #include <linux/list.h> -#include <linux/genhd.h> +#include <linux/blkdev.h> #include <linux/slab.h> #include <linux/uaccess.h> #include <uapi/linux/sed-opal.h> #include <linux/sed-opal.h> +#include <linux/sed-opal-key.h> #include <linux/string.h> #include <linux/kdev_t.h> +#include <linux/key.h> +#include <linux/key-type.h> +#include <keys/user-type.h> #include "opal_proto.h" @@ -29,6 +33,8 @@ /* Number of bytes needed by cmd_finalize. */ #define CMD_FINALIZE_BYTES_NEEDED 7 +static struct key *sed_opal_keyring; + struct opal_step { int (*fn)(struct opal_dev *dev, void *data); void *data; @@ -74,8 +80,7 @@ struct parsed_resp { }; struct opal_dev { - bool supported; - bool mbr_enabled; + u32 flags; void *data; sec_send_recv *send_recv; @@ -84,12 +89,14 @@ struct opal_dev { u16 comid; u32 hsn; u32 tsn; - u64 align; + u64 align; /* alignment granularity */ u64 lowest_lba; + u32 logical_block_size; + u8 align_required; /* ALIGN: 0 or 1 */ size_t pos; - u8 cmd[IO_BUFFER_LENGTH]; - u8 resp[IO_BUFFER_LENGTH]; + u8 *cmd; + u8 *resp; struct parsed_resp parsed; size_t prev_d_len; @@ -133,6 +140,8 @@ static const u8 opaluid[][OPAL_UID_LENGTH] = { { 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01 }, [OPAL_LOCKINGRANGE_GLOBAL] = { 0x00, 0x00, 0x08, 0x02, 0x00, 0x00, 0x00, 0x01 }, + [OPAL_LOCKINGRANGE_ACE_START_TO_KEY] = + { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xD0, 0x01 }, [OPAL_LOCKINGRANGE_ACE_RDLOCKED] = { 0x00, 0x00, 0x00, 0x08, 0x00, 0x03, 0xE0, 0x01 }, [OPAL_LOCKINGRANGE_ACE_WRLOCKED] = @@ -266,6 +275,101 @@ static void print_buffer(const u8 *ptr, u32 length) #endif } +/* + * Allocate/update a SED Opal key and add it to the SED Opal keyring. + */ +static int update_sed_opal_key(const char *desc, u_char *key_data, int keylen) +{ + key_ref_t kr; + + if (!sed_opal_keyring) + return -ENOKEY; + + kr = key_create_or_update(make_key_ref(sed_opal_keyring, true), "user", + desc, (const void *)key_data, keylen, + KEY_USR_VIEW | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA | KEY_ALLOC_BUILT_IN | + KEY_ALLOC_BYPASS_RESTRICTION); + if (IS_ERR(kr)) { + pr_err("Error adding SED key (%ld)\n", PTR_ERR(kr)); + return PTR_ERR(kr); + } + + return 0; +} + +/* + * Read a SED Opal key from the SED Opal keyring. + */ +static int read_sed_opal_key(const char *key_name, u_char *buffer, int buflen) +{ + int ret; + key_ref_t kref; + struct key *key; + + if (!sed_opal_keyring) + return -ENOKEY; + + kref = keyring_search(make_key_ref(sed_opal_keyring, true), + &key_type_user, key_name, true); + + if (IS_ERR(kref)) + ret = PTR_ERR(kref); + + key = key_ref_to_ptr(kref); + down_read(&key->sem); + ret = key_validate(key); + if (ret == 0) { + if (buflen > key->datalen) + buflen = key->datalen; + + ret = key->type->read(key, (char *)buffer, buflen); + } + up_read(&key->sem); + + key_ref_put(kref); + + return ret; +} + +static int opal_get_key(struct opal_dev *dev, struct opal_key *key) +{ + int ret = 0; + + switch (key->key_type) { + case OPAL_INCLUDED: + /* the key is ready to use */ + break; + case OPAL_KEYRING: + /* the key is in the keyring */ + ret = read_sed_opal_key(OPAL_AUTH_KEY, key->key, OPAL_KEY_MAX); + if (ret > 0) { + if (ret > U8_MAX) { + ret = -ENOSPC; + goto error; + } + key->key_len = ret; + key->key_type = OPAL_INCLUDED; + } + break; + default: + ret = -EINVAL; + break; + } + if (ret < 0) + goto error; + + /* must have a PEK by now or it's an error */ + if (key->key_type != OPAL_INCLUDED || key->key_len == 0) { + ret = -EINVAL; + goto error; + } + return 0; +error: + pr_debug("Error getting password: %d\n", ret); + return ret; +} + static bool check_tper(const void *data) { const struct d0_tper_features *tper = data; @@ -280,6 +384,30 @@ static bool check_tper(const void *data) return true; } +static bool check_lcksuppt(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_SUPPORTED_MASK); +} + +static bool check_lckenabled(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKING_ENABLED_MASK); +} + +static bool check_locked(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & LOCKED_MASK); +} + static bool check_mbrenabled(const void *data) { const struct d0_locking_features *lfeat = data; @@ -288,6 +416,14 @@ static bool check_mbrenabled(const void *data) return !!(sup_feat & MBR_ENABLED_MASK); } +static bool check_mbrdone(const void *data) +{ + const struct d0_locking_features *lfeat = data; + u8 sup_feat = lfeat->supported_features; + + return !!(sup_feat & MBR_DONE_MASK); +} + static bool check_sum(const void *data) { const struct d0_single_user_mode *sum = data; @@ -376,6 +512,8 @@ static void check_geometry(struct opal_dev *dev, const void *data) dev->align = be64_to_cpu(geo->alignment_granularity); dev->lowest_lba = be64_to_cpu(geo->lowest_aligned_lba); + dev->logical_block_size = be32_to_cpu(geo->logical_block_size); + dev->align_required = geo->reserved01 & 1; } static int execute_step(struct opal_dev *dev, @@ -426,8 +564,11 @@ out_error: return error; } -static int opal_discovery0_end(struct opal_dev *dev) +static int opal_discovery0_end(struct opal_dev *dev, void *data) { + struct opal_discovery *discv_out = data; /* may be NULL */ + u8 __user *buf_out; + u64 len_out; bool found_com_id = false, supported = true, single_user = false; const struct d0_header *hdr = (struct d0_header *)dev->resp; const u8 *epos = dev->resp, *cpos = dev->resp; @@ -435,7 +576,7 @@ static int opal_discovery0_end(struct opal_dev *dev) u32 hlen = be32_to_cpu(hdr->length); print_buffer(dev->resp, hlen); - dev->mbr_enabled = false; + dev->flags &= OPAL_FL_SUPPORTED; if (hlen > IO_BUFFER_LENGTH - sizeof(*hdr)) { pr_debug("Discovery length overflows buffer (%zu+%u)/%u\n", @@ -443,6 +584,15 @@ static int opal_discovery0_end(struct opal_dev *dev) return -EFAULT; } + if (discv_out) { + buf_out = (u8 __user *)(uintptr_t)discv_out->data; + len_out = min_t(u64, discv_out->size, hlen); + if (buf_out && copy_to_user(buf_out, dev->resp, len_out)) + return -EFAULT; + + discv_out->size = hlen; /* actual size of data */ + } + epos += hlen; /* end of buffer */ cpos += sizeof(*hdr); /* current position on buffer */ @@ -456,12 +606,23 @@ static int opal_discovery0_end(struct opal_dev *dev) break; case FC_SINGLEUSER: single_user = check_sum(body->features); + if (single_user) + dev->flags |= OPAL_FL_SUM_SUPPORTED; break; case FC_GEOMETRY: check_geometry(dev, body); break; case FC_LOCKING: - dev->mbr_enabled = check_mbrenabled(body->features); + if (check_lcksuppt(body->features)) + dev->flags |= OPAL_FL_LOCKING_SUPPORTED; + if (check_lckenabled(body->features)) + dev->flags |= OPAL_FL_LOCKING_ENABLED; + if (check_locked(body->features)) + dev->flags |= OPAL_FL_LOCKED; + if (check_mbrenabled(body->features)) + dev->flags |= OPAL_FL_MBR_ENABLED; + if (check_mbrdone(body->features)) + dev->flags |= OPAL_FL_MBR_DONE; break; case FC_ENTERPRISE: case FC_DATASTORE: @@ -517,13 +678,13 @@ static int opal_discovery0(struct opal_dev *dev, void *data) if (ret) return ret; - return opal_discovery0_end(dev); + return opal_discovery0_end(dev, data); } static int opal_discovery0_step(struct opal_dev *dev) { const struct opal_step discovery0_step = { - opal_discovery0, + opal_discovery0, NULL }; return execute_step(dev, &discovery0_step, 0); @@ -895,16 +1056,20 @@ static int response_parse(const u8 *buf, size_t length, token_length = response_parse_medium(iter, pos); else if (pos[0] <= LONG_ATOM_BYTE) /* long atom */ token_length = response_parse_long(iter, pos); + else if (pos[0] == EMPTY_ATOM_BYTE) /* empty atom */ + token_length = 1; else /* TOKEN */ token_length = response_parse_token(iter, pos); if (token_length < 0) return token_length; + if (pos[0] != EMPTY_ATOM_BYTE) + num_entries++; + pos += token_length; total -= token_length; iter++; - num_entries++; } resp->num = num_entries; @@ -1105,12 +1270,8 @@ static int finalize_and_send(struct opal_dev *dev, cont_fn cont) return opal_send_recv(dev, cont); } -/* - * request @column from table @table on device @dev. On success, the column - * data will be available in dev->resp->tok[4] - */ -static int generic_get_column(struct opal_dev *dev, const u8 *table, - u64 column) +static int generic_get_columns(struct opal_dev *dev, const u8 *table, + u64 start_column, u64 end_column) { int err; @@ -1120,12 +1281,12 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, OPAL_STARTCOLUMN); - add_token_u64(&err, dev, column); + add_token_u64(&err, dev, start_column); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_STARTNAME); add_token_u8(&err, dev, OPAL_ENDCOLUMN); - add_token_u64(&err, dev, column); + add_token_u64(&err, dev, end_column); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_ENDLIST); @@ -1137,6 +1298,16 @@ static int generic_get_column(struct opal_dev *dev, const u8 *table, } /* + * request @column from table @table on device @dev. On success, the column + * data will be available in dev->resp->tok[4] + */ +static int generic_get_column(struct opal_dev *dev, const u8 *table, + u64 column) +{ + return generic_get_columns(dev, table, column, column); +} + +/* * see TCG SAS 5.3.2.3 for a description of the available columns * * the result is provided in dev->resp->tok[4] @@ -1395,6 +1566,129 @@ static int setup_locking_range(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int response_get_column(const struct parsed_resp *resp, + int *iter, + u8 column, + u64 *value) +{ + const struct opal_resp_tok *tok; + int n = *iter; + u64 val; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_STARTNAME)) { + pr_debug("Unexpected response token type %d.\n", n); + return OPAL_INVAL_PARAM; + } + n++; + + if (response_get_u64(resp, n) != column) { + pr_debug("Token %d does not match expected column %u.\n", + n, column); + return OPAL_INVAL_PARAM; + } + n++; + + val = response_get_u64(resp, n); + n++; + + tok = response_get_token(resp, n); + if (IS_ERR(tok)) + return PTR_ERR(tok); + + if (!response_token_matches(tok, OPAL_ENDNAME)) { + pr_debug("Unexpected response token type %d.\n", n); + return OPAL_INVAL_PARAM; + } + n++; + + *value = val; + *iter = n; + + return 0; +} + +static int locking_range_status(struct opal_dev *dev, void *data) +{ + u8 lr_buffer[OPAL_UID_LENGTH]; + u64 resp; + bool rlocked, wlocked; + int err, tok_n = 2; + struct opal_lr_status *lrst = data; + + err = build_locking_range(lr_buffer, sizeof(lr_buffer), + lrst->session.opal_key.lr); + if (err) + return err; + + err = generic_get_columns(dev, lr_buffer, OPAL_RANGESTART, + OPAL_WRITELOCKED); + if (err) { + pr_debug("Couldn't get lr %u table columns %d to %d.\n", + lrst->session.opal_key.lr, OPAL_RANGESTART, + OPAL_WRITELOCKED); + return err; + } + + /* range start */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGESTART, + &lrst->range_start); + if (err) + return err; + + /* range length */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_RANGELENGTH, + &lrst->range_length); + if (err) + return err; + + /* RLE */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKENABLED, + &resp); + if (err) + return err; + + lrst->RLE = !!resp; + + /* WLE */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKENABLED, + &resp); + if (err) + return err; + + lrst->WLE = !!resp; + + /* read locked */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_READLOCKED, &resp); + if (err) + return err; + + rlocked = !!resp; + + /* write locked */ + err = response_get_column(&dev->parsed, &tok_n, OPAL_WRITELOCKED, &resp); + if (err) + return err; + + wlocked = !!resp; + + /* opal_lock_state can not map 'read locked' only state. */ + lrst->l_state = OPAL_RW; + if (rlocked && wlocked) + lrst->l_state = OPAL_LK; + else if (wlocked) + lrst->l_state = OPAL_RO; + else if (rlocked) { + pr_debug("Can not report read locked only state.\n"); + return -EINVAL; + } + + return 0; +} + static int start_generic_opal_session(struct opal_dev *dev, enum opal_uid auth, enum opal_uid sp_type, @@ -1580,6 +1874,26 @@ static int internal_activate_user(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int revert_lsp(struct opal_dev *dev, void *data) +{ + struct opal_revert_lsp *rev = data; + int err; + + err = cmd_start(dev, opaluid[OPAL_THISSP_UID], + opalmethod[OPAL_REVERTSP]); + add_token_u8(&err, dev, OPAL_STARTNAME); + add_token_u64(&err, dev, OPAL_KEEP_GLOBAL_RANGE_KEY); + add_token_u8(&err, dev, (rev->options & OPAL_PRESERVE) ? + OPAL_TRUE : OPAL_FALSE); + add_token_u8(&err, dev, OPAL_ENDNAME); + if (err) { + pr_debug("Error building REVERT SP command.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int erase_locking_range(struct opal_dev *dev, void *data) { struct opal_session_info *session = data; @@ -1717,25 +2031,43 @@ static int set_sid_cpin_pin(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } -static int add_user_to_lr(struct opal_dev *dev, void *data) +static void add_authority_object_ref(int *err, + struct opal_dev *dev, + const u8 *uid, + size_t uid_len) +{ + add_token_u8(err, dev, OPAL_STARTNAME); + add_token_bytestring(err, dev, + opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], + OPAL_UID_LENGTH/2); + add_token_bytestring(err, dev, uid, uid_len); + add_token_u8(err, dev, OPAL_ENDNAME); +} + +static void add_boolean_object_ref(int *err, + struct opal_dev *dev, + u8 boolean_op) +{ + add_token_u8(err, dev, OPAL_STARTNAME); + add_token_bytestring(err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], + OPAL_UID_LENGTH/2); + add_token_u8(err, dev, boolean_op); + add_token_u8(err, dev, OPAL_ENDNAME); +} + +static int set_lr_boolean_ace(struct opal_dev *dev, + unsigned int opal_uid, + u8 lr, + const u8 *users, + size_t users_len) { u8 lr_buffer[OPAL_UID_LENGTH]; u8 user_uid[OPAL_UID_LENGTH]; - struct opal_lock_unlock *lkul = data; + u8 u; int err; - memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_RDLOCKED], - OPAL_UID_LENGTH); - - if (lkul->l_state == OPAL_RW) - memcpy(lr_buffer, opaluid[OPAL_LOCKINGRANGE_ACE_WRLOCKED], - OPAL_UID_LENGTH); - - lr_buffer[7] = lkul->session.opal_key.lr; - - memcpy(user_uid, opaluid[OPAL_USER1_UID], OPAL_UID_LENGTH); - - user_uid[7] = lkul->session.who; + memcpy(lr_buffer, opaluid[opal_uid], OPAL_UID_LENGTH); + lr_buffer[7] = lr; err = cmd_start(dev, lr_buffer, opalmethod[OPAL_SET]); @@ -1748,35 +2080,49 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) add_token_u8(&err, dev, OPAL_STARTLIST); + for (u = 0; u < users_len; u++) { + if (users[u] == OPAL_ADMIN1) + memcpy(user_uid, opaluid[OPAL_ADMIN1_UID], + OPAL_UID_LENGTH); + else { + memcpy(user_uid, opaluid[OPAL_USER1_UID], + OPAL_UID_LENGTH); + user_uid[7] = users[u]; + } - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, - opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], - OPAL_UID_LENGTH/2); - add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); - add_token_u8(&err, dev, OPAL_ENDNAME); - - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, - opaluid[OPAL_HALF_UID_AUTHORITY_OBJ_REF], - OPAL_UID_LENGTH/2); - add_token_bytestring(&err, dev, user_uid, OPAL_UID_LENGTH); - add_token_u8(&err, dev, OPAL_ENDNAME); - - - add_token_u8(&err, dev, OPAL_STARTNAME); - add_token_bytestring(&err, dev, opaluid[OPAL_HALF_UID_BOOLEAN_ACE], - OPAL_UID_LENGTH/2); - add_token_u8(&err, dev, 1); - add_token_u8(&err, dev, OPAL_ENDNAME); + add_authority_object_ref(&err, dev, user_uid, sizeof(user_uid)); + /* + * Add boolean operator in postfix only with + * two or more authorities being added in ACE + * expresion. + * */ + if (u > 0) + add_boolean_object_ref(&err, dev, OPAL_BOOLEAN_OR); + } add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); add_token_u8(&err, dev, OPAL_ENDLIST); add_token_u8(&err, dev, OPAL_ENDNAME); + return err; +} + +static int add_user_to_lr(struct opal_dev *dev, void *data) +{ + int err; + struct opal_lock_unlock *lkul = data; + const u8 users[] = { + lkul->session.who + }; + + err = set_lr_boolean_ace(dev, + lkul->l_state == OPAL_RW ? + OPAL_LOCKINGRANGE_ACE_WRLOCKED : + OPAL_LOCKINGRANGE_ACE_RDLOCKED, + lkul->session.opal_key.lr, users, + ARRAY_SIZE(users)); if (err) { pr_debug("Error building add user to locking range command.\n"); return err; @@ -1785,6 +2131,27 @@ static int add_user_to_lr(struct opal_dev *dev, void *data) return finalize_and_send(dev, parse_and_check_status); } +static int add_user_to_lr_ace(struct opal_dev *dev, void *data) +{ + int err; + struct opal_lock_unlock *lkul = data; + const u8 users[] = { + OPAL_ADMIN1, + lkul->session.who + }; + + err = set_lr_boolean_ace(dev, OPAL_LOCKINGRANGE_ACE_START_TO_KEY, + lkul->session.opal_key.lr, users, + ARRAY_SIZE(users)); + + if (err) { + pr_debug("Error building add user to locking ranges ACEs.\n"); + return err; + } + + return finalize_and_send(dev, parse_and_check_status); +} + static int lock_unlock_locking_range(struct opal_dev *dev, void *data) { u8 lr_buffer[OPAL_UID_LENGTH]; @@ -2109,7 +2476,8 @@ static int check_opal_support(struct opal_dev *dev) mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = opal_discovery0_step(dev); - dev->supported = !ret; + if (!ret) + dev->flags |= OPAL_FL_SUPPORTED; mutex_unlock(&dev->dev_lock); return ret; @@ -2134,6 +2502,8 @@ void free_opal_dev(struct opal_dev *dev) return; clean_opal_dev(dev); + kfree(dev->resp); + kfree(dev->cmd); kfree(dev); } EXPORT_SYMBOL(free_opal_dev); @@ -2146,17 +2516,40 @@ struct opal_dev *init_opal_dev(void *data, sec_send_recv *send_recv) if (!dev) return NULL; + /* + * Presumably DMA-able buffers must be cache-aligned. Kmalloc makes + * sure the allocated buffer is DMA-safe in that regard. + */ + dev->cmd = kmalloc(IO_BUFFER_LENGTH, GFP_KERNEL); + if (!dev->cmd) + goto err_free_dev; + + dev->resp = kmalloc(IO_BUFFER_LENGTH, GFP_KERNEL); + if (!dev->resp) + goto err_free_cmd; + INIT_LIST_HEAD(&dev->unlk_lst); mutex_init(&dev->dev_lock); + dev->flags = 0; dev->data = data; dev->send_recv = send_recv; if (check_opal_support(dev) != 0) { pr_debug("Opal is not supported on this device\n"); - kfree(dev); - return NULL; + goto err_free_resp; } return dev; + +err_free_resp: + kfree(dev->resp); + +err_free_cmd: + kfree(dev->cmd); + +err_free_dev: + kfree(dev); + + return NULL; } EXPORT_SYMBOL(init_opal_dev); @@ -2171,6 +2564,9 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2179,6 +2575,42 @@ static int opal_secure_erase_locking_range(struct opal_dev *dev, return ret; } +static int opal_get_discv(struct opal_dev *dev, struct opal_discovery *discv) +{ + const struct opal_step discovery0_step = { + opal_discovery0, discv + }; + int ret = 0; + + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_step(dev, &discovery0_step, 0); + mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + return discv->size; /* modified to actual length of data */ +} + +static int opal_revertlsp(struct opal_dev *dev, struct opal_revert_lsp *rev) +{ + /* controller will terminate session */ + const struct opal_step steps[] = { + { start_admin1LSP_opal_session, &rev->key }, + { revert_lsp, rev } + }; + int ret; + + ret = opal_get_key(dev, &rev->key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + static int opal_erase_locking_range(struct opal_dev *dev, struct opal_session_info *opal_session) { @@ -2189,6 +2621,9 @@ static int opal_erase_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, erase_steps, ARRAY_SIZE(erase_steps)); @@ -2217,6 +2652,9 @@ static int opal_enable_disable_shadow_mbr(struct opal_dev *dev, opal_mbr->enable_disable != OPAL_MBR_DISABLE) return -EINVAL; + ret = opal_get_key(dev, &opal_mbr->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2242,6 +2680,9 @@ static int opal_set_mbr_done(struct opal_dev *dev, mbr_done->done_flag != OPAL_MBR_NOT_DONE) return -EINVAL; + ret = opal_get_key(dev, &mbr_done->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2263,6 +2704,9 @@ static int opal_write_shadow_mbr(struct opal_dev *dev, if (info->size == 0) return 0; + ret = opal_get_key(dev, &info->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, mbr_steps, ARRAY_SIZE(mbr_steps)); @@ -2296,6 +2740,7 @@ static int opal_add_user_to_lr(struct opal_dev *dev, const struct opal_step steps[] = { { start_admin1LSP_opal_session, &lk_unlk->session.opal_key }, { add_user_to_lr, lk_unlk }, + { add_user_to_lr_ace, lk_unlk }, { end_opal_session, } }; int ret; @@ -2319,6 +2764,9 @@ static int opal_add_user_to_lr(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, steps, ARRAY_SIZE(steps)); @@ -2341,6 +2789,10 @@ static int opal_reverttper(struct opal_dev *dev, struct opal_key *opal, bool psi int ret; + ret = opal_get_key(dev, opal); + + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); if (psid) @@ -2395,6 +2847,44 @@ static int __opal_set_mbr_done(struct opal_dev *dev, struct opal_key *key) return execute_steps(dev, mbrdone_step, ARRAY_SIZE(mbrdone_step)); } +static void opal_lock_check_for_saved_key(struct opal_dev *dev, + struct opal_lock_unlock *lk_unlk) +{ + struct opal_suspend_data *iter; + + if (lk_unlk->l_state != OPAL_LK || + lk_unlk->session.opal_key.key_len > 0) + return; + + /* + * Usually when closing a crypto device (eg: dm-crypt with LUKS) the + * volume key is not required, as it requires root privileges anyway, + * and root can deny access to a disk in many ways regardless. + * Requiring the volume key to lock the device is a peculiarity of the + * OPAL specification. Given we might already have saved the key if + * the user requested it via the 'IOC_OPAL_SAVE' ioctl, we can use + * that key to lock the device if no key was provided here, the + * locking range matches and the appropriate flag was passed with + * 'IOC_OPAL_SAVE'. + * This allows integrating OPAL with tools and libraries that are used + * to the common behaviour and do not ask for the volume key when + * closing a device. + */ + setup_opal_dev(dev); + list_for_each_entry(iter, &dev->unlk_lst, node) { + if ((iter->unlk.flags & OPAL_SAVE_FOR_LOCK) && + iter->lr == lk_unlk->session.opal_key.lr && + iter->unlk.session.opal_key.key_len > 0) { + lk_unlk->session.opal_key.key_len = + iter->unlk.session.opal_key.key_len; + memcpy(lk_unlk->session.opal_key.key, + iter->unlk.session.opal_key.key, + iter->unlk.session.opal_key.key_len); + break; + } + } +} + static int opal_lock_unlock(struct opal_dev *dev, struct opal_lock_unlock *lk_unlk) { @@ -2404,7 +2894,10 @@ static int opal_lock_unlock(struct opal_dev *dev, return -EINVAL; mutex_lock(&dev->dev_lock); - ret = __opal_lock_unlock(dev, lk_unlk); + opal_lock_check_for_saved_key(dev, lk_unlk); + ret = opal_get_key(dev, &lk_unlk->session.opal_key); + if (!ret) + ret = __opal_lock_unlock(dev, lk_unlk); mutex_unlock(&dev->dev_lock); return ret; @@ -2425,6 +2918,9 @@ static int opal_take_ownership(struct opal_dev *dev, struct opal_key *opal) if (!dev) return -ENODEV; + ret = opal_get_key(dev, opal); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, owner_steps, ARRAY_SIZE(owner_steps)); @@ -2447,6 +2943,9 @@ static int opal_activate_lsp(struct opal_dev *dev, if (!opal_lr_act->num_lrs || opal_lr_act->num_lrs > OPAL_MAX_LRS) return -EINVAL; + ret = opal_get_key(dev, &opal_lr_act->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, active_steps, ARRAY_SIZE(active_steps)); @@ -2465,11 +2964,41 @@ static int opal_setup_locking_range(struct opal_dev *dev, }; int ret; + ret = opal_get_key(dev, &opal_lrs->session.opal_key); + if (ret) + return ret; + mutex_lock(&dev->dev_lock); + setup_opal_dev(dev); + ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); + mutex_unlock(&dev->dev_lock); + + return ret; +} + +static int opal_locking_range_status(struct opal_dev *dev, + struct opal_lr_status *opal_lrst, + void __user *data) +{ + const struct opal_step lr_steps[] = { + { start_auth_opal_session, &opal_lrst->session }, + { locking_range_status, opal_lrst }, + { end_opal_session, } + }; + int ret; + mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, lr_steps, ARRAY_SIZE(lr_steps)); mutex_unlock(&dev->dev_lock); + /* skip session info when copying back to uspace */ + if (!ret && copy_to_user(data + offsetof(struct opal_lr_status, range_start), + (void *)opal_lrst + offsetof(struct opal_lr_status, range_start), + sizeof(*opal_lrst) - offsetof(struct opal_lr_status, range_start))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + return ret; } @@ -2491,6 +3020,20 @@ static int opal_set_new_pw(struct opal_dev *dev, struct opal_new_pw *opal_pw) ret = execute_steps(dev, pw_steps, ARRAY_SIZE(pw_steps)); mutex_unlock(&dev->dev_lock); + if (ret) + return ret; + + /* update keyring and key store with new password */ + ret = sed_write_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + if (ret != -EOPNOTSUPP) + pr_warn("error updating SED key: %d\n", ret); + + ret = update_sed_opal_key(OPAL_AUTH_KEY, + opal_pw->new_user_pw.opal_key.key, + opal_pw->new_user_pw.opal_key.key_len); + return ret; } @@ -2511,6 +3054,9 @@ static int opal_activate_user(struct opal_dev *dev, return -EINVAL; } + ret = opal_get_key(dev, &opal_session->opal_key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); ret = execute_steps(dev, act_steps, ARRAY_SIZE(act_steps)); @@ -2528,7 +3074,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) if (!dev) return false; - if (!dev->supported) + if (!(dev->flags & OPAL_FL_SUPPORTED)) return false; mutex_lock(&dev->dev_lock); @@ -2546,7 +3092,7 @@ bool opal_unlock_from_suspend(struct opal_dev *dev) was_failure = true; } - if (dev->mbr_enabled) { + if (dev->flags & OPAL_FL_MBR_ENABLED) { ret = __opal_set_mbr_done(dev, &suspend->unlk.session.opal_key); if (ret) pr_debug("Failed to set MBR Done in S3 resume\n"); @@ -2597,6 +3143,9 @@ static int opal_generic_read_write_table(struct opal_dev *dev, { int ret, bit_set; + ret = opal_get_key(dev, &rw_tbl->key); + if (ret) + return ret; mutex_lock(&dev->dev_lock); setup_opal_dev(dev); @@ -2620,6 +3169,43 @@ static int opal_generic_read_write_table(struct opal_dev *dev, return ret; } +static int opal_get_status(struct opal_dev *dev, void __user *data) +{ + struct opal_status sts = {0}; + + /* + * check_opal_support() error is not fatal, + * !dev->supported is a valid condition + */ + if (!check_opal_support(dev)) + sts.flags = dev->flags; + if (copy_to_user(data, &sts, sizeof(sts))) { + pr_debug("Error copying status to userspace\n"); + return -EFAULT; + } + return 0; +} + +static int opal_get_geometry(struct opal_dev *dev, void __user *data) +{ + struct opal_geometry geo = {0}; + + if (check_opal_support(dev)) + return -EINVAL; + + geo.align = dev->align_required; + geo.logical_block_size = dev->logical_block_size; + geo.alignment_granularity = dev->align; + geo.lowest_aligned_lba = dev->lowest_lba; + + if (copy_to_user(data, &geo, sizeof(geo))) { + pr_debug("Error copying geometry data to userspace\n"); + return -EFAULT; + } + + return 0; +} + int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) { void *p; @@ -2628,13 +3214,15 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EACCES; if (!dev) - return -ENOTSUPP; - if (!dev->supported) - return -ENOTSUPP; + return -EOPNOTSUPP; + if (!(dev->flags & OPAL_FL_SUPPORTED)) + return -EOPNOTSUPP; - p = memdup_user(arg, _IOC_SIZE(cmd)); - if (IS_ERR(p)) - return PTR_ERR(p); + if (cmd & IOC_IN) { + p = memdup_user(arg, _IOC_SIZE(cmd)); + if (IS_ERR(p)) + return PTR_ERR(p); + } switch (cmd) { case IOC_OPAL_SAVE: @@ -2685,11 +3273,54 @@ int sed_ioctl(struct opal_dev *dev, unsigned int cmd, void __user *arg) case IOC_OPAL_GENERIC_TABLE_RW: ret = opal_generic_read_write_table(dev, p); break; + case IOC_OPAL_GET_STATUS: + ret = opal_get_status(dev, arg); + break; + case IOC_OPAL_GET_LR_STATUS: + ret = opal_locking_range_status(dev, p, arg); + break; + case IOC_OPAL_GET_GEOMETRY: + ret = opal_get_geometry(dev, arg); + break; + case IOC_OPAL_REVERT_LSP: + ret = opal_revertlsp(dev, p); + break; + case IOC_OPAL_DISCOVERY: + ret = opal_get_discv(dev, p); + break; + default: break; } - kfree(p); + if (cmd & IOC_IN) + kfree(p); return ret; } EXPORT_SYMBOL_GPL(sed_ioctl); + +static int __init sed_opal_init(void) +{ + struct key *kr; + char init_sed_key[OPAL_KEY_MAX]; + int keylen = OPAL_KEY_MAX - 1; + + kr = keyring_alloc(".sed_opal", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + (KEY_POS_ALL & ~KEY_POS_SETATTR) | KEY_USR_VIEW | + KEY_USR_READ | KEY_USR_SEARCH | KEY_USR_WRITE, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(kr)) + return PTR_ERR(kr); + + sed_opal_keyring = kr; + + if (sed_read_key(OPAL_AUTH_KEY, init_sed_key, &keylen) < 0) { + memset(init_sed_key, '\0', sizeof(init_sed_key)); + keylen = OPAL_KEY_MAX - 1; + } + + return update_sed_opal_key(OPAL_AUTH_KEY, init_sed_key, keylen); +} +late_initcall(sed_opal_init); diff --git a/block/t10-pi.c b/block/t10-pi.c index d910534b3a41..914d8cddd43a 100644 --- a/block/t10-pi.c +++ b/block/t10-pi.c @@ -5,10 +5,12 @@ */ #include <linux/t10-pi.h> -#include <linux/blkdev.h> +#include <linux/blk-integrity.h> #include <linux/crc-t10dif.h> +#include <linux/crc64.h> #include <linux/module.h> #include <net/checksum.h> +#include <asm/unaligned.h> typedef __be16 (csum_fn) (void *, unsigned int); @@ -44,7 +46,7 @@ static blk_status_t t10_pi_generate(struct blk_integrity_iter *iter, pi->ref_tag = 0; iter->data_buf += iter->interval; - iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->prot_buf += iter->tuple_size; iter->seed++; } @@ -93,7 +95,7 @@ static blk_status_t t10_pi_verify(struct blk_integrity_iter *iter, next: iter->data_buf += iter->interval; - iter->prot_buf += sizeof(struct t10_pi_tuple); + iter->prot_buf += iter->tuple_size; iter->seed++; } @@ -147,11 +149,10 @@ static void t10_pi_type1_prepare(struct request *rq) break; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -161,8 +162,7 @@ static void t10_pi_type1_prepare(struct request *rq) ref_tag++; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } bip->bip_flags |= BIP_MAPPED_INTEGRITY; @@ -195,11 +195,10 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) struct bvec_iter iter; bip_for_each_vec(iv, bip, iter) { - void *p, *pmap; unsigned int j; + void *p; - pmap = kmap_atomic(iv.bv_page); - p = pmap + iv.bv_offset; + p = bvec_kmap_local(&iv); for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { struct t10_pi_tuple *pi = p; @@ -210,8 +209,7 @@ static void t10_pi_type1_complete(struct request *rq, unsigned int nr_bytes) intervals--; p += tuple_sz; } - - kunmap_atomic(pmap); + kunmap_local(p); } } } @@ -282,4 +280,196 @@ const struct blk_integrity_profile t10_pi_type3_ip = { }; EXPORT_SYMBOL(t10_pi_type3_ip); +static __be64 ext_pi_crc64(void *data, unsigned int len) +{ + return cpu_to_be64(crc64_rocksoft(data, len)); +} + +static blk_status_t ext_pi_crc64_generate(struct blk_integrity_iter *iter, + enum t10_dif_type type) +{ + unsigned int i; + + for (i = 0 ; i < iter->data_size ; i += iter->interval) { + struct crc64_pi_tuple *pi = iter->prot_buf; + + pi->guard_tag = ext_pi_crc64(iter->data_buf, iter->interval); + pi->app_tag = 0; + + if (type == T10_PI_TYPE1_PROTECTION) + put_unaligned_be48(iter->seed, pi->ref_tag); + else + put_unaligned_be48(0ULL, pi->ref_tag); + + iter->data_buf += iter->interval; + iter->prot_buf += iter->tuple_size; + iter->seed++; + } + + return BLK_STS_OK; +} + +static bool ext_pi_ref_escape(u8 *ref_tag) +{ + static u8 ref_escape[6] = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }; + + return memcmp(ref_tag, ref_escape, sizeof(ref_escape)) == 0; +} + +static blk_status_t ext_pi_crc64_verify(struct blk_integrity_iter *iter, + enum t10_dif_type type) +{ + unsigned int i; + + for (i = 0; i < iter->data_size; i += iter->interval) { + struct crc64_pi_tuple *pi = iter->prot_buf; + u64 ref, seed; + __be64 csum; + + if (type == T10_PI_TYPE1_PROTECTION) { + if (pi->app_tag == T10_PI_APP_ESCAPE) + goto next; + + ref = get_unaligned_be48(pi->ref_tag); + seed = lower_48_bits(iter->seed); + if (ref != seed) { + pr_err("%s: ref tag error at location %llu (rcvd %llu)\n", + iter->disk_name, seed, ref); + return BLK_STS_PROTECTION; + } + } else if (type == T10_PI_TYPE3_PROTECTION) { + if (pi->app_tag == T10_PI_APP_ESCAPE && + ext_pi_ref_escape(pi->ref_tag)) + goto next; + } + + csum = ext_pi_crc64(iter->data_buf, iter->interval); + if (pi->guard_tag != csum) { + pr_err("%s: guard tag error at sector %llu " \ + "(rcvd %016llx, want %016llx)\n", + iter->disk_name, (unsigned long long)iter->seed, + be64_to_cpu(pi->guard_tag), be64_to_cpu(csum)); + return BLK_STS_PROTECTION; + } + +next: + iter->data_buf += iter->interval; + iter->prot_buf += iter->tuple_size; + iter->seed++; + } + + return BLK_STS_OK; +} + +static blk_status_t ext_pi_type1_verify_crc64(struct blk_integrity_iter *iter) +{ + return ext_pi_crc64_verify(iter, T10_PI_TYPE1_PROTECTION); +} + +static blk_status_t ext_pi_type1_generate_crc64(struct blk_integrity_iter *iter) +{ + return ext_pi_crc64_generate(iter, T10_PI_TYPE1_PROTECTION); +} + +static void ext_pi_type1_prepare(struct request *rq) +{ + const int tuple_sz = rq->q->integrity.tuple_size; + u64 ref_tag = ext_pi_ref_tag(rq); + struct bio *bio; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u64 virt = lower_48_bits(bip_get_seed(bip)); + struct bio_vec iv; + struct bvec_iter iter; + + /* Already remapped? */ + if (bip->bip_flags & BIP_MAPPED_INTEGRITY) + break; + + bip_for_each_vec(iv, bip, iter) { + unsigned int j; + void *p; + + p = bvec_kmap_local(&iv); + for (j = 0; j < iv.bv_len; j += tuple_sz) { + struct crc64_pi_tuple *pi = p; + u64 ref = get_unaligned_be48(pi->ref_tag); + + if (ref == virt) + put_unaligned_be48(ref_tag, pi->ref_tag); + virt++; + ref_tag++; + p += tuple_sz; + } + kunmap_local(p); + } + + bip->bip_flags |= BIP_MAPPED_INTEGRITY; + } +} + +static void ext_pi_type1_complete(struct request *rq, unsigned int nr_bytes) +{ + unsigned intervals = nr_bytes >> rq->q->integrity.interval_exp; + const int tuple_sz = rq->q->integrity.tuple_size; + u64 ref_tag = ext_pi_ref_tag(rq); + struct bio *bio; + + __rq_for_each_bio(bio, rq) { + struct bio_integrity_payload *bip = bio_integrity(bio); + u64 virt = lower_48_bits(bip_get_seed(bip)); + struct bio_vec iv; + struct bvec_iter iter; + + bip_for_each_vec(iv, bip, iter) { + unsigned int j; + void *p; + + p = bvec_kmap_local(&iv); + for (j = 0; j < iv.bv_len && intervals; j += tuple_sz) { + struct crc64_pi_tuple *pi = p; + u64 ref = get_unaligned_be48(pi->ref_tag); + + if (ref == ref_tag) + put_unaligned_be48(virt, pi->ref_tag); + virt++; + ref_tag++; + intervals--; + p += tuple_sz; + } + kunmap_local(p); + } + } +} + +static blk_status_t ext_pi_type3_verify_crc64(struct blk_integrity_iter *iter) +{ + return ext_pi_crc64_verify(iter, T10_PI_TYPE3_PROTECTION); +} + +static blk_status_t ext_pi_type3_generate_crc64(struct blk_integrity_iter *iter) +{ + return ext_pi_crc64_generate(iter, T10_PI_TYPE3_PROTECTION); +} + +const struct blk_integrity_profile ext_pi_type1_crc64 = { + .name = "EXT-DIF-TYPE1-CRC64", + .generate_fn = ext_pi_type1_generate_crc64, + .verify_fn = ext_pi_type1_verify_crc64, + .prepare_fn = ext_pi_type1_prepare, + .complete_fn = ext_pi_type1_complete, +}; +EXPORT_SYMBOL_GPL(ext_pi_type1_crc64); + +const struct blk_integrity_profile ext_pi_type3_crc64 = { + .name = "EXT-DIF-TYPE3-CRC64", + .generate_fn = ext_pi_type3_generate_crc64, + .verify_fn = ext_pi_type3_verify_crc64, + .prepare_fn = t10_pi_type3_prepare, + .complete_fn = t10_pi_type3_complete, +}; +EXPORT_SYMBOL_GPL(ext_pi_type3_crc64); + +MODULE_LICENSE("GPL"); MODULE_LICENSE("GPL"); |