aboutsummaryrefslogtreecommitdiffstats
path: root/drivers/nvme
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/nvme')
-rw-r--r--drivers/nvme/host/core.c77
-rw-r--r--drivers/nvme/host/fc.c18
-rw-r--r--drivers/nvme/host/multipath.c98
-rw-r--r--drivers/nvme/host/nvme.h15
-rw-r--r--drivers/nvme/host/pci.c62
-rw-r--r--drivers/nvme/host/rdma.c40
-rw-r--r--drivers/nvme/host/tcp.c41
-rw-r--r--drivers/nvme/target/core.c6
-rw-r--r--drivers/nvme/target/fabrics-cmd.c15
-rw-r--r--drivers/nvme/target/rdma.c6
-rw-r--r--drivers/nvme/target/tcp.c14
11 files changed, 265 insertions, 127 deletions
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 2bdac578ee51..eee1711d9639 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -6,6 +6,7 @@
#include <linux/blkdev.h>
#include <linux/blk-mq.h>
+#include <linux/compat.h>
#include <linux/delay.h>
#include <linux/errno.h>
#include <linux/hdreg.h>
@@ -66,8 +67,8 @@ MODULE_PARM_DESC(streams, "turn on support for Streams write directives");
* nvme_reset_wq - hosts nvme reset works
* nvme_delete_wq - hosts nvme delete works
*
- * nvme_wq will host works such are scan, aen handling, fw activation,
- * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq
+ * nvme_wq will host works such as scan, aen handling, fw activation,
+ * keep-alive, periodic reconnects etc. nvme_reset_wq
* runs reset works which also flush works hosted on nvme_wq for
* serialization purposes. nvme_delete_wq host controller deletion
* works which flush reset works for serialization.
@@ -565,8 +566,14 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
struct nvme_dsm_range *range;
struct bio *bio;
- range = kmalloc_array(segments, sizeof(*range),
- GFP_ATOMIC | __GFP_NOWARN);
+ /*
+ * Some devices do not consider the DSM 'Number of Ranges' field when
+ * determining how much data to DMA. Always allocate memory for maximum
+ * number of segments to prevent device reading beyond end of buffer.
+ */
+ static const size_t alloc_size = sizeof(*range) * NVME_DSM_MAX_RANGES;
+
+ range = kzalloc(alloc_size, GFP_ATOMIC | __GFP_NOWARN);
if (!range) {
/*
* If we fail allocation our range, fallback to the controller
@@ -606,7 +613,7 @@ static blk_status_t nvme_setup_discard(struct nvme_ns *ns, struct request *req,
req->special_vec.bv_page = virt_to_page(range);
req->special_vec.bv_offset = offset_in_page(range);
- req->special_vec.bv_len = sizeof(*range) * segments;
+ req->special_vec.bv_len = alloc_size;
req->rq_flags |= RQF_SPECIAL_PAYLOAD;
return BLK_STS_OK;
@@ -929,7 +936,7 @@ static void nvme_keep_alive_end_io(struct request *rq, blk_status_t status)
startka = true;
spin_unlock_irqrestore(&ctrl->lock, flags);
if (startka)
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
}
static int nvme_keep_alive(struct nvme_ctrl *ctrl)
@@ -959,7 +966,7 @@ static void nvme_keep_alive_work(struct work_struct *work)
dev_dbg(ctrl->device,
"reschedule traffic based keep-alive timer\n");
ctrl->comp_seen = false;
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
return;
}
@@ -976,7 +983,7 @@ static void nvme_start_keep_alive(struct nvme_ctrl *ctrl)
if (unlikely(ctrl->kato == 0))
return;
- schedule_delayed_work(&ctrl->ka_work, ctrl->kato * HZ);
+ queue_delayed_work(nvme_wq, &ctrl->ka_work, ctrl->kato * HZ);
}
void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
@@ -988,6 +995,19 @@ void nvme_stop_keep_alive(struct nvme_ctrl *ctrl)
}
EXPORT_SYMBOL_GPL(nvme_stop_keep_alive);
+/*
+ * In NVMe 1.0 the CNS field was just a binary controller or namespace
+ * flag, thus sending any new CNS opcodes has a big chance of not working.
+ * Qemu unfortunately had that bug after reporting a 1.1 version compliance
+ * (but not for any later version).
+ */
+static bool nvme_ctrl_limited_cns(struct nvme_ctrl *ctrl)
+{
+ if (ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)
+ return ctrl->vs < NVME_VS(1, 2, 0);
+ return ctrl->vs < NVME_VS(1, 1, 0);
+}
+
static int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id)
{
struct nvme_command c = { };
@@ -1120,8 +1140,8 @@ static struct nvme_id_ns *nvme_identify_ns(struct nvme_ctrl *ctrl,
static int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11,
void *buffer, size_t buflen, u32 *result)
{
+ union nvme_result res = { 0 };
struct nvme_command c;
- union nvme_result res;
int ret;
memset(&c, 0, sizeof(c));
@@ -1182,6 +1202,18 @@ static void nvme_enable_aen(struct nvme_ctrl *ctrl)
supported_aens);
}
+/*
+ * Convert integer values from ioctl structures to user pointers, silently
+ * ignoring the upper bits in the compat case to match behaviour of 32-bit
+ * kernels.
+ */
+static void __user *nvme_to_user_ptr(uintptr_t ptrval)
+{
+ if (in_compat_syscall())
+ ptrval = (compat_uptr_t)ptrval;
+ return (void __user *)ptrval;
+}
+
static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
{
struct nvme_user_io io;
@@ -1205,7 +1237,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
length = (io.nblocks + 1) << ns->lba_shift;
meta_len = (io.nblocks + 1) * ns->ms;
- metadata = (void __user *)(uintptr_t)io.metadata;
+ metadata = nvme_to_user_ptr(io.metadata);
if (ns->ext) {
length += meta_len;
@@ -1228,7 +1260,7 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio)
c.rw.appmask = cpu_to_le16(io.appmask);
return nvme_submit_user_cmd(ns->queue, &c,
- (void __user *)(uintptr_t)io.addr, length,
+ nvme_to_user_ptr(io.addr), length,
metadata, meta_len, lower_32_bits(io.slba), NULL, 0);
}
@@ -1349,9 +1381,9 @@ static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata,
- cmd.metadata_len, 0, &result, timeout);
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
+ 0, &result, timeout);
nvme_passthru_end(ctrl, effects);
if (status >= 0) {
@@ -1396,8 +1428,8 @@ static int nvme_user_cmd64(struct nvme_ctrl *ctrl, struct nvme_ns *ns,
effects = nvme_passthru_start(ctrl, ns, cmd.opcode);
status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c,
- (void __user *)(uintptr_t)cmd.addr, cmd.data_len,
- (void __user *)(uintptr_t)cmd.metadata, cmd.metadata_len,
+ nvme_to_user_ptr(cmd.addr), cmd.data_len,
+ nvme_to_user_ptr(cmd.metadata), cmd.metadata_len,
0, &cmd.result, timeout);
nvme_passthru_end(ctrl, effects);
@@ -1750,7 +1782,7 @@ static void __nvme_revalidate_disk(struct gendisk *disk, struct nvme_id_ns *id)
if (ns->head->disk) {
nvme_update_disk_info(ns->head->disk, ns, id);
blk_queue_stack_limits(ns->head->disk->queue, ns->queue);
- revalidate_disk(ns->head->disk);
+ nvme_mpath_update_disk_size(ns->head->disk);
}
#endif
}
@@ -2450,8 +2482,8 @@ static bool nvme_validate_cntlid(struct nvme_subsystem *subsys,
lockdep_assert_held(&nvme_subsystems_lock);
list_for_each_entry(tmp, &subsys->ctrls, subsys_entry) {
- if (ctrl->state == NVME_CTRL_DELETING ||
- ctrl->state == NVME_CTRL_DEAD)
+ if (tmp->state == NVME_CTRL_DELETING ||
+ tmp->state == NVME_CTRL_DEAD)
continue;
if (tmp->cntlid == ctrl->cntlid) {
@@ -3414,6 +3446,8 @@ static int nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid)
return 0;
out_put_disk:
+ /* prevent double queue cleanup */
+ ns->disk->queue = NULL;
put_disk(ns->disk);
out_unlink_ns:
mutex_lock(&ctrl->subsys->lock);
@@ -3587,8 +3621,7 @@ static void nvme_scan_work(struct work_struct *work)
mutex_lock(&ctrl->scan_lock);
nn = le32_to_cpu(id->nn);
- if (ctrl->vs >= NVME_VS(1, 1, 0) &&
- !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) {
+ if (!nvme_ctrl_limited_cns(ctrl)) {
if (!nvme_scan_ns_list(ctrl, nn))
goto out_free_id;
}
@@ -3686,7 +3719,7 @@ static void nvme_get_fw_slot_info(struct nvme_ctrl *ctrl)
if (!log)
return;
- if (nvme_get_log(ctrl, NVME_NSID_ALL, 0, NVME_LOG_FW_SLOT, log,
+ if (nvme_get_log(ctrl, NVME_NSID_ALL, NVME_LOG_FW_SLOT, 0, log,
sizeof(*log), 0))
dev_warn(ctrl->device, "Get FW SLOT INFO log error\n");
kfree(log);
diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c
index dd8169bbf0d2..551d5f16e83c 100644
--- a/drivers/nvme/host/fc.c
+++ b/drivers/nvme/host/fc.c
@@ -2882,10 +2882,22 @@ nvme_fc_reconnect_or_delete(struct nvme_fc_ctrl *ctrl, int status)
static void
__nvme_fc_terminate_io(struct nvme_fc_ctrl *ctrl)
{
- nvme_stop_keep_alive(&ctrl->ctrl);
+ /*
+ * if state is connecting - the error occurred as part of a
+ * reconnect attempt. The create_association error paths will
+ * clean up any outstanding io.
+ *
+ * if it's a different state - ensure all pending io is
+ * terminated. Given this can delay while waiting for the
+ * aborted io to return, we recheck adapter state below
+ * before changing state.
+ */
+ if (ctrl->ctrl.state != NVME_CTRL_CONNECTING) {
+ nvme_stop_keep_alive(&ctrl->ctrl);
- /* will block will waiting for io to terminate */
- nvme_fc_delete_association(ctrl);
+ /* will block will waiting for io to terminate */
+ nvme_fc_delete_association(ctrl);
+ }
if (ctrl->ctrl.state != NVME_CTRL_CONNECTING &&
!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_CONNECTING))
diff --git a/drivers/nvme/host/multipath.c b/drivers/nvme/host/multipath.c
index f928bcfc57b5..ec3c0f2ba405 100644
--- a/drivers/nvme/host/multipath.c
+++ b/drivers/nvme/host/multipath.c
@@ -3,6 +3,7 @@
* Copyright (c) 2017-2018 Christoph Hellwig.
*/
+#include <linux/backing-dev.h>
#include <linux/moduleparam.h>
#include <trace/events/block.h>
#include "nvme.h"
@@ -158,20 +159,27 @@ void nvme_mpath_clear_ctrl_paths(struct nvme_ctrl *ctrl)
struct nvme_ns *ns;
mutex_lock(&ctrl->scan_lock);
+ down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list)
if (nvme_mpath_clear_current_path(ns))
kblockd_schedule_work(&ns->head->requeue_work);
+ up_read(&ctrl->namespaces_rwsem);
mutex_unlock(&ctrl->scan_lock);
}
+static bool nvme_path_is_disabled(struct nvme_ns *ns)
+{
+ return ns->ctrl->state != NVME_CTRL_LIVE ||
+ test_bit(NVME_NS_ANA_PENDING, &ns->flags);
+}
+
static struct nvme_ns *__nvme_find_path(struct nvme_ns_head *head, int node)
{
int found_distance = INT_MAX, fallback_distance = INT_MAX, distance;
struct nvme_ns *found = NULL, *fallback = NULL, *ns;
list_for_each_entry_rcu(ns, &head->list, siblings) {
- if (ns->ctrl->state != NVME_CTRL_LIVE ||
- test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+ if (nvme_path_is_disabled(ns))
continue;
if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_NUMA)
@@ -225,8 +233,7 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
for (ns = nvme_next_ns(head, old);
ns != old;
ns = nvme_next_ns(head, ns)) {
- if (ns->ctrl->state != NVME_CTRL_LIVE ||
- test_bit(NVME_NS_ANA_PENDING, &ns->flags))
+ if (nvme_path_is_disabled(ns))
continue;
if (ns->ana_state == NVME_ANA_OPTIMIZED) {
@@ -237,6 +244,17 @@ static struct nvme_ns *nvme_round_robin_path(struct nvme_ns_head *head,
fallback = ns;
}
+ /*
+ * The loop above skips the current path for round-robin semantics.
+ * Fall back to the current path if either:
+ * - no other optimized path found and current is optimized,
+ * - no other usable path found and current is usable.
+ */
+ if (!nvme_path_is_disabled(old) &&
+ (old->ana_state == NVME_ANA_OPTIMIZED ||
+ (!fallback && old->ana_state == NVME_ANA_NONOPTIMIZED)))
+ return old;
+
if (!fallback)
return NULL;
found = fallback;
@@ -257,10 +275,13 @@ inline struct nvme_ns *nvme_find_path(struct nvme_ns_head *head)
struct nvme_ns *ns;
ns = srcu_dereference(head->current_path[node], &head->srcu);
- if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR && ns)
- ns = nvme_round_robin_path(head, node, ns);
- if (unlikely(!ns || !nvme_path_is_optimized(ns)))
- ns = __nvme_find_path(head, node);
+ if (unlikely(!ns))
+ return __nvme_find_path(head, node);
+
+ if (READ_ONCE(head->subsys->iopolicy) == NVME_IOPOLICY_RR)
+ return nvme_round_robin_path(head, node, ns);
+ if (unlikely(!nvme_path_is_optimized(ns)))
+ return __nvme_find_path(head, node);
return ns;
}
@@ -402,15 +423,14 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
{
struct nvme_ns_head *head = ns->head;
- lockdep_assert_held(&ns->head->lock);
-
if (!head->disk)
return;
- if (!(head->disk->flags & GENHD_FL_UP))
+ if (!test_and_set_bit(NVME_NSHEAD_DISK_LIVE, &head->flags))
device_add_disk(&head->subsys->dev, head->disk,
nvme_ns_id_attr_groups);
+ mutex_lock(&head->lock);
if (nvme_path_is_optimized(ns)) {
int node, srcu_idx;
@@ -419,9 +439,10 @@ static void nvme_mpath_set_live(struct nvme_ns *ns)
__nvme_find_path(head, node);
srcu_read_unlock(&head->srcu, srcu_idx);
}
+ mutex_unlock(&head->lock);
- synchronize_srcu(&ns->head->srcu);
- kblockd_schedule_work(&ns->head->requeue_work);
+ synchronize_srcu(&head->srcu);
+ kblockd_schedule_work(&head->requeue_work);
}
static int nvme_parse_ana_log(struct nvme_ctrl *ctrl, void *data,
@@ -472,14 +493,12 @@ static inline bool nvme_state_is_live(enum nvme_ana_state state)
static void nvme_update_ns_ana_state(struct nvme_ana_group_desc *desc,
struct nvme_ns *ns)
{
- mutex_lock(&ns->head->lock);
ns->ana_grpid = le32_to_cpu(desc->grpid);
ns->ana_state = desc->state;
clear_bit(NVME_NS_ANA_PENDING, &ns->flags);
if (nvme_state_is_live(ns->ana_state))
nvme_mpath_set_live(ns);
- mutex_unlock(&ns->head->lock);
}
static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
@@ -499,7 +518,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (!nr_nsids)
return 0;
- down_write(&ctrl->namespaces_rwsem);
+ down_read(&ctrl->namespaces_rwsem);
list_for_each_entry(ns, &ctrl->namespaces, list) {
unsigned nsid = le32_to_cpu(desc->nsids[n]);
@@ -510,7 +529,7 @@ static int nvme_update_ana_state(struct nvme_ctrl *ctrl,
if (++n == nr_nsids)
break;
}
- up_write(&ctrl->namespaces_rwsem);
+ up_read(&ctrl->namespaces_rwsem);
return 0;
}
@@ -630,31 +649,45 @@ static ssize_t ana_state_show(struct device *dev, struct device_attribute *attr,
}
DEVICE_ATTR_RO(ana_state);
-static int nvme_set_ns_ana_state(struct nvme_ctrl *ctrl,
+static int nvme_lookup_ana_group_desc(struct nvme_ctrl *ctrl,
struct nvme_ana_group_desc *desc, void *data)
{
- struct nvme_ns *ns = data;
+ struct nvme_ana_group_desc *dst = data;
- if (ns->ana_grpid == le32_to_cpu(desc->grpid)) {
- nvme_update_ns_ana_state(desc, ns);
- return -ENXIO; /* just break out of the loop */
- }
+ if (desc->grpid != dst->grpid)
+ return 0;
- return 0;
+ *dst = *desc;
+ return -ENXIO; /* just break out of the loop */
}
void nvme_mpath_add_disk(struct nvme_ns *ns, struct nvme_id_ns *id)
{
if (nvme_ctrl_use_ana(ns->ctrl)) {
+ struct nvme_ana_group_desc desc = {
+ .grpid = id->anagrpid,
+ .state = 0,
+ };
+
mutex_lock(&ns->ctrl->ana_lock);
ns->ana_grpid = le32_to_cpu(id->anagrpid);
- nvme_parse_ana_log(ns->ctrl, ns, nvme_set_ns_ana_state);
+ nvme_parse_ana_log(ns->ctrl, &desc, nvme_lookup_ana_group_desc);
mutex_unlock(&ns->ctrl->ana_lock);
+ if (desc.state) {
+ /* found the group desc: update */
+ nvme_update_ns_ana_state(&desc, ns);
+ }
} else {
- mutex_lock(&ns->head->lock);
ns->ana_state = NVME_ANA_OPTIMIZED;
nvme_mpath_set_live(ns);
- mutex_unlock(&ns->head->lock);
+ }
+
+ if (bdi_cap_stable_pages_required(ns->queue->backing_dev_info)) {
+ struct gendisk *disk = ns->head->disk;
+
+ if (disk)
+ disk->queue->backing_dev_info->capabilities |=
+ BDI_CAP_STABLE_WRITES;
}
}
@@ -669,6 +702,14 @@ void nvme_mpath_remove_disk(struct nvme_ns_head *head)
kblockd_schedule_work(&head->requeue_work);
flush_work(&head->requeue_work);
blk_cleanup_queue(head->disk->queue);
+ if (!test_bit(NVME_NSHEAD_DISK_LIVE, &head->flags)) {
+ /*
+ * if device_add_disk wasn't called, prevent
+ * disk release to put a bogus reference on the
+ * request queue
+ */
+ head->disk->queue = NULL;
+ }
put_disk(head->disk);
}
@@ -701,13 +742,14 @@ int nvme_mpath_init(struct nvme_ctrl *ctrl, struct nvme_id_ctrl *id)
}
INIT_WORK(&ctrl->ana_work, nvme_ana_work);
+ kfree(ctrl->ana_log_buf);
ctrl->ana_log_buf = kmalloc(ctrl->ana_log_size, GFP_KERNEL);
if (!ctrl->ana_log_buf) {
error = -ENOMEM;
goto out;
}
- error = nvme_read_ana_log(ctrl, true);
+ error = nvme_read_ana_log(ctrl, false);
if (error)
goto out_free_ana_log_buf;
return 0;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 81215ca32671..9036cf3c44b4 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -309,6 +309,8 @@ struct nvme_ns_head {
spinlock_t requeue_lock;
struct work_struct requeue_work;
struct mutex lock;
+ unsigned long flags;
+#define NVME_NSHEAD_DISK_LIVE 0
struct nvme_ns __rcu *current_path[];
#endif
};
@@ -502,6 +504,16 @@ static inline void nvme_mpath_check_last_path(struct nvme_ns *ns)
kblockd_schedule_work(&head->requeue_work);
}
+static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
+{
+ struct block_device *bdev = bdget_disk(disk, 0);
+
+ if (bdev) {
+ bd_set_size(bdev, get_capacity(disk) << SECTOR_SHIFT);
+ bdput(bdev);
+ }
+}
+
extern struct device_attribute dev_attr_ana_grpid;
extern struct device_attribute dev_attr_ana_state;
extern struct device_attribute subsys_attr_iopolicy;
@@ -572,6 +584,9 @@ static inline void nvme_mpath_wait_freeze(struct nvme_subsystem *subsys)
static inline void nvme_mpath_start_freeze(struct nvme_subsystem *subsys)
{
}
+static inline void nvme_mpath_update_disk_size(struct gendisk *disk)
+{
+}
#endif /* CONFIG_NVME_MULTIPATH */
#ifdef CONFIG_NVM
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index c9f5983a9f82..727fec79b6cf 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -73,13 +73,13 @@ static const struct kernel_param_ops queue_count_ops = {
.get = param_get_int,
};
-static int write_queues;
+static unsigned int write_queues;
module_param_cb(write_queues, &queue_count_ops, &write_queues, 0644);
MODULE_PARM_DESC(write_queues,
"Number of queues to use for writes. If not set, reads and writes "
"will share a queue set.");
-static int poll_queues = 0;
+static unsigned int poll_queues = 0;
module_param_cb(poll_queues, &queue_count_ops, &poll_queues, 0644);
MODULE_PARM_DESC(poll_queues, "Number of queues to use for polled IO.");
@@ -183,7 +183,6 @@ struct nvme_queue {
/* only used for poll queues: */
spinlock_t cq_poll_lock ____cacheline_aligned_in_smp;
volatile struct nvme_completion *cqes;
- struct blk_mq_tags **tags;
dma_addr_t sq_dma_addr;
dma_addr_t cq_dma_addr;
u32 __iomem *q_db;
@@ -392,29 +391,17 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
WARN_ON(hctx_idx != 0);
WARN_ON(dev->admin_tagset.tags[0] != hctx->tags);
- WARN_ON(nvmeq->tags);
hctx->driver_data = nvmeq;
- nvmeq->tags = &dev->admin_tagset.tags[0];
return 0;
}
-static void nvme_admin_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx)
-{
- struct nvme_queue *nvmeq = hctx->driver_data;
-
- nvmeq->tags = NULL;
-}
-
static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
unsigned int hctx_idx)
{
struct nvme_dev *dev = data;
struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1];
- if (!nvmeq->tags)
- nvmeq->tags = &dev->tagset.tags[hctx_idx];
-
WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags);
hctx->driver_data = nvmeq;
return 0;
@@ -963,6 +950,13 @@ static inline void nvme_ring_cq_doorbell(struct nvme_queue *nvmeq)
writel(head, nvmeq->q_db + nvmeq->dev->db_stride);
}
+static inline struct blk_mq_tags *nvme_queue_tagset(struct nvme_queue *nvmeq)
+{
+ if (!nvmeq->qid)
+ return nvmeq->dev->admin_tagset.tags[0];
+ return nvmeq->dev->tagset.tags[nvmeq->qid - 1];
+}
+
static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
{
volatile struct nvme_completion *cqe = &nvmeq->cqes[idx];
@@ -988,7 +982,7 @@ static inline void nvme_handle_cqe(struct nvme_queue *nvmeq, u16 idx)
return;
}
- req = blk_mq_tag_to_rq(*nvmeq->tags, cqe->command_id);
+ req = blk_mq_tag_to_rq(nvme_queue_tagset(nvmeq), cqe->command_id);
trace_nvme_sq(req, cqe->sq_head, nvmeq->sq_tail);
nvme_end_request(req, cqe->status, cqe->result);
}
@@ -1103,9 +1097,9 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx)
spin_lock(&nvmeq->cq_poll_lock);
found = nvme_process_cq(nvmeq, &start, &end, -1);
+ nvme_complete_cqes(nvmeq, start, end);
spin_unlock(&nvmeq->cq_poll_lock);
- nvme_complete_cqes(nvmeq, start, end);
return found;
}
@@ -1426,6 +1420,25 @@ static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown)
nvme_poll_irqdisable(nvmeq, -1);
}
+/*
+ * Called only on a device that has been disabled and after all other threads
+ * that can check this device's completion queues have synced, except
+ * nvme_poll(). This is the last chance for the driver to see a natural
+ * completion before nvme_cancel_request() terminates all incomplete requests.
+ */
+static void nvme_reap_pending_cqes(struct nvme_dev *dev)
+{
+ u16 start, end;
+ int i;
+
+ for (i = dev->ctrl.queue_count - 1; i > 0; i--) {
+ spin_lock(&dev->queues[i].cq_poll_lock);
+ nvme_process_cq(&dev->queues[i], &start, &end, -1);
+ spin_unlock(&dev->queues[i].cq_poll_lock);
+ nvme_complete_cqes(&dev->queues[i], start, end);
+ }
+}
+
static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues,
int entry_size)
{
@@ -1591,7 +1604,6 @@ static const struct blk_mq_ops nvme_mq_admin_ops = {
.queue_rq = nvme_queue_rq,
.complete = nvme_pci_complete_rq,
.init_hctx = nvme_admin_init_hctx,
- .exit_hctx = nvme_admin_exit_hctx,
.init_request = nvme_init_request,
.timeout = nvme_timeout,
};
@@ -2073,7 +2085,6 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
.priv = dev,
};
unsigned int irq_queues, this_p_queues;
- unsigned int nr_cpus = num_possible_cpus();
/*
* Poll queues don't need interrupts, but we need at least one IO
@@ -2084,10 +2095,7 @@ static int nvme_setup_irqs(struct nvme_dev *dev, unsigned int nr_io_queues)
this_p_queues = nr_io_queues - 1;
irq_queues = 1;
} else {
- if (nr_cpus < nr_io_queues - this_p_queues)
- irq_queues = nr_cpus + 1;
- else
- irq_queues = nr_io_queues - this_p_queues + 1;
+ irq_queues = nr_io_queues - this_p_queues + 1;
}
dev->io_queues[HCTX_TYPE_POLL] = this_p_queues;
@@ -2250,11 +2258,6 @@ static bool __nvme_disable_io_queues(struct nvme_dev *dev, u8 opcode)
if (timeout == 0)
return false;
- /* handle any remaining CQEs */
- if (opcode == nvme_admin_delete_cq &&
- !test_bit(NVMEQ_DELETE_ERROR, &nvmeq->flags))
- nvme_poll_irqdisable(nvmeq, -1);
-
sent--;
if (nr_queues)
goto retry;
@@ -2417,6 +2420,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown)
nvme_suspend_io_queues(dev);
nvme_suspend_queue(&dev->queues[0]);
nvme_pci_disable(dev);
+ nvme_reap_pending_cqes(dev);
blk_mq_tagset_busy_iter(&dev->tagset, nvme_cancel_request, &dev->ctrl);
blk_mq_tagset_busy_iter(&dev->admin_tagset, nvme_cancel_request, &dev->ctrl);
@@ -2966,6 +2970,8 @@ static const struct pci_device_id nvme_id_table[] = {
{ PCI_DEVICE(0x1cc1, 0x8201), /* ADATA SX8200PNP 512GB */
.driver_data = NVME_QUIRK_NO_DEEPEST_PS |
NVME_QUIRK_IGNORE_DEV_SUBNQN, },
+ { PCI_DEVICE(0x1c5c, 0x1504), /* SK Hynix PC400 */
+ .driver_data = NVME_QUIRK_DISABLE_WRITE_ZEROES, },
{ PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) },
{ PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2003) },
diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c
index 59e1cedfca92..ad17c79fd1e2 100644
--- a/drivers/nvme/host/rdma.c
+++ b/drivers/nvme/host/rdma.c
@@ -142,14 +142,6 @@ static void nvme_rdma_recv_done(struct ib_cq *cq, struct ib_wc *wc);
static const struct blk_mq_ops nvme_rdma_mq_ops;
static const struct blk_mq_ops nvme_rdma_admin_mq_ops;
-/* XXX: really should move to a generic header sooner or later.. */
-static inline void put_unaligned_le24(u32 val, u8 *p)
-{
- *p++ = val;
- *p++ = val >> 8;
- *p++ = val >> 16;
-}
-
static inline int nvme_rdma_queue_idx(struct nvme_rdma_queue *queue)
{
return queue - queue->ctrl->queues;
@@ -451,7 +443,7 @@ static int nvme_rdma_create_queue_ib(struct nvme_rdma_queue *queue)
* Spread I/O queues completion vectors according their queue index.
* Admin queues can always go on completion vector 0.
*/
- comp_vector = idx == 0 ? idx : idx - 1;
+ comp_vector = (idx == 0 ? idx : idx - 1) % ibdev->num_comp_vectors;
/* Polling queues need direct cq polling context */
if (nvme_rdma_poll_queue(queue))
@@ -843,9 +835,11 @@ out_free_tagset:
if (new)
blk_mq_free_tag_set(ctrl->ctrl.admin_tagset);
out_free_async_qe:
- nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
- sizeof(struct nvme_command), DMA_TO_DEVICE);
- ctrl->async_event_sqe.data = NULL;
+ if (ctrl->async_event_sqe.data) {
+ nvme_rdma_free_qe(ctrl->device->dev, &ctrl->async_event_sqe,
+ sizeof(struct nvme_command), DMA_TO_DEVICE);
+ ctrl->async_event_sqe.data = NULL;
+ }
out_free_queue:
nvme_rdma_free_queue(&ctrl->queues[0]);
return error;
@@ -881,15 +875,20 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->ctrl.connect_q);
goto out_free_tag_set;
}
- } else {
- blk_mq_update_nr_hw_queues(&ctrl->tag_set,
- ctrl->ctrl.queue_count - 1);
}
ret = nvme_rdma_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
+ if (!new) {
+ nvme_start_queues(&ctrl->ctrl);
+ nvme_wait_freeze(&ctrl->ctrl);
+ blk_mq_update_nr_hw_queues(ctrl->ctrl.tagset,
+ ctrl->ctrl.queue_count - 1);
+ nvme_unfreeze(&ctrl->ctrl);
+ }
+
return 0;
out_cleanup_connect_q:
@@ -919,6 +918,7 @@ static void nvme_rdma_teardown_io_queues(struct nvme_rdma_ctrl *ctrl,
bool remove)
{
if (ctrl->ctrl.queue_count > 1) {
+ nvme_start_freeze(&ctrl->ctrl);
nvme_stop_queues(&ctrl->ctrl);
nvme_rdma_stop_io_queues(ctrl);
if (ctrl->ctrl.tagset)
@@ -1075,7 +1075,7 @@ static void nvme_rdma_error_recovery(struct nvme_rdma_ctrl *ctrl)
if (!nvme_change_ctrl_state(&ctrl->ctrl, NVME_CTRL_RESETTING))
return;
- queue_work(nvme_wq, &ctrl->err_work);
+ queue_work(nvme_reset_wq, &ctrl->err_work);
}
static void nvme_rdma_wr_error(struct ib_cq *cq, struct ib_wc *wc,
@@ -2111,8 +2111,16 @@ err_unreg_client:
static void __exit nvme_rdma_cleanup_module(void)
{
+ struct nvme_rdma_ctrl *ctrl;
+
nvmf_unregister_transport(&nvme_rdma_transport);
ib_unregister_client(&nvme_rdma_ib_client);
+
+ mutex_lock(&nvme_rdma_ctrl_mutex);
+ list_for_each_entry(ctrl, &nvme_rdma_ctrl_list, list)
+ nvme_delete_ctrl(&ctrl->ctrl);
+ mutex_unlock(&nvme_rdma_ctrl_mutex);
+ flush_workqueue(nvme_delete_wq);
}
module_init(nvme_rdma_init_module);
diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c
index bdadb27b28bb..8651b6cfd3ee 100644
--- a/drivers/nvme/host/tcp.c
+++ b/drivers/nvme/host/tcp.c
@@ -162,16 +162,14 @@ static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
{
struct request *rq;
- unsigned int bytes;
if (unlikely(nvme_tcp_async_req(req)))
return false; /* async events don't have a request */
rq = blk_mq_rq_from_pdu(req);
- bytes = blk_rq_payload_bytes(rq);
- return rq_data_dir(rq) == WRITE && bytes &&
- bytes <= nvme_tcp_inline_data_size(req->queue);
+ return rq_data_dir(rq) == WRITE && req->data_len &&
+ req->data_len <= nvme_tcp_inline_data_size(req->queue);
}
static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
@@ -420,7 +418,7 @@ static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING))
return;
- queue_work(nvme_wq, &to_tcp_ctrl(ctrl)->err_work);
+ queue_work(nvme_reset_wq, &to_tcp_ctrl(ctrl)->err_work);
}
static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
@@ -785,11 +783,11 @@ static void nvme_tcp_data_ready(struct sock *sk)
{
struct nvme_tcp_queue *queue;
- read_lock(&sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
queue = sk->sk_user_data;
if (likely(queue && queue->rd_enabled))
queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work);
- read_unlock(&sk->sk_callback_lock);
+ read_unlock_bh(&sk->sk_callback_lock);
}
static void nvme_tcp_write_space(struct sock *sk)
@@ -1051,7 +1049,12 @@ static void nvme_tcp_io_work(struct work_struct *w)
} else if (unlikely(result < 0)) {
dev_err(queue->ctrl->ctrl.device,
"failed to send request %d\n", result);
- if (result != -EPIPE)
+
+ /*
+ * Fail the request unless peer closed the connection,
+ * in which case error recovery flow will complete all.
+ */
+ if ((result != -EPIPE) && (result != -ECONNRESET))
nvme_tcp_fail_request(queue->request);
nvme_tcp_done_send_req(queue);
return;
@@ -1301,6 +1304,9 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
goto err_sock;
}
+ /* Set 10 seconds timeout for icresp recvmsg */
+ queue->sock->sk->sk_rcvtimeo = 10 * HZ;
+
queue->sock->sk->sk_allocation = GFP_ATOMIC;
if (!qid)
n = 0;
@@ -1653,15 +1659,20 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
ret = PTR_ERR(ctrl->connect_q);
goto out_free_tag_set;
}
- } else {
- blk_mq_update_nr_hw_queues(ctrl->tagset,
- ctrl->queue_count - 1);
}
ret = nvme_tcp_start_io_queues(ctrl);
if (ret)
goto out_cleanup_connect_q;
+ if (!new) {
+ nvme_start_queues(ctrl);
+ nvme_wait_freeze(ctrl);
+ blk_mq_update_nr_hw_queues(ctrl->tagset,
+ ctrl->queue_count - 1);
+ nvme_unfreeze(ctrl);
+ }
+
return 0;
out_cleanup_connect_q:
@@ -1760,6 +1771,7 @@ static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
{
if (ctrl->queue_count <= 1)
return;
+ nvme_start_freeze(ctrl);
nvme_stop_queues(ctrl);
nvme_tcp_stop_io_queues(ctrl);
if (ctrl->tagset)
@@ -2042,7 +2054,9 @@ static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
c->common.flags |= NVME_CMD_SGL_METABUF;
- if (rq_data_dir(rq) == WRITE && req->data_len &&
+ if (!blk_rq_nr_phys_segments(rq))
+ nvme_tcp_set_sg_null(c);
+ else if (rq_data_dir(rq) == WRITE &&
req->data_len <= nvme_tcp_inline_data_size(queue))
nvme_tcp_set_sg_inline(queue, c, req->data_len);
else
@@ -2069,7 +2083,8 @@ static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
req->data_sent = 0;
req->pdu_len = 0;
req->pdu_sent = 0;
- req->data_len = blk_rq_payload_bytes(rq);
+ req->data_len = blk_rq_nr_phys_segments(rq) ?
+ blk_rq_payload_bytes(rq) : 0;
req->curr_bio = rq->bio;
if (rq_data_dir(rq) == WRITE &&
diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c
index 396cbc7ea353..cabe50eaa745 100644
--- a/drivers/nvme/target/core.c
+++ b/drivers/nvme/target/core.c
@@ -550,7 +550,8 @@ int nvmet_ns_enable(struct nvmet_ns *ns)
} else {
struct nvmet_ns *old;
- list_for_each_entry_rcu(old, &subsys->namespaces, dev_link) {
+ list_for_each_entry_rcu(old, &subsys->namespaces, dev_link,
+ lockdep_is_held(&subsys->lock)) {
BUG_ON(ns->nsid == old->nsid);
if (ns->nsid < old->nsid)
break;
@@ -1164,7 +1165,8 @@ static void nvmet_setup_p2p_ns_map(struct nvmet_ctrl *ctrl,
ctrl->p2p_client = get_device(req->p2p_client);
- list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link)
+ list_for_each_entry_rcu(ns, &ctrl->subsys->namespaces, dev_link,
+ lockdep_is_held(&ctrl->subsys->lock))
nvmet_p2pmem_ns_add_p2p(ctrl, ns);
}
diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c
index 3b9f79aba98f..129f11a8a522 100644
--- a/drivers/nvme/target/fabrics-cmd.c
+++ b/drivers/nvme/target/fabrics-cmd.c
@@ -105,6 +105,7 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
u16 qid = le16_to_cpu(c->qid);
u16 sqsize = le16_to_cpu(c->sqsize);
struct nvmet_ctrl *old;
+ u16 ret;
old = cmpxchg(&req->sq->ctrl, NULL, ctrl);
if (old) {
@@ -115,7 +116,8 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
if (!sqsize) {
pr_warn("queue size zero!\n");
req->error_loc = offsetof(struct nvmf_connect_command, sqsize);
- return NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+ ret = NVME_SC_CONNECT_INVALID_PARAM | NVME_SC_DNR;
+ goto err;
}
/* note: convert queue size from 0's-based value to 1's-based value */
@@ -128,16 +130,19 @@ static u16 nvmet_install_queue(struct nvmet_ctrl *ctrl, struct nvmet_req *req)
}
if (ctrl->ops->install_queue) {
- u16 ret = ctrl->ops->install_queue(req->sq);
-
+ ret = ctrl->ops->install_queue(req->sq);
if (ret) {
pr_err("failed to install queue %d cntlid %d ret %x\n",
- qid, ret, ctrl->cntlid);
- return ret;
+ qid, ctrl->cntlid, ret);
+ goto err;
}
}
return 0;
+
+err:
+ req->sq->ctrl = NULL;
+ return ret;
}
static void nvmet_execute_admin_connect(struct nvmet_req *req)
diff --git a/drivers/nvme/target/rdma.c b/drivers/nvme/target/rdma.c
index 36d906a7f70d..dc193526d4da 100644
--- a/drivers/nvme/target/rdma.c
+++ b/drivers/nvme/target/rdma.c
@@ -143,12 +143,6 @@ static int num_pages(int len)
return 1 + (((len - 1) & PAGE_MASK) >> PAGE_SHIFT);
}
-/* XXX: really should move to a generic header sooner or later.. */
-static inline u32 get_unaligned_le24(const u8 *p)
-{
- return (u32)p[0] | (u32)p[1] << 8 | (u32)p[2] << 16;
-}
-
static inline bool nvmet_rdma_need_data_in(struct nvmet_rdma_rsp *rsp)
{
return nvme_is_write(rsp->req.cmd) &&
diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c
index 69b83fa0c76c..23a42a06b543 100644
--- a/drivers/nvme/target/tcp.c
+++ b/drivers/nvme/target/tcp.c
@@ -515,7 +515,7 @@ static int nvmet_try_send_data_pdu(struct nvmet_tcp_cmd *cmd)
return 1;
}
-static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
+static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd, bool last_in_batch)
{
struct nvmet_tcp_queue *queue = cmd->queue;
int ret;
@@ -523,9 +523,15 @@ static int nvmet_try_send_data(struct nvmet_tcp_cmd *cmd)
while (cmd->cur_sg) {
struct page *page = sg_page(cmd->cur_sg);
u32 left = cmd->cur_sg->length - cmd->offset;
+ int flags = MSG_DONTWAIT;
+
+ if ((!last_in_batch && cmd->queue->send_list_len) ||
+ cmd->wbytes_done + left < cmd->req.transfer_len ||
+ queue->data_digest || !queue->nvme_sq.sqhd_disabled)
+ flags |= MSG_MORE;
ret = kernel_sendpage(cmd->queue->sock, page, cmd->offset,
- left, MSG_DONTWAIT | MSG_MORE);
+ left, flags);
if (ret <= 0)
return ret;
@@ -660,7 +666,7 @@ static int nvmet_tcp_try_send_one(struct nvmet_tcp_queue *queue,
}
if (cmd->state == NVMET_TCP_SEND_DATA) {
- ret = nvmet_try_send_data(cmd);
+ ret = nvmet_try_send_data(cmd, last_in_batch);
if (ret <= 0)
goto done_send;
}
@@ -788,7 +794,7 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue)
icresp->hdr.pdo = 0;
icresp->hdr.plen = cpu_to_le32(icresp->hdr.hlen);
icresp->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
- icresp->maxdata = cpu_to_le32(0xffff); /* FIXME: support r2t */
+ icresp->maxdata = cpu_to_le32(0x400000); /* 16M arbitrary limit */
icresp->cpda = 0;
if (queue->hdr_digest)
icresp->digest |= NVME_TCP_HDR_DIGEST_ENABLE;