From 1abc696174f1ba27c9563c722029373e1a692933 Mon Sep 17 00:00:00 2001 From: Linjun Bao Date: Thu, 11 Aug 2022 15:40:24 +0800 Subject: [PATCH 01/16] nvme: add comment for unaligned "fake" nqn Current "fake" nqn field is "nqn.2014.08.org.nvmexpress:", it is not aligned with the canonical version for history reasons. Signed-off-by: Linjun Bao Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index af367b22871b..112881664695 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2704,7 +2704,11 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct dev_warn(ctrl->device, "missing or invalid SUBNQN field.\n"); } - /* Generate a "fake" NQN per Figure 254 in NVMe 1.3 + ECN 001 */ + /* + * Generate a "fake" NQN similar to the one in Section 4.5 of the NVMe + * Base Specification 2.0. It is slightly different from the format + * specified there due to historic reasons, and we can't change it now. + */ off = snprintf(subsys->subnqn, NVMF_NQN_SIZE, "nqn.2014.08.org.nvmexpress:%04x%04x", le16_to_cpu(id->vid), le16_to_cpu(id->ssvid)); From a8817cc09d8e6140c8561a81dded7b3f68e15a1b Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 18 Aug 2022 23:00:52 +0200 Subject: [PATCH 02/16] nvme: move from strlcpy with unused retval to strscpy Follow the advice of the below link and prefer 'strscpy' in this subsystem. Conversion is 1:1 because the return value is not used. Generated by a coccinelle script. Link: https://lore.kernel.org/r/CAHk-=wgfRnXz0W3D37d01q3JFkr_i_uTL=V6A6G1oUZcprmknw@mail.gmail.com/ Signed-off-by: Wolfram Sang Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- drivers/nvme/host/fabrics.c | 2 +- drivers/nvme/target/admin-cmd.c | 2 +- drivers/nvme/target/discovery.c | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 112881664695..8c9c1176624d 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -2696,7 +2696,7 @@ static void nvme_init_subnqn(struct nvme_subsystem *subsys, struct nvme_ctrl *ct if(!(ctrl->quirks & NVME_QUIRK_IGNORE_DEV_SUBNQN)) { nqnlen = strnlen(id->subnqn, NVMF_NQN_SIZE); if (nqnlen > 0 && nqnlen < NVMF_NQN_SIZE) { - strlcpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); + strscpy(subsys->subnqn, id->subnqn, NVMF_NQN_SIZE); return; } diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 10cc4a814602..61637c1dd722 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -49,7 +49,7 @@ static struct nvmf_host *nvmf_host_add(const char *hostnqn) goto out_unlock; kref_init(&host->ref); - strlcpy(host->nqn, hostnqn, NVMF_NQN_SIZE); + strscpy(host->nqn, hostnqn, NVMF_NQN_SIZE); list_add_tail(&host->list, &nvmf_hosts); out_unlock: diff --git a/drivers/nvme/target/admin-cmd.c b/drivers/nvme/target/admin-cmd.c index fc8a957fad0a..c8a061ce3ee5 100644 --- a/drivers/nvme/target/admin-cmd.c +++ b/drivers/nvme/target/admin-cmd.c @@ -449,7 +449,7 @@ static void nvmet_execute_identify_ctrl(struct nvmet_req *req) if (req->port->inline_data_size) id->sgls |= cpu_to_le32(1 << 20); - strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); + strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); /* * Max command capsule size is sqe + in-capsule data size. diff --git a/drivers/nvme/target/discovery.c b/drivers/nvme/target/discovery.c index c2162eef8ce1..668d257fa986 100644 --- a/drivers/nvme/target/discovery.c +++ b/drivers/nvme/target/discovery.c @@ -292,7 +292,7 @@ static void nvmet_execute_disc_identify(struct nvmet_req *req) id->oaes = cpu_to_le32(NVMET_DISC_AEN_CFG_OPTIONAL); - strlcpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); + strscpy(id->subnqn, ctrl->subsys->subsysnqn, sizeof(id->subnqn)); status = nvmet_copy_to_sgl(req, 0, id, sizeof(*id)); From 6e6fee569d4733f028f306036aaa12669c25b362 Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Fri, 12 Aug 2022 11:12:31 +0800 Subject: [PATCH 03/16] nvme-auth: remove the redundant req->cqe->result.u16 assignment operation req->cqe->result.u16 has already been assigned in the previous line, no need to do it again. Signed-off-by: Jackie Liu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/nvme/target/fabrics-cmd.c b/drivers/nvme/target/fabrics-cmd.c index f91a56180d3d..c1dfdfb92ebf 100644 --- a/drivers/nvme/target/fabrics-cmd.c +++ b/drivers/nvme/target/fabrics-cmd.c @@ -332,7 +332,6 @@ static void nvmet_execute_io_connect(struct nvmet_req *req) req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); pr_debug("adding queue %d to ctrl %d.\n", qid, ctrl->cntlid); - req->cqe->result.u16 = cpu_to_le16(ctrl->cntlid); if (nvmet_has_auth(ctrl)) nvmet_init_auth(ctrl, req); From 42147981561c3344d2c6781fe7029e5900daa9fb Mon Sep 17 00:00:00 2001 From: Jackie Liu Date: Fri, 12 Aug 2022 11:12:30 +0800 Subject: [PATCH 04/16] nvmet-auth: clean up with done_kfree Jump directly to done_kfree to release d, which is consistent with the code style behind. Reported-by: Genjian Zhang Signed-off-by: Jackie Liu Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd-auth.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index ebdf9aa81041..8458ec40340b 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -229,10 +229,8 @@ void nvmet_execute_auth_send(struct nvmet_req *req) } status = nvmet_copy_from_sgl(req, 0, d, tl); - if (status) { - kfree(d); - goto done; - } + if (status) + goto done_kfree; data = d; pr_debug("%s: ctrl %d qid %d type %d id %d step %x\n", __func__, From c46724cb8947697d4cfa5db44763d7c63b93a02b Mon Sep 17 00:00:00 2001 From: Genjian Zhang Date: Tue, 23 Aug 2022 10:14:41 +0800 Subject: [PATCH 05/16] nvmet-auth: remove redundant parameters req The parameter is not used in this function, so remove it. Signed-off-by: Genjian Zhang Signed-off-by: Christoph Hellwig --- drivers/nvme/target/fabrics-cmd-auth.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/target/fabrics-cmd-auth.c b/drivers/nvme/target/fabrics-cmd-auth.c index 8458ec40340b..84601e38a335 100644 --- a/drivers/nvme/target/fabrics-cmd-auth.c +++ b/drivers/nvme/target/fabrics-cmd-auth.c @@ -177,7 +177,7 @@ static u16 nvmet_auth_reply(struct nvmet_req *req, void *d) return 0; } -static u16 nvmet_auth_failure2(struct nvmet_req *req, void *d) +static u16 nvmet_auth_failure2(void *d) { struct nvmf_auth_dhchap_failure_data *data = d; @@ -308,7 +308,7 @@ void nvmet_execute_auth_send(struct nvmet_req *req) goto done_kfree; break; case NVME_AUTH_DHCHAP_MESSAGE_FAILURE2: - status = nvmet_auth_failure2(req, d); + status = nvmet_auth_failure2(d); if (status) { pr_warn("ctrl %d qid %d: authentication failed (%d)\n", ctrl->cntlid, req->sq->qid, status); From d416800776b530ad629b2959909062287339defd Mon Sep 17 00:00:00 2001 From: Guixin Liu Date: Fri, 29 Jul 2022 11:53:05 +0800 Subject: [PATCH 06/16] nvmet: avoid unnecessary flush bio For no volatile write cache block device backend, sending flush bio is unnecessary, avoid to do that. Signed-off-by: Guixin Liu Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/target/io-cmd-bdev.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/nvme/target/io-cmd-bdev.c b/drivers/nvme/target/io-cmd-bdev.c index 2dc1c1035626..8d527a8c0f54 100644 --- a/drivers/nvme/target/io-cmd-bdev.c +++ b/drivers/nvme/target/io-cmd-bdev.c @@ -334,6 +334,11 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) { struct bio *bio = &req->b.inline_bio; + if (!bdev_write_cache(req->ns->bdev)) { + nvmet_req_complete(req, NVME_SC_SUCCESS); + return; + } + if (!nvmet_check_transfer_len(req, 0)) return; @@ -347,6 +352,9 @@ static void nvmet_bdev_execute_flush(struct nvmet_req *req) u16 nvmet_bdev_flush(struct nvmet_req *req) { + if (!bdev_write_cache(req->ns->bdev)) + return 0; + if (blkdev_issue_flush(req->ns->bdev)) return NVME_SC_INTERNAL | NVME_SC_DNR; return 0; From 3e980f5995e0bb4d86fef873a9c9ad66721580d0 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 29 Aug 2022 11:28:39 +0200 Subject: [PATCH 07/16] nvmet: expose max queues to configfs Allow to set the max queues the target supports. This is useful for testing the reconnect attempt of the host with changing numbers of supported queues. Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/target/configfs.c | 29 +++++++++++++++++++++++++++++ 1 file changed, 29 insertions(+) diff --git a/drivers/nvme/target/configfs.c b/drivers/nvme/target/configfs.c index 2bcd60758919..e34a2896fedb 100644 --- a/drivers/nvme/target/configfs.c +++ b/drivers/nvme/target/configfs.c @@ -1281,6 +1281,34 @@ static ssize_t nvmet_subsys_attr_pi_enable_store(struct config_item *item, CONFIGFS_ATTR(nvmet_subsys_, attr_pi_enable); #endif +static ssize_t nvmet_subsys_attr_qid_max_show(struct config_item *item, + char *page) +{ + return snprintf(page, PAGE_SIZE, "%u\n", to_subsys(item)->max_qid); +} + +static ssize_t nvmet_subsys_attr_qid_max_store(struct config_item *item, + const char *page, size_t cnt) +{ + struct nvmet_port *port = to_nvmet_port(item); + u16 qid_max; + + if (nvmet_is_port_enabled(port, __func__)) + return -EACCES; + + if (sscanf(page, "%hu\n", &qid_max) != 1) + return -EINVAL; + + if (qid_max < 1 || qid_max > NVMET_NR_QUEUES) + return -EINVAL; + + down_write(&nvmet_config_sem); + to_subsys(item)->max_qid = qid_max; + up_write(&nvmet_config_sem); + return cnt; +} +CONFIGFS_ATTR(nvmet_subsys_, attr_qid_max); + static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_allow_any_host, &nvmet_subsys_attr_attr_version, @@ -1288,6 +1316,7 @@ static struct configfs_attribute *nvmet_subsys_attrs[] = { &nvmet_subsys_attr_attr_cntlid_min, &nvmet_subsys_attr_attr_cntlid_max, &nvmet_subsys_attr_attr_model, + &nvmet_subsys_attr_attr_qid_max, #ifdef CONFIG_BLK_DEV_INTEGRITY &nvmet_subsys_attr_attr_pi_enable, #endif From 09035f86496d8dea7a05a07f6dcb8083c0a3d885 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 29 Aug 2022 11:28:40 +0200 Subject: [PATCH 08/16] nvme-tcp: handle number of queue changes On reconnect, the number of queues might have changed. In the case where we have more queues available than previously we try to access queues which are not initialized yet. The other case where we have less queues than previously, the connection attempt will fail because the target doesn't support the old number of queues and we end up in a reconnect loop. Thus, only start queues which are currently present in the tagset limited by the number of available queues. Then we update the tagset and we can start any new queue. Signed-off-by: Daniel Wagner Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index ef151c23d495..4c2d198145a7 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -1762,11 +1762,12 @@ static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl) nvme_tcp_stop_queue(ctrl, i); } -static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) +static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl, + int first, int last) { int i, ret; - for (i = 1; i < ctrl->queue_count; i++) { + for (i = first; i < last; i++) { ret = nvme_tcp_start_queue(ctrl, i); if (ret) goto out_stop_queues; @@ -1775,7 +1776,7 @@ static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl) return 0; out_stop_queues: - for (i--; i >= 1; i--) + for (i--; i >= first; i--) nvme_tcp_stop_queue(ctrl, i); return ret; } @@ -1901,7 +1902,7 @@ static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove) static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) { - int ret; + int ret, nr_queues; ret = nvme_tcp_alloc_io_queues(ctrl); if (ret) @@ -1917,7 +1918,13 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) goto out_free_tag_set; } - ret = nvme_tcp_start_io_queues(ctrl); + /* + * Only start IO queues for which we have allocated the tagset + * and limitted it to the available queues. On reconnects, the + * queue number might have changed. + */ + nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count); + ret = nvme_tcp_start_io_queues(ctrl, 1, nr_queues); if (ret) goto out_cleanup_connect_q; @@ -1937,6 +1944,15 @@ static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new) nvme_unfreeze(ctrl); } + /* + * If the number of queues has increased (reconnect case) + * start all new queues now. + */ + ret = nvme_tcp_start_io_queues(ctrl, nr_queues, + ctrl->tagset->nr_hw_queues + 1); + if (ret) + goto out_wait_freeze_timed_out; + return 0; out_wait_freeze_timed_out: From 1c467e259599864ec925d5b85066a0960320fb3c Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Mon, 29 Aug 2022 11:28:41 +0200 Subject: [PATCH 09/16] nvme-rdma: handle number of queue changes On reconnect, the number of queues might have changed. In the case where we have more queues available than previously we try to access queues which are not initialized yet. The other case where we have less queues than previously, the connection attempt will fail because the target doesn't support the old number of queues and we end up in a reconnect loop. Thus, only start queues which are currently present in the tagset limited by the number of available queues. Then we update the tagset and we can start any new queue. Signed-off-by: Daniel Wagner Reviewed-by: Sagi Grimberg Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/rdma.c | 26 +++++++++++++++++++++----- 1 file changed, 21 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index ba08851e42c3..4c6df34f9d7a 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -696,11 +696,12 @@ static int nvme_rdma_start_queue(struct nvme_rdma_ctrl *ctrl, int idx) return ret; } -static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) +static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl, + int first, int last) { int i, ret = 0; - for (i = 1; i < ctrl->ctrl.queue_count; i++) { + for (i = first; i < last; i++) { ret = nvme_rdma_start_queue(ctrl, i); if (ret) goto out_stop_queues; @@ -709,7 +710,7 @@ static int nvme_rdma_start_io_queues(struct nvme_rdma_ctrl *ctrl) return 0; out_stop_queues: - for (i--; i >= 1; i--) + for (i--; i >= first; i--) nvme_rdma_stop_queue(&ctrl->queues[i]); return ret; } @@ -964,7 +965,7 @@ static void nvme_rdma_destroy_io_queues(struct nvme_rdma_ctrl *ctrl, static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) { - int ret; + int ret, nr_queues; ret = nvme_rdma_alloc_io_queues(ctrl); if (ret) @@ -980,7 +981,13 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) goto out_free_tag_set; } - ret = nvme_rdma_start_io_queues(ctrl); + /* + * Only start IO queues for which we have allocated the tagset + * and limitted it to the available queues. On reconnects, the + * queue number might have changed. + */ + nr_queues = min(ctrl->tag_set.nr_hw_queues + 1, ctrl->ctrl.queue_count); + ret = nvme_rdma_start_io_queues(ctrl, 1, nr_queues); if (ret) goto out_cleanup_connect_q; @@ -1000,6 +1007,15 @@ static int nvme_rdma_configure_io_queues(struct nvme_rdma_ctrl *ctrl, bool new) nvme_unfreeze(&ctrl->ctrl); } + /* + * If the number of queues has increased (reconnect case) + * start all new queues now. + */ + ret = nvme_rdma_start_io_queues(ctrl, nr_queues, + ctrl->tag_set.nr_hw_queues + 1); + if (ret) + goto out_wait_freeze_timed_out; + return 0; out_wait_freeze_timed_out: From 4cde03d82e2d0056d20fd5af6a264c7f5e6a3e76 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Fri, 29 Jul 2022 16:26:30 +0200 Subject: [PATCH 10/16] nvme: consider also host_iface when checking ip options It's perfectly fine to use the same traddr and trsvcid more than once as long we use different host interface. This is used in setups where the host has more than one interface but the target exposes only one traddr/trsvcid combination. Use the same acceptance rules for host_iface as we have for host_traddr. Signed-off-by: Daniel Wagner Reviewed-by: Chao Leng Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 23 ++++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 61637c1dd722..ce27276f552d 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -971,13 +971,17 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, return false; /* - * Checking the local address is rough. In most cases, none is specified - * and the host port is selected by the stack. + * Checking the local address or host interfaces is rough. + * + * In most cases, none is specified and the host port or + * host interface is selected by the stack. * * Assume no match if: - * - local address is specified and address is not the same - * - local address is not specified but remote is, or vice versa - * (admin using specific host_traddr when it matters). + * - local address or host interface is specified and address + * or host interface is not the same + * - local address or host interface is not specified but + * remote is, or vice versa (admin using specific + * host_traddr/host_iface when it matters). */ if ((opts->mask & NVMF_OPT_HOST_TRADDR) && (ctrl->opts->mask & NVMF_OPT_HOST_TRADDR)) { @@ -988,6 +992,15 @@ bool nvmf_ip_options_match(struct nvme_ctrl *ctrl, return false; } + if ((opts->mask & NVMF_OPT_HOST_IFACE) && + (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) { + if (strcmp(opts->host_iface, ctrl->opts->host_iface)) + return false; + } else if ((opts->mask & NVMF_OPT_HOST_IFACE) || + (ctrl->opts->mask & NVMF_OPT_HOST_IFACE)) { + return false; + } + return true; } EXPORT_SYMBOL_GPL(nvmf_ip_options_match); From a53232cb3abef51524f06ee9d8fbc3364ad95794 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 Sep 2022 09:07:35 -0700 Subject: [PATCH 11/16] nvme-pci: remove nvme_queue from nvme_iod We can get the nvme_queue from the req just as easily, so remove the duplicate path to the same structure to save some space. Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 28 +++++++++++++--------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4a8cfb360d31..403876ad3234 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -226,7 +226,6 @@ struct nvme_queue { struct nvme_iod { struct nvme_request req; struct nvme_command cmd; - struct nvme_queue *nvmeq; bool use_sgl; int aborted; int npages; /* In the PRP list. 0 means small pool in use */ @@ -430,11 +429,6 @@ static int nvme_pci_init_request(struct blk_mq_tag_set *set, { struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = &dev->queues[queue_idx]; - - BUG_ON(!nvmeq); - iod->nvmeq = nvmeq; nvme_req(req)->ctrl = &dev->ctrl; nvme_req(req)->cmd = &iod->cmd; @@ -526,7 +520,7 @@ static void **nvme_pci_iod_list(struct request *req) static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; int nseg = blk_rq_nr_phys_segments(req); unsigned int avg_seg_size; @@ -534,7 +528,7 @@ static inline bool nvme_pci_use_sgls(struct nvme_dev *dev, struct request *req) if (!nvme_ctrl_sgl_supported(&dev->ctrl)) return false; - if (!iod->nvmeq->qid) + if (!nvmeq->qid) return false; if (!sgl_threshold || avg_seg_size < sgl_threshold) return false; @@ -831,6 +825,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, int rc; if (blk_rq_nr_phys_segments(req) == 1) { + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct bio_vec bv = req_bvec(req); if (!is_pci_p2pdma_page(bv.bv_page)) { @@ -838,7 +833,7 @@ static blk_status_t nvme_map_data(struct nvme_dev *dev, struct request *req, return nvme_setup_prp_simple(dev, req, &cmnd->rw, &bv); - if (iod->nvmeq->qid && sgl_threshold && + if (nvmeq->qid && sgl_threshold && nvme_ctrl_sgl_supported(&dev->ctrl)) return nvme_setup_sgl_simple(dev, req, &cmnd->rw, &bv); @@ -1017,12 +1012,16 @@ static void nvme_queue_rqs(struct request **rqlist) static __always_inline void nvme_pci_unmap_rq(struct request *req) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_dev *dev = iod->nvmeq->dev; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; + + if (blk_integrity_rq(req)) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - if (blk_integrity_rq(req)) dma_unmap_page(dev->dev, iod->meta_dma, rq_integrity_vec(req)->bv_len, rq_data_dir(req)); + } + if (blk_rq_nr_phys_segments(req)) nvme_unmap_data(dev, req); } @@ -1270,8 +1269,7 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) static void abort_endio(struct request *req, blk_status_t error) { - struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = iod->nvmeq; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; dev_warn(nvmeq->dev->ctrl.device, "Abort status: 0x%x", nvme_req(req)->status); @@ -1333,7 +1331,7 @@ static void nvme_warn_reset(struct nvme_dev *dev, u32 csts) static enum blk_eh_timer_return nvme_timeout(struct request *req) { struct nvme_iod *iod = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = iod->nvmeq; + struct nvme_queue *nvmeq = req->mq_hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; struct nvme_command cmd = { }; From 52da4f3f5cbd42815946c0544774167241f9f45f Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 Sep 2022 09:07:36 -0700 Subject: [PATCH 12/16] nvme-pci: iod's 'aborted' is a bool It's only true or false, so make this a bool to reflect that and save some space in nvme_iod. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 403876ad3234..045ebdd8e8f3 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -227,7 +227,7 @@ struct nvme_iod { struct nvme_request req; struct nvme_command cmd; bool use_sgl; - int aborted; + bool aborted; int npages; /* In the PRP list. 0 means small pool in use */ dma_addr_t first_dma; unsigned int dma_len; /* length of single DMA segment mapping */ @@ -891,7 +891,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) struct nvme_iod *iod = blk_mq_rq_to_pdu(req); blk_status_t ret; - iod->aborted = 0; + iod->aborted = false; iod->npages = -1; iod->sgt.nents = 0; @@ -1412,7 +1412,7 @@ static enum blk_eh_timer_return nvme_timeout(struct request *req) atomic_inc(&dev->ctrl.abort_limit); return BLK_EH_RESET_TIMER; } - iod->aborted = 1; + iod->aborted = true; cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = nvme_cid(req); From c372cdd1efdf2b8e51fdde36a2a05c263ab5ebb7 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Tue, 6 Sep 2022 09:07:37 -0700 Subject: [PATCH 13/16] nvme-pci: iod npages fits in s8 The largest allowed transfer is 4MB, which can use at most 1025 PRPs. Each PRP is 8 bytes, so the maximum number of 4k nvme pages needed for the iod_list is 3, which fits in an 's8' type. While modifying this field, change the name to "nr_allocations" to better represent that this is referring to the number of units allocated from a dma_pool. Also introduce a BUILD_BUG_ON to ensure we never accidently increase the largest transfer limit beyond 127 chained prp lists. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 045ebdd8e8f3..a553062ff3ba 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -228,7 +228,8 @@ struct nvme_iod { struct nvme_command cmd; bool use_sgl; bool aborted; - int npages; /* In the PRP list. 0 means small pool in use */ + s8 nr_allocations; /* PRP list pool allocations. 0 means small + pool in use */ dma_addr_t first_dma; unsigned int dma_len; /* length of single DMA segment mapping */ dma_addr_t meta_dma; @@ -542,7 +543,7 @@ static void nvme_free_prps(struct nvme_dev *dev, struct request *req) dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->npages; i++) { + for (i = 0; i < iod->nr_allocations; i++) { __le64 *prp_list = nvme_pci_iod_list(req)[i]; dma_addr_t next_dma_addr = le64_to_cpu(prp_list[last_prp]); @@ -558,7 +559,7 @@ static void nvme_free_sgls(struct nvme_dev *dev, struct request *req) dma_addr_t dma_addr = iod->first_dma; int i; - for (i = 0; i < iod->npages; i++) { + for (i = 0; i < iod->nr_allocations; i++) { struct nvme_sgl_desc *sg_list = nvme_pci_iod_list(req)[i]; dma_addr_t next_dma_addr = le64_to_cpu((sg_list[last_sg]).addr); @@ -581,7 +582,7 @@ static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) dma_unmap_sgtable(dev->dev, &iod->sgt, rq_dma_dir(req), 0); - if (iod->npages == 0) + if (iod->nr_allocations == 0) dma_pool_free(dev->prp_small_pool, nvme_pci_iod_list(req)[0], iod->first_dma); else if (iod->use_sgl) @@ -643,15 +644,15 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, nprps = DIV_ROUND_UP(length, NVME_CTRL_PAGE_SIZE); if (nprps <= (256 / 8)) { pool = dev->prp_small_pool; - iod->npages = 0; + iod->nr_allocations = 0; } else { pool = dev->prp_page_pool; - iod->npages = 1; + iod->nr_allocations = 1; } prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) { - iod->npages = -1; + iod->nr_allocations = -1; return BLK_STS_RESOURCE; } list[0] = prp_list; @@ -663,7 +664,7 @@ static blk_status_t nvme_pci_setup_prps(struct nvme_dev *dev, prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) goto free_prps; - list[iod->npages++] = prp_list; + list[iod->nr_allocations++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); i = 1; @@ -738,15 +739,15 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, if (entries <= (256 / sizeof(struct nvme_sgl_desc))) { pool = dev->prp_small_pool; - iod->npages = 0; + iod->nr_allocations = 0; } else { pool = dev->prp_page_pool; - iod->npages = 1; + iod->nr_allocations = 1; } sg_list = dma_pool_alloc(pool, GFP_ATOMIC, &sgl_dma); if (!sg_list) { - iod->npages = -1; + iod->nr_allocations = -1; return BLK_STS_RESOURCE; } @@ -765,7 +766,7 @@ static blk_status_t nvme_pci_setup_sgls(struct nvme_dev *dev, goto free_sgls; i = 0; - nvme_pci_iod_list(req)[iod->npages++] = sg_list; + nvme_pci_iod_list(req)[iod->nr_allocations++] = sg_list; sg_list[i++] = *link; nvme_pci_sgl_set_seg(link, sgl_dma, entries); } @@ -892,7 +893,7 @@ static blk_status_t nvme_prep_rq(struct nvme_dev *dev, struct request *req) blk_status_t ret; iod->aborted = false; - iod->npages = -1; + iod->nr_allocations = -1; iod->sgt.nents = 0; ret = nvme_setup_cmd(req->q->queuedata, req); @@ -3559,6 +3560,8 @@ static int __init nvme_init(void) BUILD_BUG_ON(sizeof(struct nvme_create_sq) != 64); BUILD_BUG_ON(sizeof(struct nvme_delete_queue) != 64); BUILD_BUG_ON(IRQ_AFFINITY_MAX_SETS < 2); + BUILD_BUG_ON(DIV_ROUND_UP(nvme_pci_npages_prp(), NVME_CTRL_PAGE_SIZE) > + S8_MAX); return pci_register_driver(&nvme_driver); } From c4c22c52081385f026b430f35776f80073a22076 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Wed, 7 Sep 2022 08:42:07 +0200 Subject: [PATCH 14/16] nvme-pci: move iod dma_len fill gaps The 32-bit field, dma_len, packs better in the iod struct above the dma_addr_t on 64-bit systems. Reviewed-by: Chaitanya Kulkarni Signed-off-by: Keith Busch Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index a553062ff3ba..70b1922c8953 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -230,8 +230,8 @@ struct nvme_iod { bool aborted; s8 nr_allocations; /* PRP list pool allocations. 0 means small pool in use */ - dma_addr_t first_dma; unsigned int dma_len; /* length of single DMA segment mapping */ + dma_addr_t first_dma; dma_addr_t meta_dma; struct sg_table sgt; }; From 5bfaba275ae6486700194cad962574e3eb7ae60d Mon Sep 17 00:00:00 2001 From: "Fabio M. De Francesco" Date: Wed, 31 Aug 2022 00:05:33 +0200 Subject: [PATCH 15/16] nvmet-tcp: don't map pages which can't come from HIGHMEM MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit kmap() is being deprecated in favor of kmap_local_page().[1] There are two main problems with kmap(): (1) It comes with an overhead as mapping space is restricted and protected by a global lock for synchronization and (2) it also requires global TLB invalidation when the kmap’s pool wraps and it might block when the mapping space is fully utilized until a slot becomes available. The pages which will be mapped are allocated in nvmet_tcp_map_data(), using the GFP_KERNEL flag. This assures that they cannot come from HIGHMEM. This imply that a straight page_address() can replace the kmap() of sg_page(sg) in nvmet_tcp_map_pdu_iovec(). As a side effect, we might also delete the field "nr_mapped" from struct "nvmet_tcp_cmd" because, after removing the kmap() calls, there would be no longer any need of it. In addition, there is no reason to use a kvec for the command receive data buffers iovec, use a bio_vec instead and let iov_iter handle the buffer mapping and data copy. Test with blktests on a QEMU/KVM x86_32 VM, 6GB RAM, booting a kernel with HIGHMEM64GB enabled. [1] "[PATCH] checkpatch: Add kmap and kmap_atomic to the deprecated list" https://lore.kernel.org/all/20220813220034.806698-1-ira.weiny@intel.com/ Cc: Chaitanya Kulkarni Cc: Keith Busch Suggested-by: Ira Weiny Signed-off-by: Fabio M. De Francesco Suggested-by: Christoph Hellwig Suggested-by: Al Viro [sagi: added bio_vec plus minor naming changes] Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/tcp.c | 44 ++++++++++++--------------------------- 1 file changed, 13 insertions(+), 31 deletions(-) diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c index dc3b4dc8fe08..c07de4f4f719 100644 --- a/drivers/nvme/target/tcp.c +++ b/drivers/nvme/target/tcp.c @@ -77,9 +77,8 @@ struct nvmet_tcp_cmd { u32 pdu_len; u32 pdu_recv; int sg_idx; - int nr_mapped; struct msghdr recv_msg; - struct kvec *iov; + struct bio_vec *iov; u32 flags; struct list_head entry; @@ -167,7 +166,6 @@ static const struct nvmet_fabrics_ops nvmet_tcp_ops; static void nvmet_tcp_free_cmd(struct nvmet_tcp_cmd *c); static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd); static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd); -static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd); static inline u16 nvmet_tcp_cmd_tag(struct nvmet_tcp_queue *queue, struct nvmet_tcp_cmd *cmd) @@ -301,35 +299,21 @@ static int nvmet_tcp_check_ddgst(struct nvmet_tcp_queue *queue, void *pdu) static void nvmet_tcp_free_cmd_buffers(struct nvmet_tcp_cmd *cmd) { - WARN_ON(unlikely(cmd->nr_mapped > 0)); - kfree(cmd->iov); sgl_free(cmd->req.sg); cmd->iov = NULL; cmd->req.sg = NULL; } -static void nvmet_tcp_unmap_pdu_iovec(struct nvmet_tcp_cmd *cmd) +static void nvmet_tcp_build_pdu_iovec(struct nvmet_tcp_cmd *cmd) { - struct scatterlist *sg; - int i; - - sg = &cmd->req.sg[cmd->sg_idx]; - - for (i = 0; i < cmd->nr_mapped; i++) - kunmap(sg_page(&sg[i])); - - cmd->nr_mapped = 0; -} - -static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) -{ - struct kvec *iov = cmd->iov; + struct bio_vec *iov = cmd->iov; struct scatterlist *sg; u32 length, offset, sg_offset; + int nr_pages; length = cmd->pdu_len; - cmd->nr_mapped = DIV_ROUND_UP(length, PAGE_SIZE); + nr_pages = DIV_ROUND_UP(length, PAGE_SIZE); offset = cmd->rbytes_done; cmd->sg_idx = offset / PAGE_SIZE; sg_offset = offset % PAGE_SIZE; @@ -338,8 +322,9 @@ static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) while (length) { u32 iov_len = min_t(u32, length, sg->length - sg_offset); - iov->iov_base = kmap(sg_page(sg)) + sg->offset + sg_offset; - iov->iov_len = iov_len; + iov->bv_page = sg_page(sg); + iov->bv_len = sg->length; + iov->bv_offset = sg->offset + sg_offset; length -= iov_len; sg = sg_next(sg); @@ -347,8 +332,8 @@ static void nvmet_tcp_map_pdu_iovec(struct nvmet_tcp_cmd *cmd) sg_offset = 0; } - iov_iter_kvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, - cmd->nr_mapped, cmd->pdu_len); + iov_iter_bvec(&cmd->recv_msg.msg_iter, READ, cmd->iov, + nr_pages, cmd->pdu_len); } static void nvmet_tcp_fatal_error(struct nvmet_tcp_queue *queue) @@ -926,7 +911,7 @@ static void nvmet_tcp_handle_req_failure(struct nvmet_tcp_queue *queue, } queue->rcv_state = NVMET_TCP_RECV_DATA; - nvmet_tcp_map_pdu_iovec(cmd); + nvmet_tcp_build_pdu_iovec(cmd); cmd->flags |= NVMET_TCP_F_INIT_FAILED; } @@ -952,7 +937,7 @@ static int nvmet_tcp_handle_h2c_data_pdu(struct nvmet_tcp_queue *queue) cmd->pdu_len = le32_to_cpu(data->data_length); cmd->pdu_recv = 0; - nvmet_tcp_map_pdu_iovec(cmd); + nvmet_tcp_build_pdu_iovec(cmd); queue->cmd = cmd; queue->rcv_state = NVMET_TCP_RECV_DATA; @@ -1021,7 +1006,7 @@ static int nvmet_tcp_done_recv_pdu(struct nvmet_tcp_queue *queue) if (nvmet_tcp_need_data_in(queue->cmd)) { if (nvmet_tcp_has_inline_data(queue->cmd)) { queue->rcv_state = NVMET_TCP_RECV_DATA; - nvmet_tcp_map_pdu_iovec(queue->cmd); + nvmet_tcp_build_pdu_iovec(queue->cmd); return 0; } /* send back R2T */ @@ -1141,7 +1126,6 @@ static int nvmet_tcp_try_recv_data(struct nvmet_tcp_queue *queue) cmd->rbytes_done += ret; } - nvmet_tcp_unmap_pdu_iovec(cmd); if (queue->data_digest) { nvmet_tcp_prep_recv_ddgst(cmd); return 0; @@ -1411,7 +1395,6 @@ static void nvmet_tcp_restore_socket_callbacks(struct nvmet_tcp_queue *queue) static void nvmet_tcp_finish_cmd(struct nvmet_tcp_cmd *cmd) { nvmet_req_uninit(&cmd->req); - nvmet_tcp_unmap_pdu_iovec(cmd); nvmet_tcp_free_cmd_buffers(cmd); } @@ -1424,7 +1407,6 @@ static void nvmet_tcp_uninit_data_in_cmds(struct nvmet_tcp_queue *queue) if (nvmet_tcp_need_data_in(cmd)) nvmet_req_uninit(&cmd->req); - nvmet_tcp_unmap_pdu_iovec(cmd); nvmet_tcp_free_cmd_buffers(cmd); } From 02c57a82c0081141abc19150beab48ef47f97f18 Mon Sep 17 00:00:00 2001 From: Martin Belanger Date: Wed, 7 Sep 2022 08:27:37 -0400 Subject: [PATCH 16/16] nvme-tcp: print actual source IP address through sysfs "address" attr TCP transport relies on the routing table to determine which source address and interface to use when making a connection. Currently, there is no way to tell from userspace where a connection was made. This patch exposes the actual source address using a new field named "src_addr=" in the "address" attribute. This is needed to diagnose and identify connectivity issues. With the source address we can infer the interface associated with each connection. This was tested with nvme-cli 2.0 to verify it does not have any adverse effect. The new "src_addr=" field will simply be displayed in the output of the "list-subsys" or "list -v" commands as shown here. $ nvme list-subsys nvme-subsys0 - NQN=nqn.2014-08.org.nvmexpress.discovery \ +- nvme0 tcp traddr=192.168.56.1,trsvcid=8009,src_addr=192.168.56.101 live Signed-off-by: Martin Belanger Reviewed-by: Sagi Grimberg Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 21 ++++++++++++++++++++- 1 file changed, 20 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index 4c2d198145a7..b5f22ceaae82 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2546,6 +2546,25 @@ static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob) return queue->nr_cqe; } +static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) +{ + struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0]; + struct sockaddr_storage src_addr; + int ret, len; + + len = nvmf_get_address(ctrl, buf, size); + + ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); + if (ret > 0) { + if (len > 0) + len--; /* strip trailing newline */ + len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", + (len) ? "," : "", &src_addr); + } + + return len; +} + static const struct blk_mq_ops nvme_tcp_mq_ops = { .queue_rq = nvme_tcp_queue_rq, .commit_rqs = nvme_tcp_commit_rqs, @@ -2577,7 +2596,7 @@ static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = { .free_ctrl = nvme_tcp_free_ctrl, .submit_async_event = nvme_tcp_submit_async_event, .delete_ctrl = nvme_tcp_delete_ctrl, - .get_address = nvmf_get_address, + .get_address = nvme_tcp_get_address, .stop_ctrl = nvme_tcp_stop_ctrl, };