diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c index b7b03082c54e..00e619bfdc46 100644 --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -1,1703 +1,1702 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2012-2016 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include "opt_nvme.h" #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme_private.h" #define B4_CHK_RDY_DELAY_MS 2300 /* work around controller bug */ static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer); static void nvme_ctrlr_barrier(struct nvme_controller *ctrlr, int flags) { bus_barrier(ctrlr->resource, 0, rman_get_size(ctrlr->resource), flags); } static void nvme_ctrlr_devctl_va(struct nvme_controller *ctrlr, const char *type, const char *msg, va_list ap) { struct sbuf sb; int error; if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL) return; sbuf_printf(&sb, "name=\"%s\" ", device_get_nameunit(ctrlr->dev)); sbuf_vprintf(&sb, msg, ap); error = sbuf_finish(&sb); if (error == 0) devctl_notify("nvme", "controller", type, sbuf_data(&sb)); sbuf_delete(&sb); } static void nvme_ctrlr_devctl(struct nvme_controller *ctrlr, const char *type, const char *msg, ...) { va_list ap; va_start(ap, msg); nvme_ctrlr_devctl_va(ctrlr, type, msg, ap); va_end(ap); } static void nvme_ctrlr_devctl_log(struct nvme_controller *ctrlr, const char *type, const char *msg, ...) { struct sbuf sb; va_list ap; int error; if (sbuf_new(&sb, NULL, 0, SBUF_AUTOEXTEND | SBUF_NOWAIT) == NULL) return; sbuf_printf(&sb, "%s: ", device_get_nameunit(ctrlr->dev)); va_start(ap, msg); sbuf_vprintf(&sb, msg, ap); va_end(ap); error = sbuf_finish(&sb); if (error == 0) printf("%s\n", sbuf_data(&sb)); sbuf_delete(&sb); va_start(ap, msg); nvme_ctrlr_devctl_va(ctrlr, type, msg, ap); va_end(ap); } static int nvme_ctrlr_construct_admin_qpair(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t num_entries; int error; qpair = &ctrlr->adminq; qpair->id = 0; qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; qpair->domain = ctrlr->domain; num_entries = NVME_ADMIN_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.admin_entries", &num_entries); /* * If admin_entries was overridden to an invalid value, revert it * back to our default value. */ if (num_entries < NVME_MIN_ADMIN_ENTRIES || num_entries > NVME_MAX_ADMIN_ENTRIES) { nvme_printf(ctrlr, "invalid hw.nvme.admin_entries=%d " "specified\n", num_entries); num_entries = NVME_ADMIN_ENTRIES; } /* * The admin queue's max xfer size is treated differently than the * max I/O xfer size. 16KB is sufficient here - maybe even less? */ error = nvme_qpair_construct(qpair, num_entries, NVME_ADMIN_TRACKERS, ctrlr); return (error); } #define QP(ctrlr, c) ((c) * (ctrlr)->num_io_queues / mp_ncpus) static int nvme_ctrlr_construct_io_qpairs(struct nvme_controller *ctrlr) { struct nvme_qpair *qpair; uint32_t cap_lo; uint16_t mqes; int c, error, i, n; int num_entries, num_trackers, max_entries; /* * NVMe spec sets a hard limit of 64K max entries, but devices may * specify a smaller limit, so we need to check the MQES field in the * capabilities register. We have to cap the number of entries to the * current stride allows for in BAR 0/1, otherwise the remainder entries * are inaccessible. MQES should reflect this, and this is just a * fail-safe. */ max_entries = (rman_get_size(ctrlr->resource) - nvme_mmio_offsetof(doorbell[0])) / (1 << (ctrlr->dstrd + 1)); num_entries = NVME_IO_ENTRIES; TUNABLE_INT_FETCH("hw.nvme.io_entries", &num_entries); cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); mqes = NVME_CAP_LO_MQES(cap_lo); num_entries = min(num_entries, mqes + 1); num_entries = min(num_entries, max_entries); num_trackers = NVME_IO_TRACKERS; TUNABLE_INT_FETCH("hw.nvme.io_trackers", &num_trackers); num_trackers = max(num_trackers, NVME_MIN_IO_TRACKERS); num_trackers = min(num_trackers, NVME_MAX_IO_TRACKERS); /* * No need to have more trackers than entries in the submit queue. Note * also that for a queue size of N, we can only have (N-1) commands * outstanding, hence the "-1" here. */ num_trackers = min(num_trackers, (num_entries-1)); /* * Our best estimate for the maximum number of I/Os that we should * normally have in flight at one time. This should be viewed as a hint, * not a hard limit and will need to be revisited when the upper layers * of the storage system grows multi-queue support. */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; ctrlr->ioq = malloc(ctrlr->num_io_queues * sizeof(struct nvme_qpair), M_NVME, M_ZERO | M_WAITOK); for (i = c = n = 0; i < ctrlr->num_io_queues; i++, c += n) { qpair = &ctrlr->ioq[i]; /* * Admin queue has ID=0. IO queues start at ID=1 - * hence the 'i+1' here. */ qpair->id = i + 1; if (ctrlr->num_io_queues > 1) { /* Find number of CPUs served by this queue. */ for (n = 1; QP(ctrlr, c + n) == i; n++) ; /* Shuffle multiple NVMe devices between CPUs. */ qpair->cpu = c + (device_get_unit(ctrlr->dev)+n/2) % n; qpair->domain = pcpu_find(qpair->cpu)->pc_domain; } else { qpair->cpu = CPU_FFS(&cpuset_domain[ctrlr->domain]) - 1; qpair->domain = ctrlr->domain; } /* * For I/O queues, use the controller-wide max_xfer_size * calculated in nvme_attach(). */ error = nvme_qpair_construct(qpair, num_entries, num_trackers, ctrlr); if (error) return (error); /* * Do not bother binding interrupts if we only have one I/O * interrupt thread for this controller. */ if (ctrlr->num_io_queues > 1) bus_bind_intr(ctrlr->dev, qpair->res, qpair->cpu); } return (0); } static void nvme_ctrlr_fail(struct nvme_controller *ctrlr) { int i; /* * No need to disable queues before failing them. Failing is a superet * of disabling (though pedantically we'd abort the AERs silently with * a different error, though when we fail, that hardly matters). */ ctrlr->is_failed = true; nvme_qpair_fail(&ctrlr->adminq); if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) { nvme_qpair_fail(&ctrlr->ioq[i]); } } nvme_notify_fail_consumers(ctrlr); } /* * Wait for RDY to change. * * Starts sleeping for 1us and geometrically increases it the longer we wait, * capped at 1ms. */ static int nvme_ctrlr_wait_for_ready(struct nvme_controller *ctrlr, int desired_val) { int timeout = ticks + MSEC_2_TICKS(ctrlr->ready_timeout_in_ms); sbintime_t delta_t = SBT_1US; uint32_t csts; while (1) { csts = nvme_mmio_read_4(ctrlr, csts); if (csts == NVME_GONE) /* Hot unplug. */ return (ENXIO); if (NVMEV(NVME_CSTS_REG_RDY, csts) == desired_val) break; if (timeout - ticks < 0) { nvme_printf(ctrlr, "controller ready did not become %d " "within %d ms\n", desired_val, ctrlr->ready_timeout_in_ms); return (ENXIO); } pause_sbt("nvmerdy", delta_t, 0, C_PREL(1)); delta_t = min(SBT_1MS, delta_t * 3 / 2); } return (0); } static int nvme_ctrlr_disable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = NVMEV(NVME_CC_REG_EN, cc); rdy = NVMEV(NVME_CSTS_REG_RDY, csts); /* * Per 3.1.5 in NVME 1.3 spec, transitioning CC.EN from 0 to 1 * when CSTS.RDY is 1 or transitioning CC.EN from 1 to 0 when * CSTS.RDY is 0 "has undefined results" So make sure that CSTS.RDY * isn't the desired value. Short circuit if we're already disabled. */ if (en == 0) { /* Wait for RDY == 0 or timeout & fail */ if (rdy == 0) return (0); return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } if (rdy == 0) { /* EN == 1, wait for RDY == 1 or timeout & fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 1); if (err != 0) return (err); } cc &= ~NVMEM(NVME_CC_REG_EN); nvme_mmio_write_4(ctrlr, cc, cc); /* * A few drives have firmware bugs that freeze the drive if we access * the mmio too soon after we disable. */ if (ctrlr->quirks & QUIRK_DELAY_B4_CHK_RDY) pause("nvmeR", MSEC_2_TICKS(B4_CHK_RDY_DELAY_MS)); return (nvme_ctrlr_wait_for_ready(ctrlr, 0)); } static int nvme_ctrlr_enable(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; uint32_t aqa; uint32_t qsize; uint8_t en, rdy; int err; cc = nvme_mmio_read_4(ctrlr, cc); csts = nvme_mmio_read_4(ctrlr, csts); en = NVMEV(NVME_CC_REG_EN, cc); rdy = NVMEV(NVME_CSTS_REG_RDY, csts); /* * See note in nvme_ctrlr_disable. Short circuit if we're already enabled. */ if (en == 1) { if (rdy == 1) return (0); return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } /* EN == 0 already wait for RDY == 0 or timeout & fail */ err = nvme_ctrlr_wait_for_ready(ctrlr, 0); if (err != 0) return (err); nvme_mmio_write_8(ctrlr, asq, ctrlr->adminq.cmd_bus_addr); nvme_mmio_write_8(ctrlr, acq, ctrlr->adminq.cpl_bus_addr); /* acqs and asqs are 0-based. */ qsize = ctrlr->adminq.num_entries - 1; aqa = 0; aqa |= NVMEF(NVME_AQA_REG_ACQS, qsize); aqa |= NVMEF(NVME_AQA_REG_ASQS, qsize); nvme_mmio_write_4(ctrlr, aqa, aqa); /* Initialization values for CC */ cc = 0; cc |= NVMEF(NVME_CC_REG_EN, 1); cc |= NVMEF(NVME_CC_REG_CSS, 0); cc |= NVMEF(NVME_CC_REG_AMS, 0); cc |= NVMEF(NVME_CC_REG_SHN, 0); cc |= NVMEF(NVME_CC_REG_IOSQES, 6); /* SQ entry size == 64 == 2^6 */ cc |= NVMEF(NVME_CC_REG_IOCQES, 4); /* CQ entry size == 16 == 2^4 */ /* * Use the Memory Page Size selected during device initialization. Note * that value stored in mps is suitable to use here without adjusting by * NVME_MPS_SHIFT. */ cc |= NVMEF(NVME_CC_REG_MPS, ctrlr->mps); nvme_ctrlr_barrier(ctrlr, BUS_SPACE_BARRIER_WRITE); nvme_mmio_write_4(ctrlr, cc, cc); return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } static void nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr) { int i; nvme_admin_qpair_disable(&ctrlr->adminq); /* * I/O queues are not allocated before the initial HW * reset, so do not try to disable them. Use is_initialized * to determine if this is the initial HW reset. */ if (ctrlr->is_initialized) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_disable(&ctrlr->ioq[i]); } } static int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { int err; TSENTER(); nvme_ctrlr_disable_qpairs(ctrlr); err = nvme_ctrlr_disable(ctrlr); if (err != 0) goto out; err = nvme_ctrlr_enable(ctrlr); out: TSEXIT(); return (err); } void nvme_ctrlr_reset(struct nvme_controller *ctrlr) { int cmpset; cmpset = atomic_cmpset_32(&ctrlr->is_resetting, 0, 1); if (cmpset == 0 || ctrlr->is_failed) /* * Controller is already resetting or has failed. Return * immediately since there is no need to kick off another * reset in these cases. */ return; if (!ctrlr->is_dying) taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); } static int nvme_ctrlr_identify(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; status.done = 0; nvme_ctrlr_cmd_identify_controller(ctrlr, &ctrlr->cdata, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_identify_controller failed!\n"); return (ENXIO); } /* Convert data to host endian */ nvme_controller_data_swapbytes(&ctrlr->cdata); /* * Use MDTS to ensure our default max_xfer_size doesn't exceed what the * controller supports. */ if (ctrlr->cdata.mdts > 0) ctrlr->max_xfer_size = min(ctrlr->max_xfer_size, 1 << (ctrlr->cdata.mdts + NVME_MPS_SHIFT + NVME_CAP_HI_MPSMIN(ctrlr->cap_hi))); return (0); } static int nvme_ctrlr_set_num_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; int cq_allocated, sq_allocated; status.done = 0; nvme_ctrlr_cmd_set_num_queues(ctrlr, ctrlr->num_io_queues, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_ctrlr_set_num_qpairs failed!\n"); return (ENXIO); } /* * Data in cdw0 is 0-based. * Lower 16-bits indicate number of submission queues allocated. * Upper 16-bits indicate number of completion queues allocated. */ sq_allocated = (status.cpl.cdw0 & 0xFFFF) + 1; cq_allocated = (status.cpl.cdw0 >> 16) + 1; /* * Controller may allocate more queues than we requested, * so use the minimum of the number requested and what was * actually allocated. */ ctrlr->num_io_queues = min(ctrlr->num_io_queues, sq_allocated); ctrlr->num_io_queues = min(ctrlr->num_io_queues, cq_allocated); if (ctrlr->num_io_queues > vm_ndomains) ctrlr->num_io_queues -= ctrlr->num_io_queues % vm_ndomains; return (0); } static int nvme_ctrlr_create_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; int i; for (i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_create_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_cq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_create_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_create_io_sq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; for (int i = 0; i < ctrlr->num_io_queues; i++) { qpair = &ctrlr->ioq[i]; status.done = 0; nvme_ctrlr_cmd_delete_io_sq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_sq failed!\n"); return (ENXIO); } status.done = 0; nvme_ctrlr_cmd_delete_io_cq(ctrlr, qpair, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_destroy_io_cq failed!\n"); return (ENXIO); } } return (0); } static int nvme_ctrlr_construct_namespaces(struct nvme_controller *ctrlr) { struct nvme_namespace *ns; uint32_t i; for (i = 0; i < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); i++) { ns = &ctrlr->ns[i]; nvme_ns_construct(ns, i+1, ctrlr); } return (0); } static bool is_log_page_id_valid(uint8_t page_id) { switch (page_id) { case NVME_LOG_ERROR: case NVME_LOG_HEALTH_INFORMATION: case NVME_LOG_FIRMWARE_SLOT: case NVME_LOG_CHANGED_NAMESPACE: case NVME_LOG_COMMAND_EFFECT: case NVME_LOG_RES_NOTIFICATION: case NVME_LOG_SANITIZE_STATUS: return (true); } return (false); } static uint32_t nvme_ctrlr_get_log_page_size(struct nvme_controller *ctrlr, uint8_t page_id) { uint32_t log_page_size; switch (page_id) { case NVME_LOG_ERROR: log_page_size = min( sizeof(struct nvme_error_information_entry) * (ctrlr->cdata.elpe + 1), NVME_MAX_AER_LOG_SIZE); break; case NVME_LOG_HEALTH_INFORMATION: log_page_size = sizeof(struct nvme_health_information_page); break; case NVME_LOG_FIRMWARE_SLOT: log_page_size = sizeof(struct nvme_firmware_page); break; case NVME_LOG_CHANGED_NAMESPACE: log_page_size = sizeof(struct nvme_ns_list); break; case NVME_LOG_COMMAND_EFFECT: log_page_size = sizeof(struct nvme_command_effects_page); break; case NVME_LOG_RES_NOTIFICATION: log_page_size = sizeof(struct nvme_res_notification_page); break; case NVME_LOG_SANITIZE_STATUS: log_page_size = sizeof(struct nvme_sanitize_status_page); break; default: log_page_size = 0; break; } return (log_page_size); } static void nvme_ctrlr_log_critical_warnings(struct nvme_controller *ctrlr, uint8_t state) { if (state & NVME_CRIT_WARN_ST_AVAILABLE_SPARE) nvme_printf(ctrlr, "SMART WARNING: available spare space below threshold\n"); if (state & NVME_CRIT_WARN_ST_TEMPERATURE) nvme_printf(ctrlr, "SMART WARNING: temperature above threshold\n"); if (state & NVME_CRIT_WARN_ST_DEVICE_RELIABILITY) nvme_printf(ctrlr, "SMART WARNING: device reliability degraded\n"); if (state & NVME_CRIT_WARN_ST_READ_ONLY) nvme_printf(ctrlr, "SMART WARNING: media placed in read only mode\n"); if (state & NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP) nvme_printf(ctrlr, "SMART WARNING: volatile memory backup device failed\n"); if (state & NVME_CRIT_WARN_ST_PERSISTENT_MEMORY_REGION) nvme_printf(ctrlr, "SMART WARNING: persistent memory read only or unreliable\n"); if (state & NVME_CRIT_WARN_ST_RESERVED_MASK) nvme_printf(ctrlr, "SMART WARNING: unknown critical warning(s): state = 0x%02x\n", state & NVME_CRIT_WARN_ST_RESERVED_MASK); nvme_ctrlr_devctl(ctrlr, "critical", "SMART_ERROR", "state=0x%02x", state); } static void nvme_ctrlr_async_event_log_page_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; struct nvme_health_information_page *health_info; struct nvme_ns_list *nsl; struct nvme_error_information_entry *err; int i; /* * If the log page fetch for some reason completed with an error, * don't pass log page data to the consumers. In practice, this case * should never happen. */ if (nvme_completion_is_error(cpl)) nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, NULL, 0); else { /* Convert data to host endian */ switch (aer->log_page_id) { case NVME_LOG_ERROR: err = (struct nvme_error_information_entry *)aer->log_page_buffer; for (i = 0; i < (aer->ctrlr->cdata.elpe + 1); i++) nvme_error_information_entry_swapbytes(err++); break; case NVME_LOG_HEALTH_INFORMATION: nvme_health_information_page_swapbytes( (struct nvme_health_information_page *)aer->log_page_buffer); break; case NVME_LOG_CHANGED_NAMESPACE: nvme_ns_list_swapbytes( (struct nvme_ns_list *)aer->log_page_buffer); break; case NVME_LOG_COMMAND_EFFECT: nvme_command_effects_page_swapbytes( (struct nvme_command_effects_page *)aer->log_page_buffer); break; case NVME_LOG_RES_NOTIFICATION: nvme_res_notification_page_swapbytes( (struct nvme_res_notification_page *)aer->log_page_buffer); break; case NVME_LOG_SANITIZE_STATUS: nvme_sanitize_status_page_swapbytes( (struct nvme_sanitize_status_page *)aer->log_page_buffer); break; default: break; } if (aer->log_page_id == NVME_LOG_HEALTH_INFORMATION) { health_info = (struct nvme_health_information_page *) aer->log_page_buffer; nvme_ctrlr_log_critical_warnings(aer->ctrlr, health_info->critical_warning); /* * Critical warnings reported through the * SMART/health log page are persistent, so * clear the associated bits in the async event * config so that we do not receive repeated * notifications for the same event. */ aer->ctrlr->async_event_config &= ~health_info->critical_warning; nvme_ctrlr_cmd_set_async_event_config(aer->ctrlr, aer->ctrlr->async_event_config, NULL, NULL); } else if (aer->log_page_id == NVME_LOG_CHANGED_NAMESPACE && !nvme_use_nvd) { nsl = (struct nvme_ns_list *)aer->log_page_buffer; for (i = 0; i < nitems(nsl->ns) && nsl->ns[i] != 0; i++) { if (nsl->ns[i] > NVME_MAX_NAMESPACES) break; nvme_notify_ns(aer->ctrlr, nsl->ns[i]); } } /* * Pass the cpl data from the original async event completion, * not the log page fetch. */ nvme_notify_async_consumers(aer->ctrlr, &aer->cpl, aer->log_page_id, aer->log_page_buffer, aer->log_page_size); } /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } static void nvme_ctrlr_async_event_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_async_event_request *aer = arg; if (nvme_completion_is_error(cpl)) { /* * Do not retry failed async event requests. This avoids * infinite loops where a new async event request is submitted * to replace the one just failed, only to fail again and * perpetuate the loop. */ return; } /* Associated log page is in bits 23:16 of completion entry dw0. */ aer->log_page_id = NVMEV(NVME_ASYNC_EVENT_LOG_PAGE_ID, cpl->cdw0); nvme_printf(aer->ctrlr, "async event occurred (type 0x%x, info 0x%02x," " page 0x%02x)\n", NVMEV(NVME_ASYNC_EVENT_TYPE, cpl->cdw0), NVMEV(NVME_ASYNC_EVENT_INFO, cpl->cdw0), aer->log_page_id); if (is_log_page_id_valid(aer->log_page_id)) { aer->log_page_size = nvme_ctrlr_get_log_page_size(aer->ctrlr, aer->log_page_id); memcpy(&aer->cpl, cpl, sizeof(*cpl)); nvme_ctrlr_cmd_get_log_page(aer->ctrlr, aer->log_page_id, NVME_GLOBAL_NAMESPACE_TAG, aer->log_page_buffer, aer->log_page_size, nvme_ctrlr_async_event_log_page_cb, aer); /* Wait to notify consumers until after log page is fetched. */ } else { nvme_notify_async_consumers(aer->ctrlr, cpl, aer->log_page_id, NULL, 0); /* * Repost another asynchronous event request to replace the one * that just completed. */ nvme_ctrlr_construct_and_submit_aer(aer->ctrlr, aer); } } static void nvme_ctrlr_construct_and_submit_aer(struct nvme_controller *ctrlr, struct nvme_async_event_request *aer) { struct nvme_request *req; aer->ctrlr = ctrlr; req = nvme_allocate_request_null(nvme_ctrlr_async_event_cb, aer); aer->req = req; /* * Disable timeout here, since asynchronous event requests should by * nature never be timed out. */ req->timeout = false; req->cmd.opc = NVME_OPC_ASYNC_EVENT_REQUEST; nvme_ctrlr_submit_admin_request(ctrlr, req); } static void nvme_ctrlr_configure_aer(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_async_event_request *aer; uint32_t i; ctrlr->async_event_config = NVME_CRIT_WARN_ST_AVAILABLE_SPARE | NVME_CRIT_WARN_ST_DEVICE_RELIABILITY | NVME_CRIT_WARN_ST_READ_ONLY | NVME_CRIT_WARN_ST_VOLATILE_MEMORY_BACKUP; if (ctrlr->cdata.ver >= NVME_REV(1, 2)) ctrlr->async_event_config |= ctrlr->cdata.oaes & (NVME_ASYNC_EVENT_NS_ATTRIBUTE | NVME_ASYNC_EVENT_FW_ACTIVATE); status.done = 0; nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_TEMPERATURE_THRESHOLD, 0, NULL, 0, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl) || (status.cpl.cdw0 & 0xFFFF) == 0xFFFF || (status.cpl.cdw0 & 0xFFFF) == 0x0000) { nvme_printf(ctrlr, "temperature threshold not supported\n"); } else ctrlr->async_event_config |= NVME_CRIT_WARN_ST_TEMPERATURE; nvme_ctrlr_cmd_set_async_event_config(ctrlr, ctrlr->async_event_config, NULL, NULL); /* aerl is a zero-based value, so we need to add 1 here. */ ctrlr->num_aers = min(NVME_MAX_ASYNC_EVENTS, (ctrlr->cdata.aerl+1)); for (i = 0; i < ctrlr->num_aers; i++) { aer = &ctrlr->aer[i]; nvme_ctrlr_construct_and_submit_aer(ctrlr, aer); } } static void nvme_ctrlr_configure_int_coalescing(struct nvme_controller *ctrlr) { ctrlr->int_coal_time = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_time", &ctrlr->int_coal_time); ctrlr->int_coal_threshold = 0; TUNABLE_INT_FETCH("hw.nvme.int_coal_threshold", &ctrlr->int_coal_threshold); nvme_ctrlr_cmd_set_interrupt_coalescing(ctrlr, ctrlr->int_coal_time, ctrlr->int_coal_threshold, NULL, NULL); } static void nvme_ctrlr_hmb_free(struct nvme_controller *ctrlr) { struct nvme_hmb_chunk *hmbc; int i; if (ctrlr->hmb_desc_paddr) { bus_dmamap_unload(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map); bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr, ctrlr->hmb_desc_map); ctrlr->hmb_desc_paddr = 0; } if (ctrlr->hmb_desc_tag) { bus_dma_tag_destroy(ctrlr->hmb_desc_tag); ctrlr->hmb_desc_tag = NULL; } for (i = 0; i < ctrlr->hmb_nchunks; i++) { hmbc = &ctrlr->hmb_chunks[i]; bus_dmamap_unload(ctrlr->hmb_tag, hmbc->hmbc_map); bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr, hmbc->hmbc_map); } ctrlr->hmb_nchunks = 0; if (ctrlr->hmb_tag) { bus_dma_tag_destroy(ctrlr->hmb_tag); ctrlr->hmb_tag = NULL; } if (ctrlr->hmb_chunks) { free(ctrlr->hmb_chunks, M_NVME); ctrlr->hmb_chunks = NULL; } } static void nvme_ctrlr_hmb_alloc(struct nvme_controller *ctrlr) { struct nvme_hmb_chunk *hmbc; size_t pref, min, minc, size; int err, i; uint64_t max; /* Limit HMB to 5% of RAM size per device by default. */ max = (uint64_t)physmem * PAGE_SIZE / 20; TUNABLE_UINT64_FETCH("hw.nvme.hmb_max", &max); /* * Units of Host Memory Buffer in the Identify info are always in terms * of 4k units. */ min = (long long unsigned)ctrlr->cdata.hmmin * NVME_HMB_UNITS; if (max == 0 || max < min) return; pref = MIN((long long unsigned)ctrlr->cdata.hmpre * NVME_HMB_UNITS, max); minc = MAX(ctrlr->cdata.hmminds * NVME_HMB_UNITS, ctrlr->page_size); if (min > 0 && ctrlr->cdata.hmmaxd > 0) minc = MAX(minc, min / ctrlr->cdata.hmmaxd); ctrlr->hmb_chunk = pref; again: /* * However, the chunk sizes, number of chunks, and alignment of chunks * are all based on the current MPS (ctrlr->page_size). */ ctrlr->hmb_chunk = roundup2(ctrlr->hmb_chunk, ctrlr->page_size); ctrlr->hmb_nchunks = howmany(pref, ctrlr->hmb_chunk); if (ctrlr->cdata.hmmaxd > 0 && ctrlr->hmb_nchunks > ctrlr->cdata.hmmaxd) ctrlr->hmb_nchunks = ctrlr->cdata.hmmaxd; ctrlr->hmb_chunks = malloc(sizeof(struct nvme_hmb_chunk) * ctrlr->hmb_nchunks, M_NVME, M_WAITOK); err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), ctrlr->page_size, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, ctrlr->hmb_chunk, 1, ctrlr->hmb_chunk, 0, NULL, NULL, &ctrlr->hmb_tag); if (err != 0) { nvme_printf(ctrlr, "HMB tag create failed %d\n", err); nvme_ctrlr_hmb_free(ctrlr); return; } for (i = 0; i < ctrlr->hmb_nchunks; i++) { hmbc = &ctrlr->hmb_chunks[i]; if (bus_dmamem_alloc(ctrlr->hmb_tag, (void **)&hmbc->hmbc_vaddr, BUS_DMA_NOWAIT, &hmbc->hmbc_map)) { nvme_printf(ctrlr, "failed to alloc HMB\n"); break; } if (bus_dmamap_load(ctrlr->hmb_tag, hmbc->hmbc_map, hmbc->hmbc_vaddr, ctrlr->hmb_chunk, nvme_single_map, &hmbc->hmbc_paddr, BUS_DMA_NOWAIT) != 0) { bus_dmamem_free(ctrlr->hmb_tag, hmbc->hmbc_vaddr, hmbc->hmbc_map); nvme_printf(ctrlr, "failed to load HMB\n"); break; } bus_dmamap_sync(ctrlr->hmb_tag, hmbc->hmbc_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); } if (i < ctrlr->hmb_nchunks && i * ctrlr->hmb_chunk < min && ctrlr->hmb_chunk / 2 >= minc) { ctrlr->hmb_nchunks = i; nvme_ctrlr_hmb_free(ctrlr); ctrlr->hmb_chunk /= 2; goto again; } ctrlr->hmb_nchunks = i; if (ctrlr->hmb_nchunks * ctrlr->hmb_chunk < min) { nvme_ctrlr_hmb_free(ctrlr); return; } size = sizeof(struct nvme_hmb_desc) * ctrlr->hmb_nchunks; err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 16, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, size, 1, size, 0, NULL, NULL, &ctrlr->hmb_desc_tag); if (err != 0) { nvme_printf(ctrlr, "HMB desc tag create failed %d\n", err); nvme_ctrlr_hmb_free(ctrlr); return; } if (bus_dmamem_alloc(ctrlr->hmb_desc_tag, (void **)&ctrlr->hmb_desc_vaddr, BUS_DMA_WAITOK, &ctrlr->hmb_desc_map)) { nvme_printf(ctrlr, "failed to alloc HMB desc\n"); nvme_ctrlr_hmb_free(ctrlr); return; } if (bus_dmamap_load(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map, ctrlr->hmb_desc_vaddr, size, nvme_single_map, &ctrlr->hmb_desc_paddr, BUS_DMA_NOWAIT) != 0) { bus_dmamem_free(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_vaddr, ctrlr->hmb_desc_map); nvme_printf(ctrlr, "failed to load HMB desc\n"); nvme_ctrlr_hmb_free(ctrlr); return; } for (i = 0; i < ctrlr->hmb_nchunks; i++) { memset(&ctrlr->hmb_desc_vaddr[i], 0, sizeof(struct nvme_hmb_desc)); ctrlr->hmb_desc_vaddr[i].addr = htole64(ctrlr->hmb_chunks[i].hmbc_paddr); ctrlr->hmb_desc_vaddr[i].size = htole32(ctrlr->hmb_chunk / ctrlr->page_size); } bus_dmamap_sync(ctrlr->hmb_desc_tag, ctrlr->hmb_desc_map, BUS_DMASYNC_PREWRITE); nvme_printf(ctrlr, "Allocated %lluMB host memory buffer\n", (long long unsigned)ctrlr->hmb_nchunks * ctrlr->hmb_chunk / 1024 / 1024); } static void nvme_ctrlr_hmb_enable(struct nvme_controller *ctrlr, bool enable, bool memret) { struct nvme_completion_poll_status status; uint32_t cdw11; cdw11 = 0; if (enable) cdw11 |= 1; if (memret) cdw11 |= 2; status.done = 0; nvme_ctrlr_cmd_set_feature(ctrlr, NVME_FEAT_HOST_MEMORY_BUFFER, cdw11, ctrlr->hmb_nchunks * ctrlr->hmb_chunk / ctrlr->page_size, ctrlr->hmb_desc_paddr, ctrlr->hmb_desc_paddr >> 32, ctrlr->hmb_nchunks, NULL, 0, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) nvme_printf(ctrlr, "nvme_ctrlr_hmb_enable failed!\n"); } static void nvme_ctrlr_start(void *ctrlr_arg, bool resetting) { struct nvme_controller *ctrlr = ctrlr_arg; uint32_t old_num_io_queues; int i; TSENTER(); /* * Only reset adminq here when we are restarting the * controller after a reset. During initialization, * we have already submitted admin commands to get * the number of I/O queues supported, so cannot reset * the adminq again here. */ if (resetting) { nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); } if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_qpair_reset(&ctrlr->ioq[i]); } /* * If it was a reset on initialization command timeout, just * return here, letting initialization code fail gracefully. */ if (resetting && !ctrlr->is_initialized) return; if (resetting && nvme_ctrlr_identify(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } /* * The number of qpairs are determined during controller initialization, * including using NVMe SET_FEATURES/NUMBER_OF_QUEUES to determine the * HW limit. We call SET_FEATURES again here so that it gets called * after any reset for controllers that depend on the driver to * explicit specify how many queues it will use. This value should * never change between resets, so panic if somehow that does happen. */ if (resetting) { old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (old_num_io_queues != ctrlr->num_io_queues) { panic("num_io_queues changed from %u to %u", old_num_io_queues, ctrlr->num_io_queues); } } if (ctrlr->cdata.hmpre > 0 && ctrlr->hmb_nchunks == 0) { nvme_ctrlr_hmb_alloc(ctrlr); if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, true, false); } else if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, true, true); if (nvme_ctrlr_create_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } if (nvme_ctrlr_construct_namespaces(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); return; } nvme_ctrlr_configure_aer(ctrlr); nvme_ctrlr_configure_int_coalescing(ctrlr); for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_enable(&ctrlr->ioq[i]); TSEXIT(); } void nvme_ctrlr_start_config_hook(void *arg) { struct nvme_controller *ctrlr = arg; TSENTER(); if (nvme_ctrlr_hw_reset(ctrlr) != 0) { fail: nvme_ctrlr_fail(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); return; } nvme_qpair_reset(&ctrlr->adminq); nvme_admin_qpair_enable(&ctrlr->adminq); if (nvme_ctrlr_identify(ctrlr) == 0 && nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) nvme_ctrlr_start(ctrlr, false); else goto fail; nvme_sysctl_initialize_ctrlr(ctrlr); config_intrhook_disestablish(&ctrlr->config_hook); ctrlr->is_initialized = 1; nvme_notify_new_controller(ctrlr); TSEXIT(); } static void nvme_ctrlr_reset_task(void *arg, int pending) { struct nvme_controller *ctrlr = arg; int status; nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"start\""); status = nvme_ctrlr_hw_reset(ctrlr); if (status == 0) { nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"success\""); nvme_ctrlr_start(ctrlr, true); } else { nvme_ctrlr_devctl_log(ctrlr, "RESET", "event=\"timed_out\""); nvme_ctrlr_fail(ctrlr); } atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); } /* * Poll all the queues enabled on the device for completion. */ void nvme_ctrlr_poll(struct nvme_controller *ctrlr) { int i; nvme_qpair_process_completions(&ctrlr->adminq); for (i = 0; i < ctrlr->num_io_queues; i++) if (ctrlr->ioq && ctrlr->ioq[i].cpl) nvme_qpair_process_completions(&ctrlr->ioq[i]); } /* * Poll the single-vector interrupt case: num_io_queues will be 1 and * there's only a single vector. While we're polling, we mask further * interrupts in the controller. */ void nvme_ctrlr_shared_handler(void *arg) { struct nvme_controller *ctrlr = arg; nvme_mmio_write_4(ctrlr, intms, 1); nvme_ctrlr_poll(ctrlr); nvme_mmio_write_4(ctrlr, intmc, 1); } static void nvme_pt_done(void *arg, const struct nvme_completion *cpl) { struct nvme_pt_command *pt = arg; struct mtx *mtx = pt->driver_lock; uint16_t status; bzero(&pt->cpl, sizeof(pt->cpl)); pt->cpl.cdw0 = cpl->cdw0; status = cpl->status; status &= ~NVMEM(NVME_STATUS_P); pt->cpl.status = status; mtx_lock(mtx); pt->driver_lock = NULL; wakeup(pt); mtx_unlock(mtx); } int nvme_ctrlr_passthrough_cmd(struct nvme_controller *ctrlr, struct nvme_pt_command *pt, uint32_t nsid, int is_user_buffer, int is_admin_cmd) { struct nvme_request *req; struct mtx *mtx; struct buf *buf = NULL; int ret = 0; if (pt->len > 0) { if (pt->len > ctrlr->max_xfer_size) { nvme_printf(ctrlr, "pt->len (%d) " "exceeds max_xfer_size (%d)\n", pt->len, ctrlr->max_xfer_size); return EIO; } if (is_user_buffer) { /* * Ensure the user buffer is wired for the duration of * this pass-through command. */ PHOLD(curproc); buf = uma_zalloc(pbuf_zone, M_WAITOK); buf->b_iocmd = pt->is_read ? BIO_READ : BIO_WRITE; if (vmapbuf(buf, pt->buf, pt->len, 1) < 0) { ret = EFAULT; goto err; } req = nvme_allocate_request_vaddr(buf->b_data, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_vaddr(pt->buf, pt->len, nvme_pt_done, pt); } else req = nvme_allocate_request_null(nvme_pt_done, pt); /* Assume user space already converted to little-endian */ req->cmd.opc = pt->cmd.opc; req->cmd.fuse = pt->cmd.fuse; req->cmd.rsvd2 = pt->cmd.rsvd2; req->cmd.rsvd3 = pt->cmd.rsvd3; req->cmd.cdw10 = pt->cmd.cdw10; req->cmd.cdw11 = pt->cmd.cdw11; req->cmd.cdw12 = pt->cmd.cdw12; req->cmd.cdw13 = pt->cmd.cdw13; req->cmd.cdw14 = pt->cmd.cdw14; req->cmd.cdw15 = pt->cmd.cdw15; req->cmd.nsid = htole32(nsid); mtx = mtx_pool_find(mtxpool_sleep, pt); pt->driver_lock = mtx; if (is_admin_cmd) nvme_ctrlr_submit_admin_request(ctrlr, req); else nvme_ctrlr_submit_io_request(ctrlr, req); mtx_lock(mtx); while (pt->driver_lock != NULL) mtx_sleep(pt, mtx, PRIBIO, "nvme_pt", 0); mtx_unlock(mtx); if (buf != NULL) { vunmapbuf(buf); err: uma_zfree(pbuf_zone, buf); PRELE(curproc); } return (ret); } static int nvme_ctrlr_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvme_controller *ctrlr; struct nvme_pt_command *pt; ctrlr = cdev->si_drv1; switch (cmd) { case NVME_RESET_CONTROLLER: nvme_ctrlr_reset(ctrlr); break; case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, le32toh(pt->cmd.nsid), 1 /* is_user_buffer */, 1 /* is_admin_cmd */)); case NVME_GET_NSID: { struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), + strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = 0; break; } case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = ctrlr->max_xfer_size; break; default: return (ENOTTY); } return (0); } static struct cdevsw nvme_ctrlr_cdevsw = { .d_version = D_VERSION, .d_flags = 0, .d_ioctl = nvme_ctrlr_ioctl }; int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev) { struct make_dev_args md_args; uint32_t cap_lo; uint32_t cap_hi; uint32_t to, vs, pmrcap; int status, timeout_period; ctrlr->dev = dev; mtx_init(&ctrlr->lock, "nvme ctrlr lock", NULL, MTX_DEF); if (bus_get_domain(dev, &ctrlr->domain) != 0) ctrlr->domain = 0; ctrlr->cap_lo = cap_lo = nvme_mmio_read_4(ctrlr, cap_lo); if (bootverbose) { device_printf(dev, "CapLo: 0x%08x: MQES %u%s%s%s%s, TO %u\n", cap_lo, NVME_CAP_LO_MQES(cap_lo), NVME_CAP_LO_CQR(cap_lo) ? ", CQR" : "", NVME_CAP_LO_AMS(cap_lo) ? ", AMS" : "", (NVME_CAP_LO_AMS(cap_lo) & 0x1) ? " WRRwUPC" : "", (NVME_CAP_LO_AMS(cap_lo) & 0x2) ? " VS" : "", NVME_CAP_LO_TO(cap_lo)); } ctrlr->cap_hi = cap_hi = nvme_mmio_read_4(ctrlr, cap_hi); if (bootverbose) { device_printf(dev, "CapHi: 0x%08x: DSTRD %u%s, CSS %x%s, " "CPS %x, MPSMIN %u, MPSMAX %u%s%s%s%s%s\n", cap_hi, NVME_CAP_HI_DSTRD(cap_hi), NVME_CAP_HI_NSSRS(cap_hi) ? ", NSSRS" : "", NVME_CAP_HI_CSS(cap_hi), NVME_CAP_HI_BPS(cap_hi) ? ", BPS" : "", NVME_CAP_HI_CPS(cap_hi), NVME_CAP_HI_MPSMIN(cap_hi), NVME_CAP_HI_MPSMAX(cap_hi), NVME_CAP_HI_PMRS(cap_hi) ? ", PMRS" : "", NVME_CAP_HI_CMBS(cap_hi) ? ", CMBS" : "", NVME_CAP_HI_NSSS(cap_hi) ? ", NSSS" : "", NVME_CAP_HI_CRWMS(cap_hi) ? ", CRWMS" : "", NVME_CAP_HI_CRIMS(cap_hi) ? ", CRIMS" : ""); } if (bootverbose) { vs = nvme_mmio_read_4(ctrlr, vs); device_printf(dev, "Version: 0x%08x: %d.%d\n", vs, NVME_MAJOR(vs), NVME_MINOR(vs)); } if (bootverbose && NVME_CAP_HI_PMRS(cap_hi)) { pmrcap = nvme_mmio_read_4(ctrlr, pmrcap); device_printf(dev, "PMRCap: 0x%08x: BIR %u%s%s, PMRTU %u, " "PMRWBM %x, PMRTO %u%s\n", pmrcap, NVME_PMRCAP_BIR(pmrcap), NVME_PMRCAP_RDS(pmrcap) ? ", RDS" : "", NVME_PMRCAP_WDS(pmrcap) ? ", WDS" : "", NVME_PMRCAP_PMRTU(pmrcap), NVME_PMRCAP_PMRWBM(pmrcap), NVME_PMRCAP_PMRTO(pmrcap), NVME_PMRCAP_CMSS(pmrcap) ? ", CMSS" : ""); } ctrlr->dstrd = NVME_CAP_HI_DSTRD(cap_hi) + 2; ctrlr->mps = NVME_CAP_HI_MPSMIN(cap_hi); ctrlr->page_size = 1 << (NVME_MPS_SHIFT + ctrlr->mps); /* Get ready timeout value from controller, in units of 500ms. */ to = NVME_CAP_LO_TO(cap_lo) + 1; ctrlr->ready_timeout_in_ms = to * 500; timeout_period = NVME_ADMIN_TIMEOUT_PERIOD; TUNABLE_INT_FETCH("hw.nvme.admin_timeout_period", &timeout_period); timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); ctrlr->admin_timeout_period = timeout_period; timeout_period = NVME_DEFAULT_TIMEOUT_PERIOD; TUNABLE_INT_FETCH("hw.nvme.timeout_period", &timeout_period); timeout_period = min(timeout_period, NVME_MAX_TIMEOUT_PERIOD); timeout_period = max(timeout_period, NVME_MIN_TIMEOUT_PERIOD); ctrlr->timeout_period = timeout_period; nvme_retry_count = NVME_DEFAULT_RETRY_COUNT; TUNABLE_INT_FETCH("hw.nvme.retry_count", &nvme_retry_count); ctrlr->enable_aborts = 0; TUNABLE_INT_FETCH("hw.nvme.enable_aborts", &ctrlr->enable_aborts); /* Cap transfers by the maximum addressable by page-sized PRP (4KB pages -> 2MB). */ ctrlr->max_xfer_size = MIN(maxphys, (ctrlr->page_size / 8 * ctrlr->page_size)); if (nvme_ctrlr_construct_admin_qpair(ctrlr) != 0) return (ENXIO); /* * Create 2 threads for the taskqueue. The reset thread will block when * it detects that the controller has failed until all I/O has been * failed up the stack. The fail_req task needs to be able to run in * this case to finish the request failure for some cases. * * We could partially solve this race by draining the failed requeust * queue before proceding to free the sim, though nothing would stop * new I/O from coming in after we do that drain, but before we reach * cam_sim_free, so this big hammer is used instead. */ ctrlr->taskqueue = taskqueue_create("nvme_taskq", M_WAITOK, taskqueue_thread_enqueue, &ctrlr->taskqueue); taskqueue_start_threads(&ctrlr->taskqueue, 2, PI_DISK, "nvme taskq"); ctrlr->is_resetting = 0; ctrlr->is_initialized = 0; ctrlr->notification_sent = 0; TASK_INIT(&ctrlr->reset_task, 0, nvme_ctrlr_reset_task, ctrlr); STAILQ_INIT(&ctrlr->fail_req); ctrlr->is_failed = false; make_dev_args_init(&md_args); md_args.mda_devsw = &nvme_ctrlr_cdevsw; md_args.mda_uid = UID_ROOT; md_args.mda_gid = GID_WHEEL; md_args.mda_mode = 0600; md_args.mda_unit = device_get_unit(dev); md_args.mda_si_drv1 = (void *)ctrlr; status = make_dev_s(&md_args, &ctrlr->cdev, "nvme%d", device_get_unit(dev)); if (status != 0) return (ENXIO); return (0); } void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev) { int gone, i; ctrlr->is_dying = true; if (ctrlr->resource == NULL) goto nores; if (!mtx_initialized(&ctrlr->adminq.lock)) goto noadminq; /* * Check whether it is a hot unplug or a clean driver detach. * If device is not there any more, skip any shutdown commands. */ gone = (nvme_mmio_read_4(ctrlr, csts) == NVME_GONE); if (gone) nvme_ctrlr_fail(ctrlr); else nvme_notify_fail_consumers(ctrlr); for (i = 0; i < NVME_MAX_NAMESPACES; i++) nvme_ns_destruct(&ctrlr->ns[i]); if (ctrlr->cdev) destroy_dev(ctrlr->cdev); if (ctrlr->is_initialized) { if (!gone) { if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, false, false); nvme_ctrlr_delete_qpairs(ctrlr); } nvme_ctrlr_hmb_free(ctrlr); } if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_destroy(&ctrlr->ioq[i]); free(ctrlr->ioq, M_NVME); } nvme_admin_qpair_destroy(&ctrlr->adminq); /* * Notify the controller of a shutdown, even though this is due to * a driver unload, not a system shutdown (this path is not invoked * during shutdown). This ensures the controller receives a * shutdown notification in case the system is shutdown before * reloading the driver. */ if (!gone) nvme_ctrlr_shutdown(ctrlr); if (!gone) nvme_ctrlr_disable(ctrlr); noadminq: if (ctrlr->taskqueue) taskqueue_free(ctrlr->taskqueue); if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); if (ctrlr->res) bus_release_resource(ctrlr->dev, SYS_RES_IRQ, rman_get_rid(ctrlr->res), ctrlr->res); if (ctrlr->bar4_resource != NULL) { bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->bar4_resource_id, ctrlr->bar4_resource); } bus_release_resource(dev, SYS_RES_MEMORY, ctrlr->resource_id, ctrlr->resource); nores: mtx_destroy(&ctrlr->lock); } void nvme_ctrlr_shutdown(struct nvme_controller *ctrlr) { uint32_t cc; uint32_t csts; int timeout; cc = nvme_mmio_read_4(ctrlr, cc); cc &= ~NVMEM(NVME_CC_REG_SHN); cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); nvme_mmio_write_4(ctrlr, cc, cc); timeout = ticks + (ctrlr->cdata.rtd3e == 0 ? 5 * hz : ((uint64_t)ctrlr->cdata.rtd3e * hz + 999999) / 1000000); while (1) { csts = nvme_mmio_read_4(ctrlr, csts); if (csts == NVME_GONE) /* Hot unplug. */ break; if (NVME_CSTS_GET_SHST(csts) == NVME_SHST_COMPLETE) break; if (timeout - ticks < 0) { nvme_printf(ctrlr, "shutdown timeout\n"); break; } pause("nvmeshut", 1); } } void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req) { nvme_qpair_submit_request(&ctrlr->adminq, req); } void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, struct nvme_request *req) { struct nvme_qpair *qpair; qpair = &ctrlr->ioq[QP(ctrlr, curcpu)]; nvme_qpair_submit_request(qpair, req); } device_t nvme_ctrlr_get_device(struct nvme_controller *ctrlr) { return (ctrlr->dev); } const struct nvme_controller_data * nvme_ctrlr_get_data(struct nvme_controller *ctrlr) { return (&ctrlr->cdata); } int nvme_ctrlr_suspend(struct nvme_controller *ctrlr) { int to = hz; /* * Can't touch failed controllers, so it's already suspended. */ if (ctrlr->is_failed) return (0); /* * We don't want the reset taskqueue running, since it does similar * things, so prevent it from running after we start. Wait for any reset * that may have been started to complete. The reset process we follow * will ensure that any new I/O will queue and be given to the hardware * after we resume (though there should be none). */ while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0) pause("nvmesusp", 1); if (to <= 0) { nvme_printf(ctrlr, "Competing reset task didn't finish. Try again later.\n"); return (EWOULDBLOCK); } if (ctrlr->hmb_nchunks > 0) nvme_ctrlr_hmb_enable(ctrlr, false, false); /* * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to * delete the hardware I/O queues, and then shutdown. This properly * flushes any metadata the drive may have stored so it can survive * having its power removed and prevents the unsafe shutdown count from * incriminating. Once we delete the qpairs, we have to disable them * before shutting down. */ nvme_ctrlr_delete_qpairs(ctrlr); nvme_ctrlr_disable_qpairs(ctrlr); nvme_ctrlr_shutdown(ctrlr); return (0); } int nvme_ctrlr_resume(struct nvme_controller *ctrlr) { /* * Can't touch failed controllers, so nothing to do to resume. */ if (ctrlr->is_failed) return (0); if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; /* * Now that we've reset the hardware, we can restart the controller. Any * I/O that was pending is requeued. Any admin commands are aborted with * an error. Once we've restarted, take the controller out of reset. */ nvme_ctrlr_start(ctrlr, true); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); return (0); fail: /* * Since we can't bring the controller out of reset, announce and fail * the controller. However, we have to return success for the resume * itself, due to questionable APIs. */ nvme_printf(ctrlr, "Failed to reset on resume, failing.\n"); nvme_ctrlr_fail(ctrlr); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); return (0); } diff --git a/sys/dev/nvme/nvme_ns.c b/sys/dev/nvme/nvme_ns.c index 360b9f982c20..e494b8cd857a 100644 --- a/sys/dev/nvme/nvme_ns.c +++ b/sys/dev/nvme/nvme_ns.c @@ -1,623 +1,622 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (C) 2012-2013 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme_private.h" static void nvme_bio_child_inbed(struct bio *parent, int bio_error); static void nvme_bio_child_done(void *arg, const struct nvme_completion *cpl); static uint32_t nvme_get_num_segments(uint64_t addr, uint64_t size, uint32_t alignment); static void nvme_free_child_bios(int num_bios, struct bio **child_bios); static struct bio ** nvme_allocate_child_bios(int num_bios); static struct bio ** nvme_construct_child_bios(struct bio *bp, uint32_t alignment, int *num_bios); static int nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp, uint32_t alignment); static int nvme_ns_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvme_namespace *ns; struct nvme_controller *ctrlr; struct nvme_pt_command *pt; ns = cdev->si_drv1; ctrlr = ns->ctrlr; switch (cmd) { case NVME_IO_TEST: case NVME_BIO_TEST: nvme_ns_test(ns, cmd, arg); break; case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvme_ctrlr_passthrough_cmd(ctrlr, pt, ns->id, 1 /* is_user_buffer */, 0 /* is_admin_cmd */)); case NVME_GET_NSID: { struct nvme_get_nsid *gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), + strlcpy(gnsid->cdev, device_get_nameunit(ctrlr->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = ns->id; break; } case DIOCGMEDIASIZE: *(off_t *)arg = (off_t)nvme_ns_get_size(ns); break; case DIOCGSECTORSIZE: *(u_int *)arg = nvme_ns_get_sector_size(ns); break; default: return (ENOTTY); } return (0); } static int nvme_ns_open(struct cdev *dev __unused, int flags, int fmt __unused, struct thread *td) { int error = 0; if (flags & FWRITE) error = securelevel_gt(td->td_ucred, 0); return (error); } static int nvme_ns_close(struct cdev *dev __unused, int flags, int fmt __unused, struct thread *td) { return (0); } static void nvme_ns_strategy_done(void *arg, const struct nvme_completion *cpl) { struct bio *bp = arg; /* * TODO: add more extensive translation of NVMe status codes * to different bio error codes (i.e. EIO, EINVAL, etc.) */ if (nvme_completion_is_error(cpl)) { bp->bio_error = EIO; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; } else bp->bio_resid = 0; biodone(bp); } static void nvme_ns_strategy(struct bio *bp) { struct nvme_namespace *ns; int err; ns = bp->bio_dev->si_drv1; err = nvme_ns_bio_process(ns, bp, nvme_ns_strategy_done); if (err) { bp->bio_error = err; bp->bio_flags |= BIO_ERROR; bp->bio_resid = bp->bio_bcount; biodone(bp); } } static struct cdevsw nvme_ns_cdevsw = { .d_version = D_VERSION, .d_flags = D_DISK, .d_read = physread, .d_write = physwrite, .d_open = nvme_ns_open, .d_close = nvme_ns_close, .d_strategy = nvme_ns_strategy, .d_ioctl = nvme_ns_ioctl }; uint32_t nvme_ns_get_max_io_xfer_size(struct nvme_namespace *ns) { return ns->ctrlr->max_xfer_size; } uint32_t nvme_ns_get_sector_size(struct nvme_namespace *ns) { uint8_t flbas_fmt, lbads; flbas_fmt = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, ns->data.flbas); lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, ns->data.lbaf[flbas_fmt]); return (1 << lbads); } uint64_t nvme_ns_get_num_sectors(struct nvme_namespace *ns) { return (ns->data.nsze); } uint64_t nvme_ns_get_size(struct nvme_namespace *ns) { return (nvme_ns_get_num_sectors(ns) * nvme_ns_get_sector_size(ns)); } uint32_t nvme_ns_get_flags(struct nvme_namespace *ns) { return (ns->flags); } const char * nvme_ns_get_serial_number(struct nvme_namespace *ns) { return ((const char *)ns->ctrlr->cdata.sn); } const char * nvme_ns_get_model_number(struct nvme_namespace *ns) { return ((const char *)ns->ctrlr->cdata.mn); } const struct nvme_namespace_data * nvme_ns_get_data(struct nvme_namespace *ns) { return (&ns->data); } uint32_t nvme_ns_get_stripesize(struct nvme_namespace *ns) { uint32_t ss; if (NVMEV(NVME_NS_DATA_NSFEAT_NPVALID, ns->data.nsfeat) != 0) { ss = nvme_ns_get_sector_size(ns); if (ns->data.npwa != 0) return ((ns->data.npwa + 1) * ss); else if (ns->data.npwg != 0) return ((ns->data.npwg + 1) * ss); } return (ns->boundary); } static void nvme_ns_bio_done(void *arg, const struct nvme_completion *status) { struct bio *bp = arg; nvme_cb_fn_t bp_cb_fn; bp_cb_fn = bp->bio_driver1; if (bp->bio_driver2) free(bp->bio_driver2, M_NVME); if (nvme_completion_is_error(status)) { bp->bio_flags |= BIO_ERROR; if (bp->bio_error == 0) bp->bio_error = EIO; } if ((bp->bio_flags & BIO_ERROR) == 0) bp->bio_resid = 0; else bp->bio_resid = bp->bio_bcount; bp_cb_fn(bp, status); } static void nvme_bio_child_inbed(struct bio *parent, int bio_error) { struct nvme_completion parent_cpl; int children, inbed; if (bio_error != 0) { parent->bio_flags |= BIO_ERROR; parent->bio_error = bio_error; } /* * atomic_fetchadd will return value before adding 1, so we still * must add 1 to get the updated inbed number. Save bio_children * before incrementing to guard against race conditions when * two children bios complete on different queues. */ children = atomic_load_acq_int(&parent->bio_children); inbed = atomic_fetchadd_int(&parent->bio_inbed, 1) + 1; if (inbed == children) { bzero(&parent_cpl, sizeof(parent_cpl)); if (parent->bio_flags & BIO_ERROR) { parent_cpl.status &= ~NVMEM(NVME_STATUS_SC); parent_cpl.status |= NVMEF(NVME_STATUS_SC, NVME_SC_DATA_TRANSFER_ERROR); } nvme_ns_bio_done(parent, &parent_cpl); } } static void nvme_bio_child_done(void *arg, const struct nvme_completion *cpl) { struct bio *child = arg; struct bio *parent; int bio_error; parent = child->bio_parent; g_destroy_bio(child); bio_error = nvme_completion_is_error(cpl) ? EIO : 0; nvme_bio_child_inbed(parent, bio_error); } static uint32_t nvme_get_num_segments(uint64_t addr, uint64_t size, uint32_t align) { uint32_t num_segs, offset, remainder; if (align == 0) return (1); KASSERT((align & (align - 1)) == 0, ("alignment not power of 2\n")); num_segs = size / align; remainder = size & (align - 1); offset = addr & (align - 1); if (remainder > 0 || offset > 0) num_segs += 1 + (remainder + offset - 1) / align; return (num_segs); } static void nvme_free_child_bios(int num_bios, struct bio **child_bios) { int i; for (i = 0; i < num_bios; i++) { if (child_bios[i] != NULL) g_destroy_bio(child_bios[i]); } free(child_bios, M_NVME); } static struct bio ** nvme_allocate_child_bios(int num_bios) { struct bio **child_bios; int err = 0, i; child_bios = malloc(num_bios * sizeof(struct bio *), M_NVME, M_NOWAIT); if (child_bios == NULL) return (NULL); for (i = 0; i < num_bios; i++) { child_bios[i] = g_new_bio(); if (child_bios[i] == NULL) err = ENOMEM; } if (err == ENOMEM) { nvme_free_child_bios(num_bios, child_bios); return (NULL); } return (child_bios); } static struct bio ** nvme_construct_child_bios(struct bio *bp, uint32_t alignment, int *num_bios) { struct bio **child_bios; struct bio *child; uint64_t cur_offset; caddr_t data; uint32_t rem_bcount; int i; struct vm_page **ma; uint32_t ma_offset; *num_bios = nvme_get_num_segments(bp->bio_offset, bp->bio_bcount, alignment); child_bios = nvme_allocate_child_bios(*num_bios); if (child_bios == NULL) return (NULL); bp->bio_children = *num_bios; bp->bio_inbed = 0; cur_offset = bp->bio_offset; rem_bcount = bp->bio_bcount; data = bp->bio_data; ma_offset = bp->bio_ma_offset; ma = bp->bio_ma; for (i = 0; i < *num_bios; i++) { child = child_bios[i]; child->bio_parent = bp; child->bio_cmd = bp->bio_cmd; child->bio_offset = cur_offset; child->bio_bcount = min(rem_bcount, alignment - (cur_offset & (alignment - 1))); child->bio_flags = bp->bio_flags; if (bp->bio_flags & BIO_UNMAPPED) { child->bio_ma_offset = ma_offset; child->bio_ma = ma; child->bio_ma_n = nvme_get_num_segments(child->bio_ma_offset, child->bio_bcount, PAGE_SIZE); ma_offset = (ma_offset + child->bio_bcount) & PAGE_MASK; ma += child->bio_ma_n; if (ma_offset != 0) ma -= 1; } else { child->bio_data = data; data += child->bio_bcount; } cur_offset += child->bio_bcount; rem_bcount -= child->bio_bcount; } return (child_bios); } static int nvme_ns_split_bio(struct nvme_namespace *ns, struct bio *bp, uint32_t alignment) { struct bio *child; struct bio **child_bios; int err, i, num_bios; child_bios = nvme_construct_child_bios(bp, alignment, &num_bios); if (child_bios == NULL) return (ENOMEM); for (i = 0; i < num_bios; i++) { child = child_bios[i]; err = nvme_ns_bio_process(ns, child, nvme_bio_child_done); if (err != 0) { nvme_bio_child_inbed(bp, err); g_destroy_bio(child); } } free(child_bios, M_NVME); return (0); } int nvme_ns_bio_process(struct nvme_namespace *ns, struct bio *bp, nvme_cb_fn_t cb_fn) { struct nvme_dsm_range *dsm_range; uint32_t num_bios; int err; bp->bio_driver1 = cb_fn; if (ns->boundary > 0 && (bp->bio_cmd == BIO_READ || bp->bio_cmd == BIO_WRITE)) { num_bios = nvme_get_num_segments(bp->bio_offset, bp->bio_bcount, ns->boundary); if (num_bios > 1) return (nvme_ns_split_bio(ns, bp, ns->boundary)); } switch (bp->bio_cmd) { case BIO_READ: err = nvme_ns_cmd_read_bio(ns, bp, nvme_ns_bio_done, bp); break; case BIO_WRITE: err = nvme_ns_cmd_write_bio(ns, bp, nvme_ns_bio_done, bp); break; case BIO_FLUSH: err = nvme_ns_cmd_flush(ns, nvme_ns_bio_done, bp); break; case BIO_DELETE: dsm_range = malloc(sizeof(struct nvme_dsm_range), M_NVME, M_ZERO | M_NOWAIT); if (!dsm_range) { err = ENOMEM; break; } dsm_range->length = htole32(bp->bio_bcount/nvme_ns_get_sector_size(ns)); dsm_range->starting_lba = htole64(bp->bio_offset/nvme_ns_get_sector_size(ns)); bp->bio_driver2 = dsm_range; err = nvme_ns_cmd_deallocate(ns, dsm_range, 1, nvme_ns_bio_done, bp); if (err != 0) free(dsm_range, M_NVME); break; default: err = EOPNOTSUPP; break; } return (err); } int nvme_ns_ioctl_process(struct nvme_namespace *ns, u_long cmd, caddr_t arg, int flag, struct thread *td) { return (nvme_ns_ioctl(ns->cdev, cmd, arg, flag, td)); } int nvme_ns_construct(struct nvme_namespace *ns, uint32_t id, struct nvme_controller *ctrlr) { struct make_dev_args md_args; struct nvme_completion_poll_status status; int res; int unit; uint8_t flbas_fmt; uint8_t vwc_present; ns->ctrlr = ctrlr; ns->id = id; /* * Namespaces are reconstructed after a controller reset, so check * to make sure we only call mtx_init once on each mtx. * * TODO: Move this somewhere where it gets called at controller * construction time, which is not invoked as part of each * controller reset. */ if (!mtx_initialized(&ns->lock)) mtx_init(&ns->lock, "nvme ns lock", NULL, MTX_DEF); status.done = 0; nvme_ctrlr_cmd_identify_namespace(ctrlr, id, &ns->data, nvme_completion_poll_cb, &status); nvme_completion_poll(&status); if (nvme_completion_is_error(&status.cpl)) { nvme_printf(ctrlr, "nvme_identify_namespace failed\n"); return (ENXIO); } /* Convert data to host endian */ nvme_namespace_data_swapbytes(&ns->data); /* * If the size of is zero, chances are this isn't a valid * namespace (eg one that's not been configured yet). The * standard says the entire id will be zeros, so this is a * cheap way to test for that. */ if (ns->data.nsze == 0) return (ENXIO); flbas_fmt = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, ns->data.flbas); /* * Note: format is a 0-based value, so > is appropriate here, * not >=. */ if (flbas_fmt > ns->data.nlbaf) { nvme_printf(ctrlr, "lba format %d exceeds number supported (%d)\n", flbas_fmt, ns->data.nlbaf + 1); return (ENXIO); } /* * Older Intel devices (like the PC35xxx and P45xx series) advertise in * vendor specific space an alignment that improves performance. If * present use for the stripe size. NVMe 1.3 standardized this as * NOIOB, and newer Intel drives use that. */ if ((ctrlr->quirks & QUIRK_INTEL_ALIGNMENT) != 0) { if (ctrlr->cdata.vs[3] != 0) ns->boundary = 1 << (ctrlr->cdata.vs[3] + NVME_MPS_SHIFT + NVME_CAP_HI_MPSMIN(ctrlr->cap_hi)); else ns->boundary = 0; } else { ns->boundary = ns->data.noiob * nvme_ns_get_sector_size(ns); } if (nvme_ctrlr_has_dataset_mgmt(&ctrlr->cdata)) ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; vwc_present = NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, ctrlr->cdata.vwc); if (vwc_present) ns->flags |= NVME_NS_FLUSH_SUPPORTED; /* * cdev may have already been created, if we are reconstructing the * namespace after a controller-level reset. */ if (ns->cdev != NULL) return (0); /* * Namespace IDs start at 1, so we need to subtract 1 to create a * correct unit number. */ unit = device_get_unit(ctrlr->dev) * NVME_MAX_NAMESPACES + ns->id - 1; make_dev_args_init(&md_args); md_args.mda_devsw = &nvme_ns_cdevsw; md_args.mda_unit = unit; md_args.mda_mode = 0600; md_args.mda_si_drv1 = ns; res = make_dev_s(&md_args, &ns->cdev, "nvme%dns%d", device_get_unit(ctrlr->dev), ns->id); if (res != 0) return (ENXIO); ns->cdev->si_flags |= SI_UNMAPPED; return (0); } void nvme_ns_destruct(struct nvme_namespace *ns) { if (ns->cdev != NULL) destroy_dev(ns->cdev); } diff --git a/sys/dev/nvmf/host/nvmf.c b/sys/dev/nvmf/host/nvmf.c index 0902bc78a7b5..e43d438aaa8c 100644 --- a/sys/dev/nvmf/host/nvmf.c +++ b/sys/dev/nvmf/host/nvmf.c @@ -1,939 +1,938 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include static struct cdevsw nvmf_cdevsw; MALLOC_DEFINE(M_NVMF, "nvmf", "NVMe over Fabrics host"); static void nvmf_disconnect_task(void *arg, int pending); void nvmf_complete(void *arg, const struct nvme_completion *cqe) { struct nvmf_completion_status *status = arg; struct mtx *mtx; status->cqe = *cqe; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); status->done = true; mtx_unlock(mtx); wakeup(status); } void nvmf_io_complete(void *arg, size_t xfered, int error) { struct nvmf_completion_status *status = arg; struct mtx *mtx; status->io_error = error; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); status->io_done = true; mtx_unlock(mtx); wakeup(status); } void nvmf_wait_for_reply(struct nvmf_completion_status *status) { struct mtx *mtx; mtx = mtx_pool_find(mtxpool_sleep, status); mtx_lock(mtx); while (!status->done || !status->io_done) mtx_sleep(status, mtx, 0, "nvmfcmd", 0); mtx_unlock(mtx); } static int nvmf_read_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t *value) { const struct nvmf_fabric_prop_get_rsp *rsp; struct nvmf_completion_status status; nvmf_status_init(&status); if (!nvmf_cmd_get_property(sc, offset, size, nvmf_complete, &status, M_WAITOK)) return (ECONNABORTED); nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "PROPERTY_GET failed, status %#x\n", le16toh(status.cqe.status)); return (EIO); } rsp = (const struct nvmf_fabric_prop_get_rsp *)&status.cqe; if (size == 8) *value = le64toh(rsp->value.u64); else *value = le32toh(rsp->value.u32.low); return (0); } static int nvmf_write_property(struct nvmf_softc *sc, uint32_t offset, uint8_t size, uint64_t value) { struct nvmf_completion_status status; nvmf_status_init(&status); if (!nvmf_cmd_set_property(sc, offset, size, value, nvmf_complete, &status, M_WAITOK)) return (ECONNABORTED); nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "PROPERTY_SET failed, status %#x\n", le16toh(status.cqe.status)); return (EIO); } return (0); } static void nvmf_shutdown_controller(struct nvmf_softc *sc) { uint64_t cc; int error; error = nvmf_read_property(sc, NVMF_PROP_CC, 4, &cc); if (error != 0) { device_printf(sc->dev, "Failed to fetch CC for shutdown\n"); return; } cc |= NVMEF(NVME_CC_REG_SHN, NVME_SHN_NORMAL); error = nvmf_write_property(sc, NVMF_PROP_CC, 4, cc); if (error != 0) device_printf(sc->dev, "Failed to set CC to trigger shutdown\n"); } static void nvmf_check_keep_alive(void *arg) { struct nvmf_softc *sc = arg; int traffic; traffic = atomic_readandclear_int(&sc->ka_active_rx_traffic); if (traffic == 0) { device_printf(sc->dev, "disconnecting due to KeepAlive timeout\n"); nvmf_disconnect(sc); return; } callout_schedule_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, C_HARDCLOCK); } static void nvmf_keep_alive_complete(void *arg, const struct nvme_completion *cqe) { struct nvmf_softc *sc = arg; atomic_store_int(&sc->ka_active_rx_traffic, 1); if (cqe->status != 0) { device_printf(sc->dev, "KeepAlive response reported status %#x\n", le16toh(cqe->status)); } } static void nvmf_send_keep_alive(void *arg) { struct nvmf_softc *sc = arg; int traffic; /* * Don't bother sending a KeepAlive command if TKAS is active * and another command has been sent during the interval. */ traffic = atomic_load_int(&sc->ka_active_tx_traffic); if (traffic == 0 && !nvmf_cmd_keep_alive(sc, nvmf_keep_alive_complete, sc, M_NOWAIT)) device_printf(sc->dev, "Failed to allocate KeepAlive command\n"); /* Clear ka_active_tx_traffic after sending the keep alive command. */ atomic_store_int(&sc->ka_active_tx_traffic, 0); callout_schedule_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, C_HARDCLOCK); } int nvmf_init_ivars(struct nvmf_ivars *ivars, struct nvmf_handoff_host *hh) { size_t len; u_int i; int error; memset(ivars, 0, sizeof(*ivars)); if (!hh->admin.admin || hh->num_io_queues < 1) return (EINVAL); ivars->cdata = malloc(sizeof(*ivars->cdata), M_NVMF, M_WAITOK); error = copyin(hh->cdata, ivars->cdata, sizeof(*ivars->cdata)); if (error != 0) goto out; nvme_controller_data_swapbytes(ivars->cdata); len = hh->num_io_queues * sizeof(*ivars->io_params); ivars->io_params = malloc(len, M_NVMF, M_WAITOK); error = copyin(hh->io, ivars->io_params, len); if (error != 0) goto out; for (i = 0; i < hh->num_io_queues; i++) { if (ivars->io_params[i].admin) { error = EINVAL; goto out; } /* Require all I/O queues to be the same size. */ if (ivars->io_params[i].qsize != ivars->io_params[0].qsize) { error = EINVAL; goto out; } } ivars->hh = hh; return (0); out: free(ivars->io_params, M_NVMF); free(ivars->cdata, M_NVMF); return (error); } void nvmf_free_ivars(struct nvmf_ivars *ivars) { free(ivars->io_params, M_NVMF); free(ivars->cdata, M_NVMF); } static int nvmf_probe(device_t dev) { struct nvmf_ivars *ivars = device_get_ivars(dev); char desc[260]; if (ivars == NULL) return (ENXIO); snprintf(desc, sizeof(desc), "Fabrics: %.256s", ivars->cdata->subnqn); device_set_desc_copy(dev, desc); return (BUS_PROBE_DEFAULT); } static int nvmf_establish_connection(struct nvmf_softc *sc, struct nvmf_ivars *ivars) { char name[16]; /* Setup the admin queue. */ sc->admin = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->hh->admin, "admin queue"); if (sc->admin == NULL) { device_printf(sc->dev, "Failed to setup admin queue\n"); return (ENXIO); } /* Setup I/O queues. */ sc->io = malloc(ivars->hh->num_io_queues * sizeof(*sc->io), M_NVMF, M_WAITOK | M_ZERO); sc->num_io_queues = ivars->hh->num_io_queues; for (u_int i = 0; i < sc->num_io_queues; i++) { snprintf(name, sizeof(name), "I/O queue %u", i); sc->io[i] = nvmf_init_qp(sc, ivars->hh->trtype, &ivars->io_params[i], name); if (sc->io[i] == NULL) { device_printf(sc->dev, "Failed to setup I/O queue %u\n", i + 1); return (ENXIO); } } /* Start KeepAlive timers. */ if (ivars->hh->kato != 0) { sc->ka_traffic = NVMEV(NVME_CTRLR_DATA_CTRATT_TBKAS, sc->cdata->ctratt) != 0; sc->ka_rx_sbt = mstosbt(ivars->hh->kato); sc->ka_tx_sbt = sc->ka_rx_sbt / 2; callout_reset_sbt(&sc->ka_rx_timer, sc->ka_rx_sbt, 0, nvmf_check_keep_alive, sc, C_HARDCLOCK); callout_reset_sbt(&sc->ka_tx_timer, sc->ka_tx_sbt, 0, nvmf_send_keep_alive, sc, C_HARDCLOCK); } return (0); } static bool nvmf_scan_nslist(struct nvmf_softc *sc, struct nvme_ns_list *nslist, struct nvme_namespace_data *data, uint32_t *nsidp) { struct nvmf_completion_status status; uint32_t nsid; nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_active_namespaces(sc, *nsidp, nslist, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY active namespaces command\n"); return (false); } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY active namespaces failed, status %#x\n", le16toh(status.cqe.status)); return (false); } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY active namespaces failed with I/O error %d\n", status.io_error); return (false); } for (u_int i = 0; i < nitems(nslist->ns); i++) { nsid = nslist->ns[i]; if (nsid == 0) { *nsidp = 0; return (true); } if (sc->ns[nsid - 1] != NULL) { device_printf(sc->dev, "duplicate namespace %u in active namespace list\n", nsid); return (false); } nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY namespace %u command\n", nsid); return (false); } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed, status %#x\n", nsid, le16toh(status.cqe.status)); return (false); } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed with I/O error %d\n", nsid, status.io_error); return (false); } /* * As in nvme_ns_construct, a size of zero indicates an * invalid namespace. */ nvme_namespace_data_swapbytes(data); if (data->nsze == 0) { device_printf(sc->dev, "ignoring active namespace %u with zero size\n", nsid); continue; } sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); nvmf_sim_rescan_ns(sc, nsid); } MPASS(nsid == nslist->ns[nitems(nslist->ns) - 1] && nsid != 0); if (nsid >= 0xfffffffd) *nsidp = 0; else *nsidp = nsid + 1; return (true); } static bool nvmf_add_namespaces(struct nvmf_softc *sc) { struct nvme_namespace_data *data; struct nvme_ns_list *nslist; uint32_t nsid; bool retval; sc->ns = mallocarray(sc->cdata->nn, sizeof(*sc->ns), M_NVMF, M_WAITOK | M_ZERO); nslist = malloc(sizeof(*nslist), M_NVMF, M_WAITOK); data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nsid = 0; retval = true; for (;;) { if (!nvmf_scan_nslist(sc, nslist, data, &nsid)) { retval = false; break; } if (nsid == 0) break; } free(data, M_NVMF); free(nslist, M_NVMF); return (retval); } static int nvmf_attach(device_t dev) { struct make_dev_args mda; struct nvmf_softc *sc = device_get_softc(dev); struct nvmf_ivars *ivars = device_get_ivars(dev); uint64_t val; u_int i; int error; if (ivars == NULL) return (ENXIO); sc->dev = dev; sc->trtype = ivars->hh->trtype; callout_init(&sc->ka_rx_timer, 1); callout_init(&sc->ka_tx_timer, 1); sx_init(&sc->connection_lock, "nvmf connection"); TASK_INIT(&sc->disconnect_task, 0, nvmf_disconnect_task, sc); /* Claim the cdata pointer from ivars. */ sc->cdata = ivars->cdata; ivars->cdata = NULL; nvmf_init_aer(sc); /* TODO: Multiqueue support. */ sc->max_pending_io = ivars->io_params[0].qsize /* * sc->num_io_queues */; error = nvmf_establish_connection(sc, ivars); if (error != 0) goto out; error = nvmf_read_property(sc, NVMF_PROP_CAP, 8, &sc->cap); if (error != 0) { device_printf(sc->dev, "Failed to fetch CAP\n"); error = ENXIO; goto out; } error = nvmf_read_property(sc, NVMF_PROP_VS, 4, &val); if (error != 0) { device_printf(sc->dev, "Failed to fetch VS\n"); error = ENXIO; goto out; } sc->vs = val; /* Honor MDTS if it is set. */ sc->max_xfer_size = maxphys; if (sc->cdata->mdts != 0) { sc->max_xfer_size = ulmin(sc->max_xfer_size, 1 << (sc->cdata->mdts + NVME_MPS_SHIFT + NVME_CAP_HI_MPSMIN(sc->cap >> 32))); } error = nvmf_init_sim(sc); if (error != 0) goto out; error = nvmf_start_aer(sc); if (error != 0) { nvmf_destroy_sim(sc); goto out; } if (!nvmf_add_namespaces(sc)) { nvmf_destroy_sim(sc); goto out; } make_dev_args_init(&mda); mda.mda_devsw = &nvmf_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = sc; error = make_dev_s(&mda, &sc->cdev, "%s", device_get_nameunit(dev)); if (error != 0) { nvmf_destroy_sim(sc); goto out; } return (0); out: if (sc->ns != NULL) { for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_destroy_ns(sc->ns[i]); } free(sc->ns, M_NVMF); } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); if (sc->admin != NULL) nvmf_shutdown_controller(sc); for (i = 0; i < sc->num_io_queues; i++) { if (sc->io[i] != NULL) nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); nvmf_destroy_aer(sc); taskqueue_drain(taskqueue_thread, &sc->disconnect_task); sx_destroy(&sc->connection_lock); free(sc->cdata, M_NVMF); return (error); } void nvmf_disconnect(struct nvmf_softc *sc) { taskqueue_enqueue(taskqueue_thread, &sc->disconnect_task); } static void nvmf_disconnect_task(void *arg, int pending __unused) { struct nvmf_softc *sc = arg; u_int i; sx_xlock(&sc->connection_lock); if (sc->admin == NULL) { /* * Ignore transport errors if there is no active * association. */ sx_xunlock(&sc->connection_lock); return; } if (sc->detaching) { if (sc->admin != NULL) { /* * This unsticks the detach process if a * transport error occurs during detach. */ nvmf_shutdown_qp(sc->admin); } sx_xunlock(&sc->connection_lock); return; } if (sc->cdev == NULL) { /* * Transport error occurred during attach (nvmf_add_namespaces). * Shutdown the admin queue. */ nvmf_shutdown_qp(sc->admin); sx_xunlock(&sc->connection_lock); return; } callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); sc->ka_traffic = false; /* Quiesce namespace consumers. */ nvmf_disconnect_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_disconnect_ns(sc->ns[i]); } /* Shutdown the existing qpairs. */ for (i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); sc->io = NULL; sc->num_io_queues = 0; nvmf_destroy_qp(sc->admin); sc->admin = NULL; sx_xunlock(&sc->connection_lock); } static int nvmf_reconnect_host(struct nvmf_softc *sc, struct nvmf_handoff_host *hh) { struct nvmf_ivars ivars; u_int i; int error; /* XXX: Should we permit changing the transport type? */ if (sc->trtype != hh->trtype) { device_printf(sc->dev, "transport type mismatch on reconnect\n"); return (EINVAL); } error = nvmf_init_ivars(&ivars, hh); if (error != 0) return (error); sx_xlock(&sc->connection_lock); if (sc->admin != NULL || sc->detaching) { error = EBUSY; goto out; } /* * Ensure this is for the same controller. Note that the * controller ID can vary across associations if the remote * system is using the dynamic controller model. This merely * ensures the new association is connected to the same NVMe * subsystem. */ if (memcmp(sc->cdata->subnqn, ivars.cdata->subnqn, sizeof(ivars.cdata->subnqn)) != 0) { device_printf(sc->dev, "controller subsystem NQN mismatch on reconnect\n"); error = EINVAL; goto out; } /* * XXX: Require same number and size of I/O queues so that * max_pending_io is still correct? */ error = nvmf_establish_connection(sc, &ivars); if (error != 0) goto out; error = nvmf_start_aer(sc); if (error != 0) goto out; device_printf(sc->dev, "established new association with %u I/O queues\n", sc->num_io_queues); /* Restart namespace consumers. */ for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_reconnect_ns(sc->ns[i]); } nvmf_reconnect_sim(sc); out: sx_xunlock(&sc->connection_lock); nvmf_free_ivars(&ivars); return (error); } static int nvmf_detach(device_t dev) { struct nvmf_softc *sc = device_get_softc(dev); u_int i; destroy_dev(sc->cdev); sx_xlock(&sc->connection_lock); sc->detaching = true; sx_xunlock(&sc->connection_lock); nvmf_destroy_sim(sc); for (i = 0; i < sc->cdata->nn; i++) { if (sc->ns[i] != NULL) nvmf_destroy_ns(sc->ns[i]); } free(sc->ns, M_NVMF); callout_drain(&sc->ka_tx_timer); callout_drain(&sc->ka_rx_timer); if (sc->admin != NULL) nvmf_shutdown_controller(sc); for (i = 0; i < sc->num_io_queues; i++) { nvmf_destroy_qp(sc->io[i]); } free(sc->io, M_NVMF); taskqueue_drain(taskqueue_thread, &sc->disconnect_task); if (sc->admin != NULL) nvmf_destroy_qp(sc->admin); nvmf_destroy_aer(sc); sx_destroy(&sc->connection_lock); free(sc->cdata, M_NVMF); return (0); } void nvmf_rescan_ns(struct nvmf_softc *sc, uint32_t nsid) { struct nvmf_completion_status status; struct nvme_namespace_data *data; struct nvmf_namespace *ns; data = malloc(sizeof(*data), M_NVMF, M_WAITOK); nvmf_status_init(&status); nvmf_status_wait_io(&status); if (!nvmf_cmd_identify_namespace(sc, nsid, data, nvmf_complete, &status, nvmf_io_complete, &status, M_WAITOK)) { device_printf(sc->dev, "failed to send IDENTIFY namespace %u command\n", nsid); free(data, M_NVMF); return; } nvmf_wait_for_reply(&status); if (status.cqe.status != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed, status %#x\n", nsid, le16toh(status.cqe.status)); free(data, M_NVMF); return; } if (status.io_error != 0) { device_printf(sc->dev, "IDENTIFY namespace %u failed with I/O error %d\n", nsid, status.io_error); free(data, M_NVMF); return; } nvme_namespace_data_swapbytes(data); /* XXX: Needs locking around sc->ns[]. */ ns = sc->ns[nsid - 1]; if (data->nsze == 0) { /* XXX: Needs locking */ if (ns != NULL) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; } } else { /* XXX: Needs locking */ if (ns == NULL) { sc->ns[nsid - 1] = nvmf_init_ns(sc, nsid, data); } else { if (!nvmf_update_ns(ns, data)) { nvmf_destroy_ns(ns); sc->ns[nsid - 1] = NULL; } } } free(data, M_NVMF); nvmf_sim_rescan_ns(sc, nsid); } int nvmf_passthrough_cmd(struct nvmf_softc *sc, struct nvme_pt_command *pt, bool admin) { struct nvmf_completion_status status; struct nvme_command cmd; struct memdesc mem; struct nvmf_host_qpair *qp; struct nvmf_request *req; void *buf; int error; if (pt->len > sc->max_xfer_size) return (EINVAL); buf = NULL; if (pt->len != 0) { /* * XXX: Depending on the size we may want to pin the * user pages and use a memdesc with vm_page_t's * instead. */ buf = malloc(pt->len, M_NVMF, M_WAITOK); if (pt->is_read == 0) { error = copyin(pt->buf, buf, pt->len); if (error != 0) { free(buf, M_NVMF); return (error); } } else { /* Ensure no kernel data is leaked to userland. */ memset(buf, 0, pt->len); } } memset(&cmd, 0, sizeof(cmd)); cmd.opc = pt->cmd.opc; cmd.fuse = pt->cmd.fuse; cmd.nsid = pt->cmd.nsid; cmd.cdw10 = pt->cmd.cdw10; cmd.cdw11 = pt->cmd.cdw11; cmd.cdw12 = pt->cmd.cdw12; cmd.cdw13 = pt->cmd.cdw13; cmd.cdw14 = pt->cmd.cdw14; cmd.cdw15 = pt->cmd.cdw15; if (admin) qp = sc->admin; else qp = nvmf_select_io_queue(sc); nvmf_status_init(&status); req = nvmf_allocate_request(qp, &cmd, nvmf_complete, &status, M_WAITOK); if (req == NULL) { device_printf(sc->dev, "failed to send passthrough command\n"); error = ECONNABORTED; goto error; } if (pt->len != 0) { mem = memdesc_vaddr(buf, pt->len); nvmf_capsule_append_data(req->nc, &mem, pt->len, pt->is_read == 0, nvmf_io_complete, &status); nvmf_status_wait_io(&status); } nvmf_submit_request(req); nvmf_wait_for_reply(&status); memset(&pt->cpl, 0, sizeof(pt->cpl)); pt->cpl.cdw0 = status.cqe.cdw0; pt->cpl.status = status.cqe.status; error = status.io_error; if (error == 0 && pt->len != 0 && pt->is_read != 0) error = copyout(buf, pt->buf, pt->len); error: free(buf, M_NVMF); return (error); } static int nvmf_ioctl(struct cdev *cdev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvmf_softc *sc = cdev->si_drv1; struct nvme_get_nsid *gnsid; struct nvme_pt_command *pt; struct nvmf_reconnect_params *rp; struct nvmf_handoff_host *hh; switch (cmd) { case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; return (nvmf_passthrough_cmd(sc, pt, true)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(sc->dev), + strlcpy(gnsid->cdev, device_get_nameunit(sc->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = 0; return (0); case NVME_GET_MAX_XFER_SIZE: *(uint64_t *)arg = sc->max_xfer_size; return (0); case NVMF_RECONNECT_PARAMS: rp = (struct nvmf_reconnect_params *)arg; if ((sc->cdata->fcatt & 1) == 0) rp->cntlid = NVMF_CNTLID_DYNAMIC; else rp->cntlid = sc->cdata->ctrlr_id; memcpy(rp->subnqn, sc->cdata->subnqn, sizeof(rp->subnqn)); return (0); case NVMF_RECONNECT_HOST: hh = (struct nvmf_handoff_host *)arg; return (nvmf_reconnect_host(sc, hh)); default: return (ENOTTY); } } static struct cdevsw nvmf_cdevsw = { .d_version = D_VERSION, .d_ioctl = nvmf_ioctl }; static int nvmf_modevent(module_t mod, int what, void *arg) { switch (what) { case MOD_LOAD: return (nvmf_ctl_load()); case MOD_QUIESCE: return (0); case MOD_UNLOAD: nvmf_ctl_unload(); destroy_dev_drain(&nvmf_cdevsw); return (0); default: return (EOPNOTSUPP); } } static device_method_t nvmf_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nvmf_probe), DEVMETHOD(device_attach, nvmf_attach), DEVMETHOD(device_detach, nvmf_detach), #if 0 DEVMETHOD(device_shutdown, nvmf_shutdown), #endif DEVMETHOD_END }; driver_t nvme_nvmf_driver = { "nvme", nvmf_methods, sizeof(struct nvmf_softc), }; DRIVER_MODULE(nvme, root, nvme_nvmf_driver, nvmf_modevent, NULL); MODULE_DEPEND(nvmf, nvmf_transport, 1, 1, 1); diff --git a/sys/dev/nvmf/host/nvmf_ns.c b/sys/dev/nvmf/host/nvmf_ns.c index 3ce434bf7c50..3428cbadc974 100644 --- a/sys/dev/nvmf/host/nvmf_ns.c +++ b/sys/dev/nvmf/host/nvmf_ns.c @@ -1,483 +1,482 @@ /*- * SPDX-License-Identifier: BSD-2-Clause * * Copyright (c) 2023-2024 Chelsio Communications, Inc. * Written by: John Baldwin */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include struct nvmf_namespace { struct nvmf_softc *sc; uint64_t size; uint32_t id; u_int flags; uint32_t lba_size; bool disconnected; TAILQ_HEAD(, bio) pending_bios; struct mtx lock; volatile u_int active_bios; struct cdev *cdev; }; static void nvmf_ns_strategy(struct bio *bio); static void ns_printf(struct nvmf_namespace *ns, const char *fmt, ...) { char buf[128]; struct sbuf sb; va_list ap; sbuf_new(&sb, buf, sizeof(buf), SBUF_FIXEDLEN); sbuf_set_drain(&sb, sbuf_printf_drain, NULL); sbuf_printf(&sb, "%sns%u: ", device_get_nameunit(ns->sc->dev), ns->id); va_start(ap, fmt); sbuf_vprintf(&sb, fmt, ap); va_end(ap); sbuf_finish(&sb); sbuf_delete(&sb); } /* * The I/O completion may trigger after the received CQE if the I/O * used a zero-copy mbuf that isn't harvested until after the NIC * driver processes TX completions. Abuse bio_driver1 as a refcount. * Store I/O errors in bio_driver2. */ static __inline u_int * bio_refs(struct bio *bio) { return ((u_int *)&bio->bio_driver1); } static void nvmf_ns_biodone(struct bio *bio) { struct nvmf_namespace *ns; int error; if (!refcount_release(bio_refs(bio))) return; ns = bio->bio_dev->si_drv1; /* If a request is aborted, resubmit or queue it for resubmission. */ if (bio->bio_error == ECONNABORTED) { bio->bio_error = 0; bio->bio_driver2 = 0; mtx_lock(&ns->lock); if (ns->disconnected) { TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); mtx_unlock(&ns->lock); } else { mtx_unlock(&ns->lock); nvmf_ns_strategy(bio); } } else { /* * I/O errors take precedence over generic EIO from * CQE errors. */ error = (intptr_t)bio->bio_driver2; if (error != 0) bio->bio_error = error; if (bio->bio_error != 0) bio->bio_flags |= BIO_ERROR; biodone(bio); } if (refcount_release(&ns->active_bios)) wakeup(ns); } static void nvmf_ns_io_complete(void *arg, size_t xfered, int error) { struct bio *bio = arg; KASSERT(xfered <= bio->bio_bcount, ("%s: xfered > bio_bcount", __func__)); bio->bio_driver2 = (void *)(intptr_t)error; bio->bio_resid = bio->bio_bcount - xfered; nvmf_ns_biodone(bio); } static void nvmf_ns_delete_complete(void *arg, size_t xfered, int error) { struct bio *bio = arg; if (error != 0) bio->bio_resid = bio->bio_bcount; else bio->bio_resid = 0; free(bio->bio_driver2, M_NVMF); bio->bio_driver2 = (void *)(intptr_t)error; nvmf_ns_biodone(bio); } static void nvmf_ns_bio_complete(void *arg, const struct nvme_completion *cqe) { struct bio *bio = arg; if (nvmf_cqe_aborted(cqe)) bio->bio_error = ECONNABORTED; else if (cqe->status != 0) bio->bio_error = EIO; nvmf_ns_biodone(bio); } static int nvmf_ns_submit_bio(struct nvmf_namespace *ns, struct bio *bio) { struct nvme_command cmd; struct nvmf_request *req; struct nvme_dsm_range *dsm_range; struct memdesc mem; uint64_t lba, lba_count; dsm_range = NULL; memset(&cmd, 0, sizeof(cmd)); switch (bio->bio_cmd) { case BIO_READ: lba = bio->bio_offset / ns->lba_size; lba_count = bio->bio_bcount / ns->lba_size; nvme_ns_read_cmd(&cmd, ns->id, lba, lba_count); break; case BIO_WRITE: lba = bio->bio_offset / ns->lba_size; lba_count = bio->bio_bcount / ns->lba_size; nvme_ns_write_cmd(&cmd, ns->id, lba, lba_count); break; case BIO_FLUSH: nvme_ns_flush_cmd(&cmd, ns->id); break; case BIO_DELETE: dsm_range = malloc(sizeof(*dsm_range), M_NVMF, M_NOWAIT | M_ZERO); if (dsm_range == NULL) return (ENOMEM); lba = bio->bio_offset / ns->lba_size; lba_count = bio->bio_bcount / ns->lba_size; dsm_range->starting_lba = htole64(lba); dsm_range->length = htole32(lba_count); cmd.opc = NVME_OPC_DATASET_MANAGEMENT; cmd.nsid = htole32(ns->id); cmd.cdw10 = htole32(0); /* 1 range */ cmd.cdw11 = htole32(NVME_DSM_ATTR_DEALLOCATE); break; default: return (EOPNOTSUPP); } mtx_lock(&ns->lock); if (ns->disconnected) { TAILQ_INSERT_TAIL(&ns->pending_bios, bio, bio_queue); mtx_unlock(&ns->lock); free(dsm_range, M_NVMF); return (0); } req = nvmf_allocate_request(nvmf_select_io_queue(ns->sc), &cmd, nvmf_ns_bio_complete, bio, M_NOWAIT); if (req == NULL) { mtx_unlock(&ns->lock); free(dsm_range, M_NVMF); return (ENOMEM); } switch (bio->bio_cmd) { case BIO_READ: case BIO_WRITE: refcount_init(bio_refs(bio), 2); mem = memdesc_bio(bio); nvmf_capsule_append_data(req->nc, &mem, bio->bio_bcount, bio->bio_cmd == BIO_WRITE, nvmf_ns_io_complete, bio); break; case BIO_DELETE: refcount_init(bio_refs(bio), 2); mem = memdesc_vaddr(dsm_range, sizeof(*dsm_range)); nvmf_capsule_append_data(req->nc, &mem, sizeof(*dsm_range), true, nvmf_ns_delete_complete, bio); bio->bio_driver2 = dsm_range; break; default: refcount_init(bio_refs(bio), 1); KASSERT(bio->bio_resid == 0, ("%s: input bio_resid != 0", __func__)); break; } refcount_acquire(&ns->active_bios); nvmf_submit_request(req); mtx_unlock(&ns->lock); return (0); } static int nvmf_ns_ioctl(struct cdev *dev, u_long cmd, caddr_t arg, int flag, struct thread *td) { struct nvmf_namespace *ns = dev->si_drv1; struct nvme_get_nsid *gnsid; struct nvme_pt_command *pt; switch (cmd) { case NVME_PASSTHROUGH_CMD: pt = (struct nvme_pt_command *)arg; pt->cmd.nsid = htole32(ns->id); return (nvmf_passthrough_cmd(ns->sc, pt, false)); case NVME_GET_NSID: gnsid = (struct nvme_get_nsid *)arg; - strncpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), + strlcpy(gnsid->cdev, device_get_nameunit(ns->sc->dev), sizeof(gnsid->cdev)); - gnsid->cdev[sizeof(gnsid->cdev) - 1] = '\0'; gnsid->nsid = ns->id; return (0); case DIOCGMEDIASIZE: *(off_t *)arg = ns->size; return (0); case DIOCGSECTORSIZE: *(u_int *)arg = ns->lba_size; return (0); default: return (ENOTTY); } } static int nvmf_ns_open(struct cdev *dev, int oflags, int devtype, struct thread *td) { int error; error = 0; if ((oflags & FWRITE) != 0) error = securelevel_gt(td->td_ucred, 0); return (error); } void nvmf_ns_strategy(struct bio *bio) { struct nvmf_namespace *ns; int error; ns = bio->bio_dev->si_drv1; error = nvmf_ns_submit_bio(ns, bio); if (error != 0) { bio->bio_error = error; bio->bio_flags |= BIO_ERROR; bio->bio_resid = bio->bio_bcount; biodone(bio); } } static struct cdevsw nvmf_ns_cdevsw = { .d_version = D_VERSION, .d_flags = D_DISK, .d_open = nvmf_ns_open, .d_read = physread, .d_write = physwrite, .d_strategy = nvmf_ns_strategy, .d_ioctl = nvmf_ns_ioctl }; struct nvmf_namespace * nvmf_init_ns(struct nvmf_softc *sc, uint32_t id, struct nvme_namespace_data *data) { struct make_dev_args mda; struct nvmf_namespace *ns; int error; uint8_t lbads, lbaf; ns = malloc(sizeof(*ns), M_NVMF, M_WAITOK | M_ZERO); ns->sc = sc; ns->id = id; TAILQ_INIT(&ns->pending_bios); mtx_init(&ns->lock, "nvmf ns", NULL, MTX_DEF); /* One dummy bio avoids dropping to 0 until destroy. */ refcount_init(&ns->active_bios, 1); if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { ns_printf(ns, "End-to-end data protection not supported\n"); goto fail; } lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); if (lbaf > data->nlbaf) { ns_printf(ns, "Invalid LBA format index\n"); goto fail; } if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { ns_printf(ns, "Namespaces with metadata are not supported\n"); goto fail; } lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); if (lbads == 0) { ns_printf(ns, "Invalid LBA format index\n"); goto fail; } ns->lba_size = 1 << lbads; ns->size = data->nsze * ns->lba_size; if (nvme_ctrlr_has_dataset_mgmt(sc->cdata)) ns->flags |= NVME_NS_DEALLOCATE_SUPPORTED; if (NVMEV(NVME_CTRLR_DATA_VWC_PRESENT, sc->cdata->vwc) != 0) ns->flags |= NVME_NS_FLUSH_SUPPORTED; /* * XXX: Does any of the boundary splitting for NOIOB make any * sense for Fabrics? */ make_dev_args_init(&mda); mda.mda_devsw = &nvmf_ns_cdevsw; mda.mda_uid = UID_ROOT; mda.mda_gid = GID_WHEEL; mda.mda_mode = 0600; mda.mda_si_drv1 = ns; error = make_dev_s(&mda, &ns->cdev, "%sns%u", device_get_nameunit(sc->dev), id); if (error != 0) goto fail; ns->cdev->si_flags |= SI_UNMAPPED; return (ns); fail: mtx_destroy(&ns->lock); free(ns, M_NVMF); return (NULL); } void nvmf_disconnect_ns(struct nvmf_namespace *ns) { mtx_lock(&ns->lock); ns->disconnected = true; mtx_unlock(&ns->lock); } void nvmf_reconnect_ns(struct nvmf_namespace *ns) { TAILQ_HEAD(, bio) bios; struct bio *bio; mtx_lock(&ns->lock); ns->disconnected = false; TAILQ_INIT(&bios); TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); mtx_unlock(&ns->lock); while (!TAILQ_EMPTY(&bios)) { bio = TAILQ_FIRST(&bios); TAILQ_REMOVE(&bios, bio, bio_queue); nvmf_ns_strategy(bio); } } void nvmf_destroy_ns(struct nvmf_namespace *ns) { TAILQ_HEAD(, bio) bios; struct bio *bio; destroy_dev(ns->cdev); /* * Wait for active I/O requests to drain. The release drops * the reference on the "dummy bio" when the namespace is * created. */ mtx_lock(&ns->lock); if (!refcount_release(&ns->active_bios)) { while (ns->active_bios != 0) mtx_sleep(ns, &ns->lock, 0, "nvmfrmns", 0); } /* Abort any pending I/O requests. */ TAILQ_INIT(&bios); TAILQ_CONCAT(&bios, &ns->pending_bios, bio_queue); mtx_unlock(&ns->lock); while (!TAILQ_EMPTY(&bios)) { bio = TAILQ_FIRST(&bios); TAILQ_REMOVE(&bios, bio, bio_queue); bio->bio_error = ECONNABORTED; bio->bio_flags |= BIO_ERROR; bio->bio_resid = bio->bio_bcount; biodone(bio); } mtx_destroy(&ns->lock); free(ns, M_NVMF); } bool nvmf_update_ns(struct nvmf_namespace *ns, struct nvme_namespace_data *data) { uint8_t lbads, lbaf; if (NVMEV(NVME_NS_DATA_DPS_PIT, data->dps) != 0) { ns_printf(ns, "End-to-end data protection not supported\n"); return (false); } lbaf = NVMEV(NVME_NS_DATA_FLBAS_FORMAT, data->flbas); if (lbaf > data->nlbaf) { ns_printf(ns, "Invalid LBA format index\n"); return (false); } if (NVMEV(NVME_NS_DATA_LBAF_MS, data->lbaf[lbaf]) != 0) { ns_printf(ns, "Namespaces with metadata are not supported\n"); return (false); } lbads = NVMEV(NVME_NS_DATA_LBAF_LBADS, data->lbaf[lbaf]); if (lbads == 0) { ns_printf(ns, "Invalid LBA format index\n"); return (false); } ns->lba_size = 1 << lbads; ns->size = data->nsze * ns->lba_size; return (true); }