Index: head/share/man/man4/nvme.4 =================================================================== --- head/share/man/man4/nvme.4 (revision 350117) +++ head/share/man/man4/nvme.4 (revision 350118) @@ -1,221 +1,232 @@ .\" .\" Copyright (c) 2012-2016 Intel Corporation .\" All rights reserved. .\" .\" Redistribution and use in source and binary forms, with or without .\" modification, are permitted provided that the following conditions .\" are met: .\" 1. Redistributions of source code must retain the above copyright .\" notice, this list of conditions, and the following disclaimer, .\" without modification. .\" 2. Redistributions in binary form must reproduce at minimum a disclaimer .\" substantially similar to the "NO WARRANTY" disclaimer below .\" ("Disclaimer") and any redistribution must be conditioned upon .\" including a substantially similar Disclaimer requirement for further .\" binary redistribution. .\" .\" NO WARRANTY .\" THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS .\" "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT .\" LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR .\" A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT .\" HOLDERS OR CONTRIBUTORS BE LIABLE FOR SPECIAL, EXEMPLARY, OR CONSEQUENTIAL .\" DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS .\" OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) .\" HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, .\" STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING .\" IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE .\" POSSIBILITY OF SUCH DAMAGES. .\" .\" nvme driver man page. .\" .\" Author: Jim Harris .\" .\" $FreeBSD$ .\" .Dd May 18, 2019 .Dt NVME 4 .Os .Sh NAME .Nm nvme .Nd NVM Express core driver .Sh SYNOPSIS To compile this driver into your kernel, place the following line in your kernel configuration file: .Bd -ragged -offset indent .Cd "device nvme" .Ed .Pp Or, to load the driver as a module at boot, place the following line in .Xr loader.conf 5 : .Bd -literal -offset indent nvme_load="YES" .Ed .Pp Most users will also want to enable .Xr nvd 4 to expose NVM Express namespaces as disk devices which can be partitioned. Note that in NVM Express terms, a namespace is roughly equivalent to a SCSI LUN. .Sh DESCRIPTION The .Nm driver provides support for NVM Express (NVMe) controllers, such as: .Bl -bullet .It Hardware initialization .It Per-CPU IO queue pairs .It API for registering NVMe namespace consumers such as .Xr nvd 4 or .Xr nda 4 .It API for submitting NVM commands to namespaces .It Ioctls for controller and namespace configuration and management .El .Pp The .Nm driver creates controller device nodes in the format .Pa /dev/nvmeX and namespace device nodes in the format .Pa /dev/nvmeXnsY . Note that the NVM Express specification starts numbering namespaces at 1, not 0, and this driver follows that convention. .Sh CONFIGURATION By default, .Nm will create an I/O queue pair for each CPU, provided enough MSI-X vectors and NVMe queue pairs can be allocated. If not enough vectors or queue pairs are available, nvme(4) will use a smaller number of queue pairs and assign multiple CPUs per queue pair. .Pp To force a single I/O queue pair shared by all CPUs, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.per_cpu_io_queues=0 .Ed .Pp To assign more than one CPU per I/O queue pair, thereby reducing the number of MSI-X vectors consumed by the device, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.min_cpus_per_ioq=X .Ed .Pp To force legacy interrupts for all .Nm driver instances, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.force_intx=1 .Ed .Pp Note that use of INTx implies disabling of per-CPU I/O queue pairs. .Pp The .Xr nvd 4 driver is used to provide a disk driver to the system by default. The .Xr nda 4 driver can also be used instead. The .Xr nvd 4 driver performs better with smaller transactions and few TRIM commands. It sends all commands directly to the drive immediately. The .Xr nda 4 driver performs better with larger transactions and also collapses TRIM commands giving better performance. It can queue commands to the drive; combine .Dv BIO_DELETE commands into a single trip; and use the CAM I/O scheduler to bias one type of operation over another. To select the .Xr nda 4 driver, set the following tunable value in .Xr loader.conf 5 : .Bd -literal -offset indent hw.nvme.use_nvd=0 .Ed .Pp This value may also be set in the kernel config file with .Bd -literal -offset indent .Cd options NVME_USE_NVD=0 .Ed +.Pp +When there is an error, +.Nm +prints only the most relevant information about the command by default. +To enable dumping of all information about the command, set the following tunable +value in +.Xr loader.conf 5 : +.Bd -literal -offset indent +hw.nvme.verbose_cmd_dump=1 +.Ed +.Pp .Sh SYSCTL VARIABLES The following controller-level sysctls are currently implemented: .Bl -tag -width indent .It Va dev.nvme.0.num_cpus_per_ioq (R) Number of CPUs associated with each I/O queue pair. .It Va dev.nvme.0.int_coal_time (R/W) Interrupt coalescing timer period in microseconds. Set to 0 to disable. .It Va dev.nvme.0.int_coal_threshold (R/W) Interrupt coalescing threshold in number of command completions. Set to 0 to disable. .El .Pp The following queue pair-level sysctls are currently implemented. Admin queue sysctls take the format of dev.nvme.0.adminq and I/O queue sysctls take the format of dev.nvme.0.ioq0. .Bl -tag -width indent .It Va dev.nvme.0.ioq0.num_entries (R) Number of entries in this queue pair's command and completion queue. .It Va dev.nvme.0.ioq0.num_tr (R) Number of nvme_tracker structures currently allocated for this queue pair. .It Va dev.nvme.0.ioq0.num_prp_list (R) Number of nvme_prp_list structures currently allocated for this queue pair. .It Va dev.nvme.0.ioq0.sq_head (R) Current location of the submission queue head pointer as observed by the driver. The head pointer is incremented by the controller as it takes commands off of the submission queue. .It Va dev.nvme.0.ioq0.sq_tail (R) Current location of the submission queue tail pointer as observed by the driver. The driver increments the tail pointer after writing a command into the submission queue to signal that a new command is ready to be processed. .It Va dev.nvme.0.ioq0.cq_head (R) Current location of the completion queue head pointer as observed by the driver. The driver increments the head pointer after finishing with a completion entry that was posted by the controller. .It Va dev.nvme.0.ioq0.num_cmds (R) Number of commands that have been submitted on this queue pair. .It Va dev.nvme.0.ioq0.dump_debug (W) Writing 1 to this sysctl will dump the full contents of the submission and completion queues to the console. .El .Sh SEE ALSO .Xr nda 4 , .Xr nvd 4 , .Xr pci 4 , .Xr nvmecontrol 8 , .Xr disk 9 .Sh HISTORY The .Nm driver first appeared in .Fx 9.2 . .Sh AUTHORS .An -nosplit The .Nm driver was developed by Intel and originally written by .An Jim Harris Aq Mt jimharris@FreeBSD.org , with contributions from .An Joe Golio at EMC. .Pp This man page was written by .An Jim Harris Aq Mt jimharris@FreeBSD.org . Index: head/sys/dev/nvme/nvme.c =================================================================== --- head/sys/dev/nvme/nvme.c (revision 350117) +++ head/sys/dev/nvme/nvme.c (revision 350118) @@ -1,492 +1,495 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2014 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include #include #include "nvme_private.h" struct nvme_consumer { uint32_t id; nvme_cons_ns_fn_t ns_fn; nvme_cons_ctrlr_fn_t ctrlr_fn; nvme_cons_async_fn_t async_fn; nvme_cons_fail_fn_t fail_fn; }; struct nvme_consumer nvme_consumer[NVME_MAX_CONSUMERS]; #define INVALID_CONSUMER_ID 0xFFFF uma_zone_t nvme_request_zone; int32_t nvme_retry_count; +int nvme_verbose_cmd_dump; + +TUNABLE_INT("hw.nvme.verbose_cmd_dump", &nvme_verbose_cmd_dump); MALLOC_DEFINE(M_NVME, "nvme", "nvme(4) memory allocations"); static int nvme_probe(device_t); static int nvme_attach(device_t); static int nvme_detach(device_t); static int nvme_shutdown(device_t); static devclass_t nvme_devclass; static device_method_t nvme_pci_methods[] = { /* Device interface */ DEVMETHOD(device_probe, nvme_probe), DEVMETHOD(device_attach, nvme_attach), DEVMETHOD(device_detach, nvme_detach), DEVMETHOD(device_shutdown, nvme_shutdown), { 0, 0 } }; static driver_t nvme_pci_driver = { "nvme", nvme_pci_methods, sizeof(struct nvme_controller), }; DRIVER_MODULE(nvme, pci, nvme_pci_driver, nvme_devclass, NULL, NULL); MODULE_VERSION(nvme, 1); MODULE_DEPEND(nvme, cam, 1, 1, 1); static struct _pcsid { uint32_t devid; int match_subdevice; uint16_t subdevice; const char *desc; uint32_t quirks; } pci_ids[] = { { 0x01118086, 0, 0, "NVMe Controller" }, { IDT32_PCI_ID, 0, 0, "IDT NVMe Controller (32 channel)" }, { IDT8_PCI_ID, 0, 0, "IDT NVMe Controller (8 channel)" }, { 0x09538086, 1, 0x3702, "DC P3700 SSD" }, { 0x09538086, 1, 0x3703, "DC P3700 SSD [2.5\" SFF]" }, { 0x09538086, 1, 0x3704, "DC P3500 SSD [Add-in Card]" }, { 0x09538086, 1, 0x3705, "DC P3500 SSD [2.5\" SFF]" }, { 0x09538086, 1, 0x3709, "DC P3600 SSD [Add-in Card]" }, { 0x09538086, 1, 0x370a, "DC P3600 SSD [2.5\" SFF]" }, { 0x00031c58, 0, 0, "HGST SN100", QUIRK_DELAY_B4_CHK_RDY }, { 0x00231c58, 0, 0, "WDC SN200", QUIRK_DELAY_B4_CHK_RDY }, { 0x05401c5f, 0, 0, "Memblaze Pblaze4", QUIRK_DELAY_B4_CHK_RDY }, { 0xa821144d, 0, 0, "Samsung PM1725", QUIRK_DELAY_B4_CHK_RDY }, { 0xa822144d, 0, 0, "Samsung PM1725a", QUIRK_DELAY_B4_CHK_RDY }, { 0x01161179, 0, 0, "Toshiba XG5", QUIRK_DISABLE_TIMEOUT }, { 0x00000000, 0, 0, NULL } }; static int nvme_match(uint32_t devid, uint16_t subdevice, struct _pcsid *ep) { if (devid != ep->devid) return 0; if (!ep->match_subdevice) return 1; if (subdevice == ep->subdevice) return 1; else return 0; } static int nvme_probe (device_t device) { struct _pcsid *ep; uint32_t devid; uint16_t subdevice; devid = pci_get_devid(device); subdevice = pci_get_subdevice(device); ep = pci_ids; while (ep->devid) { if (nvme_match(devid, subdevice, ep)) break; ++ep; } if (ep->desc) { device_set_desc(device, ep->desc); return (BUS_PROBE_DEFAULT); } #if defined(PCIS_STORAGE_NVM) if (pci_get_class(device) == PCIC_STORAGE && pci_get_subclass(device) == PCIS_STORAGE_NVM && pci_get_progif(device) == PCIP_STORAGE_NVM_ENTERPRISE_NVMHCI_1_0) { device_set_desc(device, "Generic NVMe Device"); return (BUS_PROBE_GENERIC); } #endif return (ENXIO); } static void nvme_init(void) { uint32_t i; nvme_request_zone = uma_zcreate("nvme_request", sizeof(struct nvme_request), NULL, NULL, NULL, NULL, 0, 0); for (i = 0; i < NVME_MAX_CONSUMERS; i++) nvme_consumer[i].id = INVALID_CONSUMER_ID; } SYSINIT(nvme_register, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_init, NULL); static void nvme_uninit(void) { uma_zdestroy(nvme_request_zone); } SYSUNINIT(nvme_unregister, SI_SUB_DRIVERS, SI_ORDER_SECOND, nvme_uninit, NULL); static int nvme_shutdown(device_t dev) { struct nvme_controller *ctrlr; ctrlr = DEVICE2SOFTC(dev); nvme_ctrlr_shutdown(ctrlr); return (0); } void nvme_dump_command(struct nvme_command *cmd) { printf( "opc:%x f:%x cid:%x nsid:%x r2:%x r3:%x mptr:%jx prp1:%jx prp2:%jx cdw:%x %x %x %x %x %x\n", cmd->opc, cmd->fuse, cmd->cid, le32toh(cmd->nsid), cmd->rsvd2, cmd->rsvd3, (uintmax_t)le64toh(cmd->mptr), (uintmax_t)le64toh(cmd->prp1), (uintmax_t)le64toh(cmd->prp2), le32toh(cmd->cdw10), le32toh(cmd->cdw11), le32toh(cmd->cdw12), le32toh(cmd->cdw13), le32toh(cmd->cdw14), le32toh(cmd->cdw15)); } void nvme_dump_completion(struct nvme_completion *cpl) { uint8_t p, sc, sct, m, dnr; uint16_t status; status = le16toh(cpl->status); p = NVME_STATUS_GET_P(status); sc = NVME_STATUS_GET_SC(status); sct = NVME_STATUS_GET_SCT(status); m = NVME_STATUS_GET_M(status); dnr = NVME_STATUS_GET_DNR(status); printf("cdw0:%08x sqhd:%04x sqid:%04x " "cid:%04x p:%x sc:%02x sct:%x m:%x dnr:%x\n", le32toh(cpl->cdw0), le16toh(cpl->sqhd), le16toh(cpl->sqid), cpl->cid, p, sc, sct, m, dnr); } static int nvme_attach(device_t dev) { struct nvme_controller *ctrlr = DEVICE2SOFTC(dev); int status; struct _pcsid *ep; uint32_t devid; uint16_t subdevice; devid = pci_get_devid(dev); subdevice = pci_get_subdevice(dev); ep = pci_ids; while (ep->devid) { if (nvme_match(devid, subdevice, ep)) break; ++ep; } ctrlr->quirks = ep->quirks; status = nvme_ctrlr_construct(ctrlr, dev); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); } /* * Some drives do not implement the completion timeout feature * correctly. There's a WAR from the manufacturer to just disable it. * The driver wouldn't respond correctly to a timeout anyway. */ if (ep->quirks & QUIRK_DISABLE_TIMEOUT) { int ptr; uint16_t devctl2; status = pci_find_cap(dev, PCIY_EXPRESS, &ptr); if (status) { device_printf(dev, "Can't locate PCIe capability?"); return (status); } devctl2 = pci_read_config(dev, ptr + PCIER_DEVICE_CTL2, sizeof(devctl2)); devctl2 |= PCIEM_CTL2_COMP_TIMO_DISABLE; pci_write_config(dev, ptr + PCIER_DEVICE_CTL2, devctl2, sizeof(devctl2)); } /* * Enable busmastering so the completion status messages can * be busmastered back to the host. */ pci_enable_busmaster(dev); /* * Reset controller twice to ensure we do a transition from cc.en==1 * to cc.en==0. This is because we don't really know what status * the controller was left in when boot handed off to OS. */ status = nvme_ctrlr_hw_reset(ctrlr); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); } status = nvme_ctrlr_hw_reset(ctrlr); if (status != 0) { nvme_ctrlr_destruct(ctrlr, dev); return (status); } ctrlr->config_hook.ich_func = nvme_ctrlr_start_config_hook; ctrlr->config_hook.ich_arg = ctrlr; config_intrhook_establish(&ctrlr->config_hook); return (0); } static int nvme_detach (device_t dev) { struct nvme_controller *ctrlr = DEVICE2SOFTC(dev); nvme_ctrlr_destruct(ctrlr, dev); pci_disable_busmaster(dev); return (0); } static void nvme_notify(struct nvme_consumer *cons, struct nvme_controller *ctrlr) { struct nvme_namespace *ns; void *ctrlr_cookie; int cmpset, ns_idx; /* * The consumer may register itself after the nvme devices * have registered with the kernel, but before the * driver has completed initialization. In that case, * return here, and when initialization completes, the * controller will make sure the consumer gets notified. */ if (!ctrlr->is_initialized) return; cmpset = atomic_cmpset_32(&ctrlr->notification_sent, 0, 1); if (cmpset == 0) return; if (cons->ctrlr_fn != NULL) ctrlr_cookie = (*cons->ctrlr_fn)(ctrlr); else ctrlr_cookie = NULL; ctrlr->cons_cookie[cons->id] = ctrlr_cookie; if (ctrlr->is_failed) { if (cons->fail_fn != NULL) (*cons->fail_fn)(ctrlr_cookie); /* * Do not notify consumers about the namespaces of a * failed controller. */ return; } for (ns_idx = 0; ns_idx < min(ctrlr->cdata.nn, NVME_MAX_NAMESPACES); ns_idx++) { ns = &ctrlr->ns[ns_idx]; if (ns->data.nsze == 0) continue; if (cons->ns_fn != NULL) ns->cons_cookie[cons->id] = (*cons->ns_fn)(ns, ctrlr_cookie); } } void nvme_notify_new_controller(struct nvme_controller *ctrlr) { int i; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { if (nvme_consumer[i].id != INVALID_CONSUMER_ID) { nvme_notify(&nvme_consumer[i], ctrlr); } } } static void nvme_notify_new_consumer(struct nvme_consumer *cons) { device_t *devlist; struct nvme_controller *ctrlr; int dev_idx, devcount; if (devclass_get_devices(nvme_devclass, &devlist, &devcount)) return; for (dev_idx = 0; dev_idx < devcount; dev_idx++) { ctrlr = DEVICE2SOFTC(devlist[dev_idx]); nvme_notify(cons, ctrlr); } free(devlist, M_TEMP); } void nvme_notify_async_consumers(struct nvme_controller *ctrlr, const struct nvme_completion *async_cpl, uint32_t log_page_id, void *log_page_buffer, uint32_t log_page_size) { struct nvme_consumer *cons; uint32_t i; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { cons = &nvme_consumer[i]; if (cons->id != INVALID_CONSUMER_ID && cons->async_fn != NULL) (*cons->async_fn)(ctrlr->cons_cookie[i], async_cpl, log_page_id, log_page_buffer, log_page_size); } } void nvme_notify_fail_consumers(struct nvme_controller *ctrlr) { struct nvme_consumer *cons; uint32_t i; /* * This controller failed during initialization (i.e. IDENTIFY * command failed or timed out). Do not notify any nvme * consumers of the failure here, since the consumer does not * even know about the controller yet. */ if (!ctrlr->is_initialized) return; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { cons = &nvme_consumer[i]; if (cons->id != INVALID_CONSUMER_ID && cons->fail_fn != NULL) cons->fail_fn(ctrlr->cons_cookie[i]); } } void nvme_notify_ns(struct nvme_controller *ctrlr, int nsid) { struct nvme_consumer *cons; struct nvme_namespace *ns = &ctrlr->ns[nsid - 1]; uint32_t i; if (!ctrlr->is_initialized) return; for (i = 0; i < NVME_MAX_CONSUMERS; i++) { cons = &nvme_consumer[i]; if (cons->id != INVALID_CONSUMER_ID && cons->ns_fn != NULL) ns->cons_cookie[cons->id] = (*cons->ns_fn)(ns, ctrlr->cons_cookie[cons->id]); } } struct nvme_consumer * nvme_register_consumer(nvme_cons_ns_fn_t ns_fn, nvme_cons_ctrlr_fn_t ctrlr_fn, nvme_cons_async_fn_t async_fn, nvme_cons_fail_fn_t fail_fn) { int i; /* * TODO: add locking around consumer registration. */ for (i = 0; i < NVME_MAX_CONSUMERS; i++) if (nvme_consumer[i].id == INVALID_CONSUMER_ID) { nvme_consumer[i].id = i; nvme_consumer[i].ns_fn = ns_fn; nvme_consumer[i].ctrlr_fn = ctrlr_fn; nvme_consumer[i].async_fn = async_fn; nvme_consumer[i].fail_fn = fail_fn; nvme_notify_new_consumer(&nvme_consumer[i]); return (&nvme_consumer[i]); } printf("nvme(4): consumer not registered - no slots available\n"); return (NULL); } void nvme_unregister_consumer(struct nvme_consumer *consumer) { consumer->id = INVALID_CONSUMER_ID; } void nvme_completion_poll_cb(void *arg, const struct nvme_completion *cpl) { struct nvme_completion_poll_status *status = arg; /* * Copy status into the argument passed by the caller, so that * the caller can check the status to determine if the * the request passed or failed. */ memcpy(&status->cpl, cpl, sizeof(*cpl)); atomic_store_rel_int(&status->done, 1); } Index: head/sys/dev/nvme/nvme_private.h =================================================================== --- head/sys/dev/nvme/nvme_private.h (revision 350117) +++ head/sys/dev/nvme/nvme_private.h (revision 350118) @@ -1,529 +1,530 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2014 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. * * $FreeBSD$ */ #ifndef __NVME_PRIVATE_H__ #define __NVME_PRIVATE_H__ #include #include #include #include #include #include #include #include #include #include #include #include #include "nvme.h" #define DEVICE2SOFTC(dev) ((struct nvme_controller *) device_get_softc(dev)) MALLOC_DECLARE(M_NVME); #define IDT32_PCI_ID 0x80d0111d /* 32 channel board */ #define IDT8_PCI_ID 0x80d2111d /* 8 channel board */ /* * For commands requiring more than 2 PRP entries, one PRP will be * embedded in the command (prp1), and the rest of the PRP entries * will be in a list pointed to by the command (prp2). This means * that real max number of PRP entries we support is 32+1, which * results in a max xfer size of 32*PAGE_SIZE. */ #define NVME_MAX_PRP_LIST_ENTRIES (NVME_MAX_XFER_SIZE / PAGE_SIZE) #define NVME_ADMIN_TRACKERS (16) #define NVME_ADMIN_ENTRIES (128) /* min and max are defined in admin queue attributes section of spec */ #define NVME_MIN_ADMIN_ENTRIES (2) #define NVME_MAX_ADMIN_ENTRIES (4096) /* * NVME_IO_ENTRIES defines the size of an I/O qpair's submission and completion * queues, while NVME_IO_TRACKERS defines the maximum number of I/O that we * will allow outstanding on an I/O qpair at any time. The only advantage in * having IO_ENTRIES > IO_TRACKERS is for debugging purposes - when dumping * the contents of the submission and completion queues, it will show a longer * history of data. */ #define NVME_IO_ENTRIES (256) #define NVME_IO_TRACKERS (128) #define NVME_MIN_IO_TRACKERS (4) #define NVME_MAX_IO_TRACKERS (1024) /* * NVME_MAX_IO_ENTRIES is not defined, since it is specified in CC.MQES * for each controller. */ #define NVME_INT_COAL_TIME (0) /* disabled */ #define NVME_INT_COAL_THRESHOLD (0) /* 0-based */ #define NVME_MAX_NAMESPACES (16) #define NVME_MAX_CONSUMERS (2) #define NVME_MAX_ASYNC_EVENTS (8) #define NVME_DEFAULT_TIMEOUT_PERIOD (30) /* in seconds */ #define NVME_MIN_TIMEOUT_PERIOD (5) #define NVME_MAX_TIMEOUT_PERIOD (120) #define NVME_DEFAULT_RETRY_COUNT (4) /* Maximum log page size to fetch for AERs. */ #define NVME_MAX_AER_LOG_SIZE (4096) /* * Define CACHE_LINE_SIZE here for older FreeBSD versions that do not define * it. */ #ifndef CACHE_LINE_SIZE #define CACHE_LINE_SIZE (64) #endif extern uma_zone_t nvme_request_zone; extern int32_t nvme_retry_count; +extern int32_t nvme_verbose_cmd_dump; struct nvme_completion_poll_status { struct nvme_completion cpl; int done; }; #define NVME_REQUEST_VADDR 1 #define NVME_REQUEST_NULL 2 /* For requests with no payload. */ #define NVME_REQUEST_UIO 3 #define NVME_REQUEST_BIO 4 #define NVME_REQUEST_CCB 5 struct nvme_request { struct nvme_command cmd; struct nvme_qpair *qpair; union { void *payload; struct bio *bio; } u; uint32_t type; uint32_t payload_size; boolean_t timeout; nvme_cb_fn_t cb_fn; void *cb_arg; int32_t retries; STAILQ_ENTRY(nvme_request) stailq; }; struct nvme_async_event_request { struct nvme_controller *ctrlr; struct nvme_request *req; struct nvme_completion cpl; uint32_t log_page_id; uint32_t log_page_size; uint8_t log_page_buffer[NVME_MAX_AER_LOG_SIZE]; }; struct nvme_tracker { TAILQ_ENTRY(nvme_tracker) tailq; struct nvme_request *req; struct nvme_qpair *qpair; struct callout timer; bus_dmamap_t payload_dma_map; uint16_t cid; uint64_t *prp; bus_addr_t prp_bus_addr; }; struct nvme_qpair { struct nvme_controller *ctrlr; uint32_t id; uint32_t phase; uint16_t vector; int rid; struct resource *res; void *tag; uint32_t num_entries; uint32_t num_trackers; uint32_t sq_tdbl_off; uint32_t cq_hdbl_off; uint32_t sq_head; uint32_t sq_tail; uint32_t cq_head; int64_t num_cmds; int64_t num_intr_handler_calls; struct nvme_command *cmd; struct nvme_completion *cpl; bus_dma_tag_t dma_tag; bus_dma_tag_t dma_tag_payload; bus_dmamap_t queuemem_map; uint64_t cmd_bus_addr; uint64_t cpl_bus_addr; TAILQ_HEAD(, nvme_tracker) free_tr; TAILQ_HEAD(, nvme_tracker) outstanding_tr; STAILQ_HEAD(, nvme_request) queued_req; struct nvme_tracker **act_tr; boolean_t is_enabled; struct mtx lock __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); struct nvme_namespace { struct nvme_controller *ctrlr; struct nvme_namespace_data data; uint32_t id; uint32_t flags; struct cdev *cdev; void *cons_cookie[NVME_MAX_CONSUMERS]; uint32_t stripesize; struct mtx lock; }; /* * One of these per allocated PCI device. */ struct nvme_controller { device_t dev; struct mtx lock; uint32_t ready_timeout_in_ms; uint32_t quirks; #define QUIRK_DELAY_B4_CHK_RDY 1 /* Can't touch MMIO on disable */ #define QUIRK_DISABLE_TIMEOUT 2 /* Disable broken completion timeout feature */ bus_space_tag_t bus_tag; bus_space_handle_t bus_handle; int resource_id; struct resource *resource; /* * The NVMe spec allows for the MSI-X table to be placed in BAR 4/5, * separate from the control registers which are in BAR 0/1. These * members track the mapping of BAR 4/5 for that reason. */ int bar4_resource_id; struct resource *bar4_resource; uint32_t msix_enabled; uint32_t force_intx; uint32_t enable_aborts; uint32_t num_io_queues; uint32_t num_cpus_per_ioq; uint32_t max_hw_pend_io; /* Fields for tracking progress during controller initialization. */ struct intr_config_hook config_hook; uint32_t ns_identified; uint32_t queues_created; struct task reset_task; struct task fail_req_task; struct taskqueue *taskqueue; /* For shared legacy interrupt. */ int rid; struct resource *res; void *tag; bus_dma_tag_t hw_desc_tag; bus_dmamap_t hw_desc_map; /** maximum i/o size in bytes */ uint32_t max_xfer_size; /** minimum page size supported by this controller in bytes */ uint32_t min_page_size; /** interrupt coalescing time period (in microseconds) */ uint32_t int_coal_time; /** interrupt coalescing threshold */ uint32_t int_coal_threshold; /** timeout period in seconds */ uint32_t timeout_period; struct nvme_qpair adminq; struct nvme_qpair *ioq; struct nvme_registers *regs; struct nvme_controller_data cdata; struct nvme_namespace ns[NVME_MAX_NAMESPACES]; struct cdev *cdev; /** bit mask of event types currently enabled for async events */ uint32_t async_event_config; uint32_t num_aers; struct nvme_async_event_request aer[NVME_MAX_ASYNC_EVENTS]; void *cons_cookie[NVME_MAX_CONSUMERS]; uint32_t is_resetting; uint32_t is_initialized; uint32_t notification_sent; boolean_t is_failed; STAILQ_HEAD(, nvme_request) fail_req; }; #define nvme_mmio_offsetof(reg) \ offsetof(struct nvme_registers, reg) #define nvme_mmio_read_4(sc, reg) \ bus_space_read_4((sc)->bus_tag, (sc)->bus_handle, \ nvme_mmio_offsetof(reg)) #define nvme_mmio_write_4(sc, reg, val) \ bus_space_write_4((sc)->bus_tag, (sc)->bus_handle, \ nvme_mmio_offsetof(reg), val) #define nvme_mmio_write_8(sc, reg, val) \ do { \ bus_space_write_4((sc)->bus_tag, (sc)->bus_handle, \ nvme_mmio_offsetof(reg), val & 0xFFFFFFFF); \ bus_space_write_4((sc)->bus_tag, (sc)->bus_handle, \ nvme_mmio_offsetof(reg)+4, \ (val & 0xFFFFFFFF00000000ULL) >> 32); \ } while (0); #define nvme_printf(ctrlr, fmt, args...) \ device_printf(ctrlr->dev, fmt, ##args) void nvme_ns_test(struct nvme_namespace *ns, u_long cmd, caddr_t arg); void nvme_ctrlr_cmd_identify_controller(struct nvme_controller *ctrlr, void *payload, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_identify_namespace(struct nvme_controller *ctrlr, uint32_t nsid, void *payload, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_set_interrupt_coalescing(struct nvme_controller *ctrlr, uint32_t microseconds, uint32_t threshold, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_error_page(struct nvme_controller *ctrlr, struct nvme_error_information_entry *payload, uint32_t num_entries, /* 0 = max */ nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_health_information_page(struct nvme_controller *ctrlr, uint32_t nsid, struct nvme_health_information_page *payload, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_get_firmware_page(struct nvme_controller *ctrlr, struct nvme_firmware_page *payload, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_create_io_cq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, uint16_t vector, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_create_io_sq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_delete_io_cq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_delete_io_sq(struct nvme_controller *ctrlr, struct nvme_qpair *io_que, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_set_num_queues(struct nvme_controller *ctrlr, uint32_t num_queues, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_set_async_event_config(struct nvme_controller *ctrlr, uint32_t state, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_ctrlr_cmd_abort(struct nvme_controller *ctrlr, uint16_t cid, uint16_t sqid, nvme_cb_fn_t cb_fn, void *cb_arg); void nvme_completion_poll_cb(void *arg, const struct nvme_completion *cpl); int nvme_ctrlr_construct(struct nvme_controller *ctrlr, device_t dev); void nvme_ctrlr_destruct(struct nvme_controller *ctrlr, device_t dev); void nvme_ctrlr_shutdown(struct nvme_controller *ctrlr); int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr); void nvme_ctrlr_reset(struct nvme_controller *ctrlr); /* ctrlr defined as void * to allow use with config_intrhook. */ void nvme_ctrlr_start_config_hook(void *ctrlr_arg); void nvme_ctrlr_submit_admin_request(struct nvme_controller *ctrlr, struct nvme_request *req); void nvme_ctrlr_submit_io_request(struct nvme_controller *ctrlr, struct nvme_request *req); void nvme_ctrlr_post_failed_request(struct nvme_controller *ctrlr, struct nvme_request *req); int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, uint16_t vector, uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr); void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr); bool nvme_qpair_process_completions(struct nvme_qpair *qpair); void nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); void nvme_qpair_reset(struct nvme_qpair *qpair); void nvme_qpair_fail(struct nvme_qpair *qpair); void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, struct nvme_request *req, uint32_t sct, uint32_t sc); void nvme_admin_qpair_enable(struct nvme_qpair *qpair); void nvme_admin_qpair_disable(struct nvme_qpair *qpair); void nvme_admin_qpair_destroy(struct nvme_qpair *qpair); void nvme_io_qpair_enable(struct nvme_qpair *qpair); void nvme_io_qpair_disable(struct nvme_qpair *qpair); void nvme_io_qpair_destroy(struct nvme_qpair *qpair); int nvme_ns_construct(struct nvme_namespace *ns, uint32_t id, struct nvme_controller *ctrlr); void nvme_ns_destruct(struct nvme_namespace *ns); void nvme_sysctl_initialize_ctrlr(struct nvme_controller *ctrlr); void nvme_dump_command(struct nvme_command *cmd); void nvme_dump_completion(struct nvme_completion *cpl); static __inline void nvme_single_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) { uint64_t *bus_addr = (uint64_t *)arg; if (error != 0) printf("nvme_single_map err %d\n", error); *bus_addr = seg[0].ds_addr; } static __inline struct nvme_request * _nvme_allocate_request(nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = uma_zalloc(nvme_request_zone, M_NOWAIT | M_ZERO); if (req != NULL) { req->cb_fn = cb_fn; req->cb_arg = cb_arg; req->timeout = TRUE; } return (req); } static __inline struct nvme_request * nvme_allocate_request_vaddr(void *payload, uint32_t payload_size, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = _nvme_allocate_request(cb_fn, cb_arg); if (req != NULL) { req->type = NVME_REQUEST_VADDR; req->u.payload = payload; req->payload_size = payload_size; } return (req); } static __inline struct nvme_request * nvme_allocate_request_null(nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = _nvme_allocate_request(cb_fn, cb_arg); if (req != NULL) req->type = NVME_REQUEST_NULL; return (req); } static __inline struct nvme_request * nvme_allocate_request_bio(struct bio *bio, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = _nvme_allocate_request(cb_fn, cb_arg); if (req != NULL) { req->type = NVME_REQUEST_BIO; req->u.bio = bio; } return (req); } static __inline struct nvme_request * nvme_allocate_request_ccb(union ccb *ccb, nvme_cb_fn_t cb_fn, void *cb_arg) { struct nvme_request *req; req = _nvme_allocate_request(cb_fn, cb_arg); if (req != NULL) { req->type = NVME_REQUEST_CCB; req->u.payload = ccb; } return (req); } #define nvme_free_request(req) uma_zfree(nvme_request_zone, req) void nvme_notify_async_consumers(struct nvme_controller *ctrlr, const struct nvme_completion *async_cpl, uint32_t log_page_id, void *log_page_buffer, uint32_t log_page_size); void nvme_notify_fail_consumers(struct nvme_controller *ctrlr); void nvme_notify_new_controller(struct nvme_controller *ctrlr); void nvme_notify_ns(struct nvme_controller *ctrlr, int nsid); void nvme_ctrlr_intx_handler(void *arg); void nvme_ctrlr_poll(struct nvme_controller *ctrlr); #endif /* __NVME_PRIVATE_H__ */ Index: head/sys/dev/nvme/nvme_qpair.c =================================================================== --- head/sys/dev/nvme/nvme_qpair.c (revision 350117) +++ head/sys/dev/nvme/nvme_qpair.c (revision 350118) @@ -1,1213 +1,1223 @@ /*- * SPDX-License-Identifier: BSD-2-Clause-FreeBSD * * Copyright (C) 2012-2014 Intel Corporation * All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions * are met: * 1. Redistributions of source code must retain the above copyright * notice, this list of conditions and the following disclaimer. * 2. Redistributions in binary form must reproduce the above copyright * notice, this list of conditions and the following disclaimer in the * documentation and/or other materials provided with the distribution. * * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF * SUCH DAMAGE. */ #include __FBSDID("$FreeBSD$"); #include #include #include #include #include #include "nvme_private.h" typedef enum error_print { ERROR_PRINT_NONE, ERROR_PRINT_NO_RETRY, ERROR_PRINT_ALL } error_print_t; #define DO_NOT_RETRY 1 static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req); static void nvme_qpair_destroy(struct nvme_qpair *qpair); struct nvme_opcode_string { uint16_t opc; const char * str; }; static struct nvme_opcode_string admin_opcode[] = { { NVME_OPC_DELETE_IO_SQ, "DELETE IO SQ" }, { NVME_OPC_CREATE_IO_SQ, "CREATE IO SQ" }, { NVME_OPC_GET_LOG_PAGE, "GET LOG PAGE" }, { NVME_OPC_DELETE_IO_CQ, "DELETE IO CQ" }, { NVME_OPC_CREATE_IO_CQ, "CREATE IO CQ" }, { NVME_OPC_IDENTIFY, "IDENTIFY" }, { NVME_OPC_ABORT, "ABORT" }, { NVME_OPC_SET_FEATURES, "SET FEATURES" }, { NVME_OPC_GET_FEATURES, "GET FEATURES" }, { NVME_OPC_ASYNC_EVENT_REQUEST, "ASYNC EVENT REQUEST" }, { NVME_OPC_FIRMWARE_ACTIVATE, "FIRMWARE ACTIVATE" }, { NVME_OPC_FIRMWARE_IMAGE_DOWNLOAD, "FIRMWARE IMAGE DOWNLOAD" }, { NVME_OPC_DEVICE_SELF_TEST, "DEVICE SELF-TEST" }, { NVME_OPC_NAMESPACE_ATTACHMENT, "NAMESPACE ATTACHMENT" }, { NVME_OPC_KEEP_ALIVE, "KEEP ALIVE" }, { NVME_OPC_DIRECTIVE_SEND, "DIRECTIVE SEND" }, { NVME_OPC_DIRECTIVE_RECEIVE, "DIRECTIVE RECEIVE" }, { NVME_OPC_VIRTUALIZATION_MANAGEMENT, "VIRTUALIZATION MANAGEMENT" }, { NVME_OPC_NVME_MI_SEND, "NVME-MI SEND" }, { NVME_OPC_NVME_MI_RECEIVE, "NVME-MI RECEIVE" }, { NVME_OPC_DOORBELL_BUFFER_CONFIG, "DOORBELL BUFFER CONFIG" }, { NVME_OPC_FORMAT_NVM, "FORMAT NVM" }, { NVME_OPC_SECURITY_SEND, "SECURITY SEND" }, { NVME_OPC_SECURITY_RECEIVE, "SECURITY RECEIVE" }, { NVME_OPC_SANITIZE, "SANITIZE" }, { 0xFFFF, "ADMIN COMMAND" } }; static struct nvme_opcode_string io_opcode[] = { { NVME_OPC_FLUSH, "FLUSH" }, { NVME_OPC_WRITE, "WRITE" }, { NVME_OPC_READ, "READ" }, { NVME_OPC_WRITE_UNCORRECTABLE, "WRITE UNCORRECTABLE" }, { NVME_OPC_COMPARE, "COMPARE" }, { NVME_OPC_WRITE_ZEROES, "WRITE ZEROES" }, { NVME_OPC_DATASET_MANAGEMENT, "DATASET MANAGEMENT" }, { NVME_OPC_RESERVATION_REGISTER, "RESERVATION REGISTER" }, { NVME_OPC_RESERVATION_REPORT, "RESERVATION REPORT" }, { NVME_OPC_RESERVATION_ACQUIRE, "RESERVATION ACQUIRE" }, { NVME_OPC_RESERVATION_RELEASE, "RESERVATION RELEASE" }, { 0xFFFF, "IO COMMAND" } }; static const char * get_admin_opcode_string(uint16_t opc) { struct nvme_opcode_string *entry; entry = admin_opcode; while (entry->opc != 0xFFFF) { if (entry->opc == opc) return (entry->str); entry++; } return (entry->str); } static const char * get_io_opcode_string(uint16_t opc) { struct nvme_opcode_string *entry; entry = io_opcode; while (entry->opc != 0xFFFF) { if (entry->opc == opc) return (entry->str); entry++; } return (entry->str); } static void nvme_admin_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%x " "cdw10:%08x cdw11:%08x\n", get_admin_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, le32toh(cmd->nsid), le32toh(cmd->cdw10), le32toh(cmd->cdw11)); } static void nvme_io_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { switch (cmd->opc) { case NVME_OPC_WRITE: case NVME_OPC_READ: case NVME_OPC_WRITE_UNCORRECTABLE: case NVME_OPC_COMPARE: case NVME_OPC_WRITE_ZEROES: nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d " "lba:%llu len:%d\n", get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid), ((unsigned long long)le32toh(cmd->cdw11) << 32) + le32toh(cmd->cdw10), (le32toh(cmd->cdw12) & 0xFFFF) + 1); break; case NVME_OPC_FLUSH: case NVME_OPC_DATASET_MANAGEMENT: case NVME_OPC_RESERVATION_REGISTER: case NVME_OPC_RESERVATION_REPORT: case NVME_OPC_RESERVATION_ACQUIRE: case NVME_OPC_RESERVATION_RELEASE: nvme_printf(qpair->ctrlr, "%s sqid:%d cid:%d nsid:%d\n", get_io_opcode_string(cmd->opc), qpair->id, cmd->cid, le32toh(cmd->nsid)); break; default: nvme_printf(qpair->ctrlr, "%s (%02x) sqid:%d cid:%d nsid:%d\n", get_io_opcode_string(cmd->opc), cmd->opc, qpair->id, cmd->cid, le32toh(cmd->nsid)); break; } } static void nvme_qpair_print_command(struct nvme_qpair *qpair, struct nvme_command *cmd) { if (qpair->id == 0) nvme_admin_qpair_print_command(qpair, cmd); else nvme_io_qpair_print_command(qpair, cmd); + if (nvme_verbose_cmd_dump) { + nvme_printf(qpair->ctrlr, + "nsid:%#x rsvd2:%#x rsvd3:%#x mptr:%#jx prp1:%#jx prp2:%#jx\n", + cmd->nsid, cmd->rsvd2, cmd->rsvd3, (uintmax_t)cmd->mptr, + (uintmax_t)cmd->prp1, (uintmax_t)cmd->prp2); + nvme_printf(qpair->ctrlr, + "cdw10: %#x cdw11:%#x cdw12:%#x cdw13:%#x cdw14:%#x cdw15:%#x\n", + cmd->cdw10, cmd->cdw11, cmd->cdw12, cmd->cdw13, cmd->cdw14, + cmd->cdw15); + } } struct nvme_status_string { uint16_t sc; const char * str; }; static struct nvme_status_string generic_status[] = { { NVME_SC_SUCCESS, "SUCCESS" }, { NVME_SC_INVALID_OPCODE, "INVALID OPCODE" }, { NVME_SC_INVALID_FIELD, "INVALID_FIELD" }, { NVME_SC_COMMAND_ID_CONFLICT, "COMMAND ID CONFLICT" }, { NVME_SC_DATA_TRANSFER_ERROR, "DATA TRANSFER ERROR" }, { NVME_SC_ABORTED_POWER_LOSS, "ABORTED - POWER LOSS" }, { NVME_SC_INTERNAL_DEVICE_ERROR, "INTERNAL DEVICE ERROR" }, { NVME_SC_ABORTED_BY_REQUEST, "ABORTED - BY REQUEST" }, { NVME_SC_ABORTED_SQ_DELETION, "ABORTED - SQ DELETION" }, { NVME_SC_ABORTED_FAILED_FUSED, "ABORTED - FAILED FUSED" }, { NVME_SC_ABORTED_MISSING_FUSED, "ABORTED - MISSING FUSED" }, { NVME_SC_INVALID_NAMESPACE_OR_FORMAT, "INVALID NAMESPACE OR FORMAT" }, { NVME_SC_COMMAND_SEQUENCE_ERROR, "COMMAND SEQUENCE ERROR" }, { NVME_SC_INVALID_SGL_SEGMENT_DESCR, "INVALID SGL SEGMENT DESCRIPTOR" }, { NVME_SC_INVALID_NUMBER_OF_SGL_DESCR, "INVALID NUMBER OF SGL DESCRIPTORS" }, { NVME_SC_DATA_SGL_LENGTH_INVALID, "DATA SGL LENGTH INVALID" }, { NVME_SC_METADATA_SGL_LENGTH_INVALID, "METADATA SGL LENGTH INVALID" }, { NVME_SC_SGL_DESCRIPTOR_TYPE_INVALID, "SGL DESCRIPTOR TYPE INVALID" }, { NVME_SC_INVALID_USE_OF_CMB, "INVALID USE OF CONTROLLER MEMORY BUFFER" }, { NVME_SC_PRP_OFFET_INVALID, "PRP OFFET INVALID" }, { NVME_SC_ATOMIC_WRITE_UNIT_EXCEEDED, "ATOMIC WRITE UNIT EXCEEDED" }, { NVME_SC_OPERATION_DENIED, "OPERATION DENIED" }, { NVME_SC_SGL_OFFSET_INVALID, "SGL OFFSET INVALID" }, { NVME_SC_HOST_ID_INCONSISTENT_FORMAT, "HOST IDENTIFIER INCONSISTENT FORMAT" }, { NVME_SC_KEEP_ALIVE_TIMEOUT_EXPIRED, "KEEP ALIVE TIMEOUT EXPIRED" }, { NVME_SC_KEEP_ALIVE_TIMEOUT_INVALID, "KEEP ALIVE TIMEOUT INVALID" }, { NVME_SC_ABORTED_DUE_TO_PREEMPT, "COMMAND ABORTED DUE TO PREEMPT AND ABORT" }, { NVME_SC_SANITIZE_FAILED, "SANITIZE FAILED" }, { NVME_SC_SANITIZE_IN_PROGRESS, "SANITIZE IN PROGRESS" }, { NVME_SC_SGL_DATA_BLOCK_GRAN_INVALID, "SGL_DATA_BLOCK_GRANULARITY_INVALID" }, { NVME_SC_NOT_SUPPORTED_IN_CMB, "COMMAND NOT SUPPORTED FOR QUEUE IN CMB" }, { NVME_SC_LBA_OUT_OF_RANGE, "LBA OUT OF RANGE" }, { NVME_SC_CAPACITY_EXCEEDED, "CAPACITY EXCEEDED" }, { NVME_SC_NAMESPACE_NOT_READY, "NAMESPACE NOT READY" }, { NVME_SC_RESERVATION_CONFLICT, "RESERVATION CONFLICT" }, { NVME_SC_FORMAT_IN_PROGRESS, "FORMAT IN PROGRESS" }, { 0xFFFF, "GENERIC" } }; static struct nvme_status_string command_specific_status[] = { { NVME_SC_COMPLETION_QUEUE_INVALID, "INVALID COMPLETION QUEUE" }, { NVME_SC_INVALID_QUEUE_IDENTIFIER, "INVALID QUEUE IDENTIFIER" }, { NVME_SC_MAXIMUM_QUEUE_SIZE_EXCEEDED, "MAX QUEUE SIZE EXCEEDED" }, { NVME_SC_ABORT_COMMAND_LIMIT_EXCEEDED, "ABORT CMD LIMIT EXCEEDED" }, { NVME_SC_ASYNC_EVENT_REQUEST_LIMIT_EXCEEDED, "ASYNC LIMIT EXCEEDED" }, { NVME_SC_INVALID_FIRMWARE_SLOT, "INVALID FIRMWARE SLOT" }, { NVME_SC_INVALID_FIRMWARE_IMAGE, "INVALID FIRMWARE IMAGE" }, { NVME_SC_INVALID_INTERRUPT_VECTOR, "INVALID INTERRUPT VECTOR" }, { NVME_SC_INVALID_LOG_PAGE, "INVALID LOG PAGE" }, { NVME_SC_INVALID_FORMAT, "INVALID FORMAT" }, { NVME_SC_FIRMWARE_REQUIRES_RESET, "FIRMWARE REQUIRES RESET" }, { NVME_SC_INVALID_QUEUE_DELETION, "INVALID QUEUE DELETION" }, { NVME_SC_FEATURE_NOT_SAVEABLE, "FEATURE IDENTIFIER NOT SAVEABLE" }, { NVME_SC_FEATURE_NOT_CHANGEABLE, "FEATURE NOT CHANGEABLE" }, { NVME_SC_FEATURE_NOT_NS_SPECIFIC, "FEATURE NOT NAMESPACE SPECIFIC" }, { NVME_SC_FW_ACT_REQUIRES_NVMS_RESET, "FIRMWARE ACTIVATION REQUIRES NVM SUBSYSTEM RESET" }, { NVME_SC_FW_ACT_REQUIRES_RESET, "FIRMWARE ACTIVATION REQUIRES RESET" }, { NVME_SC_FW_ACT_REQUIRES_TIME, "FIRMWARE ACTIVATION REQUIRES MAXIMUM TIME VIOLATION" }, { NVME_SC_FW_ACT_PROHIBITED, "FIRMWARE ACTIVATION PROHIBITED" }, { NVME_SC_OVERLAPPING_RANGE, "OVERLAPPING RANGE" }, { NVME_SC_NS_INSUFFICIENT_CAPACITY, "NAMESPACE INSUFFICIENT CAPACITY" }, { NVME_SC_NS_ID_UNAVAILABLE, "NAMESPACE IDENTIFIER UNAVAILABLE" }, { NVME_SC_NS_ALREADY_ATTACHED, "NAMESPACE ALREADY ATTACHED" }, { NVME_SC_NS_IS_PRIVATE, "NAMESPACE IS PRIVATE" }, { NVME_SC_NS_NOT_ATTACHED, "NS NOT ATTACHED" }, { NVME_SC_THIN_PROV_NOT_SUPPORTED, "THIN PROVISIONING NOT SUPPORTED" }, { NVME_SC_CTRLR_LIST_INVALID, "CONTROLLER LIST INVALID" }, { NVME_SC_SELT_TEST_IN_PROGRESS, "DEVICE SELT-TEST IN PROGRESS" }, { NVME_SC_BOOT_PART_WRITE_PROHIB, "BOOT PARTITION WRITE PROHIBITED" }, { NVME_SC_INVALID_CTRLR_ID, "INVALID CONTROLLER IDENTIFIER" }, { NVME_SC_INVALID_SEC_CTRLR_STATE, "INVALID SECONDARY CONTROLLER STATE" }, { NVME_SC_INVALID_NUM_OF_CTRLR_RESRC, "INVALID NUMBER OF CONTROLLER RESOURCES" }, { NVME_SC_INVALID_RESOURCE_ID, "INVALID RESOURCE IDENTIFIER" }, { NVME_SC_CONFLICTING_ATTRIBUTES, "CONFLICTING ATTRIBUTES" }, { NVME_SC_INVALID_PROTECTION_INFO, "INVALID PROTECTION INFO" }, { NVME_SC_ATTEMPTED_WRITE_TO_RO_PAGE, "WRITE TO RO PAGE" }, { 0xFFFF, "COMMAND SPECIFIC" } }; static struct nvme_status_string media_error_status[] = { { NVME_SC_WRITE_FAULTS, "WRITE FAULTS" }, { NVME_SC_UNRECOVERED_READ_ERROR, "UNRECOVERED READ ERROR" }, { NVME_SC_GUARD_CHECK_ERROR, "GUARD CHECK ERROR" }, { NVME_SC_APPLICATION_TAG_CHECK_ERROR, "APPLICATION TAG CHECK ERROR" }, { NVME_SC_REFERENCE_TAG_CHECK_ERROR, "REFERENCE TAG CHECK ERROR" }, { NVME_SC_COMPARE_FAILURE, "COMPARE FAILURE" }, { NVME_SC_ACCESS_DENIED, "ACCESS DENIED" }, { NVME_SC_DEALLOCATED_OR_UNWRITTEN, "DEALLOCATED OR UNWRITTEN LOGICAL BLOCK" }, { 0xFFFF, "MEDIA ERROR" } }; static const char * get_status_string(uint16_t sct, uint16_t sc) { struct nvme_status_string *entry; switch (sct) { case NVME_SCT_GENERIC: entry = generic_status; break; case NVME_SCT_COMMAND_SPECIFIC: entry = command_specific_status; break; case NVME_SCT_MEDIA_ERROR: entry = media_error_status; break; case NVME_SCT_VENDOR_SPECIFIC: return ("VENDOR SPECIFIC"); default: return ("RESERVED"); } while (entry->sc != 0xFFFF) { if (entry->sc == sc) return (entry->str); entry++; } return (entry->str); } static void nvme_qpair_print_completion(struct nvme_qpair *qpair, struct nvme_completion *cpl) { uint16_t sct, sc; sct = NVME_STATUS_GET_SCT(cpl->status); sc = NVME_STATUS_GET_SC(cpl->status); nvme_printf(qpair->ctrlr, "%s (%02x/%02x) sqid:%d cid:%d cdw0:%x\n", get_status_string(sct, sc), sct, sc, cpl->sqid, cpl->cid, cpl->cdw0); } static boolean_t nvme_completion_is_retry(const struct nvme_completion *cpl) { uint8_t sct, sc, dnr; sct = NVME_STATUS_GET_SCT(cpl->status); sc = NVME_STATUS_GET_SC(cpl->status); dnr = NVME_STATUS_GET_DNR(cpl->status); /* Do Not Retry Bit */ /* * TODO: spec is not clear how commands that are aborted due * to TLER will be marked. So for now, it seems * NAMESPACE_NOT_READY is the only case where we should * look at the DNR bit. Requests failed with ABORTED_BY_REQUEST * set the DNR bit correctly since the driver controls that. */ switch (sct) { case NVME_SCT_GENERIC: switch (sc) { case NVME_SC_ABORTED_BY_REQUEST: case NVME_SC_NAMESPACE_NOT_READY: if (dnr) return (0); else return (1); case NVME_SC_INVALID_OPCODE: case NVME_SC_INVALID_FIELD: case NVME_SC_COMMAND_ID_CONFLICT: case NVME_SC_DATA_TRANSFER_ERROR: case NVME_SC_ABORTED_POWER_LOSS: case NVME_SC_INTERNAL_DEVICE_ERROR: case NVME_SC_ABORTED_SQ_DELETION: case NVME_SC_ABORTED_FAILED_FUSED: case NVME_SC_ABORTED_MISSING_FUSED: case NVME_SC_INVALID_NAMESPACE_OR_FORMAT: case NVME_SC_COMMAND_SEQUENCE_ERROR: case NVME_SC_LBA_OUT_OF_RANGE: case NVME_SC_CAPACITY_EXCEEDED: default: return (0); } case NVME_SCT_COMMAND_SPECIFIC: case NVME_SCT_MEDIA_ERROR: case NVME_SCT_VENDOR_SPECIFIC: default: return (0); } } static void nvme_qpair_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, struct nvme_completion *cpl, error_print_t print_on_error) { struct nvme_request *req; boolean_t retry, error; req = tr->req; error = nvme_completion_is_error(cpl); retry = error && nvme_completion_is_retry(cpl) && req->retries < nvme_retry_count; if (error && (print_on_error == ERROR_PRINT_ALL || (!retry && print_on_error == ERROR_PRINT_NO_RETRY))) { nvme_qpair_print_command(qpair, &req->cmd); nvme_qpair_print_completion(qpair, cpl); } qpair->act_tr[cpl->cid] = NULL; KASSERT(cpl->cid == req->cmd.cid, ("cpl cid does not match cmd cid\n")); if (req->cb_fn && !retry) req->cb_fn(req->cb_arg, cpl); mtx_lock(&qpair->lock); callout_stop(&tr->timer); if (retry) { req->retries++; nvme_qpair_submit_tracker(qpair, tr); } else { if (req->type != NVME_REQUEST_NULL) { bus_dmamap_sync(qpair->dma_tag_payload, tr->payload_dma_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); bus_dmamap_unload(qpair->dma_tag_payload, tr->payload_dma_map); } nvme_free_request(req); tr->req = NULL; TAILQ_REMOVE(&qpair->outstanding_tr, tr, tailq); TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); /* * If the controller is in the middle of resetting, don't * try to submit queued requests here - let the reset logic * handle that instead. */ if (!STAILQ_EMPTY(&qpair->queued_req) && !qpair->ctrlr->is_resetting) { req = STAILQ_FIRST(&qpair->queued_req); STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); _nvme_qpair_submit_request(qpair, req); } } mtx_unlock(&qpair->lock); } static void nvme_qpair_manual_complete_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr, uint32_t sct, uint32_t sc, uint32_t dnr, error_print_t print_on_error) { struct nvme_completion cpl; memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; cpl.cid = tr->cid; cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; cpl.status |= (dnr & NVME_STATUS_DNR_MASK) << NVME_STATUS_DNR_SHIFT; nvme_qpair_complete_tracker(qpair, tr, &cpl, print_on_error); } void nvme_qpair_manual_complete_request(struct nvme_qpair *qpair, struct nvme_request *req, uint32_t sct, uint32_t sc) { struct nvme_completion cpl; boolean_t error; memset(&cpl, 0, sizeof(cpl)); cpl.sqid = qpair->id; cpl.status |= (sct & NVME_STATUS_SCT_MASK) << NVME_STATUS_SCT_SHIFT; cpl.status |= (sc & NVME_STATUS_SC_MASK) << NVME_STATUS_SC_SHIFT; error = nvme_completion_is_error(&cpl); if (error) { nvme_qpair_print_command(qpair, &req->cmd); nvme_qpair_print_completion(qpair, &cpl); } if (req->cb_fn) req->cb_fn(req->cb_arg, &cpl); nvme_free_request(req); } bool nvme_qpair_process_completions(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_completion cpl; int done = 0; bool in_panic = dumping || SCHEDULER_STOPPED(); qpair->num_intr_handler_calls++; /* * qpair is not enabled, likely because a controller reset is is in * progress. Ignore the interrupt - any I/O that was associated with * this interrupt will get retried when the reset is complete. */ if (!qpair->is_enabled) return (false); /* * A panic can stop the CPU this routine is running on at any point. If * we're called during a panic, complete the sq_head wrap protocol for * the case where we are interrupted just after the increment at 1 * below, but before we can reset cq_head to zero at 2. Also cope with * the case where we do the zero at 2, but may or may not have done the * phase adjustment at step 3. The panic machinery flushes all pending * memory writes, so we can make these strong ordering assumptions * that would otherwise be unwise if we were racing in real time. */ if (__predict_false(in_panic)) { if (qpair->cq_head == qpair->num_entries) { /* * Here we know that we need to zero cq_head and then negate * the phase, which hasn't been assigned if cq_head isn't * zero due to the atomic_store_rel. */ qpair->cq_head = 0; qpair->phase = !qpair->phase; } else if (qpair->cq_head == 0) { /* * In this case, we know that the assignment at 2 * happened below, but we don't know if it 3 happened or * not. To do this, we look at the last completion * entry and set the phase to the opposite phase * that it has. This gets us back in sync */ cpl = qpair->cpl[qpair->num_entries - 1]; nvme_completion_swapbytes(&cpl); qpair->phase = !NVME_STATUS_GET_P(cpl.status); } } bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); while (1) { cpl = qpair->cpl[qpair->cq_head]; /* Convert to host endian */ nvme_completion_swapbytes(&cpl); if (NVME_STATUS_GET_P(cpl.status) != qpair->phase) break; tr = qpair->act_tr[cpl.cid]; if (tr != NULL) { nvme_qpair_complete_tracker(qpair, tr, &cpl, ERROR_PRINT_ALL); qpair->sq_head = cpl.sqhd; done++; } else if (!in_panic) { /* * A missing tracker is normally an error. However, a * panic can stop the CPU this routine is running on * after completing an I/O but before updating * qpair->cq_head at 1 below. Later, we re-enter this * routine to poll I/O associated with the kernel * dump. We find that the tr has been set to null before * calling the completion routine. If it hasn't * completed (or it triggers a panic), then '1' below * won't have updated cq_head. Rather than panic again, * ignore this condition because it's not unexpected. */ nvme_printf(qpair->ctrlr, "cpl does not map to outstanding cmd\n"); /* nvme_dump_completion expects device endianess */ nvme_dump_completion(&qpair->cpl[qpair->cq_head]); KASSERT(0, ("received completion for unknown cmd")); } /* * There's a number of races with the following (see above) when * the system panics. We compensate for each one of them by * using the atomic store to force strong ordering (at least when * viewed in the aftermath of a panic). */ if (++qpair->cq_head == qpair->num_entries) { /* 1 */ atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */ qpair->phase = !qpair->phase; /* 3 */ } nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl, qpair->cq_head); } return (done != 0); } static void nvme_qpair_msix_handler(void *arg) { struct nvme_qpair *qpair = arg; nvme_qpair_process_completions(qpair); } int nvme_qpair_construct(struct nvme_qpair *qpair, uint32_t id, uint16_t vector, uint32_t num_entries, uint32_t num_trackers, struct nvme_controller *ctrlr) { struct nvme_tracker *tr; size_t cmdsz, cplsz, prpsz, allocsz, prpmemsz; uint64_t queuemem_phys, prpmem_phys, list_phys; uint8_t *queuemem, *prpmem, *prp_list; int i, err; qpair->id = id; qpair->vector = vector; qpair->num_entries = num_entries; qpair->num_trackers = num_trackers; qpair->ctrlr = ctrlr; if (ctrlr->msix_enabled) { /* * MSI-X vector resource IDs start at 1, so we add one to * the queue's vector to get the corresponding rid to use. */ qpair->rid = vector + 1; qpair->res = bus_alloc_resource_any(ctrlr->dev, SYS_RES_IRQ, &qpair->rid, RF_ACTIVE); bus_setup_intr(ctrlr->dev, qpair->res, INTR_TYPE_MISC | INTR_MPSAFE, NULL, nvme_qpair_msix_handler, qpair, &qpair->tag); if (id == 0) { bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, "admin"); } else { bus_describe_intr(ctrlr->dev, qpair->res, qpair->tag, "io%d", id - 1); } } mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); /* Note: NVMe PRP format is restricted to 4-byte alignment. */ err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), 4, PAGE_SIZE, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, NVME_MAX_XFER_SIZE, (NVME_MAX_XFER_SIZE/PAGE_SIZE)+1, PAGE_SIZE, 0, NULL, NULL, &qpair->dma_tag_payload); if (err != 0) { nvme_printf(ctrlr, "payload tag create failed %d\n", err); goto out; } /* * Each component must be page aligned, and individual PRP lists * cannot cross a page boundary. */ cmdsz = qpair->num_entries * sizeof(struct nvme_command); cmdsz = roundup2(cmdsz, PAGE_SIZE); cplsz = qpair->num_entries * sizeof(struct nvme_completion); cplsz = roundup2(cplsz, PAGE_SIZE); prpsz = sizeof(uint64_t) * NVME_MAX_PRP_LIST_ENTRIES;; prpmemsz = qpair->num_trackers * prpsz; allocsz = cmdsz + cplsz + prpmemsz; err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), PAGE_SIZE, 0, BUS_SPACE_MAXADDR, BUS_SPACE_MAXADDR, NULL, NULL, allocsz, 1, allocsz, 0, NULL, NULL, &qpair->dma_tag); if (err != 0) { nvme_printf(ctrlr, "tag create failed %d\n", err); goto out; } if (bus_dmamem_alloc(qpair->dma_tag, (void **)&queuemem, BUS_DMA_NOWAIT, &qpair->queuemem_map)) { nvme_printf(ctrlr, "failed to alloc qpair memory\n"); goto out; } if (bus_dmamap_load(qpair->dma_tag, qpair->queuemem_map, queuemem, allocsz, nvme_single_map, &queuemem_phys, 0) != 0) { nvme_printf(ctrlr, "failed to load qpair memory\n"); goto out; } qpair->num_cmds = 0; qpair->num_intr_handler_calls = 0; qpair->cmd = (struct nvme_command *)queuemem; qpair->cpl = (struct nvme_completion *)(queuemem + cmdsz); prpmem = (uint8_t *)(queuemem + cmdsz + cplsz); qpair->cmd_bus_addr = queuemem_phys; qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; qpair->sq_tdbl_off = nvme_mmio_offsetof(doorbell[id].sq_tdbl); qpair->cq_hdbl_off = nvme_mmio_offsetof(doorbell[id].cq_hdbl); TAILQ_INIT(&qpair->free_tr); TAILQ_INIT(&qpair->outstanding_tr); STAILQ_INIT(&qpair->queued_req); list_phys = prpmem_phys; prp_list = prpmem; for (i = 0; i < qpair->num_trackers; i++) { if (list_phys + prpsz > prpmem_phys + prpmemsz) { qpair->num_trackers = i; break; } /* * Make sure that the PRP list for this tracker doesn't * overflow to another page. */ if (trunc_page(list_phys) != trunc_page(list_phys + prpsz - 1)) { list_phys = roundup2(list_phys, PAGE_SIZE); prp_list = (uint8_t *)roundup2((uintptr_t)prp_list, PAGE_SIZE); } tr = malloc(sizeof(*tr), M_NVME, M_ZERO | M_WAITOK); bus_dmamap_create(qpair->dma_tag_payload, 0, &tr->payload_dma_map); callout_init(&tr->timer, 1); tr->cid = i; tr->qpair = qpair; tr->prp = (uint64_t *)prp_list; tr->prp_bus_addr = list_phys; TAILQ_INSERT_HEAD(&qpair->free_tr, tr, tailq); list_phys += prpsz; prp_list += prpsz; } if (qpair->num_trackers == 0) { nvme_printf(ctrlr, "failed to allocate enough trackers\n"); goto out; } qpair->act_tr = malloc(sizeof(struct nvme_tracker *) * qpair->num_entries, M_NVME, M_ZERO | M_WAITOK); return (0); out: nvme_qpair_destroy(qpair); return (ENOMEM); } static void nvme_qpair_destroy(struct nvme_qpair *qpair) { struct nvme_tracker *tr; if (qpair->tag) bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); if (mtx_initialized(&qpair->lock)) mtx_destroy(&qpair->lock); if (qpair->res) bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, rman_get_rid(qpair->res), qpair->res); if (qpair->cmd != NULL) { bus_dmamap_unload(qpair->dma_tag, qpair->queuemem_map); bus_dmamem_free(qpair->dma_tag, qpair->cmd, qpair->queuemem_map); } if (qpair->act_tr) free(qpair->act_tr, M_NVME); while (!TAILQ_EMPTY(&qpair->free_tr)) { tr = TAILQ_FIRST(&qpair->free_tr); TAILQ_REMOVE(&qpair->free_tr, tr, tailq); bus_dmamap_destroy(qpair->dma_tag_payload, tr->payload_dma_map); free(tr, M_NVME); } if (qpair->dma_tag) bus_dma_tag_destroy(qpair->dma_tag); if (qpair->dma_tag_payload) bus_dma_tag_destroy(qpair->dma_tag_payload); } static void nvme_admin_qpair_abort_aers(struct nvme_qpair *qpair) { struct nvme_tracker *tr; tr = TAILQ_FIRST(&qpair->outstanding_tr); while (tr != NULL) { if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, ERROR_PRINT_NONE); tr = TAILQ_FIRST(&qpair->outstanding_tr); } else { tr = TAILQ_NEXT(tr, tailq); } } } void nvme_admin_qpair_destroy(struct nvme_qpair *qpair) { nvme_admin_qpair_abort_aers(qpair); nvme_qpair_destroy(qpair); } void nvme_io_qpair_destroy(struct nvme_qpair *qpair) { nvme_qpair_destroy(qpair); } static void nvme_abort_complete(void *arg, const struct nvme_completion *status) { struct nvme_tracker *tr = arg; /* * If cdw0 == 1, the controller was not able to abort the command * we requested. We still need to check the active tracker array, * to cover race where I/O timed out at same time controller was * completing the I/O. */ if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { /* * An I/O has timed out, and the controller was unable to * abort it for some reason. Construct a fake completion * status, and then complete the I/O's tracker manually. */ nvme_printf(tr->qpair->ctrlr, "abort command failed, aborting command manually\n"); nvme_qpair_manual_complete_tracker(tr->qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); } } static void nvme_timeout(void *arg) { struct nvme_tracker *tr = arg; struct nvme_qpair *qpair = tr->qpair; struct nvme_controller *ctrlr = qpair->ctrlr; uint32_t csts; uint8_t cfs; /* * Read csts to get value of cfs - controller fatal status. * If no fatal status, try to call the completion routine, and * if completes transactions, report a missed interrupt and * return (this may need to be rate limited). Otherwise, if * aborts are enabled and the controller is not reporting * fatal status, abort the command. Otherwise, just reset the * controller and hope for the best. */ csts = nvme_mmio_read_4(ctrlr, csts); cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; if (cfs == 0 && nvme_qpair_process_completions(qpair)) { nvme_printf(ctrlr, "Missing interrupt\n"); return; } if (ctrlr->enable_aborts && cfs == 0) { nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, nvme_abort_complete, tr); } else { nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", cfs ? " and fatal error status" : ""); nvme_ctrlr_reset(ctrlr); } } void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) { struct nvme_request *req; struct nvme_controller *ctrlr; mtx_assert(&qpair->lock, MA_OWNED); req = tr->req; req->cmd.cid = tr->cid; qpair->act_tr[tr->cid] = tr; ctrlr = qpair->ctrlr; if (req->timeout) callout_reset_curcpu(&tr->timer, ctrlr->timeout_period * hz, nvme_timeout, tr); /* Copy the command from the tracker to the submission queue. */ memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); if (++qpair->sq_tail == qpair->num_entries) qpair->sq_tail = 0; bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); #ifndef __powerpc__ /* * powerpc's bus_dmamap_sync() already includes a heavyweight sync, but * no other archs do. */ wmb(); #endif nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].sq_tdbl, qpair->sq_tail); qpair->num_cmds++; } static void nvme_payload_map(void *arg, bus_dma_segment_t *seg, int nseg, int error) { struct nvme_tracker *tr = arg; uint32_t cur_nseg; /* * If the mapping operation failed, return immediately. The caller * is responsible for detecting the error status and failing the * tracker manually. */ if (error != 0) { nvme_printf(tr->qpair->ctrlr, "nvme_payload_map err %d\n", error); return; } /* * Note that we specified PAGE_SIZE for alignment and max * segment size when creating the bus dma tags. So here * we can safely just transfer each segment to its * associated PRP entry. */ tr->req->cmd.prp1 = htole64(seg[0].ds_addr); if (nseg == 2) { tr->req->cmd.prp2 = htole64(seg[1].ds_addr); } else if (nseg > 2) { cur_nseg = 1; tr->req->cmd.prp2 = htole64((uint64_t)tr->prp_bus_addr); while (cur_nseg < nseg) { tr->prp[cur_nseg-1] = htole64((uint64_t)seg[cur_nseg].ds_addr); cur_nseg++; } } else { /* * prp2 should not be used by the controller * since there is only one segment, but set * to 0 just to be safe. */ tr->req->cmd.prp2 = 0; } bus_dmamap_sync(tr->qpair->dma_tag_payload, tr->payload_dma_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); nvme_qpair_submit_tracker(tr->qpair, tr); } static void _nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) { struct nvme_tracker *tr; int err = 0; mtx_assert(&qpair->lock, MA_OWNED); tr = TAILQ_FIRST(&qpair->free_tr); req->qpair = qpair; if (tr == NULL || !qpair->is_enabled) { /* * No tracker is available, or the qpair is disabled due to * an in-progress controller-level reset or controller * failure. */ if (qpair->ctrlr->is_failed) { /* * The controller has failed. Post the request to a * task where it will be aborted, so that we do not * invoke the request's callback in the context * of the submission. */ nvme_ctrlr_post_failed_request(qpair->ctrlr, req); } else { /* * Put the request on the qpair's request queue to be * processed when a tracker frees up via a command * completion or when the controller reset is * completed. */ STAILQ_INSERT_TAIL(&qpair->queued_req, req, stailq); } return; } TAILQ_REMOVE(&qpair->free_tr, tr, tailq); TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); tr->req = req; switch (req->type) { case NVME_REQUEST_VADDR: KASSERT(req->payload_size <= qpair->ctrlr->max_xfer_size, ("payload_size (%d) exceeds max_xfer_size (%d)\n", req->payload_size, qpair->ctrlr->max_xfer_size)); err = bus_dmamap_load(tr->qpair->dma_tag_payload, tr->payload_dma_map, req->u.payload, req->payload_size, nvme_payload_map, tr, 0); if (err != 0) nvme_printf(qpair->ctrlr, "bus_dmamap_load returned 0x%x!\n", err); break; case NVME_REQUEST_NULL: nvme_qpair_submit_tracker(tr->qpair, tr); break; case NVME_REQUEST_BIO: KASSERT(req->u.bio->bio_bcount <= qpair->ctrlr->max_xfer_size, ("bio->bio_bcount (%jd) exceeds max_xfer_size (%d)\n", (intmax_t)req->u.bio->bio_bcount, qpair->ctrlr->max_xfer_size)); err = bus_dmamap_load_bio(tr->qpair->dma_tag_payload, tr->payload_dma_map, req->u.bio, nvme_payload_map, tr, 0); if (err != 0) nvme_printf(qpair->ctrlr, "bus_dmamap_load_bio returned 0x%x!\n", err); break; case NVME_REQUEST_CCB: err = bus_dmamap_load_ccb(tr->qpair->dma_tag_payload, tr->payload_dma_map, req->u.payload, nvme_payload_map, tr, 0); if (err != 0) nvme_printf(qpair->ctrlr, "bus_dmamap_load_ccb returned 0x%x!\n", err); break; default: panic("unknown nvme request type 0x%x\n", req->type); break; } if (err != 0) { /* * The dmamap operation failed, so we manually fail the * tracker here with DATA_TRANSFER_ERROR status. * * nvme_qpair_manual_complete_tracker must not be called * with the qpair lock held. */ mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_DATA_TRANSFER_ERROR, DO_NOT_RETRY, ERROR_PRINT_ALL); mtx_lock(&qpair->lock); } } void nvme_qpair_submit_request(struct nvme_qpair *qpair, struct nvme_request *req) { mtx_lock(&qpair->lock); _nvme_qpair_submit_request(qpair, req); mtx_unlock(&qpair->lock); } static void nvme_qpair_enable(struct nvme_qpair *qpair) { qpair->is_enabled = TRUE; } void nvme_qpair_reset(struct nvme_qpair *qpair) { qpair->sq_head = qpair->sq_tail = qpair->cq_head = 0; /* * First time through the completion queue, HW will set phase * bit on completions to 1. So set this to 1 here, indicating * we're looking for a 1 to know which entries have completed. * we'll toggle the bit each time when the completion queue * rolls over. */ qpair->phase = 1; memset(qpair->cmd, 0, qpair->num_entries * sizeof(struct nvme_command)); memset(qpair->cpl, 0, qpair->num_entries * sizeof(struct nvme_completion)); } void nvme_admin_qpair_enable(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_tracker *tr_temp; /* * Manually abort each outstanding admin command. Do not retry * admin commands found here, since they will be left over from * a controller reset and its likely the context in which the * command was issued no longer applies. */ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { nvme_printf(qpair->ctrlr, "aborting outstanding admin command\n"); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); } nvme_qpair_enable(qpair); } void nvme_io_qpair_enable(struct nvme_qpair *qpair) { STAILQ_HEAD(, nvme_request) temp; struct nvme_tracker *tr; struct nvme_tracker *tr_temp; struct nvme_request *req; /* * Manually abort each outstanding I/O. This normally results in a * retry, unless the retry count on the associated request has * reached its limit. */ TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { nvme_printf(qpair->ctrlr, "aborting outstanding i/o\n"); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_NO_RETRY); } mtx_lock(&qpair->lock); nvme_qpair_enable(qpair); STAILQ_INIT(&temp); STAILQ_SWAP(&qpair->queued_req, &temp, nvme_request); while (!STAILQ_EMPTY(&temp)) { req = STAILQ_FIRST(&temp); STAILQ_REMOVE_HEAD(&temp, stailq); nvme_printf(qpair->ctrlr, "resubmitting queued i/o\n"); nvme_qpair_print_command(qpair, &req->cmd); _nvme_qpair_submit_request(qpair, req); } mtx_unlock(&qpair->lock); } static void nvme_qpair_disable(struct nvme_qpair *qpair) { struct nvme_tracker *tr; qpair->is_enabled = FALSE; mtx_lock(&qpair->lock); TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) callout_stop(&tr->timer); mtx_unlock(&qpair->lock); } void nvme_admin_qpair_disable(struct nvme_qpair *qpair) { nvme_qpair_disable(qpair); nvme_admin_qpair_abort_aers(qpair); } void nvme_io_qpair_disable(struct nvme_qpair *qpair) { nvme_qpair_disable(qpair); } void nvme_qpair_fail(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_request *req; if (!mtx_initialized(&qpair->lock)) return; mtx_lock(&qpair->lock); while (!STAILQ_EMPTY(&qpair->queued_req)) { req = STAILQ_FIRST(&qpair->queued_req); STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); nvme_printf(qpair->ctrlr, "failing queued i/o\n"); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); mtx_lock(&qpair->lock); } /* Manually abort each outstanding I/O. */ while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { tr = TAILQ_FIRST(&qpair->outstanding_tr); /* * Do not remove the tracker. The abort_tracker path will * do that for us. */ nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_tracker(qpair, tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); mtx_lock(&qpair->lock); } mtx_unlock(&qpair->lock); }