Index: head/sys/dev/nvme/nvme.c =================================================================== --- head/sys/dev/nvme/nvme.c +++ head/sys/dev/nvme/nvme.c @@ -137,9 +137,10 @@ } /* - * Reset controller twice to ensure we do a transition from cc.en==1 - * to cc.en==0. This is because we don't really know what status - * the controller was left in when boot handed off to OS. + * Reset controller twice to ensure we do a transition from cc.en==1 to + * cc.en==0. This is because we don't really know what status the + * controller was left in when boot handed off to OS. Linux doesn't do + * this, however. If we adopt that policy, see also nvme_ctrlr_resume(). */ status = nvme_ctrlr_hw_reset(ctrlr); if (status != 0) { Index: head/sys/dev/nvme/nvme_ctrlr.c =================================================================== --- head/sys/dev/nvme/nvme_ctrlr.c +++ head/sys/dev/nvme/nvme_ctrlr.c @@ -118,8 +118,8 @@ /* * Our best estimate for the maximum number of I/Os that we should - * noramlly have in flight at one time. This should be viewed as a hint, - * not a hard limit and will need to be revisitted when the upper layers + * normally have in flight at one time. This should be viewed as a hint, + * not a hard limit and will need to be revisited when the upper layers * of the storage system grows multi-queue support. */ ctrlr->max_hw_pend_io = num_trackers * ctrlr->num_io_queues * 3 / 4; @@ -344,10 +344,10 @@ return (nvme_ctrlr_wait_for_ready(ctrlr, 1)); } -int -nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) +static void +nvme_ctrlr_disable_qpairs(struct nvme_controller *ctrlr) { - int i, err; + int i; nvme_admin_qpair_disable(&ctrlr->adminq); /* @@ -359,7 +359,15 @@ for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_disable(&ctrlr->ioq[i]); } +} +int +nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) +{ + int err; + + nvme_ctrlr_disable_qpairs(ctrlr); + DELAY(100*1000); err = nvme_ctrlr_disable(ctrlr); @@ -481,7 +489,7 @@ } static int -nvme_ctrlr_destroy_qpairs(struct nvme_controller *ctrlr) +nvme_ctrlr_delete_qpairs(struct nvme_controller *ctrlr) { struct nvme_completion_poll_status status; struct nvme_qpair *qpair; @@ -820,7 +828,7 @@ } static void -nvme_ctrlr_start(void *ctrlr_arg) +nvme_ctrlr_start(void *ctrlr_arg, bool resetting) { struct nvme_controller *ctrlr = ctrlr_arg; uint32_t old_num_io_queues; @@ -833,7 +841,7 @@ * the number of I/O queues supported, so cannot reset * the adminq again here. */ - if (ctrlr->is_resetting) + if (resetting) nvme_qpair_reset(&ctrlr->adminq); for (i = 0; i < ctrlr->num_io_queues; i++) @@ -854,7 +862,7 @@ * explicit specify how many queues it will use. This value should * never change between resets, so panic if somehow that does happen. */ - if (ctrlr->is_resetting) { + if (resetting) { old_num_io_queues = ctrlr->num_io_queues; if (nvme_ctrlr_set_num_qpairs(ctrlr) != 0) { nvme_ctrlr_fail(ctrlr); @@ -894,7 +902,7 @@ if (nvme_ctrlr_set_num_qpairs(ctrlr) == 0 && nvme_ctrlr_construct_io_qpairs(ctrlr) == 0) - nvme_ctrlr_start(ctrlr); + nvme_ctrlr_start(ctrlr, false); else nvme_ctrlr_fail(ctrlr); @@ -923,7 +931,7 @@ */ pause("nvmereset", hz / 10); if (status == 0) - nvme_ctrlr_start(ctrlr); + nvme_ctrlr_start(ctrlr, true); else nvme_ctrlr_fail(ctrlr); @@ -946,7 +954,7 @@ } /* - * Poll the single-vector intertrupt case: num_io_queues will be 1 and + * Poll the single-vector interrupt case: num_io_queues will be 1 and * there's only a single vector. While we're polling, we mask further * interrupts in the controller. */ @@ -1012,7 +1020,7 @@ if (is_user_buffer) { /* * Ensure the user buffer is wired for the duration of - * this passthrough command. + * this pass-through command. */ PHOLD(curproc); buf = uma_zalloc(pbuf_zone, M_WAITOK); @@ -1031,7 +1039,7 @@ } else req = nvme_allocate_request_null(nvme_pt_done, pt); - /* Assume userspace already converted to little-endian */ + /* Assume user space already converted to little-endian */ req->cmd.opc = pt->cmd.opc; req->cmd.fuse = pt->cmd.fuse; req->cmd.rsvd2 = pt->cmd.rsvd2; @@ -1206,7 +1214,7 @@ if (ctrlr->is_initialized) { if (!gone) - nvme_ctrlr_destroy_qpairs(ctrlr); + nvme_ctrlr_delete_qpairs(ctrlr); for (i = 0; i < ctrlr->num_io_queues; i++) nvme_io_qpair_destroy(&ctrlr->ioq[i]); free(ctrlr->ioq, M_NVME); @@ -1305,4 +1313,88 @@ { return (&ctrlr->cdata); +} + +int +nvme_ctrlr_suspend(struct nvme_controller *ctrlr) +{ + int to = hz; + + /* + * Can't touch failed controllers, so it's already suspended. + */ + if (ctrlr->is_failed) + return (0); + + /* + * We don't want the reset taskqueue running, since it does similar + * things, so prevent it from running after we start. Wait for any reset + * that may have been started to complete. The reset process we follow + * will ensure that any new I/O will queue and be given to the hardware + * after we resume (though there should be none). + */ + while (atomic_cmpset_32(&ctrlr->is_resetting, 0, 1) == 0 && to-- > 0) + pause("nvmesusp", 1); + if (to <= 0) { + nvme_printf(ctrlr, + "Competing reset task didn't finish. Try again later.\n"); + return (EWOULDBLOCK); + } + + /* + * Per Section 7.6.2 of NVMe spec 1.4, to properly suspend, we need to + * delete the hardware I/O queues, and then shutdown. This properly + * flushes any metadata the drive may have stored so it can survive + * having its power removed and prevents the unsafe shutdown count from + * incriminating. Once we delete the qpairs, we have to disable them + * before shutting down. The delay is out of paranoia in + * nvme_ctrlr_hw_reset, and is repeated here (though we should have no + * pending I/O that the delay copes with). + */ + nvme_ctrlr_delete_qpairs(ctrlr); + nvme_ctrlr_disable_qpairs(ctrlr); + DELAY(100*1000); + nvme_ctrlr_shutdown(ctrlr); + + return (0); +} + +int +nvme_ctrlr_resume(struct nvme_controller *ctrlr) +{ + + /* + * Can't touch failed controllers, so nothing to do to resume. + */ + if (ctrlr->is_failed) + return (0); + + /* + * Have to reset the hardware twice, just like we do on attach. See + * nmve_attach() for why. + */ + if (nvme_ctrlr_hw_reset(ctrlr) != 0) + goto fail; + if (nvme_ctrlr_hw_reset(ctrlr) != 0) + goto fail; + + /* + * Now that we're reset the hardware, we can restart the controller. Any + * I/O that was pending is requeued. Any admin commands are aborted with + * an error. Once we've restarted, take the controller out of reset. + */ + nvme_ctrlr_start(ctrlr, true); + atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); + + return (0); +fail: + /* + * Since we can't bring the controller out of reset, announce and fail + * the controller. However, we have to return success for the resume + * itself, due to questionable APIs. + */ + nvme_printf(ctrlr, "Failed to reset on resume, failing.\n"); + nvme_ctrlr_fail(ctrlr); + atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); + return (0); } Index: head/sys/dev/nvme/nvme_pci.c =================================================================== --- head/sys/dev/nvme/nvme_pci.c +++ head/sys/dev/nvme/nvme_pci.c @@ -43,6 +43,8 @@ static int nvme_pci_probe(device_t); static int nvme_pci_attach(device_t); static int nvme_pci_detach(device_t); +static int nvme_pci_suspend(device_t); +static int nvme_pci_resume(device_t); static void nvme_ctrlr_setup_interrupts(struct nvme_controller *ctrlr); @@ -51,6 +53,8 @@ DEVMETHOD(device_probe, nvme_pci_probe), DEVMETHOD(device_attach, nvme_pci_attach), DEVMETHOD(device_detach, nvme_pci_detach), + DEVMETHOD(device_suspend, nvme_pci_suspend), + DEVMETHOD(device_resume, nvme_pci_resume), DEVMETHOD(device_shutdown, nvme_shutdown), { 0, 0 } }; @@ -331,4 +335,22 @@ } ctrlr->msix_enabled = 1; +} + +static int +nvme_pci_suspend(device_t dev) +{ + struct nvme_controller *ctrlr; + + ctrlr = DEVICE2SOFTC(dev); + return (nvme_ctrlr_suspend(ctrlr)); +} + +static int +nvme_pci_resume(device_t dev) +{ + struct nvme_controller *ctrlr; + + ctrlr = DEVICE2SOFTC(dev); + return (nvme_ctrlr_resume(ctrlr)); } Index: head/sys/dev/nvme/nvme_private.h =================================================================== --- head/sys/dev/nvme/nvme_private.h +++ head/sys/dev/nvme/nvme_private.h @@ -556,4 +556,7 @@ void nvme_ctrlr_intx_handler(void *arg); void nvme_ctrlr_poll(struct nvme_controller *ctrlr); +int nvme_ctrlr_suspend(struct nvme_controller *ctrlr); +int nvme_ctrlr_resume(struct nvme_controller *ctrlr); + #endif /* __NVME_PRIVATE_H__ */