diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -217,12 +217,15 @@ { int i; + /* + * No need to disable queues before failing them. Failing is a superet + * of disabling (though pedantically we'd abort the AERs silently with + * a different error, though when we fail, that hardly matters). + */ ctrlr->is_failed = true; - nvme_admin_qpair_disable(&ctrlr->adminq); nvme_qpair_fail(&ctrlr->adminq); if (ctrlr->ioq != NULL) { for (i = 0; i < ctrlr->num_io_queues; i++) { - nvme_io_qpair_disable(&ctrlr->ioq[i]); nvme_qpair_fail(&ctrlr->ioq[i]); } } @@ -416,34 +419,6 @@ } } -static void -nvme_pre_reset(struct nvme_controller *ctrlr) -{ - /* - * Make sure that all the ISRs are done before proceeding with the reset - * (and also keep any stray interrupts that happen during this process - * from racing this process). For startup, this is a nop, since the - * hardware is in a good state. But for recovery, where we randomly - * reset the hardware, this ensure that we're not racing the ISRs. - */ - mtx_lock(&ctrlr->adminq.recovery); - for (int i = 0; i < ctrlr->num_io_queues; i++) { - mtx_lock(&ctrlr->ioq[i].recovery); - } -} - -static void -nvme_post_reset(struct nvme_controller *ctrlr) -{ - /* - * Reset complete, unblock ISRs - */ - mtx_unlock(&ctrlr->adminq.recovery); - for (int i = 0; i < ctrlr->num_io_queues; i++) { - mtx_unlock(&ctrlr->ioq[i].recovery); - } -} - static int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { @@ -1236,9 +1211,7 @@ int status; nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller"); - nvme_pre_reset(ctrlr); status = nvme_ctrlr_hw_reset(ctrlr); - nvme_post_reset(ctrlr); if (status == 0) nvme_ctrlr_start(ctrlr, true); else @@ -1725,19 +1698,8 @@ if (ctrlr->is_failed) return (0); - nvme_pre_reset(ctrlr); if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; -#ifdef NVME_2X_RESET - /* - * Prior to FreeBSD 13.1, FreeBSD's nvme driver reset the hardware twice - * to get it into a known good state. However, the hardware's state is - * good and we don't need to do this for proper functioning. - */ - if (nvme_ctrlr_hw_reset(ctrlr) != 0) - goto fail; -#endif - nvme_post_reset(ctrlr); /* * Now that we've reset the hardware, we can restart the controller. Any diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -150,6 +150,7 @@ enum nvme_recovery { RECOVERY_NONE = 0, /* Normal operations */ RECOVERY_WAITING, /* waiting for the reset to complete */ + RECOVERY_FAILED, /* We have failed, no more I/O */ }; struct nvme_qpair { struct nvme_controller *ctrlr; diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -945,22 +945,38 @@ { struct nvme_tracker *tr; + /* + * nvme_complete_tracker must be called without the qpair lock held. It + * takes the lock to adjust outstanding_tr list, so make sure we don't + * have it yet (since this is a general purpose routine). We take the + * lock to make the list traverse safe, but have to drop the lock to + * complete any AER. We restart the list scan when we do this to make + * this safe. There's interlock with the ISR so we know this tracker + * won't be completed twice. + */ + mtx_assert(&qpair->lock, MA_NOTOWNED); + + mtx_lock(&qpair->lock); tr = TAILQ_FIRST(&qpair->outstanding_tr); while (tr != NULL) { if (tr->req->cmd.opc == NVME_OPC_ASYNC_EVENT_REQUEST) { + mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_SQ_DELETION, 0, ERROR_PRINT_NONE); + mtx_lock(&qpair->lock); tr = TAILQ_FIRST(&qpair->outstanding_tr); } else { tr = TAILQ_NEXT(tr, tailq); } } + mtx_unlock(&qpair->lock); } void nvme_admin_qpair_destroy(struct nvme_qpair *qpair) { + mtx_assert(&qpair->lock, MA_NOTOWNED); nvme_admin_qpair_abort_aers(qpair); nvme_qpair_destroy(qpair); @@ -1097,6 +1113,9 @@ nvme_printf(ctrlr, "waiting for reset to complete\n"); idle = false; /* We want to keep polling */ break; + case RECOVERY_FAILED: + idle = true; /* nothing to monitor */ + break; } /* @@ -1286,6 +1305,8 @@ mtx_assert(&qpair->recovery, MA_OWNED); if (mtx_initialized(&qpair->lock)) mtx_assert(&qpair->lock, MA_OWNED); + KASSERT(qpair->recovery_state != RECOVERY_FAILED, + ("Enabling a failed qpair\n")); qpair->recovery_state = RECOVERY_NONE; } @@ -1410,12 +1431,13 @@ nvme_admin_qpair_disable(struct nvme_qpair *qpair) { mtx_lock(&qpair->recovery); - mtx_lock(&qpair->lock); + mtx_lock(&qpair->lock); nvme_qpair_disable(qpair); + mtx_unlock(&qpair->lock); + nvme_admin_qpair_abort_aers(qpair); - mtx_unlock(&qpair->lock); mtx_unlock(&qpair->recovery); } @@ -1440,18 +1462,27 @@ if (!mtx_initialized(&qpair->lock)) return; + mtx_lock(&qpair->recovery); + qpair->recovery_state = RECOVERY_FAILED; + mtx_unlock(&qpair->recovery); + mtx_lock(&qpair->lock); + if (!STAILQ_EMPTY(&qpair->queued_req)) { + nvme_printf(qpair->ctrlr, "failing queued i/o\n"); + } while (!STAILQ_EMPTY(&qpair->queued_req)) { req = STAILQ_FIRST(&qpair->queued_req); STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); - nvme_printf(qpair->ctrlr, "failing queued i/o\n"); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_request(qpair, req, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST); mtx_lock(&qpair->lock); } + if (!TAILQ_EMPTY(&qpair->outstanding_tr)) { + nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); + } /* Manually abort each outstanding I/O. */ while (!TAILQ_EMPTY(&qpair->outstanding_tr)) { tr = TAILQ_FIRST(&qpair->outstanding_tr); @@ -1459,7 +1490,6 @@ * Do not remove the tracker. The abort_tracker path will * do that for us. */ - nvme_printf(qpair->ctrlr, "failing outstanding i/o\n"); mtx_unlock(&qpair->lock); nvme_qpair_manual_complete_tracker(tr, NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);