diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -425,13 +425,35 @@ TSENTER(); + /* + * Make sure that all the ISRs are done before proceeding with the reset + * (and also keep any stray interrupts that happen during this process + * from racing this process). For startup, this is a nop, since the + * hardware is in a good state. But for recovery, where we randomly + * reset the hardware, this ensure that we're not racing the ISRs. + */ + mtx_lock_spin(&ctrlr->adminq.isr_spin); + for (int i = 0; i < ctrlr->num_io_queues; i++) { + mtx_lock_spin(&ctrlr->ioq[i].isr_spin); + } + nvme_ctrlr_disable_qpairs(ctrlr); err = nvme_ctrlr_disable(ctrlr); if (err != 0) - return err; + goto out; err = nvme_ctrlr_enable(ctrlr); +out: + + /* + * Reset complete, unblock ISRs + */ + mtx_unlock_spin(&ctrlr->adminq.isr_spin); + for (int i = 0; i < ctrlr->num_io_queues; i++) { + mtx_unlock_spin(&ctrlr->ioq[i].isr_spin); + } + TSEXIT(); return (err); } diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -184,6 +184,7 @@ int64_t num_retries; int64_t num_failures; int64_t num_ignored; + int64_t num_spinlock_fail; struct nvme_command *cmd; struct nvme_completion *cpl; @@ -201,8 +202,8 @@ struct nvme_tracker **act_tr; - struct mtx lock __aligned(CACHE_LINE_SIZE); - + struct mtx_padalign isr_spin; + struct mtx_padalign lock; } __aligned(CACHE_LINE_SIZE); struct nvme_namespace { diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -538,6 +538,17 @@ int done = 0; bool in_panic = dumping || SCHEDULER_STOPPED(); + /* + * Interlock with reset / recovery code. This is an usually uncontended + * spinlock to make sure that we drain out of the ISRs before we reset + * the card and to prevent races with the recovery process called from + * a timeout context. + */ + if (!mtx_trylock_spin(&qpair->isr_spin)) { + qpair->num_spinlock_fail++; + return (false); + } + /* * qpair is not enabled, likely because a controller reset is in * progress. Ignore the interrupt - any I/O that was associated with @@ -548,6 +559,7 @@ */ if (qpair->recovery_state != RECOVERY_NONE) { qpair->num_ignored++; + mtx_unlock_spin(&qpair->isr_spin); return (false); } @@ -673,6 +685,7 @@ qpair->cq_hdbl_off, qpair->cq_head); } + mtx_unlock_spin(&qpair->isr_spin); return (done != 0); } @@ -701,6 +714,7 @@ qpair->ctrlr = ctrlr; mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); + mtx_init(&qpair->isr_spin, "nvme qpair ISR spinlock", NULL, MTX_SPIN); /* Note: NVMe PRP format is restricted to 4-byte alignment. */ err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), @@ -905,6 +919,8 @@ if (mtx_initialized(&qpair->lock)) mtx_destroy(&qpair->lock); + if (mtx_initialized(&qpair->isr_spin)) + mtx_destroy(&qpair->isr_spin); if (qpair->res) { bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, @@ -1007,10 +1023,14 @@ /* * Next, check to see if we have any completions. If we do, * we've likely missed an interrupt, but the card is otherwise - * fine. This will also catch all the commands that are about - * to timeout (but there's still a tiny race). Since the timeout - * is long relative to the race between here and the check below, + * fine. This will also catch all the commands that are about to + * timeout (but there's still a tiny race). Since the timeout is + * long relative to the race between here and the check below, * this is still a win. + * + * It will also catch misconfigured interrupts. In addition, the + * isr_spin lock taken out in nvme_qpair_process_completions + * protects against races here, no matter how small. */ mtx_unlock(&qpair->lock); nvme_qpair_process_completions(qpair); @@ -1019,7 +1039,8 @@ /* * Somebody else adjusted recovery state while unlocked, * we should bail. Unlock the qpair and return without - * doing anything else. + * doing anything else. That somebody else has stepped + * the state machine into the next stage. */ mtx_unlock(&qpair->lock); return; @@ -1032,6 +1053,10 @@ now = getsbinuptime(); needs_reset = false; TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { + /* + * Skip async commands, they are posted to the card for + * an indefinite amount of time and have no deadline. + */ if (tr->deadline == SBT_MAX) continue; if (now > tr->deadline) { diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c --- a/sys/dev/nvme/nvme_sysctl.c +++ b/sys/dev/nvme/nvme_sysctl.c @@ -165,6 +165,7 @@ qpair->num_retries = 0; qpair->num_failures = 0; qpair->num_ignored = 0; + qpair->num_spinlock_fail = 0; } static int @@ -242,6 +243,21 @@ return (sysctl_handle_64(oidp, &num_ignored, 0, req)); } +static int +nvme_sysctl_num_spinlock_fail(SYSCTL_HANDLER_ARGS) +{ + struct nvme_controller *ctrlr = arg1; + int64_t num_spinfail = 0; + int i; + + num_spinfail = ctrlr->adminq.num_spinlock_fail; + + for (i = 0; i < ctrlr->num_io_queues; i++) + num_spinfail += ctrlr->ioq[i].num_spinlock_fail; + + return (sysctl_handle_64(oidp, &num_spinfail, 0, req)); +} + static int nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS) { @@ -300,6 +316,9 @@ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored", CTLFLAG_RD, &qpair->num_ignored, "Number of interrupts posted, but were administratively ignored"); + SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_spinlock_fail", + CTLFLAG_RD, &qpair->num_spinlock_fail, + "Number of times that we can't get a spin lock in the ISR"); SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO, "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, @@ -368,6 +387,11 @@ ctrlr, 0, nvme_sysctl_num_ignored, "IU", "Number of interrupts ignored administratively"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, + "num_spinlock_fail", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ctrlr, 0, nvme_sysctl_num_spinlock_fail, "IU", + "Number of times that we can't get a spin lock in the ISR"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, "reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr, 0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");