diff --git a/sys/dev/nvme/nvme_ctrlr.c b/sys/dev/nvme/nvme_ctrlr.c --- a/sys/dev/nvme/nvme_ctrlr.c +++ b/sys/dev/nvme/nvme_ctrlr.c @@ -418,6 +418,34 @@ } } +static void +nvme_pre_reset(struct nvme_controller *ctrlr) +{ + /* + * Make sure that all the ISRs are done before proceeding with the reset + * (and also keep any stray interrupts that happen during this process + * from racing this process). For startup, this is a nop, since the + * hardware is in a good state. But for recovery, where we randomly + * reset the hardware, this ensure that we're not racing the ISRs. + */ + mtx_lock(&ctrlr->adminq.recovery); + for (int i = 0; i < ctrlr->num_io_queues; i++) { + mtx_lock(&ctrlr->ioq[i].recovery); + } +} + +static void +nvme_post_reset(struct nvme_controller *ctrlr) +{ + /* + * Reset complete, unblock ISRs + */ + mtx_unlock(&ctrlr->adminq.recovery); + for (int i = 0; i < ctrlr->num_io_queues; i++) { + mtx_unlock(&ctrlr->ioq[i].recovery); + } +} + static int nvme_ctrlr_hw_reset(struct nvme_controller *ctrlr) { @@ -429,9 +457,11 @@ err = nvme_ctrlr_disable(ctrlr); if (err != 0) - return err; + goto out; err = nvme_ctrlr_enable(ctrlr); +out: + TSEXIT(); return (err); } @@ -1159,6 +1189,11 @@ TSENTER(); + /* + * Don't call pre/post reset here. We've not yet created the qpairs, + * haven't setup the ISRs, so there's no need to 'drain' them or + * 'exclude' them. + */ if (nvme_ctrlr_hw_reset(ctrlr) != 0) { fail: nvme_ctrlr_fail(ctrlr); @@ -1203,16 +1238,9 @@ int status; nvme_ctrlr_devctl_log(ctrlr, "RESET", "resetting controller"); + nvme_pre_reset(ctrlr); status = nvme_ctrlr_hw_reset(ctrlr); - /* - * Use pause instead of DELAY, so that we yield to any nvme interrupt - * handlers on this CPU that were blocked on a qpair lock. We want - * all nvme interrupts completed before proceeding with restarting the - * controller. - * - * XXX - any way to guarantee the interrupt handlers have quiesced? - */ - pause("nvmereset", hz / 10); + nvme_post_reset(ctrlr); if (status == 0) nvme_ctrlr_start(ctrlr, true); else @@ -1699,6 +1727,7 @@ if (ctrlr->is_failed) return (0); + nvme_pre_reset(ctrlr); if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; #ifdef NVME_2X_RESET @@ -1710,6 +1739,7 @@ if (nvme_ctrlr_hw_reset(ctrlr) != 0) goto fail; #endif + nvme_post_reset(ctrlr); /* * Now that we've reset the hardware, we can restart the controller. Any @@ -1726,6 +1756,7 @@ * the controller. However, we have to return success for the resume * itself, due to questionable APIs. */ + nvme_pre_reset(ctrlr); nvme_printf(ctrlr, "Failed to reset on resume, failing.\n"); nvme_ctrlr_fail(ctrlr); (void)atomic_cmpset_32(&ctrlr->is_resetting, 1, 0); diff --git a/sys/dev/nvme/nvme_private.h b/sys/dev/nvme/nvme_private.h --- a/sys/dev/nvme/nvme_private.h +++ b/sys/dev/nvme/nvme_private.h @@ -164,10 +164,9 @@ struct resource *res; void *tag; - struct callout timer; - sbintime_t deadline; - bool timer_armed; - enum nvme_recovery recovery_state; + struct callout timer; /* recovery lock */ + bool timer_armed; /* recovery lock */ + enum nvme_recovery recovery_state; /* recovery lock */ uint32_t num_entries; uint32_t num_trackers; @@ -184,6 +183,7 @@ int64_t num_retries; int64_t num_failures; int64_t num_ignored; + int64_t num_recovery_nolock; struct nvme_command *cmd; struct nvme_completion *cpl; @@ -202,7 +202,7 @@ struct nvme_tracker **act_tr; struct mtx_padalign lock; - + struct mtx_padalign recovery; } __aligned(CACHE_LINE_SIZE); struct nvme_namespace { diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c --- a/sys/dev/nvme/nvme_qpair.c +++ b/sys/dev/nvme/nvme_qpair.c @@ -530,14 +530,17 @@ nvme_free_request(req); } -bool -nvme_qpair_process_completions(struct nvme_qpair *qpair) +/* Locked version of completion processor */ +static bool +_nvme_qpair_process_completions(struct nvme_qpair *qpair) { struct nvme_tracker *tr; struct nvme_completion cpl; - int done = 0; + bool done = false; bool in_panic = dumping || SCHEDULER_STOPPED(); + mtx_assert(&qpair->recovery, MA_OWNED); + /* * qpair is not enabled, likely because a controller reset is in * progress. Ignore the interrupt - any I/O that was associated with @@ -547,6 +550,7 @@ * to the device. */ if (qpair->recovery_state != RECOVERY_NONE) { + qpair->num_ignored++; return (false); } @@ -631,7 +635,7 @@ else tr = NULL; - done++; + done = true; if (tr != NULL) { nvme_qpair_complete_tracker(tr, &cpl, ERROR_PRINT_ALL); qpair->sq_head = cpl.sqhd; @@ -668,12 +672,35 @@ } } - if (done != 0) { + if (done) { bus_space_write_4(qpair->ctrlr->bus_tag, qpair->ctrlr->bus_handle, qpair->cq_hdbl_off, qpair->cq_head); } - return (done != 0); + return (done); +} + +bool +nvme_qpair_process_completions(struct nvme_qpair *qpair) +{ + bool done; + + /* + * Interlock with reset / recovery code. This is an usually uncontended + * to make sure that we drain out of the ISRs before we reset the card + * and to prevent races with the recovery process called from a timeout + * context. + */ + if (!mtx_trylock(&qpair->recovery)) { + qpair->num_recovery_nolock++; + return (false); + } + + done = _nvme_qpair_process_completions(qpair); + + mtx_unlock(&qpair->recovery); + + return (done); } static void @@ -701,6 +728,7 @@ qpair->ctrlr = ctrlr; mtx_init(&qpair->lock, "nvme qpair lock", NULL, MTX_DEF); + mtx_init(&qpair->recovery, "nvme qpair recovery", NULL, MTX_DEF); /* Note: NVMe PRP format is restricted to 4-byte alignment. */ err = bus_dma_tag_create(bus_get_dma_tag(ctrlr->dev), @@ -767,7 +795,7 @@ qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; - callout_init(&qpair->timer, 1); + callout_init_mtx(&qpair->timer, &qpair->recovery, 0); qpair->timer_armed = false; qpair->recovery_state = RECOVERY_WAITING; @@ -905,6 +933,8 @@ if (mtx_initialized(&qpair->lock)) mtx_destroy(&qpair->lock); + if (mtx_initialized(&qpair->recovery)) + mtx_destroy(&qpair->recovery); if (qpair->res) { bus_release_resource(qpair->ctrlr->dev, SYS_RES_IRQ, @@ -977,14 +1007,12 @@ struct nvme_controller *ctrlr = qpair->ctrlr; struct nvme_tracker *tr; sbintime_t now; - bool idle; + bool idle = false; bool needs_reset; uint32_t csts; uint8_t cfs; - - mtx_lock(&qpair->lock); - idle = TAILQ_EMPTY(&qpair->outstanding_tr); + mtx_assert(&qpair->recovery, MA_OWNED); switch (qpair->recovery_state) { case RECOVERY_NONE: @@ -1005,25 +1033,10 @@ goto do_reset; /* - * Next, check to see if we have any completions. If we do, - * we've likely missed an interrupt, but the card is otherwise - * fine. This will also catch all the commands that are about - * to timeout (but there's still a tiny race). Since the timeout - * is long relative to the race between here and the check below, - * this is still a win. + * Process completions. We already have the recovery lock, so + * call the locked version. */ - mtx_unlock(&qpair->lock); - nvme_qpair_process_completions(qpair); - mtx_lock(&qpair->lock); - if (qpair->recovery_state != RECOVERY_NONE) { - /* - * Somebody else adjusted recovery state while unlocked, - * we should bail. Unlock the qpair and return without - * doing anything else. - */ - mtx_unlock(&qpair->lock); - return; - } + _nvme_qpair_process_completions(qpair); /* * Check to see if we need to timeout any commands. If we do, then @@ -1031,7 +1044,13 @@ */ now = getsbinuptime(); needs_reset = false; + idle = true; + mtx_lock(&qpair->lock); TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) { + /* + * Skip async commands, they are posted to the card for + * an indefinite amount of time and have no deadline. + */ if (tr->deadline == SBT_MAX) continue; if (now > tr->deadline) { @@ -1057,6 +1076,7 @@ idle = false; } } + mtx_unlock(&qpair->lock); if (!needs_reset) break; @@ -1078,6 +1098,7 @@ break; case RECOVERY_WAITING: nvme_printf(ctrlr, "waiting for reset to complete\n"); + idle = false; /* We want to keep polling */ break; } @@ -1089,7 +1110,6 @@ } else { qpair->timer_armed = false; } - mtx_unlock(&qpair->lock); } /* @@ -1198,9 +1218,12 @@ if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { /* - * No tracker is available, or the qpair is disabled due to - * an in-progress controller-level reset or controller - * failure. + * No tracker is available, or the qpair is disabled due to an + * in-progress controller-level reset or controller failure. If + * we lose the race with recovery_state, then we may add an + * extra request to the queue which will be resubmitted later. + * We only set recovery_state to NONE with qpair->lock also + * held. */ if (qpair->ctrlr->is_failed) { @@ -1262,7 +1285,10 @@ static void nvme_qpair_enable(struct nvme_qpair *qpair) { - mtx_assert(&qpair->lock, MA_OWNED); + if (mtx_initialized(&qpair->recovery)) + mtx_assert(&qpair->recovery, MA_OWNED); + if (mtx_initialized(&qpair->lock)) + mtx_assert(&qpair->lock, MA_OWNED); qpair->recovery_state = RECOVERY_NONE; } @@ -1313,9 +1339,11 @@ nvme_printf(qpair->ctrlr, "done aborting outstanding admin\n"); + mtx_lock(&qpair->recovery); mtx_lock(&qpair->lock); nvme_qpair_enable(qpair); mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } void @@ -1342,8 +1370,8 @@ if (report) nvme_printf(qpair->ctrlr, "done aborting outstanding i/o\n"); + mtx_lock(&qpair->recovery); mtx_lock(&qpair->lock); - nvme_qpair_enable(qpair); STAILQ_INIT(&temp); @@ -1362,6 +1390,7 @@ nvme_printf(qpair->ctrlr, "done resubmitting i/o\n"); mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } static void @@ -1369,27 +1398,40 @@ { struct nvme_tracker *tr, *tr_temp; - mtx_lock(&qpair->lock); + if (mtx_initialized(&qpair->recovery)) + mtx_assert(&qpair->recovery, MA_OWNED); + if (mtx_initialized(&qpair->lock)) + mtx_assert(&qpair->lock, MA_OWNED); + qpair->recovery_state = RECOVERY_WAITING; TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { tr->deadline = SBT_MAX; } - mtx_unlock(&qpair->lock); } void nvme_admin_qpair_disable(struct nvme_qpair *qpair) { + mtx_lock(&qpair->recovery); + mtx_lock(&qpair->lock); nvme_qpair_disable(qpair); nvme_admin_qpair_abort_aers(qpair); + + mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } void nvme_io_qpair_disable(struct nvme_qpair *qpair) { + mtx_lock(&qpair->recovery); + mtx_lock(&qpair->lock); nvme_qpair_disable(qpair); + + mtx_unlock(&qpair->lock); + mtx_unlock(&qpair->recovery); } void diff --git a/sys/dev/nvme/nvme_sysctl.c b/sys/dev/nvme/nvme_sysctl.c --- a/sys/dev/nvme/nvme_sysctl.c +++ b/sys/dev/nvme/nvme_sysctl.c @@ -165,6 +165,7 @@ qpair->num_retries = 0; qpair->num_failures = 0; qpair->num_ignored = 0; + qpair->num_recovery_nolock = 0; } static int @@ -242,6 +243,21 @@ return (sysctl_handle_64(oidp, &num_ignored, 0, req)); } +static int +nvme_sysctl_num_recovery_nolock(SYSCTL_HANDLER_ARGS) +{ + struct nvme_controller *ctrlr = arg1; + int64_t num; + int i; + + num = ctrlr->adminq.num_recovery_nolock; + + for (i = 0; i < ctrlr->num_io_queues; i++) + num += ctrlr->ioq[i].num_recovery_nolock; + + return (sysctl_handle_64(oidp, &num, 0, req)); +} + static int nvme_sysctl_reset_stats(SYSCTL_HANDLER_ARGS) { @@ -300,6 +316,9 @@ SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_ignored", CTLFLAG_RD, &qpair->num_ignored, "Number of interrupts posted, but were administratively ignored"); + SYSCTL_ADD_QUAD(ctrlr_ctx, que_list, OID_AUTO, "num_recovery_nolock", + CTLFLAG_RD, &qpair->num_recovery_nolock, + "Number of times that we failed to lock recovery in the ISR"); SYSCTL_ADD_PROC(ctrlr_ctx, que_list, OID_AUTO, "dump_debug", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, @@ -368,6 +387,11 @@ ctrlr, 0, nvme_sysctl_num_ignored, "IU", "Number of interrupts ignored administratively"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, + "num_recovery_nolock", CTLTYPE_S64 | CTLFLAG_RD | CTLFLAG_MPSAFE, + ctrlr, 0, nvme_sysctl_num_recovery_nolock, "IU", + "Number of times that we failed to lock recovery in the ISR"); + SYSCTL_ADD_PROC(ctrlr_ctx, ctrlr_list, OID_AUTO, "reset_stats", CTLTYPE_UINT | CTLFLAG_RW | CTLFLAG_MPSAFE, ctrlr, 0, nvme_sysctl_reset_stats, "IU", "Reset statistics to zero");