Index: sys/dev/nvme/nvme_ctrlr.c =================================================================== --- sys/dev/nvme/nvme_ctrlr.c +++ sys/dev/nvme/nvme_ctrlr.c @@ -232,7 +232,8 @@ mtx_lock(&ctrlr->lock); STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); mtx_unlock(&ctrlr->lock); - taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); + if (!ctrlr->is_dying) + taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); } static void @@ -435,7 +436,8 @@ */ return; - taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); + if (!ctrlr->is_dying) + taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); } static int @@ -1481,6 +1483,8 @@ { int gone, i; + ctrlr->is_dying = true; + if (ctrlr->resource == NULL) goto nores; if (!mtx_initialized(&ctrlr->adminq.lock)) Index: sys/dev/nvme/nvme_private.h =================================================================== --- sys/dev/nvme/nvme_private.h +++ sys/dev/nvme/nvme_private.h @@ -151,7 +151,7 @@ TAILQ_ENTRY(nvme_tracker) tailq; struct nvme_request *req; struct nvme_qpair *qpair; - struct callout timer; + sbintime_t deadline; bus_dmamap_t payload_dma_map; uint16_t cid; @@ -159,6 +159,12 @@ bus_addr_t prp_bus_addr; }; +enum nvme_recovery { + RECOVERY_NONE = 0, /* Normal operations */ + RECOVERY_START, /* Deadline has passed, start recovering */ + RECOVERY_RESET, /* This pass, initiate reset of controller */ + RECOVERY_WAITING, /* waiting for the reset to complete */ +}; struct nvme_qpair { struct nvme_controller *ctrlr; uint32_t id; @@ -170,6 +176,11 @@ struct resource *res; void *tag; + struct callout timer; + sbintime_t deadline; + bool timer_armed; + enum nvme_recovery recovery_state; + uint32_t num_entries; uint32_t num_trackers; uint32_t sq_tdbl_off; @@ -201,8 +212,6 @@ struct nvme_tracker **act_tr; - bool is_enabled; - struct mtx lock __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); @@ -305,6 +314,7 @@ uint32_t notification_sent; bool is_failed; + bool is_dying; STAILQ_HEAD(, nvme_request) fail_req; /* Host Memory Buffer */ Index: sys/dev/nvme/nvme_qpair.c =================================================================== --- sys/dev/nvme/nvme_qpair.c +++ sys/dev/nvme/nvme_qpair.c @@ -452,7 +452,6 @@ } mtx_lock(&qpair->lock); - callout_stop(&tr->timer); if (retry) { req->retries++; @@ -544,7 +543,7 @@ * progress. Ignore the interrupt - any I/O that was associated with * this interrupt will get retried when the reset is complete. */ - if (!qpair->is_enabled) + if (qpair->recovery_state != RECOVERY_NONE) return (false); bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, @@ -739,6 +738,10 @@ qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; + callout_init(&qpair->timer, 1); + qpair->timer_armed = false; + qpair->recovery_state = RECOVERY_NONE; + /* * Calcuate the stride of the doorbell register. Many emulators set this * value to correspond to a cache line. However, some hardware has set @@ -776,7 +779,6 @@ DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); bus_dmamap_create(qpair->dma_tag_payload, 0, &tr->payload_dma_map); - callout_init(&tr->timer, 1); tr->cid = i; tr->qpair = qpair; tr->prp = (uint64_t *)prp_list; @@ -835,6 +837,8 @@ { struct nvme_tracker *tr; + callout_drain(&qpair->timer); + if (qpair->tag) { bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); qpair->tag = NULL; @@ -914,65 +918,102 @@ } static void -nvme_abort_complete(void *arg, const struct nvme_completion *status) +nvme_qpair_timeout(void *arg) { - struct nvme_tracker *tr = arg; + struct nvme_qpair *qpair = arg; + struct nvme_controller *ctrlr = qpair->ctrlr; + struct nvme_tracker *tr; + struct nvme_tracker *tr_temp; + sbintime_t now; + bool idle; + uint32_t csts; + uint8_t cfs; - /* - * If cdw0 == 1, the controller was not able to abort the command - * we requested. We still need to check the active tracker array, - * to cover race where I/O timed out at same time controller was - * completing the I/O. - */ - if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { + mtx_lock(&qpair->lock); + idle = TAILQ_EMPTY(&qpair->outstanding_tr); +again: + switch (qpair->recovery_state) { + case RECOVERY_NONE: + if (idle) + break; + now = getsbinuptime(); + TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { + if (now > tr->deadline && tr->deadline != 0) { + /* + * We're now passed our earliest deadline. We + * need to do expensive things to cope, but next + * time. Flag that and close the door to any + * further processing. + */ + qpair->recovery_state = RECOVERY_START; + nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd\n", + (uintmax_t)now, (uintmax_t)tr->deadline); + break; + } + } + break; + case RECOVERY_START: /* - * An I/O has timed out, and the controller was unable to - * abort it for some reason. Construct a fake completion - * status, and then complete the I/O's tracker manually. + * Read csts to get value of cfs - controller fatal status. + * If no fatal status, try to call the completion routine, and + * if completes transactions, report a missed interrupt and + * return (this may need to be rate limited). Otherwise, if + * aborts are enabled and the controller is not reporting + * fatal status, abort the command. Otherwise, just reset the + * controller and hope for the best. */ - nvme_printf(tr->qpair->ctrlr, - "abort command failed, aborting command manually\n"); - nvme_qpair_manual_complete_tracker(tr, - NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); + csts = nvme_mmio_read_4(ctrlr, csts); + cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; + if (cfs) { + nvme_printf(ctrlr, "Controller in fatal status, resetting\n"); + qpair->recovery_state = RECOVERY_RESET; + goto again; + } + mtx_unlock(&qpair->lock); + if (nvme_qpair_process_completions(qpair)) { + nvme_printf(ctrlr, "Missing interrupt\n"); + qpair->recovery_state = RECOVERY_NONE; + // XXX do I need a NOP here to not race? + } else { + nvme_printf(ctrlr, "missed interrupt with nothing complete\n"); + qpair->recovery_state = RECOVERY_RESET; + mtx_lock(&qpair->lock); + goto again; + } + mtx_lock(&qpair->lock); + break; + case RECOVERY_RESET: + /* + * If we get here due to a possible surprise hot-unplug event, + * then we let nvme_ctrlr_reset confirm and fail the + * controller. + */ + nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", + cfs ? " and fatal error status" : ""); + nvme_printf(ctrlr, "RECOVERY_WAITING\n"); + qpair->recovery_state = RECOVERY_WAITING; + nvme_ctrlr_reset(ctrlr); + break; + case RECOVERY_WAITING: + nvme_printf(ctrlr, "waiting\n"); + break; } -} - -static void -nvme_timeout(void *arg) -{ - struct nvme_tracker *tr = arg; - struct nvme_qpair *qpair = tr->qpair; - struct nvme_controller *ctrlr = qpair->ctrlr; - uint32_t csts; - uint8_t cfs; /* - * Read csts to get value of cfs - controller fatal status. - * If no fatal status, try to call the completion routine, and - * if completes transactions, report a missed interrupt and - * return (this may need to be rate limited). Otherwise, if - * aborts are enabled and the controller is not reporting - * fatal status, abort the command. Otherwise, just reset the - * controller and hope for the best. + * Rearm the timeout. */ - csts = nvme_mmio_read_4(ctrlr, csts); - cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; - if (cfs == 0 && nvme_qpair_process_completions(qpair)) { - nvme_printf(ctrlr, "Missing interrupt\n"); - return; - } - if (ctrlr->enable_aborts && cfs == 0) { - nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); - nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, - nvme_abort_complete, tr); + if (!idle) { + callout_schedule(&qpair->timer, hz / 2); } else { - nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", - (csts == NVME_GONE) ? " and possible hot unplug" : - (cfs ? " and fatal error status" : "")); - nvme_ctrlr_reset(ctrlr); + qpair->timer_armed = false; } + mtx_unlock(&qpair->lock); } +/* + * Submit the tracker to the hardware. Must already be in the + * outstanding queue when called. + */ void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) { @@ -989,12 +1030,17 @@ if (req->timeout) { if (req->cb_fn == nvme_completion_poll_cb) - timeout = hz; + timeout = 1; else - timeout = ctrlr->timeout_period * hz; - callout_reset_on(&tr->timer, timeout, nvme_timeout, tr, - qpair->cpu); - } + timeout = ctrlr->timeout_period; + tr->deadline = getsbinuptime() + timeout * SBT_1S; + if (!qpair->timer_armed) { + qpair->timer_armed = true; + callout_reset_on(&qpair->timer, hz / 2, + nvme_qpair_timeout, qpair, qpair->cpu); + } + } else + tr->deadline = SBT_MAX; /* Copy the command from the tracker to the submission queue. */ memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); @@ -1069,7 +1115,7 @@ tr = TAILQ_FIRST(&qpair->free_tr); req->qpair = qpair; - if (tr == NULL || !qpair->is_enabled) { + if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { /* * No tracker is available, or the qpair is disabled due to * an in-progress controller-level reset or controller @@ -1096,6 +1142,8 @@ TAILQ_REMOVE(&qpair->free_tr, tr, tailq); TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); + if (!qpair->timer_armed) + tr->deadline = SBT_MAX; tr->req = req; switch (req->type) { @@ -1164,8 +1212,9 @@ static void nvme_qpair_enable(struct nvme_qpair *qpair) { + mtx_assert(&qpair->lock, MA_OWNED); - qpair->is_enabled = true; + qpair->recovery_state = RECOVERY_NONE; } void @@ -1208,7 +1257,9 @@ NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); } + mtx_lock(&qpair->lock); nvme_qpair_enable(qpair); + mtx_unlock(&qpair->lock); } void @@ -1251,12 +1302,13 @@ static void nvme_qpair_disable(struct nvme_qpair *qpair) { - struct nvme_tracker *tr; + struct nvme_tracker *tr, *tr_temp; - qpair->is_enabled = false; mtx_lock(&qpair->lock); - TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) - callout_stop(&tr->timer); + qpair->recovery_state = RECOVERY_WAITING; + TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { + tr->deadline = SBT_MAX; + } mtx_unlock(&qpair->lock); }