Index: sys/dev/nvme/nvme_ctrlr.c =================================================================== --- sys/dev/nvme/nvme_ctrlr.c +++ sys/dev/nvme/nvme_ctrlr.c @@ -232,7 +232,8 @@ mtx_lock(&ctrlr->lock); STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq); mtx_unlock(&ctrlr->lock); - taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); + if (!ctrlr->is_dying) + taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task); } static void @@ -433,7 +434,8 @@ */ return; - taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); + if (!ctrlr->is_dying) + taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task); } static int @@ -1464,6 +1466,8 @@ { int gone, i; + ctrlr->is_dying = true; + if (ctrlr->resource == NULL) goto nores; @@ -1483,6 +1487,9 @@ if (ctrlr->cdev) destroy_dev(ctrlr->cdev); + if (ctrlr->taskqueue) + taskqueue_free(ctrlr->taskqueue); + if (ctrlr->is_initialized) { if (!gone) { if (ctrlr->hmb_nchunks > 0) @@ -1511,9 +1518,6 @@ if (!gone) nvme_ctrlr_disable(ctrlr); - if (ctrlr->taskqueue) - taskqueue_free(ctrlr->taskqueue); - if (ctrlr->tag) bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag); Index: sys/dev/nvme/nvme_private.h =================================================================== --- sys/dev/nvme/nvme_private.h +++ sys/dev/nvme/nvme_private.h @@ -151,7 +151,7 @@ TAILQ_ENTRY(nvme_tracker) tailq; struct nvme_request *req; struct nvme_qpair *qpair; - struct callout timer; + sbintime_t deadline; bus_dmamap_t payload_dma_map; uint16_t cid; @@ -159,6 +159,12 @@ bus_addr_t prp_bus_addr; }; +enum nvme_recovery { + RECOVERY_NONE = 0, /* Normal operations */ + RECOVERY_START, /* Deadline has passed, start recovering */ + RECOVERY_RESET, /* This pass, initiate reset of controller */ + RECOVERY_WAITING, /* waiting for the reset to complete */ +}; struct nvme_qpair { struct nvme_controller *ctrlr; uint32_t id; @@ -170,6 +176,12 @@ struct resource *res; void *tag; + struct callout timer; + sbintime_t deadline; + bool timer_armed; + enum nvme_recovery recovery_state; + int ticks; + uint32_t num_entries; uint32_t num_trackers; uint32_t sq_tdbl_off; @@ -201,8 +213,6 @@ struct nvme_tracker **act_tr; - bool is_enabled; - struct mtx lock __aligned(CACHE_LINE_SIZE); } __aligned(CACHE_LINE_SIZE); @@ -305,6 +315,7 @@ uint32_t notification_sent; bool is_failed; + bool is_dying; STAILQ_HEAD(, nvme_request) fail_req; /* Host Memory Buffer */ Index: sys/dev/nvme/nvme_qpair.c =================================================================== --- sys/dev/nvme/nvme_qpair.c +++ sys/dev/nvme/nvme_qpair.c @@ -452,7 +452,6 @@ } mtx_lock(&qpair->lock); - callout_stop(&tr->timer); if (retry) { req->retries++; @@ -479,6 +478,8 @@ req = STAILQ_FIRST(&qpair->queued_req); STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq); _nvme_qpair_submit_request(qpair, req); + } else if (TAILQ_EMPTY(&qpair->outstanding_tr)) { + qpair->deadline = SBT_MAX; } } @@ -544,7 +545,7 @@ * progress. Ignore the interrupt - any I/O that was associated with * this interrupt will get retried when the reset is complete. */ - if (!qpair->is_enabled) + if (qpair->recovery_state != RECOVERY_NONE) return (false); bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, @@ -746,6 +747,11 @@ qpair->cpl_bus_addr = queuemem_phys + cmdsz; prpmem_phys = queuemem_phys + cmdsz + cplsz; + callout_init(&qpair->timer, 1); + qpair->timer_armed = false; + qpair->deadline = SBT_MAX; + qpair->recovery_state = RECOVERY_NONE; + /* * Calcuate the stride of the doorbell register. Many emulators set this * value to correspond to a cache line. However, some hardware has set @@ -783,7 +789,6 @@ DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK); bus_dmamap_create(qpair->dma_tag_payload, 0, &tr->payload_dma_map); - callout_init(&tr->timer, 1); tr->cid = i; tr->qpair = qpair; tr->prp = (uint64_t *)prp_list; @@ -813,6 +818,8 @@ { struct nvme_tracker *tr; + callout_drain(&qpair->timer); + if (qpair->tag) { bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag); qpair->tag = NULL; @@ -892,65 +899,126 @@ } static void -nvme_abort_complete(void *arg, const struct nvme_completion *status) +nvme_qpair_timeout(void *arg) { - struct nvme_tracker *tr = arg; - - /* - * If cdw0 == 1, the controller was not able to abort the command - * we requested. We still need to check the active tracker array, - * to cover race where I/O timed out at same time controller was - * completing the I/O. - */ - if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) { - /* - * An I/O has timed out, and the controller was unable to - * abort it for some reason. Construct a fake completion - * status, and then complete the I/O's tracker manually. - */ - nvme_printf(tr->qpair->ctrlr, - "abort command failed, aborting command manually\n"); - nvme_qpair_manual_complete_tracker(tr, - NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); - } -} - -static void -nvme_timeout(void *arg) -{ - struct nvme_tracker *tr = arg; - struct nvme_qpair *qpair = tr->qpair; + struct nvme_qpair *qpair = arg; struct nvme_controller *ctrlr = qpair->ctrlr; + struct nvme_tracker *tr; + struct nvme_tracker *tr_temp; + sbintime_t now; + bool idle; uint32_t csts; uint8_t cfs; - /* - * Read csts to get value of cfs - controller fatal status. - * If no fatal status, try to call the completion routine, and - * if completes transactions, report a missed interrupt and - * return (this may need to be rate limited). Otherwise, if - * aborts are enabled and the controller is not reporting - * fatal status, abort the command. Otherwise, just reset the - * controller and hope for the best. - */ - csts = nvme_mmio_read_4(ctrlr, csts); - cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; - if (cfs == 0 && nvme_qpair_process_completions(qpair)) { - nvme_printf(ctrlr, "Missing interrupt\n"); - return; + mtx_lock(&qpair->lock); + idle = TAILQ_EMPTY(&qpair->outstanding_tr); + if (idle && qpair->deadline != SBT_MAX) { + nvme_printf(ctrlr, "Had to reset deadline from %jd to MAX due to idle\n", + qpair->deadline); + qpair->deadline = SBT_MAX; } - if (ctrlr->enable_aborts && cfs == 0) { - nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); - nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, - nvme_abort_complete, tr); - } else { +again: + switch (qpair->recovery_state) { + case RECOVERY_NONE: + now = getsbinuptime(); + if (!idle && now > qpair->deadline) { + qpair->deadline = SBT_MAX; + TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) { + if (now > tr->deadline && tr->deadline != 0) { + /* + * We're now passed our earliest + * deadline. We need to do expensive + * things to cope, but next time. Flag + * that and close the door to any + * further processing. + */ + qpair->recovery_state = RECOVERY_START; + nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd %jd\n", + (uintmax_t)now, (uintmax_t)qpair->deadline, + (uintmax_t)tr->deadline); + qpair->deadline = SBT_MAX; + break; + } + if (qpair->deadline > tr->deadline && tr->deadline != 0) { + /* + * If we haven't started recovery, then + * we need to adjust the current + * outstanding deadlines. Since we + * expect the trackers to be in order of + * deadline we print when we have to + * adjust the qpair deadline except when + * it's SBT_MAX that we set it to above + * which is expected for the first one. + */ + if (qpair->deadline != SBT_MAX) + nvme_printf(ctrlr, "Adjusting: %jd to %jd\n", + (uintmax_t)qpair->deadline, + (uintmax_t)tr->deadline); + qpair->deadline = tr->deadline; + } + } + } + break; + case RECOVERY_START: + /* + * Read csts to get value of cfs - controller fatal status. + * If no fatal status, try to call the completion routine, and + * if completes transactions, report a missed interrupt and + * return (this may need to be rate limited). Otherwise, if + * aborts are enabled and the controller is not reporting + * fatal status, abort the command. Otherwise, just reset the + * controller and hope for the best. + */ + csts = nvme_mmio_read_4(ctrlr, csts); + cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; + if (cfs) { + nvme_printf(ctrlr, "Controller in fatal status, resetting\n"); + qpair->recovery_state = RECOVERY_RESET; + goto again; + } + mtx_unlock(&qpair->lock); + if (nvme_qpair_process_completions(qpair)) { + nvme_printf(ctrlr, "Missing interrupt\n"); + qpair->recovery_state = RECOVERY_NONE; + // XXX do I need a NOP here to not race? + } else { + nvme_printf(ctrlr, "missed interrupt with nothing complete\n"); + qpair->recovery_state = RECOVERY_RESET; + mtx_lock(&qpair->lock); + goto again; + } + mtx_lock(&qpair->lock); + break; + case RECOVERY_RESET: nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", (csts == NVME_GONE) ? " and possible hot unplug" : (cfs ? " and fatal error status" : "")); + nvme_printf(ctrlr, "RECOVERY_WAITING\n"); + qpair->recovery_state = RECOVERY_WAITING; nvme_ctrlr_reset(ctrlr); + break; + case RECOVERY_WAITING: + nvme_printf(ctrlr, "waiting\n"); + break; } + + /* + * Rearm the timeout. + */ + if (!idle) { + callout_reset_on(&qpair->timer, qpair->ticks, + nvme_qpair_timeout, qpair, qpair->cpu); + } else { + qpair->timer_armed = false; + qpair->deadline = SBT_MAX; + } + mtx_unlock(&qpair->lock); } +/* + * Submit the tracker to the hardware. Must already be in the + * outstanding queue when called. + */ void nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) { @@ -967,12 +1035,18 @@ if (req->timeout) { if (req->cb_fn == nvme_completion_poll_cb) - timeout = hz; + timeout = 1; else - timeout = ctrlr->timeout_period * hz; - callout_reset_on(&tr->timer, timeout, nvme_timeout, tr, - qpair->cpu); + timeout = ctrlr->timeout_period; + tr->deadline = getsbinuptime() + timeout * SBT_1S; + if (!qpair->timer_armed) { + qpair->ticks = hz / 2; + qpair->timer_armed = true; + callout_reset_on(&qpair->timer, qpair->ticks, + nvme_qpair_timeout, qpair, qpair->cpu); + } } + qpair->deadline = TAILQ_FIRST(&qpair->outstanding_tr)->deadline; /* Copy the command from the tracker to the submission queue. */ memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd)); @@ -1047,7 +1121,7 @@ tr = TAILQ_FIRST(&qpair->free_tr); req->qpair = qpair; - if (tr == NULL || !qpair->is_enabled) { + if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) { /* * No tracker is available, or the qpair is disabled due to * an in-progress controller-level reset or controller @@ -1076,6 +1150,8 @@ TAILQ_REMOVE(&qpair->free_tr, tr, tailq); TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq); + if (!qpair->timer_armed) + tr->deadline = SBT_MAX; tr->req = req; switch (req->type) { @@ -1144,8 +1220,9 @@ static void nvme_qpair_enable(struct nvme_qpair *qpair) { + mtx_assert(&qpair->lock, MA_OWNED); - qpair->is_enabled = true; + qpair->recovery_state = RECOVERY_NONE; } void @@ -1188,7 +1265,9 @@ NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL); } + mtx_lock(&qpair->lock); nvme_qpair_enable(qpair); + mtx_unlock(&qpair->lock); } void @@ -1231,12 +1310,9 @@ static void nvme_qpair_disable(struct nvme_qpair *qpair) { - struct nvme_tracker *tr; - qpair->is_enabled = false; mtx_lock(&qpair->lock); - TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) - callout_stop(&tr->timer); + qpair->recovery_state = RECOVERY_WAITING; mtx_unlock(&qpair->lock); }