Page MenuHomeFreeBSD

D46026.id141089.diff
No OneTemporary

D46026.id141089.diff

diff --git a/sys/dev/nvme/nvme_qpair.c b/sys/dev/nvme/nvme_qpair.c
--- a/sys/dev/nvme/nvme_qpair.c
+++ b/sys/dev/nvme/nvme_qpair.c
@@ -1037,8 +1037,8 @@
struct nvme_controller *ctrlr = qpair->ctrlr;
struct nvme_tracker *tr;
sbintime_t now;
- bool idle = false;
- bool needs_reset;
+ bool idle = true;
+ bool fast;
uint32_t csts;
uint8_t cfs;
@@ -1084,23 +1084,69 @@
*/
csts = nvme_mmio_read_4(ctrlr, csts);
cfs = NVMEV(NVME_CSTS_REG_CFS, csts);
- if (csts == NVME_GONE || cfs == 1)
- goto do_reset;
+ if (csts == NVME_GONE || cfs == 1) {
+ /*
+ * We've had a command timeout that we weren't able to
+ * abort or we have aborts disabled and any command
+ * timed out.
+ *
+ * If we get here due to a possible surprise hot-unplug
+ * event, then we let nvme_ctrlr_reset confirm and fail
+ * the controller.
+ */
+do_reset:
+ nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
+ (csts == 0xffffffff) ? " and possible hot unplug" :
+ (cfs ? " and fatal error status" : ""));
+ qpair->recovery_state = RECOVERY_WAITING;
+ nvme_ctrlr_reset(ctrlr);
+ idle = false;
+ break;
+ }
+
/*
- * Process completions. We already have the recovery lock, so
- * call the locked version.
+ * See if there's any recovery needed. First, do a fast check to
+ * see if anything could have timed out. If not, then skip
+ * everything else.
+ */
+ fast = false;
+ mtx_lock(&qpair->lock);
+ TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
+ /*
+ * Skip async commands, they are posted to the card for
+ * an indefinite amount of time and have no deadline.
+ */
+ if (tr->deadline == SBT_MAX)
+ continue;
+
+ /*
+ * If the first real transaction is not in timeout, then
+ * we're done. Otherwise, we try recovery.
+ */
+ idle = false;
+ if (now <= tr->deadline)
+ fast = true;
+ break;
+ }
+ mtx_unlock(&qpair->lock);
+ if (idle || fast)
+ break;
+
+ /*
+ * There's a stale transaction at the start of the queue that's
+ * at least 1/2s old. Call the completion ISR to catch 'missed'
+ * interrupts.
*/
_nvme_qpair_process_completions(qpair);
/*
- * Check to see if we need to timeout any commands. If we do, then
- * we also enter a recovery phase.
+ * Now that we've run the ISR, re-rheck to see if there's any
+ * timed out commands and abort them or reset the card if so.
*/
now = getsbinuptime();
- needs_reset = false;
- idle = true;
mtx_lock(&qpair->lock);
+ idle = true;
TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq) {
/*
* Skip async commands, they are posted to the card for
@@ -1108,48 +1154,44 @@
*/
if (tr->deadline == SBT_MAX)
continue;
- if (now > tr->deadline) {
- if (tr->req->cb_fn != nvme_abort_complete &&
- ctrlr->enable_aborts) {
- /*
- * This isn't an abort command, ask
- * for a hardware abort.
- */
- nvme_ctrlr_cmd_abort(ctrlr, tr->cid,
- qpair->id, nvme_abort_complete, tr);
- } else {
- /*
- * Otherwise we have a live command in
- * the card (either one we couldn't
- * abort, or aborts weren't enabled).
- * The only safe way to proceed is to do
- * a reset.
- */
- needs_reset = true;
- }
+
+ /*
+ * If we know this tracker hasn't timed out, we also
+ * know all subsequent ones haven't timed out. The tr
+ * queue is in submission order and all normal commands
+ * in a queue have the same timeout (or the timeout was
+ * changed by the user, but we eventually timeout then).
+ */
+ idle = false;
+ if (now <= tr->deadline)
+ break;
+
+ /*
+ * Timeout expired, abort it or reset controller.
+ */
+ if (ctrlr->enable_aborts &&
+ tr->req->cb_fn != nvme_abort_complete) {
+ /*
+ * This isn't an abort command, ask for a
+ * hardware abort. This goes to the admin
+ * queue which will reset the card if it
+ * times out.
+ */
+ nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
+ nvme_abort_complete, tr);
} else {
- idle = false;
+ /*
+ * We have a live command in the card (either
+ * one we couldn't abort, or aborts weren't
+ * enabled). We can only reset.
+ */
+ mtx_unlock(&qpair->lock);
+ goto do_resest;
}
}
mtx_unlock(&qpair->lock);
- if (!needs_reset)
- break;
-
- /*
- * We've had a command timeout that we weren't able to abort
- *
- * If we get here due to a possible surprise hot-unplug event,
- * then we let nvme_ctrlr_reset confirm and fail the
- * controller.
- */
- do_reset:
- nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
- (csts == 0xffffffff) ? " and possible hot unplug" :
- (cfs ? " and fatal error status" : ""));
- qpair->recovery_state = RECOVERY_WAITING;
- nvme_ctrlr_reset(ctrlr);
- idle = false; /* We want to keep polling */
break;
+
case RECOVERY_WAITING:
/*
* These messages aren't interesting while we're suspended. We

File Metadata

Mime Type
text/plain
Expires
Tue, Feb 3, 5:34 AM (10 h, 51 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28421397
Default Alt Text
D46026.id141089.diff (4 KB)

Event Timeline