Index: sys/dev/nvme/nvme_ctrlr.c
===================================================================
--- sys/dev/nvme/nvme_ctrlr.c
+++ sys/dev/nvme/nvme_ctrlr.c
@@ -232,7 +232,8 @@
 	mtx_lock(&ctrlr->lock);
 	STAILQ_INSERT_TAIL(&ctrlr->fail_req, req, stailq);
 	mtx_unlock(&ctrlr->lock);
-	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
+	if (!ctrlr->is_dying)
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->fail_req_task);
 }
 
 static void
@@ -433,7 +434,8 @@
 		 */
 		return;
 
-	taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
+	if (!ctrlr->is_dying)
+		taskqueue_enqueue(ctrlr->taskqueue, &ctrlr->reset_task);
 }
 
 static int
@@ -1464,6 +1466,8 @@
 {
 	int	gone, i;
 
+	ctrlr->is_dying = true;
+
 	if (ctrlr->resource == NULL)
 		goto nores;
 
@@ -1483,6 +1487,9 @@
 	if (ctrlr->cdev)
 		destroy_dev(ctrlr->cdev);
 
+	if (ctrlr->taskqueue)
+		taskqueue_free(ctrlr->taskqueue);
+
 	if (ctrlr->is_initialized) {
 		if (!gone) {
 			if (ctrlr->hmb_nchunks > 0)
@@ -1511,9 +1518,6 @@
 	if (!gone)
 		nvme_ctrlr_disable(ctrlr);
 
-	if (ctrlr->taskqueue)
-		taskqueue_free(ctrlr->taskqueue);
-
 	if (ctrlr->tag)
 		bus_teardown_intr(ctrlr->dev, ctrlr->res, ctrlr->tag);
 
Index: sys/dev/nvme/nvme_private.h
===================================================================
--- sys/dev/nvme/nvme_private.h
+++ sys/dev/nvme/nvme_private.h
@@ -151,7 +151,7 @@
 	TAILQ_ENTRY(nvme_tracker)	tailq;
 	struct nvme_request		*req;
 	struct nvme_qpair		*qpair;
-	struct callout			timer;
+	sbintime_t			deadline;
 	bus_dmamap_t			payload_dma_map;
 	uint16_t			cid;
 
@@ -159,6 +159,12 @@
 	bus_addr_t			prp_bus_addr;
 };
 
+enum nvme_recovery {
+	RECOVERY_NONE = 0,		/* Normal operations */
+	RECOVERY_START,			/* Deadline has passed, start recovering */
+	RECOVERY_RESET,			/* This pass, initiate reset of controller */
+	RECOVERY_WAITING,		/* waiting for the reset to complete */
+};
 struct nvme_qpair {
 	struct nvme_controller	*ctrlr;
 	uint32_t		id;
@@ -170,6 +176,12 @@
 	struct resource		*res;
 	void 			*tag;
 
+	struct callout		timer;
+	sbintime_t		deadline;
+	bool			timer_armed;
+	enum nvme_recovery	recovery_state;
+	int			ticks;
+
 	uint32_t		num_entries;
 	uint32_t		num_trackers;
 	uint32_t		sq_tdbl_off;
@@ -201,8 +213,6 @@
 
 	struct nvme_tracker	**act_tr;
 
-	bool			is_enabled;
-
 	struct mtx		lock __aligned(CACHE_LINE_SIZE);
 
 } __aligned(CACHE_LINE_SIZE);
@@ -305,6 +315,7 @@
 	uint32_t			notification_sent;
 
 	bool				is_failed;
+	bool				is_dying;
 	STAILQ_HEAD(, nvme_request)	fail_req;
 
 	/* Host Memory Buffer */
Index: sys/dev/nvme/nvme_qpair.c
===================================================================
--- sys/dev/nvme/nvme_qpair.c
+++ sys/dev/nvme/nvme_qpair.c
@@ -452,7 +452,6 @@
 	}
 
 	mtx_lock(&qpair->lock);
-	callout_stop(&tr->timer);
 
 	if (retry) {
 		req->retries++;
@@ -479,6 +478,8 @@
 			req = STAILQ_FIRST(&qpair->queued_req);
 			STAILQ_REMOVE_HEAD(&qpair->queued_req, stailq);
 			_nvme_qpair_submit_request(qpair, req);
+		} else if (TAILQ_EMPTY(&qpair->outstanding_tr)) {
+			qpair->deadline = SBT_MAX;
 		}
 	}
 
@@ -544,7 +545,7 @@
 	 * progress.  Ignore the interrupt - any I/O that was associated with
 	 * this interrupt will get retried when the reset is complete.
 	 */
-	if (!qpair->is_enabled)
+	if (qpair->recovery_state != RECOVERY_NONE)
 		return (false);
 
 	bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map,
@@ -746,6 +747,11 @@
 	qpair->cpl_bus_addr = queuemem_phys + cmdsz;
 	prpmem_phys = queuemem_phys + cmdsz + cplsz;
 
+	callout_init(&qpair->timer, 1);
+	qpair->timer_armed = false;
+	qpair->deadline = SBT_MAX;
+	qpair->recovery_state = RECOVERY_NONE;
+
 	/*
 	 * Calcuate the stride of the doorbell register. Many emulators set this
 	 * value to correspond to a cache line. However, some hardware has set
@@ -783,7 +789,6 @@
 		    DOMAINSET_PREF(qpair->domain), M_ZERO | M_WAITOK);
 		bus_dmamap_create(qpair->dma_tag_payload, 0,
 		    &tr->payload_dma_map);
-		callout_init(&tr->timer, 1);
 		tr->cid = i;
 		tr->qpair = qpair;
 		tr->prp = (uint64_t *)prp_list;
@@ -813,6 +818,8 @@
 {
 	struct nvme_tracker	*tr;
 
+	callout_drain(&qpair->timer);
+
 	if (qpair->tag) {
 		bus_teardown_intr(qpair->ctrlr->dev, qpair->res, qpair->tag);
 		qpair->tag = NULL;
@@ -892,65 +899,126 @@
 }
 
 static void
-nvme_abort_complete(void *arg, const struct nvme_completion *status)
+nvme_qpair_timeout(void *arg)
 {
-	struct nvme_tracker	*tr = arg;
-
-	/*
-	 * If cdw0 == 1, the controller was not able to abort the command
-	 *  we requested.  We still need to check the active tracker array,
-	 *  to cover race where I/O timed out at same time controller was
-	 *  completing the I/O.
-	 */
-	if (status->cdw0 == 1 && tr->qpair->act_tr[tr->cid] != NULL) {
-		/*
-		 * An I/O has timed out, and the controller was unable to
-		 *  abort it for some reason.  Construct a fake completion
-		 *  status, and then complete the I/O's tracker manually.
-		 */
-		nvme_printf(tr->qpair->ctrlr,
-		    "abort command failed, aborting command manually\n");
-		nvme_qpair_manual_complete_tracker(tr,
-		    NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
-	}
-}
-
-static void
-nvme_timeout(void *arg)
-{
-	struct nvme_tracker	*tr = arg;
-	struct nvme_qpair	*qpair = tr->qpair;
+	struct nvme_qpair	*qpair = arg;
 	struct nvme_controller	*ctrlr = qpair->ctrlr;
+	struct nvme_tracker	*tr;
+	struct nvme_tracker	*tr_temp;
+	sbintime_t		now;
+	bool			idle;
 	uint32_t		csts;
 	uint8_t			cfs;
 
-	/*
-	 * Read csts to get value of cfs - controller fatal status.
-	 * If no fatal status, try to call the completion routine, and
-	 * if completes transactions, report a missed interrupt and
-	 * return (this may need to be rate limited). Otherwise, if
-	 * aborts are enabled and the controller is not reporting
-	 * fatal status, abort the command. Otherwise, just reset the
-	 * controller and hope for the best.
-	 */
-	csts = nvme_mmio_read_4(ctrlr, csts);
-	cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
-	if (cfs == 0 && nvme_qpair_process_completions(qpair)) {
-		nvme_printf(ctrlr, "Missing interrupt\n");
-		return;
+	mtx_lock(&qpair->lock);
+	idle = TAILQ_EMPTY(&qpair->outstanding_tr);
+	if (idle && qpair->deadline != SBT_MAX) {
+		nvme_printf(ctrlr, "Had to reset deadline from %jd to MAX due to idle\n",
+			qpair->deadline);
+		qpair->deadline = SBT_MAX;
 	}
-	if (ctrlr->enable_aborts && cfs == 0) {
-		nvme_printf(ctrlr, "Aborting command due to a timeout.\n");
-		nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
-		    nvme_abort_complete, tr);
-	} else {
+again:
+	switch (qpair->recovery_state) {
+	case RECOVERY_NONE:
+		now = getsbinuptime();
+		if (!idle && now > qpair->deadline) {
+			qpair->deadline = SBT_MAX;
+			TAILQ_FOREACH_SAFE(tr, &qpair->outstanding_tr, tailq, tr_temp) {
+				if (now > tr->deadline && tr->deadline != 0) {
+					/*
+					 * We're now passed our earliest
+					 * deadline. We need to do expensive
+					 * things to cope, but next time. Flag
+					 * that and close the door to any
+					 * further processing.
+					 */
+					qpair->recovery_state = RECOVERY_START;
+					nvme_printf(ctrlr, "RECOVERY_START %jd vs %jd %jd\n",
+					    (uintmax_t)now, (uintmax_t)qpair->deadline,
+					    (uintmax_t)tr->deadline);
+					qpair->deadline = SBT_MAX;
+					break;
+				}
+				if (qpair->deadline > tr->deadline && tr->deadline != 0) {
+					/*
+					 * If we haven't started recovery, then
+					 * we need to adjust the current
+					 * outstanding deadlines. Since we
+					 * expect the trackers to be in order of
+					 * deadline we print when we have to
+					 * adjust the qpair deadline except when
+					 * it's SBT_MAX that we set it to above
+					 * which is expected for the first one.
+					 */
+					if (qpair->deadline != SBT_MAX)
+						nvme_printf(ctrlr, "Adjusting: %jd to %jd\n",
+						    (uintmax_t)qpair->deadline,
+						    (uintmax_t)tr->deadline);
+					qpair->deadline = tr->deadline;
+				}
+			}
+		}
+		break;
+	case RECOVERY_START:
+		/*
+		 * Read csts to get value of cfs - controller fatal status.
+		 * If no fatal status, try to call the completion routine, and
+		 * if completes transactions, report a missed interrupt and
+		 * return (this may need to be rate limited). Otherwise, if
+		 * aborts are enabled and the controller is not reporting
+		 * fatal status, abort the command. Otherwise, just reset the
+		 * controller and hope for the best.
+		 */
+		csts = nvme_mmio_read_4(ctrlr, csts);
+		cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
+		if (cfs) {
+			nvme_printf(ctrlr, "Controller in fatal status, resetting\n");
+			qpair->recovery_state = RECOVERY_RESET;
+			goto again;
+		}
+		mtx_unlock(&qpair->lock);
+		if (nvme_qpair_process_completions(qpair)) {
+			nvme_printf(ctrlr, "Missing interrupt\n");
+			qpair->recovery_state = RECOVERY_NONE;
+			// XXX do I need a NOP here to not race?
+		} else {
+			nvme_printf(ctrlr, "missed interrupt with nothing complete\n");
+			qpair->recovery_state = RECOVERY_RESET;
+			mtx_lock(&qpair->lock);
+			goto again;
+		}
+		mtx_lock(&qpair->lock);
+		break;
+	case RECOVERY_RESET:
 		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
 		    (csts == NVME_GONE) ? " and possible hot unplug" :
 		    (cfs ? " and fatal error status" : ""));
+		nvme_printf(ctrlr, "RECOVERY_WAITING\n");
+		qpair->recovery_state = RECOVERY_WAITING;
 		nvme_ctrlr_reset(ctrlr);
+		break;
+	case RECOVERY_WAITING:
+		nvme_printf(ctrlr, "waiting\n");
+		break;
 	}
+
+	/*
+	 * Rearm the timeout.
+	 */
+	if (!idle) {
+		callout_reset_on(&qpair->timer, qpair->ticks,
+		    nvme_qpair_timeout, qpair, qpair->cpu);
+	} else {
+		qpair->timer_armed = false;
+		qpair->deadline = SBT_MAX;
+	}
+	mtx_unlock(&qpair->lock);
 }
 
+/*
+ * Submit the tracker to the hardware. Must already be in the
+ * outstanding queue when called.
+ */
 void
 nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr)
 {
@@ -967,12 +1035,18 @@
 
 	if (req->timeout) {
 		if (req->cb_fn == nvme_completion_poll_cb)
-			timeout = hz;
+			timeout = 1;
 		else
-			timeout = ctrlr->timeout_period * hz;
-		callout_reset_on(&tr->timer, timeout, nvme_timeout, tr,
-		    qpair->cpu);
+			timeout = ctrlr->timeout_period;
+		tr->deadline = getsbinuptime() + timeout * SBT_1S;
+		if (!qpair->timer_armed) {
+			qpair->ticks = hz / 2;
+			qpair->timer_armed = true;
+			callout_reset_on(&qpair->timer, qpair->ticks,
+			    nvme_qpair_timeout, qpair, qpair->cpu);
+		}
 	}
+	qpair->deadline = TAILQ_FIRST(&qpair->outstanding_tr)->deadline;
 
 	/* Copy the command from the tracker to the submission queue. */
 	memcpy(&qpair->cmd[qpair->sq_tail], &req->cmd, sizeof(req->cmd));
@@ -1047,7 +1121,7 @@
 	tr = TAILQ_FIRST(&qpair->free_tr);
 	req->qpair = qpair;
 
-	if (tr == NULL || !qpair->is_enabled) {
+	if (tr == NULL || qpair->recovery_state != RECOVERY_NONE) {
 		/*
 		 * No tracker is available, or the qpair is disabled due to
 		 *  an in-progress controller-level reset or controller
@@ -1076,6 +1150,8 @@
 
 	TAILQ_REMOVE(&qpair->free_tr, tr, tailq);
 	TAILQ_INSERT_TAIL(&qpair->outstanding_tr, tr, tailq);
+	if (!qpair->timer_armed)
+		tr->deadline = SBT_MAX;
 	tr->req = req;
 
 	switch (req->type) {
@@ -1144,8 +1220,9 @@
 static void
 nvme_qpair_enable(struct nvme_qpair *qpair)
 {
+	mtx_assert(&qpair->lock, MA_OWNED);
 
-	qpair->is_enabled = true;
+	qpair->recovery_state = RECOVERY_NONE;
 }
 
 void
@@ -1188,7 +1265,9 @@
 		    NVME_SC_ABORTED_BY_REQUEST, DO_NOT_RETRY, ERROR_PRINT_ALL);
 	}
 
+	mtx_lock(&qpair->lock);
 	nvme_qpair_enable(qpair);
+	mtx_unlock(&qpair->lock);
 }
 
 void
@@ -1231,12 +1310,9 @@
 static void
 nvme_qpair_disable(struct nvme_qpair *qpair)
 {
-	struct nvme_tracker *tr;
 
-	qpair->is_enabled = false;
 	mtx_lock(&qpair->lock);
-	TAILQ_FOREACH(tr, &qpair->outstanding_tr, tailq)
-		callout_stop(&tr->timer);
+	qpair->recovery_state = RECOVERY_WAITING;
 	mtx_unlock(&qpair->lock);
 }