Differential D24284 Diff 70172 sys/dev/nvme/nvme_qpair.c

Changeset View

Standalone View

sys/dev/nvme/nvme_qpair.c

Show First 20 Lines • Show All 870 Lines • ▼ Show 20 Lines
void		void
nvme_io_qpair_destroy(struct nvme_qpair *qpair)		nvme_io_qpair_destroy(struct nvme_qpair *qpair)
{		{

nvme_qpair_destroy(qpair);		nvme_qpair_destroy(qpair);
}		}

static void		static void
		nvme_unwedge_complete(void arg, const struct nvme_completion status)
		{
		struct nvme_controller *ctrlr = arg;

		if (nvme_completion_is_error(status)) {
		impAuthorUnsubmitted Done Inline Actions Maybe I need a comment here justifying reset on error. The GET FEATURES command is a can't fail sort of thing for a mandatory feature like arbitration. imp: Maybe I need a comment here justifying reset on error. The GET FEATURES command is a can't fail…
		nvme_printf(ctrlr, "Unwedge command failed, resetting.\n");
		nvme_ctrlr_reset(ctrlr);
		}
		}


		static void
nvme_abort_complete(void arg, const struct nvme_completion status)		nvme_abort_complete(void arg, const struct nvme_completion status)
{		{
struct nvme_tracker *tr = arg;		struct nvme_tracker *tr = arg;

/*		/*
* If cdw0 == 1, the controller was not able to abort the command		* If cdw0 == 1, the controller was not able to abort the command
* we requested. We still need to check the active tracker array,		* we requested. We still need to check the active tracker array,
* to cover race where I/O timed out at same time controller was		* to cover race where I/O timed out at same time controller was
Show All 11 Lines	nvme_qpair_manual_complete_tracker(tr,
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);		NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL);
}		}
}		}

static void		static void
nvme_timeout(void *arg)		nvme_timeout(void *arg)
{		{
struct nvme_tracker *tr = arg;		struct nvme_tracker *tr = arg;
		struct nvme_request *req = tr->req;
		cpercivaUnsubmitted Not Done Inline Actions If nvme_qpair_complete_tracker lost a race, we could enter nvme_timeout with tr->req = NULL, resulting in a panic when we dereference req. For that matter, with sufficient racing it's possible that tr->req is non-NULL and refers to a completely different request to the one which triggered the timeout. cperciva: If nvme_qpair_complete_tracker lost a race, we could enter nvme_timeout with tr->req = NULL…
		impAuthorUnsubmitted Done Inline Actions Wouldn't cancelling the timeout be sufficient, which is done as part of the tracking tear down during completion? imp: Wouldn't cancelling the timeout be sufficient, which is done as part of the tracking tear down…
struct nvme_qpair *qpair = tr->qpair;		struct nvme_qpair *qpair = tr->qpair;
struct nvme_controller *ctrlr = qpair->ctrlr;		struct nvme_controller *ctrlr = qpair->ctrlr;
uint32_t csts;		uint32_t csts;
uint8_t cfs;		uint8_t cfs;

/*
* Read csts to get value of cfs - controller fatal status.
* If no fatal status, try to call the completion routine, and
* if completes transactions, report a missed interrupt and
* return (this may need to be rate limited). Otherwise, if
* aborts are enabled and the controller is not reporting
* fatal status, abort the command. Otherwise, just reset the
* controller and hope for the best.
*/
csts = nvme_mmio_read_4(ctrlr, csts);		csts = nvme_mmio_read_4(ctrlr, csts);
cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;		cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK;
if (cfs == 0 && nvme_qpair_process_completions(qpair)) {		/*
nvme_printf(ctrlr, "Missing interrupt\n");		* If this command has previously timed out, then we've sent the
return;		* unwedge command already and it didn't work for some reason.
}		*
if (ctrlr->enable_aborts && cfs == 0) {		* If the cfs isn't 0, then the controller status is reporting
		* as fatal, which means no good can come from waiting further.
		*
		* In all these cases just reset and return (unless aborts
		* are enbaled for a time out, in which case send an abort).
		*
		* Note: if the command times out, the drive might legitimately just be
		* slow. We don't try to do any adaptive things to increase the timeout
		* for slow drives. They should have their timeout bumped by the system
		* administrator.
		*/
		if (req->timed_out \|\| cfs != 0) {
		if (cfs == 0 && req->timed_out && ctrlr->enable_aborts) {
nvme_printf(ctrlr, "Aborting command due to a timeout.\n");		nvme_printf(ctrlr, "Aborting command due to a timeout.\n");
nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,		nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id,
nvme_abort_complete, tr);		nvme_abort_complete, tr);
} else {		return;
		}
nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",		nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n",
(csts == 0xffffffff) ? " and possible hot unplug" :		(csts == 0xffffffff) ? " and possible hot unplug" :
(cfs ? " and fatal error status" : ""));		(cfs ? " and fatal error status" : ""));
nvme_ctrlr_reset(ctrlr);		nvme_ctrlr_reset(ctrlr);
		return;
}		}

		/*
		* The controller hasn't failed. If this is an unwedge that timed out,
		* though, then we tried to send a trivial command to the card and it
		* timed out as well. All we can do is reset the controller and hope for
		* the best. This gives us better diagnostics, though, before we hit the
		* big red panic button.
		*/
		if (req->cb_fn == nvme_unwedge_complete) {
		nvme_printf(ctrlr, "Unwedge attempted timed out, resetting.\n");
		nvme_ctrlr_reset(ctrlr);
		return;
		}

		/*
		* Next, we could try to send an abort and have it fail to send at all,
		* so if that happens, we reset as well. Note, even though we send the
		* commmand, if we have no trackers available for the request, then it
		* gets queued and may be delayed. It isn't 100% that we can't send the
		* unwedge command itself, but since it's to the administrative queue,
		* not the qpair of the original request, that's likely the result of a
		* crapton of these commands being sent.
		*/
		if (req->cb_fn == nvme_abort_complete) {
		nvme_printf(ctrlr, "Command abort timed out, resetting.\n");
		nvme_ctrlr_reset(ctrlr);
		return;
		}

		/*
		* OK. If we get this far, this is the first time into the timeout and
		* it's nothing otherwise special. Note that this request has timed out
		* and reset the timeout to the recovery timeout value. Then send a
		* boring unwedge command to see if that gets things going again.
		* The above code detects when the unwedge code has failed.
		*/
		ctrlr->unwedges++;
		req->timed_out = true;
		callout_reset_on(&tr->timer, 5 * hz, nvme_timeout, tr, qpair->cpu);
		nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_ARBITRATION, 0,
		NULL, 0, nvme_unwedge_complete, ctrlr);
}		}

void		void
nvme_qpair_submit_tracker(struct nvme_qpair qpair, struct nvme_tracker tr)		nvme_qpair_submit_tracker(struct nvme_qpair qpair, struct nvme_tracker tr)
{		{
struct nvme_request *req;		struct nvme_request *req;
struct nvme_controller *ctrlr;		struct nvme_controller *ctrlr;

▲ Show 20 Lines • Show All 337 Lines • Show Last 20 Lines