Changeset View
Changeset View
Standalone View
Standalone View
sys/dev/nvme/nvme_qpair.c
Show First 20 Lines • Show All 870 Lines • ▼ Show 20 Lines | |||||
void | void | ||||
nvme_io_qpair_destroy(struct nvme_qpair *qpair) | nvme_io_qpair_destroy(struct nvme_qpair *qpair) | ||||
{ | { | ||||
nvme_qpair_destroy(qpair); | nvme_qpair_destroy(qpair); | ||||
} | } | ||||
static void | static void | ||||
nvme_unwedge_complete(void *arg, const struct nvme_completion *status) | |||||
{ | |||||
struct nvme_controller *ctrlr = arg; | |||||
if (nvme_completion_is_error(status)) { | |||||
imp: Maybe I need a comment here justifying reset on error. The GET FEATURES command is a can't fail… | |||||
nvme_printf(ctrlr, "Unwedge command failed, resetting.\n"); | |||||
nvme_ctrlr_reset(ctrlr); | |||||
} | |||||
} | |||||
static void | |||||
nvme_abort_complete(void *arg, const struct nvme_completion *status) | nvme_abort_complete(void *arg, const struct nvme_completion *status) | ||||
{ | { | ||||
struct nvme_tracker *tr = arg; | struct nvme_tracker *tr = arg; | ||||
/* | /* | ||||
* If cdw0 == 1, the controller was not able to abort the command | * If cdw0 == 1, the controller was not able to abort the command | ||||
* we requested. We still need to check the active tracker array, | * we requested. We still need to check the active tracker array, | ||||
* to cover race where I/O timed out at same time controller was | * to cover race where I/O timed out at same time controller was | ||||
Show All 11 Lines | nvme_qpair_manual_complete_tracker(tr, | ||||
NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); | NVME_SCT_GENERIC, NVME_SC_ABORTED_BY_REQUEST, 0, ERROR_PRINT_ALL); | ||||
} | } | ||||
} | } | ||||
static void | static void | ||||
nvme_timeout(void *arg) | nvme_timeout(void *arg) | ||||
{ | { | ||||
struct nvme_tracker *tr = arg; | struct nvme_tracker *tr = arg; | ||||
struct nvme_request *req = tr->req; | |||||
cpercivaUnsubmitted Not Done Inline ActionsIf nvme_qpair_complete_tracker lost a race, we could enter nvme_timeout with tr->req = NULL, resulting in a panic when we dereference req. For that matter, with sufficient racing it's possible that tr->req is non-NULL and refers to a *completely different request* to the one which triggered the timeout. cperciva: If nvme_qpair_complete_tracker lost a race, we could enter nvme_timeout with tr->req = NULL… | |||||
impAuthorUnsubmitted Done Inline ActionsWouldn't cancelling the timeout be sufficient, which is done as part of the tracking tear down during completion? imp: Wouldn't cancelling the timeout be sufficient, which is done as part of the tracking tear down… | |||||
struct nvme_qpair *qpair = tr->qpair; | struct nvme_qpair *qpair = tr->qpair; | ||||
struct nvme_controller *ctrlr = qpair->ctrlr; | struct nvme_controller *ctrlr = qpair->ctrlr; | ||||
uint32_t csts; | uint32_t csts; | ||||
uint8_t cfs; | uint8_t cfs; | ||||
/* | |||||
* Read csts to get value of cfs - controller fatal status. | |||||
* If no fatal status, try to call the completion routine, and | |||||
* if completes transactions, report a missed interrupt and | |||||
* return (this may need to be rate limited). Otherwise, if | |||||
* aborts are enabled and the controller is not reporting | |||||
* fatal status, abort the command. Otherwise, just reset the | |||||
* controller and hope for the best. | |||||
*/ | |||||
csts = nvme_mmio_read_4(ctrlr, csts); | csts = nvme_mmio_read_4(ctrlr, csts); | ||||
cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; | cfs = (csts >> NVME_CSTS_REG_CFS_SHIFT) & NVME_CSTS_REG_CFS_MASK; | ||||
if (cfs == 0 && nvme_qpair_process_completions(qpair)) { | /* | ||||
nvme_printf(ctrlr, "Missing interrupt\n"); | * If this command has previously timed out, then we've sent the | ||||
return; | * unwedge command already and it didn't work for some reason. | ||||
} | * | ||||
if (ctrlr->enable_aborts && cfs == 0) { | * If the cfs isn't 0, then the controller status is reporting | ||||
* as fatal, which means no good can come from waiting further. | |||||
* | |||||
* In all these cases just reset and return (unless aborts | |||||
* are enbaled for a time out, in which case send an abort). | |||||
* | |||||
* Note: if the command times out, the drive might legitimately just be | |||||
* slow. We don't try to do any adaptive things to increase the timeout | |||||
* for slow drives. They should have their timeout bumped by the system | |||||
* administrator. | |||||
*/ | |||||
if (req->timed_out || cfs != 0) { | |||||
if (cfs == 0 && req->timed_out && ctrlr->enable_aborts) { | |||||
nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); | nvme_printf(ctrlr, "Aborting command due to a timeout.\n"); | ||||
nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, | nvme_ctrlr_cmd_abort(ctrlr, tr->cid, qpair->id, | ||||
nvme_abort_complete, tr); | nvme_abort_complete, tr); | ||||
} else { | return; | ||||
} | |||||
nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", | nvme_printf(ctrlr, "Resetting controller due to a timeout%s.\n", | ||||
(csts == 0xffffffff) ? " and possible hot unplug" : | (csts == 0xffffffff) ? " and possible hot unplug" : | ||||
(cfs ? " and fatal error status" : "")); | (cfs ? " and fatal error status" : "")); | ||||
nvme_ctrlr_reset(ctrlr); | nvme_ctrlr_reset(ctrlr); | ||||
return; | |||||
} | } | ||||
/* | |||||
* The controller hasn't failed. If this is an unwedge that timed out, | |||||
* though, then we tried to send a trivial command to the card and it | |||||
* timed out as well. All we can do is reset the controller and hope for | |||||
* the best. This gives us better diagnostics, though, before we hit the | |||||
* big red panic button. | |||||
*/ | |||||
if (req->cb_fn == nvme_unwedge_complete) { | |||||
nvme_printf(ctrlr, "Unwedge attempted timed out, resetting.\n"); | |||||
nvme_ctrlr_reset(ctrlr); | |||||
return; | |||||
} | |||||
/* | |||||
* Next, we could try to send an abort and have it fail to send at all, | |||||
* so if that happens, we reset as well. Note, even though we send the | |||||
* commmand, if we have no trackers available for the request, then it | |||||
* gets queued and may be delayed. It isn't 100% that we can't send the | |||||
* unwedge command itself, but since it's to the administrative queue, | |||||
* not the qpair of the original request, that's likely the result of a | |||||
* crapton of these commands being sent. | |||||
*/ | |||||
if (req->cb_fn == nvme_abort_complete) { | |||||
nvme_printf(ctrlr, "Command abort timed out, resetting.\n"); | |||||
nvme_ctrlr_reset(ctrlr); | |||||
return; | |||||
} | |||||
/* | |||||
* OK. If we get this far, this is the first time into the timeout and | |||||
* it's nothing otherwise special. Note that this request has timed out | |||||
* and reset the timeout to the recovery timeout value. Then send a | |||||
* boring unwedge command to see if that gets things going again. | |||||
* The above code detects when the unwedge code has failed. | |||||
*/ | |||||
ctrlr->unwedges++; | |||||
req->timed_out = true; | |||||
callout_reset_on(&tr->timer, 5 * hz, nvme_timeout, tr, qpair->cpu); | |||||
nvme_ctrlr_cmd_get_feature(ctrlr, NVME_FEAT_ARBITRATION, 0, | |||||
NULL, 0, nvme_unwedge_complete, ctrlr); | |||||
} | } | ||||
void | void | ||||
nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) | nvme_qpair_submit_tracker(struct nvme_qpair *qpair, struct nvme_tracker *tr) | ||||
{ | { | ||||
struct nvme_request *req; | struct nvme_request *req; | ||||
struct nvme_controller *ctrlr; | struct nvme_controller *ctrlr; | ||||
▲ Show 20 Lines • Show All 337 Lines • Show Last 20 Lines |
Maybe I need a comment here justifying reset on error. The GET FEATURES command is a can't fail sort of thing for a mandatory feature like arbitration.