Index: head/sys/dev/nvme/nvme_qpair.c =================================================================== --- head/sys/dev/nvme/nvme_qpair.c +++ head/sys/dev/nvme/nvme_qpair.c @@ -31,6 +31,8 @@ #include #include +#include +#include #include @@ -308,7 +310,7 @@ } static void -nvme_qpair_print_completion(struct nvme_qpair *qpair, +nvme_qpair_print_completion(struct nvme_qpair *qpair, struct nvme_completion *cpl) { uint16_t sct, sc; @@ -479,18 +481,51 @@ struct nvme_tracker *tr; struct nvme_completion cpl; int done = 0; + bool in_panic = dumping || SCHEDULER_STOPPED(); qpair->num_intr_handler_calls++; + /* + * qpair is not enabled, likely because a controller reset is is in + * progress. Ignore the interrupt - any I/O that was associated with + * this interrupt will get retried when the reset is complete. + */ if (!qpair->is_enabled) - /* - * qpair is not enabled, likely because a controller reset is - * is in progress. Ignore the interrupt - any I/O that was - * associated with this interrupt will get retried when the - * reset is complete. - */ return (false); + /* + * A panic can stop the CPU this routine is running on at any point. If + * we're called during a panic, complete the sq_head wrap protocol for + * the case where we are interrupted just after the increment at 1 + * below, but before we can reset cq_head to zero at 2. Also cope with + * the case where we do the zero at 2, but may or may not have done the + * phase adjustment at step 3. The panic machinery flushes all pending + * memory writes, so we can make these strong ordering assumptions + * that would otherwise be unwise if we were racing in real time. + */ + if (__predict_false(in_panic)) { + if (qpair->cq_head == qpair->num_entries) { + /* + * Here we know that we need to zero cq_head and then negate + * the phase, which hasn't been assigned if cq_head isn't + * zero due to the atomic_store_rel. + */ + qpair->cq_head = 0; + qpair->phase = !qpair->phase; + } else if (qpair->cq_head == 0) { + /* + * In this case, we know that the assignment at 2 + * happened below, but we don't know if it 3 happened or + * not. To do this, we look at the last completion + * entry and set the phase to the opposite phase + * that it has. This gets us back in sync + */ + cpl = qpair->cpl[qpair->num_entries - 1]; + nvme_completion_swapbytes(&cpl); + qpair->phase = !NVME_STATUS_GET_P(cpl.status); + } + } + bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); while (1) { @@ -508,17 +543,35 @@ nvme_qpair_complete_tracker(qpair, tr, &cpl, ERROR_PRINT_ALL); qpair->sq_head = cpl.sqhd; done++; - } else { - nvme_printf(qpair->ctrlr, + } else if (!in_panic) { + /* + * A missing tracker is normally an error. However, a + * panic can stop the CPU this routine is running on + * after completing an I/O but before updating + * qpair->cq_head at 1 below. Later, we re-enter this + * routine to poll I/O associated with the kernel + * dump. We find that the tr has been set to null before + * calling the completion routine. If it hasn't + * completed (or it triggers a panic), then '1' below + * won't have updated cq_head. Rather than panic again, + * ignore this condition because it's not unexpected. + */ + nvme_printf(qpair->ctrlr, "cpl does not map to outstanding cmd\n"); /* nvme_dump_completion expects device endianess */ nvme_dump_completion(&qpair->cpl[qpair->cq_head]); - KASSERT(0, ("received completion for unknown cmd\n")); + KASSERT(0, ("received completion for unknown cmd")); } - if (++qpair->cq_head == qpair->num_entries) { - qpair->cq_head = 0; - qpair->phase = !qpair->phase; + /* + * There's a number of races with the following (see above) when + * the system panics. We compensate for each one of them by + * using the atomic store to force strong ordering (at least when + * viewed in the aftermath of a panic). + */ + if (++qpair->cq_head == qpair->num_entries) { /* 1 */ + atomic_store_rel_int(&qpair->cq_head, 0); /* 2 */ + qpair->phase = !qpair->phase; /* 3 */ } nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl,