Index: sys/dev/nvme/nvme_qpair.c =================================================================== --- sys/dev/nvme/nvme_qpair.c +++ sys/dev/nvme/nvme_qpair.c @@ -31,6 +31,8 @@ #include #include +#include +#include #include @@ -308,7 +310,7 @@ } static void -nvme_qpair_print_completion(struct nvme_qpair *qpair, +nvme_qpair_print_completion(struct nvme_qpair *qpair, struct nvme_completion *cpl) { uint16_t sct, sc; @@ -479,18 +481,30 @@ struct nvme_tracker *tr; struct nvme_completion cpl; int done = 0; + bool in_panic = dumping || SCHEDULER_STOPPED(); qpair->num_intr_handler_calls++; + /* + * qpair is not enabled, likely because a controller reset is is in + * progress. Ignore the interrupt - any I/O that was associated with + * this interrupt will get retried when the reset is complete. + */ if (!qpair->is_enabled) - /* - * qpair is not enabled, likely because a controller reset is - * is in progress. Ignore the interrupt - any I/O that was - * associated with this interrupt will get retried when the - * reset is complete. - */ return (false); + /* + * A panic can stop the CPU this routine is running on at any point. If + * we're called during a panic, complete the cq_hea wrap protocol for + * the case where we are interrupted just after the increment at 1 + * below, but before we can reset cq_head to zero at 2. + */ + if (__predict_false(in_panic) && qpair->cq_head == qpair->num_entries) { + qpair->cq_head = 0; + qpair->phase = !qpair->phase; + } + + bus_dmamap_sync(qpair->dma_tag, qpair->queuemem_map, BUS_DMASYNC_POSTREAD | BUS_DMASYNC_POSTWRITE); while (1) { @@ -499,6 +513,14 @@ /* Convert to host endian */ nvme_completion_swapbytes(&cpl); + /* + * If we're polling for completion due to a panic, there is one + * hole: If the thread the CPU stopped on execute 2 below, but + * not 3, then we'll have the wrong phase and stop. However, if + * it did and there's not actually any new completions, we'll + * also have the wrong phase and stop. Since it's hard to guard + * against this case, we just document it. + */ if (NVME_STATUS_GET_P(cpl.status) != qpair->phase) break; @@ -508,17 +530,33 @@ nvme_qpair_complete_tracker(qpair, tr, &cpl, ERROR_PRINT_ALL); qpair->sq_head = cpl.sqhd; done++; - } else { - nvme_printf(qpair->ctrlr, + } else if (!in_panic) { + /* + * A missing tracker is normally an error. However, a + * panic can stop the CPU this routine is running on + * after completing an I/O but before updating + * qpair->cq_head at 1 below. Later, we re-enter this + * routine to poll I/O associated with the kernel + * dump. Don't panic in that case: we need the kernel + * dump more. + */ + nvme_printf(qpair->ctrlr, "cpl does not map to outstanding cmd\n"); /* nvme_dump_completion expects device endianess */ nvme_dump_completion(&qpair->cpl[qpair->cq_head]); KASSERT(0, ("received completion for unknown cmd\n")); } - if (++qpair->cq_head == qpair->num_entries) { - qpair->cq_head = 0; - qpair->phase = !qpair->phase; + /* + * There's a number of races with the following (see above) when + * the system panics. They might be solve with atomics, but that + * would slow down normal completions too much to make the + * kernel panics more reliable, which is a poor tradeoff for this + * driver. + */ + if (++qpair->cq_head == qpair->num_entries) { /* 1 */ + qpair->cq_head = 0; /* 2 */ + qpair->phase = !qpair->phase; /* 3 */ } nvme_mmio_write_4(qpair->ctrlr, doorbell[qpair->id].cq_hdbl,