Index: head/sys/dev/bnxt/bnxt_txrx.c =================================================================== --- head/sys/dev/bnxt/bnxt_txrx.c +++ head/sys/dev/bnxt/bnxt_txrx.c @@ -316,10 +316,9 @@ if (softc->rx_cp_rings[rxqid].cons != UINT32_MAX) BNXT_CP_IDX_DISABLE_DB(&softc->rx_cp_rings[rxqid].ring, softc->rx_cp_rings[rxqid].cons); - /* We're given the last filled RX buffer here, not the next empty one */ - BNXT_RX_DB(rx_ring, RING_NEXT(rx_ring, pidx)); + BNXT_RX_DB(rx_ring, pidx); /* TODO: Cumulus+ doesn't need the double doorbell */ - BNXT_RX_DB(rx_ring, RING_NEXT(rx_ring, pidx)); + BNXT_RX_DB(rx_ring, pidx); return; } Index: head/sys/dev/mgb/if_mgb.h =================================================================== --- head/sys/dev/mgb/if_mgb.h +++ head/sys/dev/mgb/if_mgb.h @@ -178,7 +178,8 @@ #define MGB_DESC_GET_FRAME_LEN(_desc) \ (((_desc)->ctl & MGB_DESC_FRAME_LEN_MASK) >> 16) -#define MGB_NEXT_RING_IDX(_idx) (((_idx) + 1) % MGB_DMA_RING_SIZE) +#define MGB_NEXT_RING_IDX(_idx) (((_idx) == MGB_DMA_RING_SIZE - 1) ? 0 : ((_idx_) + 1)) +#define MGB_PREV_RING_IDX(_idx) (((_idx) == 0) ? (MGB_DMA_RING_SIZE - 1) : ((_idx_) - 1)) #define MGB_RING_SPACE(_sc) \ ((((_sc)->tx_ring_data.last_head - (_sc)->tx_ring_data.last_tail - 1) \ + MGB_DMA_RING_SIZE ) % MGB_DMA_RING_SIZE ) Index: head/sys/dev/mgb/if_mgb.c =================================================================== --- head/sys/dev/mgb/if_mgb.c +++ head/sys/dev/mgb/if_mgb.c @@ -1191,7 +1191,12 @@ sc = xsc; KASSERT(rxqid == 0, ("tried to flush RX Channel %d.\n", rxqid)); - sc->rx_ring_data.last_tail = pidx; + /* + * According to the programming guide, last_tail must be set to + * the last valid RX descriptor, rather than to the one past that. + * Note that this is not true for the TX ring! + */ + sc->rx_ring_data.last_tail = MGB_PREV_RING_IDX(pidx); CSR_WRITE_REG(sc, MGB_DMA_RX_TAIL(rxqid), sc->rx_ring_data.last_tail); return; } Index: head/sys/dev/vmware/vmxnet3/if_vmx.c =================================================================== --- head/sys/dev/vmware/vmxnet3/if_vmx.c +++ head/sys/dev/vmware/vmxnet3/if_vmx.c @@ -1744,13 +1744,6 @@ else r = VMXNET3_BAR0_RXH2(rxqid); - /* - * pidx is the index of the last descriptor with a buffer the device - * can use, and the device needs to be told which index is one past - * that. - */ - if (++pidx == rxr->vxrxr_ndesc) - pidx = 0; vmxnet3_write_bar0(sc, r, pidx); } Index: head/sys/net/iflib.c =================================================================== --- head/sys/net/iflib.c +++ head/sys/net/iflib.c @@ -1959,7 +1959,8 @@ * @count: the number of new buffers to allocate * * (Re)populate an rxq free-buffer list with up to @count new packet buffers. - * The caller must assure that @count does not exceed the queue's capacity. + * The caller must assure that @count does not exceed the queue's capacity + * minus one (since we always leave a descriptor unavailable). */ static uint8_t iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count) @@ -1974,6 +1975,8 @@ int err, frag_idx, i, idx, n, pidx; qidx_t credits; + MPASS(count <= fl->ifl_size - fl->ifl_credits - 1); + sd_m = fl->ifl_sds.ifsd_m; sd_map = fl->ifl_sds.ifsd_map; sd_cl = fl->ifl_sds.ifsd_cl; @@ -2081,15 +2084,10 @@ fl->ifl_credits = credits; } DBG_COUNTER_INC(rxd_flush); - if (fl->ifl_pidx == 0) - pidx = fl->ifl_size - 1; - else - pidx = fl->ifl_pidx - 1; - bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, - fl->ifl_id, pidx); + fl->ifl_id, fl->ifl_pidx); if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) { fl->ifl_fragidx = frag_idx + 1; if (fl->ifl_fragidx == fl->ifl_size) @@ -2105,7 +2103,17 @@ static inline uint8_t iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl) { - /* we avoid allowing pidx to catch up with cidx as it confuses ixl */ + /* + * We leave an unused descriptor to avoid pidx to catch up with cidx. + * This is important as it confuses most NICs. For instance, + * Intel NICs have (per receive ring) RDH and RDT registers, where + * RDH points to the next receive descriptor to be used by the NIC, + * and RDT for the next receive descriptor to be published by the + * driver to the NIC (RDT - 1 is thus the last valid one). + * The condition RDH == RDT means no descriptors are available to + * the NIC, and thus it would be ambiguous if it also meant that + * all the descriptors are available to the NIC. + */ int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1; #ifdef INVARIANTS int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1; @@ -2210,12 +2218,15 @@ fl->ifl_zone = m_getzone(fl->ifl_buf_size); - /* avoid pre-allocating zillions of clusters to an idle card - * potentially speeding up attach + /* + * Avoid pre-allocating zillions of clusters to an idle card + * potentially speeding up attach. In any case make sure + * to leave a descriptor unavailable. See the comment in + * iflib_fl_refill_all(). */ - (void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size)); - MPASS(min(128, fl->ifl_size) == fl->ifl_credits); - if (min(128, fl->ifl_size) != fl->ifl_credits) + MPASS(fl->ifl_size > 0); + (void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1)); + if (min(128, fl->ifl_size - 1) != fl->ifl_credits) return (ENOBUFS); /* * handle failure