Index: sys/dev/bnxt/bnxt_txrx.c =================================================================== --- sys/dev/bnxt/bnxt_txrx.c +++ sys/dev/bnxt/bnxt_txrx.c @@ -316,10 +316,9 @@ if (softc->rx_cp_rings[rxqid].cons != UINT32_MAX) BNXT_CP_IDX_DISABLE_DB(&softc->rx_cp_rings[rxqid].ring, softc->rx_cp_rings[rxqid].cons); - /* We're given the last filled RX buffer here, not the next empty one */ - BNXT_RX_DB(rx_ring, RING_NEXT(rx_ring, pidx)); + BNXT_RX_DB(rx_ring, pidx); /* TODO: Cumulus+ doesn't need the double doorbell */ - BNXT_RX_DB(rx_ring, RING_NEXT(rx_ring, pidx)); + BNXT_RX_DB(rx_ring, pidx); return; } Index: sys/dev/mgb/if_mgb.h =================================================================== --- sys/dev/mgb/if_mgb.h +++ sys/dev/mgb/if_mgb.h @@ -178,7 +178,8 @@ #define MGB_DESC_GET_FRAME_LEN(_desc) \ (((_desc)->ctl & MGB_DESC_FRAME_LEN_MASK) >> 16) -#define MGB_NEXT_RING_IDX(_idx) (((_idx) + 1) % MGB_DMA_RING_SIZE) +#define MGB_NEXT_RING_IDX(_idx) (((_idx) == MGB_DMA_RING_SIZE - 1) ? 0 : ((_idx_) + 1)) +#define MGB_PREV_RING_IDX(_idx) (((_idx) == 0) ? (MGB_DMA_RING_SIZE - 1) : ((_idx_) - 1)) #define MGB_RING_SPACE(_sc) \ ((((_sc)->tx_ring_data.last_head - (_sc)->tx_ring_data.last_tail - 1) \ + MGB_DMA_RING_SIZE ) % MGB_DMA_RING_SIZE ) Index: sys/dev/mgb/if_mgb.c =================================================================== --- sys/dev/mgb/if_mgb.c +++ sys/dev/mgb/if_mgb.c @@ -1191,7 +1191,12 @@ sc = xsc; KASSERT(rxqid == 0, ("tried to flush RX Channel %d.\n", rxqid)); - sc->rx_ring_data.last_tail = pidx; + /* + * According to the programming guide, last_tail must be set to + * the last valid RX descriptor, rather than to the one past that. + * Note that this is not true for the TX ring! + */ + sc->rx_ring_data.last_tail = MGB_PREV_RING_IDX(pidx); CSR_WRITE_REG(sc, MGB_DMA_RX_TAIL(rxqid), sc->rx_ring_data.last_tail); return; } Index: sys/dev/vmware/vmxnet3/if_vmx.c =================================================================== --- sys/dev/vmware/vmxnet3/if_vmx.c +++ sys/dev/vmware/vmxnet3/if_vmx.c @@ -1744,13 +1744,6 @@ else r = VMXNET3_BAR0_RXH2(rxqid); - /* - * pidx is the index of the last descriptor with a buffer the device - * can use, and the device needs to be told which index is one past - * that. - */ - if (++pidx == rxr->vxrxr_ndesc) - pidx = 0; vmxnet3_write_bar0(sc, r, pidx); } Index: sys/net/iflib.c =================================================================== --- sys/net/iflib.c +++ sys/net/iflib.c @@ -832,7 +832,6 @@ { struct netmap_adapter *na = kring->na; u_int const lim = kring->nkr_num_slots - 1; - u_int head = kring->rhead; u_int nm_i = kring->nr_hwcur; struct netmap_ring *ring = kring->ring; bus_dmamap_t *map; @@ -840,39 +839,46 @@ if_ctx_t ctx = rxq->ifr_ctx; iflib_fl_t fl = &rxq->ifr_fl[0]; u_int nic_i_first, nic_i; - int i; + int i, n; #if IFLIB_DEBUG_COUNTERS int rf_count = 0; #endif /* - * Netmap requires that we leave (at least) one free slot - * in the ring, so that it can distinguish between an empty - * ring (nr_hwcur == nr_hwtail, i.e. all the buffers owned by the - * user) and a full ring (nr_hwtail == (nr_hwcur - 1) mod N, i.e. - * all the buffers owned by the kernel). - * We thus set head (the refill limit) to nr_hwcur - 1 - * at initialization. The rest of the code will then make sure - * than nr_hwtail never overcomes nr_hwcur. + * This function is used both at initialization and in rxsync. + * At initialization we need to prepare (with isc_rxd_refill()) + * all the (N) netmap buffers in the ring, in such a way to keep + * fl->ifl_pidx and kring->nr_hwcur in sync (except for + * kring->nkr_hwofs); at rxsync time, both indexes point to the + * next buffer to be refilled. + * In any case we publish (with isc_rxd_flush()) up to + * (fl->ifl_pidx - 1) % N (included), to avoid the NIC tail/prod + * pointer to overrun the head/cons pointer, although this is + * not necessary for some NICs (e.g. vmx). */ - if (__predict_false(init)) { - head = nm_prev(nm_i, lim); - } else if (nm_i == head) { - /* Nothing to do. We can leave early. */ - return (0); + if (__predict_false(init)) + n = kring->nkr_num_slots; + else { + n = kring->rhead - nm_i; + if (n == 0) + return (0); /* Nothing to do. */ + if (n < 0) + n += kring->nkr_num_slots; } + /* Start to refill from nr_hwcur, publishing n buffers. */ iru_init(&iru, rxq, 0 /* flid */); map = fl->ifl_sds.ifsd_map; - nic_i = netmap_idx_k2n(kring, nm_i); + nic_i = fl->ifl_pidx; + MPASS(nic_i == netmap_idx_k2n(kring, nm_i)); DBG_COUNTER_INC(fl_refills); - while (nm_i != head) { + while (n > 0) { #if IFLIB_DEBUG_COUNTERS if (++rf_count == 9) DBG_COUNTER_INC(fl_refills_large); #endif nic_i_first = nic_i; - for (i = 0; i < IFLIB_MAX_RX_REFRESH && nm_i != head; i++) { + for (i = 0; n > 0 && i < IFLIB_MAX_RX_REFRESH; n--, i++) { struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[i]); @@ -903,11 +909,11 @@ iru.iru_count = i; ctx->isc_rxd_refill(ctx->ifc_softc, &iru); } - kring->nr_hwcur = head; + fl->ifl_pidx = nic_i; + MPASS(!init || nm_i == 0); + MPASS(nm_i == kring->rhead); + kring->nr_hwcur = nm_i; - /* The pidx argument of isc_rxd_flush() is the index of the last valid - * slot in the free list ring. We need therefore to decrement nic_i, - * similarly to what happens in iflib_fl_refill() for ifl_pidx. */ bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, @@ -1953,7 +1959,8 @@ * @count: the number of new buffers to allocate * * (Re)populate an rxq free-buffer list with up to @count new packet buffers. - * The caller must assure that @count does not exceed the queue's capacity. + * The caller must assure that @count does not exceed the queue's capacity + * minus one (since we always leave a descriptor unavailable). */ static uint8_t iflib_fl_refill(if_ctx_t ctx, iflib_fl_t fl, int count) @@ -2075,15 +2082,10 @@ fl->ifl_credits = credits; } DBG_COUNTER_INC(rxd_flush); - if (fl->ifl_pidx == 0) - pidx = fl->ifl_size - 1; - else - pidx = fl->ifl_pidx - 1; - bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_rxd_flush(ctx->ifc_softc, fl->ifl_rxq->ifr_id, - fl->ifl_id, pidx); + fl->ifl_id, fl->ifl_pidx); if (__predict_true(bit_test(fl->ifl_rx_bitmap, frag_idx))) { fl->ifl_fragidx = frag_idx + 1; if (fl->ifl_fragidx == fl->ifl_size) @@ -2099,7 +2101,17 @@ static inline uint8_t iflib_fl_refill_all(if_ctx_t ctx, iflib_fl_t fl) { - /* we avoid allowing pidx to catch up with cidx as it confuses ixl */ + /* + * We leave an unused descriptor to avoid pidx to catch up with cidx. + * This is important as it confuses most NICs. For instance, + * Intel NICs have (per receive ring) RDH and RDT registers, where + * RDH points to the next receive descriptor to be used by the NIC, + * and RDT for the next receive descriptor to be published by the + * driver to the NIC (RDT - 1 is thus the last valid one). + * The condition RDH == RDT means no descriptors are available to + * the NIC, and thus it would be ambiguous if it also meant that + * all the descriptors are available to the NIC. + */ int32_t reclaimable = fl->ifl_size - fl->ifl_credits - 1; #ifdef INVARIANTS int32_t delta = fl->ifl_size - get_inuse(fl->ifl_size, fl->ifl_cidx, fl->ifl_pidx, fl->ifl_gen) - 1; @@ -2204,12 +2216,15 @@ fl->ifl_zone = m_getzone(fl->ifl_buf_size); - /* avoid pre-allocating zillions of clusters to an idle card - * potentially speeding up attach + /* + * Avoid pre-allocating zillions of clusters to an idle card + * potentially speeding up attach. In any case make sure + * to leave a descriptor unavailable. */ - (void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size)); - MPASS(min(128, fl->ifl_size) == fl->ifl_credits); - if (min(128, fl->ifl_size) != fl->ifl_credits) + MPASS(fl->ifl_size > 0); + (void)iflib_fl_refill(ctx, fl, min(128, fl->ifl_size - 1)); + MPASS(min(128, fl->ifl_size - 1) == fl->ifl_credits); + if (min(128, fl->ifl_size - 1) != fl->ifl_credits) return (ENOBUFS); /* * handle failure