diff --git a/sys/dev/axgbe/if_axgbe_pci.c b/sys/dev/axgbe/if_axgbe_pci.c --- a/sys/dev/axgbe/if_axgbe_pci.c +++ b/sys/dev/axgbe/if_axgbe_pci.c @@ -1922,7 +1922,7 @@ channel = pdata->channel[i]; snprintf(buf, sizeof(buf), "rxq%d", i); - error = iflib_irq_alloc_generic(ctx, &irq, rid, IFLIB_INTR_RX, + error = iflib_irq_alloc_generic(ctx, &irq, rid, IFLIB_INTR_RXTX, axgbe_msix_que, channel, channel->queue_index, buf); if (error) { diff --git a/sys/dev/bnxt/if_bnxt.c b/sys/dev/bnxt/if_bnxt.c --- a/sys/dev/bnxt/if_bnxt.c +++ b/sys/dev/bnxt/if_bnxt.c @@ -1534,7 +1534,7 @@ for (i=0; iscctx->isc_nrxqsets; i++) { snprintf(irq_name, sizeof(irq_name), "rxq%d", i); rc = iflib_irq_alloc_generic(ctx, &softc->rx_cp_rings[i].irq, - softc->rx_cp_rings[i].ring.id + 1, IFLIB_INTR_RX, + softc->rx_cp_rings[i].ring.id + 1, IFLIB_INTR_RXTX, bnxt_handle_rx_cp, &softc->rx_cp_rings[i], i, irq_name); if (rc) { device_printf(iflib_get_dev(ctx), diff --git a/sys/dev/ice/if_ice_iflib.c b/sys/dev/ice/if_ice_iflib.c --- a/sys/dev/ice/if_ice_iflib.c +++ b/sys/dev/ice/if_ice_iflib.c @@ -1493,7 +1493,7 @@ snprintf(irq_name, sizeof(irq_name), "rxq%d", i); err = iflib_irq_alloc_generic(ctx, &sc->irqvs[vector].irq, rid, - IFLIB_INTR_RX, ice_msix_que, + IFLIB_INTR_RXTX, ice_msix_que, rxq, rxq->me, irq_name); if (err) { device_printf(sc->dev, diff --git a/sys/dev/ixgbe/if_ix.c b/sys/dev/ixgbe/if_ix.c --- a/sys/dev/ixgbe/if_ix.c +++ b/sys/dev/ixgbe/if_ix.c @@ -2026,7 +2026,7 @@ snprintf(buf, sizeof(buf), "rxq%d", i); error = iflib_irq_alloc_generic(ctx, &rx_que->que_irq, rid, - IFLIB_INTR_RX, ixgbe_msix_que, rx_que, rx_que->rxr.me, buf); + IFLIB_INTR_RXTX, ixgbe_msix_que, rx_que, rx_que->rxr.me, buf); if (error) { device_printf(iflib_get_dev(ctx), diff --git a/sys/dev/ixgbe/if_ixv.c b/sys/dev/ixgbe/if_ixv.c --- a/sys/dev/ixgbe/if_ixv.c +++ b/sys/dev/ixgbe/if_ixv.c @@ -1030,7 +1030,7 @@ snprintf(buf, sizeof(buf), "rxq%d", i); error = iflib_irq_alloc_generic(ctx, &rx_que->que_irq, rid, - IFLIB_INTR_RX, ixv_msix_que, rx_que, rx_que->rxr.me, buf); + IFLIB_INTR_RXTX, ixv_msix_que, rx_que, rx_que->rxr.me, buf); if (error) { device_printf(iflib_get_dev(ctx), diff --git a/sys/dev/ixl/if_iavf.c b/sys/dev/ixl/if_iavf.c --- a/sys/dev/ixl/if_iavf.c +++ b/sys/dev/ixl/if_iavf.c @@ -904,7 +904,7 @@ snprintf(buf, sizeof(buf), "rxq%d", i); err = iflib_irq_alloc_generic(ctx, &rx_que->que_irq, rid, - IFLIB_INTR_RX, iavf_msix_que, rx_que, rx_que->rxr.me, buf); + IFLIB_INTR_RXTX, iavf_msix_que, rx_que, rx_que->rxr.me, buf); /* XXX: Does the driver work as expected if there are fewer num_rx_queues than * what's expected in the iflib context? */ if (err) { diff --git a/sys/dev/ixl/if_ixl.c b/sys/dev/ixl/if_ixl.c --- a/sys/dev/ixl/if_ixl.c +++ b/sys/dev/ixl/if_ixl.c @@ -1051,7 +1051,7 @@ snprintf(buf, sizeof(buf), "rxq%d", i); err = iflib_irq_alloc_generic(ctx, &rx_que->que_irq, rid, - IFLIB_INTR_RX, ixl_msix_que, rx_que, rx_que->rxr.me, buf); + IFLIB_INTR_RXTX, ixl_msix_que, rx_que, rx_que->rxr.me, buf); /* XXX: Does the driver work as expected if there are fewer num_rx_queues than * what's expected in the iflib context? */ if (err) { diff --git a/sys/dev/mgb/if_mgb.c b/sys/dev/mgb/if_mgb.c --- a/sys/dev/mgb/if_mgb.c +++ b/sys/dev/mgb/if_mgb.c @@ -855,7 +855,7 @@ vectorid++; snprintf(irq_name, sizeof(irq_name), "rxq%d", i); error = iflib_irq_alloc_generic(ctx, &sc->rx_irq, vectorid + 1, - IFLIB_INTR_RX, mgb_rxq_intr, sc, i, irq_name); + IFLIB_INTR_RXTX, mgb_rxq_intr, sc, i, irq_name); if (error) { device_printf(sc->dev, "Failed to register rxq %d interrupt handler\n", i); diff --git a/sys/dev/vmware/vmxnet3/if_vmx.c b/sys/dev/vmware/vmxnet3/if_vmx.c --- a/sys/dev/vmware/vmxnet3/if_vmx.c +++ b/sys/dev/vmware/vmxnet3/if_vmx.c @@ -480,7 +480,7 @@ rxq = &sc->vmx_rxq[i]; error = iflib_irq_alloc_generic(ctx, &rxq->vxrxq_irq, i + 1, - IFLIB_INTR_RX, vmxnet3_rxq_intr, rxq, i, irq_name); + IFLIB_INTR_RXTX, vmxnet3_rxq_intr, rxq, i, irq_name); if (error) { device_printf(iflib_get_dev(ctx), "Failed to register rxq %d interrupt handler\n", i); diff --git a/sys/net/iflib.h b/sys/net/iflib.h --- a/sys/net/iflib.h +++ b/sys/net/iflib.h @@ -288,10 +288,27 @@ #define IFLIB_MAGIC 0xCAFEF00D typedef enum { + /* Interrupt or softirq handles only receive */ IFLIB_INTR_RX, + + /* Interrupt or softirq handles only transmit */ IFLIB_INTR_TX, + + /* + * Interrupt will check for both pending receive + * and available tx credits and dispatch a task + * for one or both depending on the disposition + * of the respective queues. + */ IFLIB_INTR_RXTX, + + /* + * Other interrupt - typically link status and + * or error conditions. + */ IFLIB_INTR_ADMIN, + + /* Softirq (task) for iov handling */ IFLIB_INTR_IOV, } iflib_intr_type_t; diff --git a/sys/net/iflib.c b/sys/net/iflib.c --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -585,6 +585,10 @@ static int iflib_no_tx_batch = 0; SYSCTL_INT(_net_iflib, OID_AUTO, no_tx_batch, CTLFLAG_RW, &iflib_no_tx_batch, 0, "minimize transmit latency at the possible expense of throughput"); +static int iflib_timer_default = 1000; +SYSCTL_INT(_net_iflib, OID_AUTO, timer_default, CTLFLAG_RW, + &iflib_timer_default, 0, "number of ticks between iflib_timer calls"); + #if IFLIB_DEBUG_COUNTERS @@ -2293,7 +2297,7 @@ ** can be done without the lock because its RO ** and the HUNG state will be static if set. */ - if (this_tick - txq->ift_last_timer_tick >= hz / 2) { + if (this_tick - txq->ift_last_timer_tick >= iflib_timer_default) { txq->ift_last_timer_tick = this_tick; IFDI_TIMER(ctx, txq->ift_id); if ((txq->ift_qstatus == IFLIB_QUEUE_HUNG) && @@ -2303,7 +2307,8 @@ if (txq->ift_qstatus != IFLIB_QUEUE_IDLE && ifmp_ring_is_stalled(txq->ift_br)) { - KASSERT(ctx->ifc_link_state == LINK_STATE_UP, ("queue can't be marked as hung if interface is down")); + KASSERT(ctx->ifc_link_state == LINK_STATE_UP, + ("queue can't be marked as hung if interface is down")); txq->ift_qstatus = IFLIB_QUEUE_HUNG; } txq->ift_cleaned_prev = txq->ift_cleaned; @@ -2314,7 +2319,8 @@ sctx->isc_pause_frames = 0; if (if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_RUNNING) - callout_reset_on(&txq->ift_timer, hz / 2, iflib_timer, txq, txq->ift_timer.c_cpu); + callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, + txq, txq->ift_timer.c_cpu); return; hung: @@ -2426,7 +2432,7 @@ IFDI_INTR_ENABLE(ctx); txq = ctx->ifc_txqs; for (i = 0; i < sctx->isc_ntxqsets; i++, txq++) - callout_reset_on(&txq->ift_timer, hz/2, iflib_timer, txq, + callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq, txq->ift_timer.c_cpu); } @@ -3001,26 +3007,37 @@ /* XXX we should be setting this to something other than zero */ #define RECLAIM_THRESH(ctx) ((ctx)->ifc_sctx->isc_tx_reclaim_thresh) -#define MAX_TX_DESC(ctx) max((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \ +#define MAX_TX_DESC(ctx) MAX((ctx)->ifc_softc_ctx.isc_tx_tso_segments_max, \ (ctx)->ifc_softc_ctx.isc_tx_nsegments) static inline bool -iflib_txd_db_check(if_ctx_t ctx, iflib_txq_t txq, int ring, qidx_t in_use) +iflib_txd_db_check(iflib_txq_t txq, int ring) { + if_ctx_t ctx = txq->ift_ctx; qidx_t dbval, max; - bool rang; - rang = false; - max = TXQ_MAX_DB_DEFERRED(txq, in_use); - if (ring || txq->ift_db_pending >= max) { + max = TXQ_MAX_DB_DEFERRED(txq, txq->ift_in_use); + + /* force || threshold exceeded || at the edge of the ring */ + if (ring || (txq->ift_db_pending >= max) || (TXQ_AVAIL(txq) <= MAX_TX_DESC(ctx) + 2)) { + + /* + * 'npending' is used if the card's doorbell is in terms of the number of descriptors + * pending flush (BRCM). 'pidx' is used in cases where the card's doorbeel uses the + * producer index explicitly (INTC). + */ dbval = txq->ift_npending ? txq->ift_npending : txq->ift_pidx; bus_dmamap_sync(txq->ift_ifdi->idi_tag, txq->ift_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); ctx->isc_txd_flush(ctx->ifc_softc, txq->ift_id, dbval); + + /* + * Absent bugs there are zero packets pending so reset pending counts to zero. + */ txq->ift_db_pending = txq->ift_npending = 0; - rang = true; + return (true); } - return (rang); + return (false); } #ifdef PKT_DEBUG @@ -3481,6 +3498,7 @@ MPASS(pi.ipi_new_pidx != pidx); MPASS(ndesc > 0); txq->ift_in_use += ndesc; + txq->ift_db_pending += ndesc; /* * We update the last software descriptor again here because there may @@ -3647,8 +3665,8 @@ if_ctx_t ctx = txq->ift_ctx; if_t ifp = ctx->ifc_ifp; struct mbuf *m, **mp; - int avail, bytes_sent, consumed, count, err, i, in_use_prev; - int mcast_sent, pkt_sent, reclaimed, txq_avail; + int avail, bytes_sent, skipped, count, err, i; + int mcast_sent, pkt_sent, reclaimed; bool do_prefetch, rang, ring; if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) || @@ -3657,9 +3675,13 @@ return (0); } reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); - rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use); + rang = iflib_txd_db_check(txq, reclaimed && txq->ift_db_pending); avail = IDXDIFF(pidx, cidx, r->size); + if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) { + /* + * The driver is unloading so we need to free all pending packets. + */ DBG_COUNTER_INC(txq_drain_flushing); for (i = 0; i < avail; i++) { if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq)) @@ -3677,9 +3699,13 @@ DBG_COUNTER_INC(txq_drain_oactive); return (0); } + + /* + * If we've reclaimed any packets this queue cannot be hung. + */ if (reclaimed) txq->ift_qstatus = IFLIB_QUEUE_IDLE; - consumed = mcast_sent = bytes_sent = pkt_sent = 0; + skipped = mcast_sent = bytes_sent = pkt_sent = 0; count = MIN(avail, TX_BATCH_SIZE); #ifdef INVARIANTS if (iflib_verbose_debug) @@ -3687,54 +3713,57 @@ avail, ctx->ifc_flags, TXQ_AVAIL(txq)); #endif do_prefetch = (ctx->ifc_flags & IFC_PREFETCH); - txq_avail = TXQ_AVAIL(txq); err = 0; - for (i = 0; i < count && txq_avail > MAX_TX_DESC(ctx) + 2; i++) { + for (i = 0; i < count && TXQ_AVAIL(txq) >= MAX_TX_DESC(ctx) + 2; i++) { int rem = do_prefetch ? count - i : 0; mp = _ring_peek_one(r, cidx, i, rem); MPASS(mp != NULL && *mp != NULL); + + /* + * Completion interrupts will use the address of the txq + * as a sentinel to enqueue _something_ in order to acquire + * the lock on the mp_ring (there's no direct lock call). + * We obviously whave to check for these sentinel cases + * and skip them. + */ if (__predict_false(*mp == (struct mbuf *)txq)) { - consumed++; + skipped++; continue; } - in_use_prev = txq->ift_in_use; err = iflib_encap(txq, mp); if (__predict_false(err)) { /* no room - bail out */ if (err == ENOBUFS) break; - consumed++; + skipped++; /* we can't send this packet - skip it */ continue; } - consumed++; pkt_sent++; m = *mp; DBG_COUNTER_INC(tx_sent); bytes_sent += m->m_pkthdr.len; mcast_sent += !!(m->m_flags & M_MCAST); - txq_avail = TXQ_AVAIL(txq); - txq->ift_db_pending += (txq->ift_in_use - in_use_prev); - ETHER_BPF_MTAP(ifp, m); if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) break; - rang = iflib_txd_db_check(ctx, txq, false, in_use_prev); + ETHER_BPF_MTAP(ifp, m); + rang = iflib_txd_db_check(txq, false); } /* deliberate use of bitwise or to avoid gratuitous short-circuit */ - ring = rang ? false : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)); - iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use); + ring = rang ? false : (iflib_min_tx_latency | err); + iflib_txd_db_check(txq, ring); if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); if (mcast_sent) if_inc_counter(ifp, IFCOUNTER_OMCASTS, mcast_sent); #ifdef INVARIANTS if (iflib_verbose_debug) - printf("consumed=%d\n", consumed); + printf("consumed=%d\n", skipped + pkt_sent); #endif - return (consumed); + return (skipped + pkt_sent); } static uint32_t @@ -3907,7 +3936,7 @@ } IFDI_UPDATE_ADMIN_STATUS(ctx); for (txq = ctx->ifc_txqs, i = 0; i < sctx->isc_ntxqsets; i++, txq++) { - callout_reset_on(&txq->ift_timer, hz / 2, iflib_timer, txq, + callout_reset_on(&txq->ift_timer, iflib_timer_default, iflib_timer, txq, txq->ift_timer.c_cpu); } IFDI_LINK_INTR_ENABLE(ctx); @@ -5301,6 +5330,7 @@ static int iflib_module_init(void) { + iflib_timer_default = hz / 2; return (0); } @@ -6861,7 +6891,7 @@ txq = &ctx->ifc_txqs[0]; error = iflib_encap(txq, &m); if (error == 0) - (void)iflib_txd_db_check(ctx, txq, true, txq->ift_in_use); + (void)iflib_txd_db_check(txq, true); return (error); }