Index: sys/net/iflib.c =================================================================== --- sys/net/iflib.c +++ sys/net/iflib.c @@ -191,6 +191,7 @@ uint16_t ifc_sysctl_tx_abdicate; uint16_t ifc_sysctl_core_offset; #define CORE_OFFSET_UNSPECIFIED 0xffff + int ifc_sysctl_bypassmp; uint8_t ifc_sysctl_separate_txrx; qidx_t ifc_sysctl_ntxds[8]; @@ -3511,7 +3512,11 @@ /* * Need a rate-limiting check so that this isn't called every time + * But this is terrible - discuss */ + if (txq->ift_in_use < thresh) { + return 0; + } iflib_tx_credits_update(ctx, txq); reclaim = DESC_RECLAIMABLE(txq); @@ -3534,14 +3539,14 @@ } static struct mbuf ** -_ring_peek_one(struct ifmp_ring *r, int cidx, int offset, int remaining) +_ring_peek_one(struct ifmp_ring *r, void *volatile *itms, int cidx, int offset, int remaining) { int next, size; struct mbuf **items; size = r->size; next = (cidx + CACHE_PTR_INCREMENT) & (size-1); - items = __DEVOLATILE(struct mbuf **, &r->items[0]); + items = __DEVOLATILE(struct mbuf **, &itms[0]); prefetch(items[(cidx + offset) & (size-1)]); if (remaining > 1) { @@ -3550,14 +3555,16 @@ prefetch2cachelines(items[(cidx + offset + 2) & (size-1)]); prefetch2cachelines(items[(cidx + offset + 3) & (size-1)]); } - return (__DEVOLATILE(struct mbuf **, &r->items[(cidx + offset) & (size-1)])); + return (__DEVOLATILE(struct mbuf **, &itms[(cidx + offset) & (size-1)])); } static void iflib_txq_check_drain(iflib_txq_t txq, int budget) { + if_ctx_t ctx = txq->ift_ctx; - ifmp_ring_check_drainage(txq->ift_br, budget); + if (!ctx->ifc_sysctl_bypassmp) + ifmp_ring_check_drainage(txq->ift_br, budget); } static uint32_t @@ -3575,11 +3582,90 @@ } static uint32_t -iflib_txq_drain(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) +iflib_txq_drain_bypass(struct ifmp_ring *r, struct mbuf *m) { iflib_txq_t txq = r->cookie; if_ctx_t ctx = txq->ift_ctx; if_t ifp = ctx->ifc_ifp; + int reclaimed, err, consumed, in_use_prev; + bool rang, ring; + + if (__predict_false(!(if_getdrvflags(ifp) & IFF_DRV_RUNNING) || + !LINK_ACTIVE(ctx))) { + DBG_COUNTER_INC(txq_drain_notready); + return (0); + } + + CALLOUT_LOCK(txq); + + reclaimed = iflib_completed_tx_reclaim(txq, txq->ift_size/2); + rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use); + + if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) { + CALLOUT_UNLOCK(txq); + DBG_COUNTER_INC(txq_drain_flushing); + m_free(m); + return (1); + } + + if (__predict_false(if_getdrvflags(ctx->ifc_ifp) & IFF_DRV_OACTIVE)) { + txq->ift_qstatus = IFLIB_QUEUE_IDLE; + callout_stop(&txq->ift_timer); + CALLOUT_UNLOCK(txq); + DBG_COUNTER_INC(txq_drain_oactive); + return (0); + } + + if (reclaimed) { + txq->ift_qstatus = IFLIB_QUEUE_IDLE; + } + + consumed = err = 0; + if (__predict_true(TXQ_AVAIL(txq) > MAX_TX_DESC(ctx) + 2)) { + in_use_prev = txq->ift_in_use; + err = iflib_encap(txq, &m); + if (__predict_false(err)) { + /* no room - bail out */ + CALLOUT_UNLOCK(txq); + if (err == ENOBUFS) + goto done; + consumed++; + /* we can't send this packet - skip it */ + // Wait, shouldn't we free it since we're returning 1? + m_free(m); + goto done; + } + txq->ift_db_pending += (txq->ift_in_use - in_use_prev); + rang = iflib_txd_db_check(ctx, txq, false, in_use_prev); + /* deliberate use of bitwise or to avoid gratuitous short-circuit */ + ring = rang ? false : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)); + iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use); + CALLOUT_UNLOCK(txq); + if_inc_counter(ifp, IFCOUNTER_OBYTES, m->m_pkthdr.len); + if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); + if_inc_counter(ifp, IFCOUNTER_OMCASTS, !!(m->m_flags & M_MCAST)); + consumed++; + DBG_COUNTER_INC(tx_sent); + + ETHER_BPF_MTAP(ifp, m); + } + else + CALLOUT_UNLOCK(txq); + +done: +#ifdef INVARIANTS + if (iflib_verbose_debug) + printf("consumed=%d\n", consumed); +#endif + return (consumed); +} + +static uint32_t +iflib_txq_drain(struct ifmp_ring *r, void *volatile *items, uint32_t cidx, uint32_t pidx) +{ + iflib_txq_t txq = r->cookie; + if_ctx_t ctx = txq->ift_ctx; + if_t ifp = ctx->ifc_ifp; struct mbuf *m, **mp; int avail, bytes_sent, consumed, count, err, i, in_use_prev; int mcast_sent, pkt_sent, reclaimed, txq_avail; @@ -3590,15 +3676,25 @@ DBG_COUNTER_INC(txq_drain_notready); return (0); } - reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_LOCK(txq); + if (ctx->ifc_sysctl_bypassmp == 0) + reclaimed = iflib_completed_tx_reclaim(txq, RECLAIM_THRESH(ctx)); + else if (cidx == pidx) + reclaimed = iflib_completed_tx_reclaim(txq, 0); + else + reclaimed = iflib_completed_tx_reclaim(txq, txq->ift_size/2); rang = iflib_txd_db_check(ctx, txq, reclaimed, txq->ift_in_use); + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_UNLOCK(txq); avail = IDXDIFF(pidx, cidx, r->size); if (__predict_false(ctx->ifc_flags & IFC_QFLUSH)) { DBG_COUNTER_INC(txq_drain_flushing); for (i = 0; i < avail; i++) { - if (__predict_true(r->items[(cidx + i) & (r->size-1)] != (void *)txq)) - m_free(r->items[(cidx + i) & (r->size-1)]); - r->items[(cidx + i) & (r->size-1)] = NULL; + if (__predict_true(items[(cidx + i) & (r->size-1)] != (void *)txq)) + m_free(items[(cidx + i) & (r->size-1)]); + items[(cidx + i) & (r->size-1)] = NULL; } return (avail); } @@ -3611,8 +3707,13 @@ DBG_COUNTER_INC(txq_drain_oactive); return (0); } - if (reclaimed) + if (reclaimed) { + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_LOCK(txq); txq->ift_qstatus = IFLIB_QUEUE_IDLE; + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_UNLOCK(txq); + } consumed = mcast_sent = bytes_sent = pkt_sent = 0; count = MIN(avail, TX_BATCH_SIZE); #ifdef INVARIANTS @@ -3621,45 +3722,60 @@ avail, ctx->ifc_flags, TXQ_AVAIL(txq)); #endif do_prefetch = (ctx->ifc_flags & IFC_PREFETCH); + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_LOCK(txq); txq_avail = TXQ_AVAIL(txq); + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_UNLOCK(txq); err = 0; + // The use of txq_avail outside the lock is suspect here... for (i = 0; i < count && txq_avail > MAX_TX_DESC(ctx) + 2; i++) { int rem = do_prefetch ? count - i : 0; - mp = _ring_peek_one(r, cidx, i, rem); + mp = _ring_peek_one(r, items, cidx, i, rem); MPASS(mp != NULL && *mp != NULL); if (__predict_false(*mp == (struct mbuf *)txq)) { consumed++; continue; } + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_LOCK(txq); in_use_prev = txq->ift_in_use; err = iflib_encap(txq, mp); if (__predict_false(err)) { /* no room - bail out */ + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_UNLOCK(txq); if (err == ENOBUFS) break; consumed++; /* we can't send this packet - skip it */ continue; } + txq_avail = TXQ_AVAIL(txq); + txq->ift_db_pending += (txq->ift_in_use - in_use_prev); + rang = iflib_txd_db_check(ctx, txq, false, in_use_prev); + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_UNLOCK(txq); consumed++; pkt_sent++; m = *mp; DBG_COUNTER_INC(tx_sent); bytes_sent += m->m_pkthdr.len; mcast_sent += !!(m->m_flags & M_MCAST); - txq_avail = TXQ_AVAIL(txq); - txq->ift_db_pending += (txq->ift_in_use - in_use_prev); ETHER_BPF_MTAP(ifp, m); if (__predict_false(!(ifp->if_drv_flags & IFF_DRV_RUNNING))) break; - rang = iflib_txd_db_check(ctx, txq, false, in_use_prev); } /* deliberate use of bitwise or to avoid gratuitous short-circuit */ + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_LOCK(txq); ring = rang ? false : (iflib_min_tx_latency | err) || (TXQ_AVAIL(txq) < MAX_TX_DESC(ctx)); iflib_txd_db_check(ctx, txq, ring, txq->ift_in_use); + if (ctx->ifc_sysctl_bypassmp) + CALLOUT_UNLOCK(txq); if_inc_counter(ifp, IFCOUNTER_OBYTES, bytes_sent); if_inc_counter(ifp, IFCOUNTER_OPACKETS, pkt_sent); if (mcast_sent) @@ -3678,7 +3794,7 @@ } static uint32_t -iflib_txq_drain_free(struct ifmp_ring *r, uint32_t cidx, uint32_t pidx) +iflib_txq_drain_free(struct ifmp_ring *r, void *volatile *items, uint32_t cidx, uint32_t pidx) { int i, avail; struct mbuf **mp; @@ -3693,7 +3809,7 @@ avail = IDXDIFF(pidx, cidx, r->size); for (i = 0; i < avail; i++) { - mp = _ring_peek_one(r, cidx, i, avail - i); + mp = _ring_peek_one(r, items, cidx, i, avail - i); if (__predict_false(*mp == (struct mbuf *)txq)) continue; m_freem(*mp); @@ -3706,13 +3822,15 @@ static void iflib_ifmp_purge(iflib_txq_t txq) { + if_ctx_t ctx = txq->ift_ctx; struct ifmp_ring *r; r = txq->ift_br; r->drain = iflib_txq_drain_free; r->can_drain = iflib_txq_drain_always; - ifmp_ring_check_drainage(r, r->size); + if (!ctx->ifc_sysctl_bypassmp) + ifmp_ring_check_drainage(r, r->size); r->drain = iflib_txq_drain; r->can_drain = iflib_txq_can_drain; @@ -3750,15 +3868,24 @@ if (ALTQ_IS_ENABLED(&ifp->if_snd)) iflib_altq_if_start(ifp); #endif - if (txq->ift_db_pending) - ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate); - else if (!abdicate) - ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); - /* - * When abdicating, we always need to check drainage, not just when we don't enqueue - */ - if (abdicate) - ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); + if (ctx->ifc_sysctl_bypassmp) { + CALLOUT_LOCK(txq); + if (txq->ift_db_pending) + iflib_txd_db_check(ctx, txq, true, txq->ift_in_use); + if (txq->ift_in_use) + txq->ift_br->drain(txq->ift_br, NULL, 0, 0); + CALLOUT_UNLOCK(txq); + } else { + if (txq->ift_db_pending) + ifmp_ring_enqueue(txq->ift_br, (void **)&txq, 1, TX_BATCH_SIZE, abdicate); + else if (!abdicate) + ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); + /* + * When abdicating, we always need to check drainage, not just when we don't enqueue + */ + if (abdicate) + ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); + } if (ctx->ifc_flags & IFC_LEGACY) IFDI_INTR_ENABLE(ctx); else @@ -3994,6 +4121,41 @@ return (err); } +static int +iflib_if_transmit_bypass(if_t ifp, struct mbuf *m) +{ + if_ctx_t ctx = if_getsoftc(ifp); + + iflib_txq_t txq; + int err, qidx; + + if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) { + DBG_COUNTER_INC(tx_frees); + m_freem(m); + return (ENETDOWN); + } + + MPASS(m->m_nextpkt == NULL); + /* ALTQ-enabled interfaces always use queue 0. */ + qidx = 0; + if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd)) + qidx = QIDX(ctx, m); + /* + * XXX calculate buf_ring based on flowid (divvy up bits?) + */ + txq = &ctx->ifc_txqs[qidx]; + + DBG_COUNTER_INC(tx_seen); + err = (iflib_txq_drain_bypass(txq->ift_br, m) != 1); + + if (err) { + m_freem(m); + DBG_COUNTER_INC(tx_frees); + } + + return (err ? EAGAIN : 0); +} + #ifdef ALTQ /* * The overall approach to integrating iflib with ALTQ is to continue to use @@ -4021,11 +4183,15 @@ { struct ifaltq *ifq = &ifp->if_snd; struct mbuf *m; - + if_ctx_t ctx = if_getsoftc(ifp); + IFQ_LOCK(ifq); IFQ_DEQUEUE_NOLOCK(ifq, m); while (m != NULL) { - iflib_if_transmit(ifp, m); + if (ctx->ifc_sysctl_bypassmp) + iflib_if_transmit_bypass(ifp, m); + else + iflib_if_transmit(ifp, m); IFQ_DEQUEUE_NOLOCK(ifq, m); } IFQ_UNLOCK(ifq); @@ -4034,14 +4200,19 @@ static int iflib_altq_if_transmit(if_t ifp, struct mbuf *m) { + if_ctx_t ctx = if_getsoftc(ifp); int err; if (ALTQ_IS_ENABLED(&ifp->if_snd)) { IFQ_ENQUEUE(&ifp->if_snd, m, err); if (err == 0) iflib_altq_if_start(ifp); - } else - err = iflib_if_transmit(ifp, m); + } else { + if (ctx->ifc_sysctl_bypassmp) + err = iflib_if_transmit_bypass(ifp, m); + else + err = iflib_if_transmit(ifp, m); + } return (err); } @@ -4057,9 +4228,12 @@ STATE_LOCK(ctx); ctx->ifc_flags |= IFC_QFLUSH; STATE_UNLOCK(ctx); - for (i = 0; i < NTXQSETS(ctx); i++, txq++) - while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br))) - iflib_txq_check_drain(txq, 0); + + if (!ctx->ifc_sysctl_bypassmp) { + for (i = 0; i < NTXQSETS(ctx); i++, txq++) + while (!(ifmp_ring_is_idle(txq->ift_br) || ifmp_ring_is_stalled(txq->ift_br))) + iflib_txq_check_drain(txq, 0); + } STATE_LOCK(ctx); ctx->ifc_flags &= ~IFC_QFLUSH; STATE_UNLOCK(ctx); @@ -4571,6 +4745,8 @@ goto fail_ctx_free; } iflib_add_device_sysctl_pre(ctx); + if (ctx->ifc_sysctl_bypassmp) + if_settransmitfn(ctx->ifc_ifp, iflib_if_transmit_bypass); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; @@ -4820,6 +4996,8 @@ goto fail_ctx_free; } iflib_add_device_sysctl_pre(ctx); + if (ctx->ifc_sysctl_bypassmp) + if_settransmitfn(ctx->ifc_ifp, iflib_if_transmit_bypass); scctx = &ctx->ifc_softc_ctx; ifp = ctx->ifc_ifp; @@ -5326,6 +5504,7 @@ driver_t *driver = sctx->isc_driver; device_t dev = ctx->ifc_dev; if_t ifp; + char path[64]; _iflib_assert(sctx); @@ -5349,12 +5528,19 @@ if_setdev(ifp, dev); if_setinitfn(ifp, iflib_if_init); if_setioctlfn(ifp, iflib_if_ioctl); + snprintf(path, sizeof(path), "dev.%s.%d.iflib.bypass_mpring", device_get_name(ctx->ifc_dev), device_get_unit(ctx->ifc_dev)); + TUNABLE_INT_FETCH(path, &ctx->ifc_sysctl_bypassmp); #ifdef ALTQ if_setstartfn(ifp, iflib_altq_if_start); if_settransmitfn(ifp, iflib_altq_if_transmit); if_setsendqready(ifp); #else - if_settransmitfn(ifp, iflib_if_transmit); + if (ctx->ifc_sysctl_bypassmp) { + // This doesn't work since the sysctl isn't set up yet. + if_settransmitfn(ifp, iflib_if_transmit_bypass); + } + else + if_settransmitfn(ifp, iflib_if_transmit); #endif if_setqflushfn(ifp, iflib_if_qflush); if_setflags(ifp, IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST); Index: sys/net/mp_ring.h =================================================================== --- sys/net/mp_ring.h +++ sys/net/mp_ring.h @@ -36,7 +36,7 @@ #endif struct ifmp_ring; -typedef u_int (*mp_ring_drain_t)(struct ifmp_ring *, u_int, u_int); +typedef u_int (*mp_ring_drain_t)(struct ifmp_ring *, void *volatile *, u_int, u_int); typedef u_int (*mp_ring_can_drain_t)(struct ifmp_ring *); typedef void (*mp_ring_serial_t)(struct ifmp_ring *); Index: sys/net/mp_ring.c =================================================================== --- sys/net/mp_ring.c +++ sys/net/mp_ring.c @@ -111,7 +111,7 @@ while (cidx != pidx) { /* Items from cidx to pidx are available for consumption. */ - n = r->drain(r, cidx, pidx); + n = r->drain(r, (void **)r->items, cidx, pidx); if (n == 0) { os.state = ns.state = r->state; ns.cidx = cidx; @@ -186,7 +186,7 @@ while (cidx != pidx) { /* Items from cidx to pidx are available for consumption. */ - n = r->drain(r, cidx, pidx); + n = r->drain(r, r->items, cidx, pidx); if (n == 0) { critical_enter(); os.state = r->state;