Index: share/man/man4/Makefile =================================================================== --- share/man/man4/Makefile +++ share/man/man4/Makefile @@ -202,6 +202,7 @@ icmp6.4 \ ida.4 \ if_ipsec.4 \ + iflib.4 \ ifmib.4 \ ig4.4 \ igmp.4 \ Index: share/man/man4/bnxt.4 =================================================================== --- share/man/man4/bnxt.4 +++ share/man/man4/bnxt.4 @@ -110,7 +110,7 @@ or through the use of .Xr kenv 1 . These are provided by the -.Xr iflib 9 +.Xr iflib 4 framework, and might be better documented there. .Bl -tag -width indent .It Va dev.bnxt.X.iflib.override_nrxds Index: share/man/man4/em.4 =================================================================== --- share/man/man4/em.4 +++ share/man/man4/em.4 @@ -298,6 +298,7 @@ .Sh SEE ALSO .Xr altq 4 , .Xr arp 4 , +.Xr iflib 4 , .Xr led 4 , .Xr netintro 4 , .Xr ng_ether 4 , Index: share/man/man4/iflib.4 =================================================================== --- /dev/null +++ share/man/man4/iflib.4 @@ -0,0 +1,180 @@ +.\" $FreeBSD$ +.Dd July 26, 2018 +.Dt IFLIB 4 +.Os +.Sh NAME +.Nm iflib +.Nd Network Interface Driver Framework +.Sh SYNOPSIS +.Cd "device pci" +.Cd "device iflib" +.Sh DESCRIPTION +.Nm +is a framework for network interface drivers for FreeBSD. +It is designed to remove a large amount of the boilerplate that is often +needed for modern network interface devices, allowing driver authors to +focus on the specific code needed for their hardware. This allows for +a shared set of +.Xr sysctl 8 +names, rather than each driver naming them indivudually. + +.Sh SYSCTL VARIABLES +These variables must be set before loading the driver, either via +.Xr loader.conf 5 +or through the use of +.Xr kenv 1 . +They are all prefixed by dev.X.Y.iflib. where X is the driver name, and Y is +the instance number. +.Bl -tag -width indent +.It Va override_nrxds +Override the number of RX descriptors for each queue. +The value is a comma separated list of positive integers. +Some drivers only use a single value, but others may use more. +These numbers must be powers of two, and zero means to use the default. +Individual drivers may have additional restrictions on allowable values. +Defaults to all zeros. +.It Va override_ntxds +Override the number of TX descriptors for each queue. +The value is a comma separated list of positive integers. +Some drivers only use a single value, but others may use more. +These numbers must be powers of two, and zero means to use the default. +Individual drivers may have additional restrictions on allowable values. +Defaults to all zeros. +.It Va override_qs_enable +When set, allows the number of transmit and receive queues to be different. +If not set, the lower of the number of TX or RX queues will be used for both. +.It Va override_nrxqs +Set the number of RX queues. +If zero, the number of RX queues is derived from the number of cores on the +socket connected to the controller. +Defaults to 0. +.It Va override_ntxqs +Set the number of TX queues. +If zero, the number of TX queues is derived from the number of cores on the +socket connected to the controller. +.It Va disable_msix +Disables MSI-X interrupts for the device. +.El +.Pp +These +.Xr sysctl 8 +variables can be changed at any time: +.Bl -tag -width indent +.It Va tx_abdicate +Controls how the transmit ring is serviced. +If set to zero, when a frame is submitted to the transmission ring, the same +task that is submitting it will service the ring unless there's already a +task servicing the TX ring. +This ensures that whenever there is a pending transmission, the transmit ring +is being serviced. This results in higher transmit throughput. +If set to a non-zero value, task returns immediately and the transmit ring is +serviced by a different task. This returns control to the caller faster and +under high receive load, may result in fewer dropped RX frames. +.It Va rx_budget +Sets the maximum number of frames to be received at a time. +Zero (the default) indicates the default (currently 16) should be used. +.El +.Pp +There are also some global sysctls which can change behaviour for all drivers, and +may be changed at any time. +.Bl -tag -width indent +.It Va net.iflib.min_tx_latency +If this is set to a non-zero value, iflib will avoid any attempt to combine +multiple transmits, and notify the hardware as quickly as possible of new +descriptors. This will lower the maximum throughput, but will also lower +transmit latency. +.It Va net.iflib.no_tx_batch +Some NICs allow processing completed transmit descriptors in batches. Doing so +usually increases the transmit throughput by reducing the number of transmit +interrupts. Setting this to a non-zero value will disable the use of this feature. +.El +.Pp +These +.Xr sysctl 8 +variables are read-only: +.Bl -tag -width indent +.It Va driver_version +A string indicating the internal version of the driver. +.El +.Pp +There are a number of queue state +.Xr sysctl 8 +variables as well: +.Bl -tag -width indent +.It Va txqZ +The following are repeated for each transmit queue, where Z is the transmit queue instance number +.Bl -tag -width indent +.It Va r_abdications +Number of consumer abdications in the MP ring for this queue. An abdication occurs +on every ring submission when tx_abdicate is true. +.It Va r_restarts +Number of consumer restarts in the MP ring for this queue. A restart occurs when an attempt +to drain a non-empty ring fails, and the ring is already in the STALLED state. +.It Va r_stalls +Number of consumer stalls in the MP ring for this queue. A stall occurs when an attempt to +drain a non-empty ring fails. +.It Va r_starts +Number of normal consumer starts in the MP ring for this queue. A start occurs when the MP ring +transitions from IDLE to BUSY. +.It Va r_drops +Number of drops in the MP ring for this queue. A drop occurs when there is an attempt to add an +entry to an MP ring with no available space. +.It Va r_enqueues +Number of entries which have been enqueued to the MP ring for this queue. +.It Va ring_state +MP (soft) ring state. This privides a snapshot of the current MP ring state, including the producer +head and tail indexes, the consumer index, and the state. The state is one of "IDLE", "BUSY", +"STALLED", or "ABDICATED". +.It Va txq_cleaned +The number of transmit descriptors which have been reclaimed. +Total cleaned. +.It Va txq_processed +The number of transmit descriptors which have been processed, but may not yet have been reclaimed. +.It Va txq_in_use +Descriptors which have been added to the transmit queue, but have not yet been cleaned. +This value will include both untransmitted descriptors as well as descriptors which have been +processed. +.It Va txq_cidx_processed +The transmit queue consumer index of the next descriptor to process. +.It Va txq_cidx +The transmit queue consumer index of the oldest descriptor to reclaim. +.It Va txq_pidx +The transmit queue producer index where the next descriptor to transmit will be inserted. +.It Va no_tx_dma_setup +Number of times DMA mapping a transmit mbuf failed for reasons other than EFBIG. +.It Va txd_encap_efbig +Number of times DMA mapping a transmit mbuf failed due to requiring too many segments. +.It Va tx_map_failed +Number of times DMA mapping a transmit mbuf failed for any reason +(sum of no_tx_dma_setup and txd_encap_efbig) +.It Va no_desc_avail +Number of times a descriptor couldn't be added to the transmit ring because the transmit ring was full. +.It Va mbuf_defrag_failed +Number of times both m_collapse() and m_defrag() failed after an EFBIG error result from DMA mapping +a transmit mbuf. +.It Va m_pullups +Number of times m_pullup was called attempting to parse a header. +.It Va mbuf_defrag +Number of times m_defrag was called. +.El +.Pp +.It Va rxqZ +The following are repeated for each receive queue, where Z is the receive queue instance number +.Bl -tag -width indent +.It Va rxq_fl0.credits +Credits currently available in the receive ring. +.It Va rxq_fl0.cidx +Current receive ring consumer index. +.It Va rxq_fl0.pidx +Current receive ring producer index. +.El +.Pp +.El +.El +Additional OIDs useful for driver and iflib development are exposed when the +INVARIANTS and/or WITNESS options are enabled in the kernel. +.Pp +.Sh SEE ALSO +.Xr iflib 9 +.Sh HISTORY +This framework was introduced in 11.0. Index: share/man/man9/iflib.9 =================================================================== --- share/man/man9/iflib.9 +++ share/man/man9/iflib.9 @@ -32,6 +32,7 @@ .Nm based drivers. .Sh SEE ALSO +.Xr iflib 4 , .Xr iflibdd 9 , .Xr iflibdi 9 , .Xr iflibtxrx 9 , Index: sys/net/iflib.c =================================================================== --- sys/net/iflib.c +++ sys/net/iflib.c @@ -641,7 +641,6 @@ static int iflib_txq_drain_flushing; static int iflib_txq_drain_oactive; static int iflib_txq_drain_notready; -static int iflib_txq_drain_encapfail; SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_flushing, CTLFLAG_RD, &iflib_txq_drain_flushing, 0, "# drain flushes"); @@ -649,8 +648,6 @@ &iflib_txq_drain_oactive, 0, "# drain oactives"); SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_notready, CTLFLAG_RD, &iflib_txq_drain_notready, 0, "# drain notready"); -SYSCTL_INT(_net_iflib, OID_AUTO, txq_drain_encapfail, CTLFLAG_RD, - &iflib_txq_drain_encapfail, 0, "# drain encap fails"); static int iflib_encap_load_mbuf_fail; @@ -670,21 +667,14 @@ static int iflib_task_fn_rxs; static int iflib_rx_intr_enables; static int iflib_fast_intrs; -static int iflib_intr_link; -static int iflib_intr_msix; static int iflib_rx_unavail; static int iflib_rx_ctx_inactive; -static int iflib_rx_zero_len; static int iflib_rx_if_input; static int iflib_rx_mbuf_null; static int iflib_rxd_flush; static int iflib_verbose_debug; -SYSCTL_INT(_net_iflib, OID_AUTO, intr_link, CTLFLAG_RD, - &iflib_intr_link, 0, "# intr link calls"); -SYSCTL_INT(_net_iflib, OID_AUTO, intr_msix, CTLFLAG_RD, - &iflib_intr_msix, 0, "# intr msix calls"); SYSCTL_INT(_net_iflib, OID_AUTO, task_fn_rx, CTLFLAG_RD, &iflib_task_fn_rxs, 0, "# task_fn_rx calls"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_intr_enables, CTLFLAG_RD, @@ -695,8 +685,6 @@ &iflib_rx_unavail, 0, "# times rxeof called with no available data"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_ctx_inactive, CTLFLAG_RD, &iflib_rx_ctx_inactive, 0, "# times rxeof called with inactive context"); -SYSCTL_INT(_net_iflib, OID_AUTO, rx_zero_len, CTLFLAG_RD, - &iflib_rx_zero_len, 0, "# times rxeof saw zero len mbuf"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_if_input, CTLFLAG_RD, &iflib_rx_if_input, 0, "# times rxeof called if_input"); SYSCTL_INT(_net_iflib, OID_AUTO, rx_mbuf_null, CTLFLAG_RD, @@ -713,12 +701,12 @@ iflib_tx_seen = iflib_tx_sent = iflib_tx_encap = iflib_rx_allocs = iflib_fl_refills = iflib_fl_refills_large = iflib_tx_frees = iflib_txq_drain_flushing = iflib_txq_drain_oactive = - iflib_txq_drain_notready = iflib_txq_drain_encapfail = + iflib_txq_drain_notready = iflib_encap_load_mbuf_fail = iflib_encap_pad_mbuf_fail = iflib_encap_txq_avail_fail = iflib_encap_txd_encap_fail = iflib_task_fn_rxs = iflib_rx_intr_enables = iflib_fast_intrs = - iflib_intr_link = iflib_intr_msix = iflib_rx_unavail = - iflib_rx_ctx_inactive = iflib_rx_zero_len = iflib_rx_if_input = + iflib_rx_unavail = + iflib_rx_ctx_inactive = iflib_rx_if_input = iflib_rx_mbuf_null = iflib_rxd_flush = 0; } @@ -839,6 +827,9 @@ if_ctx_t ctx = rxq->ifr_ctx; iflib_fl_t fl = &rxq->ifr_fl[0]; uint32_t refill_pidx, nic_i; +#if IFLIB_DEBUG_COUNTERS + int rf_count = 0; +#endif if (nm_i == head && __predict_true(!init)) return 0; @@ -851,7 +842,12 @@ */ head = nm_prev(head, lim); nic_i = UINT_MAX; + DBG_COUNTER_INC(fl_refills); while (nm_i != head) { +#if IFLIB_DEBUG_COUNTERS + if (++rf_count == 9) + DBG_COUNTER_INC(fl_refills_large); +#endif for (int tmp_pidx = 0; tmp_pidx < IFLIB_MAX_RX_REFRESH && nm_i != head; tmp_pidx++) { struct netmap_slot *slot = &ring->slot[nm_i]; void *addr = PNMB(na, slot, &fl->ifl_bus_addrs[tmp_pidx]); @@ -898,8 +894,10 @@ if (map) bus_dmamap_sync(fl->ifl_ifdi->idi_tag, fl->ifl_ifdi->idi_map, BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE); - if (__predict_true(nic_i != UINT_MAX)) + if (__predict_true(nic_i != UINT_MAX)) { ctx->isc_rxd_flush(ctx->ifc_softc, rxq->ifr_id, fl->ifl_id, nic_i); + DBG_COUNTER_INC(rxd_flush); + } return (0); } @@ -998,6 +996,7 @@ /* Fill the slot in the NIC ring. */ ctx->isc_txd_encap(ctx->ifc_softc, &pi); + DBG_COUNTER_INC(tx_encap); /* prefetch for next round */ __builtin_prefetch(&ring->slot[nm_i + 1]); @@ -1524,8 +1523,10 @@ cidx = rxq->ifr_fl[0].ifl_cidx; if (iflib_rxd_avail(ctx, rxq, cidx, 1)) GROUPTASK_ENQUEUE(gtask); - else + else { IFDI_RX_QUEUE_INTR_ENABLE(ctx, rxq->ifr_id); + DBG_COUNTER_INC(rx_intr_enables); + } return (FILTER_HANDLED); } @@ -2887,6 +2888,7 @@ return (ENOMEM); } else { m_freem(*mp); + DBG_COUNTER_INC(tx_frees); *mp = m; } } @@ -2994,6 +2996,7 @@ pi->ipi_ip_hlen = sizeof(struct ip6_hdr); if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) { + txq->ift_pullups++; if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL)) return (ENOMEM); } @@ -3041,34 +3044,6 @@ return (0); } -static __noinline struct mbuf * -collapse_pkthdr(struct mbuf *m0) -{ - struct mbuf *m, *m_next, *tmp; - - m = m0; - m_next = m->m_next; - while (m_next != NULL && m_next->m_len == 0) { - m = m_next; - m->m_next = NULL; - m_free(m); - m_next = m_next->m_next; - } - m = m0; - m->m_next = m_next; - if (m_next == NULL) - return (m); - if ((m_next->m_flags & M_EXT) == 0) { - m = m_defrag(m, M_NOWAIT); - } else { - tmp = m_next->m_next; - memcpy(m_next, m, MPKTHSIZE); - m = m_next; - m->m_next = tmp; - } - return (m); -} - /* * If dodgy hardware rejects the scatter gather chain we've handed it * we'll need to remove the mbuf chain from ifsg_m[] before we can add the @@ -3117,8 +3092,7 @@ /* * Please don't ever do this */ - if (__predict_false(m->m_len == 0)) - *m0 = collapse_pkthdr(m); + MPASS(__predict_true(m->m_len > 0)); ctx = txq->ift_ctx; sctx = ctx->ifc_sctx; @@ -3259,6 +3233,7 @@ m_freem(*m_head); device_printf(dev, "cannot pad short frame, m_dup() failed"); DBG_COUNTER_INC(encap_pad_mbuf_fail); + DBG_COUNTER_INC(tx_frees); return ENOMEM; } m_freem(*m_head); @@ -3274,6 +3249,7 @@ m_freem(*m_head); device_printf(dev, "cannot pad short frame\n"); DBG_COUNTER_INC(encap_pad_mbuf_fail); + DBG_COUNTER_INC(tx_frees); return (ENOBUFS); } @@ -3337,8 +3313,10 @@ if ((sctx->isc_flags & IFLIB_NEED_ETHER_PAD) && __predict_false(m_head->m_pkthdr.len < scctx->isc_min_frame_size)) { err = iflib_ether_pad(ctx->ifc_dev, m_headp, scctx->isc_min_frame_size); - if (err) + if (err) { + DBG_COUNTER_INC(encap_txd_encap_fail); return err; + } } m_head = *m_headp; @@ -3352,8 +3330,10 @@ /* deliberate bitwise OR to make one condition */ if (__predict_true((pi.ipi_csum_flags | pi.ipi_vtag))) { - if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) + if (__predict_false((err = iflib_parse_header(txq, &pi, m_headp)) != 0)) { + DBG_COUNTER_INC(encap_txd_encap_fail); return (err); + } m_head = *m_headp; } @@ -3370,12 +3350,13 @@ if (m_head == NULL) remap++; } - if (remap == 1) + if (remap == 1) { + txq->ift_mbuf_defrag++; m_head = m_defrag(*m_headp, M_NOWAIT); + } remap++; if (__predict_false(m_head == NULL)) goto defrag_failed; - txq->ift_mbuf_defrag++; *m_headp = m_head; goto retry; break; @@ -3391,6 +3372,7 @@ } txq->ift_map_failed++; DBG_COUNTER_INC(encap_load_mbuf_fail); + DBG_COUNTER_INC(encap_txd_encap_fail); return (err); } @@ -3404,6 +3386,7 @@ if (map != NULL) bus_dmamap_unload(desc_tag, map); DBG_COUNTER_INC(encap_txq_avail_fail); + DBG_COUNTER_INC(encap_txd_encap_fail); if ((txq->ift_task.gt_task.ta_flags & TASK_ENQUEUED) == 0) GROUPTASK_ENQUEUE(&txq->ift_task); return (ENOBUFS); @@ -3466,9 +3449,12 @@ goto defrag; } } - DBG_COUNTER_INC(encap_txd_encap_fail); goto defrag_failed; } + /* + * err can't possibly be non-zero here, so we don't neet to test it + * to see if we need to DBG_COUNTER_INC(encap_txd_encap_fail). + */ return (err); defrag_failed: @@ -3477,6 +3463,7 @@ m_freem(*m_headp); DBG_COUNTER_INC(tx_frees); *m_headp = NULL; + DBG_COUNTER_INC(encap_txd_encap_fail); return (ENOMEM); } @@ -3675,12 +3662,10 @@ in_use_prev = txq->ift_in_use; err = iflib_encap(txq, mp); if (__predict_false(err)) { - DBG_COUNTER_INC(txq_drain_encapfail); /* no room - bail out */ if (err == ENOBUFS) break; consumed++; - DBG_COUNTER_INC(txq_drain_encapfail); /* we can't send this packet - skip it */ continue; } @@ -3740,6 +3725,7 @@ if (__predict_false(*mp == (struct mbuf *)txq)) continue; m_freem(*mp); + DBG_COUNTER_INC(tx_frees); } MPASS(ifmp_ring_is_stalled(r) == 0); return (avail); @@ -3986,6 +3972,7 @@ next = m->m_nextpkt; m->m_nextpkt = NULL; m_freem(m); + DBG_COUNTER_INC(tx_frees); m = next; } return (ENOBUFS); @@ -4028,6 +4015,7 @@ #endif ifmp_ring_check_drainage(txq->ift_br, TX_BATCH_SIZE); m_freem(m); + DBG_COUNTER_INC(tx_frees); } return (err);