diff --git a/sys/net/iflib.h b/sys/net/iflib.h --- a/sys/net/iflib.h +++ b/sys/net/iflib.h @@ -131,7 +131,9 @@ uint8_t ipi_mflags; /* packet mbuf flags */ uint32_t ipi_tcp_seq; /* tcp seqno */ - uint32_t __spare0__; + uint8_t ipi_ip_tos; /* IP ToS field data */ + uint8_t __spare0__; + uint16_t __spare1__; } *if_pkt_info_t; typedef struct if_irq { @@ -188,6 +190,7 @@ void (*ift_rxd_flush) (void *, uint16_t qsidx, uint8_t flidx, qidx_t pidx); int (*ift_legacy_intr) (void *); qidx_t (*ift_txq_select) (void *, struct mbuf *); + qidx_t (*ift_txq_select_v2) (void *, struct mbuf *, if_pkt_info_t); } *if_txrx_t; typedef struct if_softc_ctx { @@ -416,6 +419,13 @@ * as ift_txq_select in struct if_txrx */ #define IFLIB_FEATURE_QUEUE_SELECT 1400050 +/* + * Driver can set its own TX queue selection function + * as ift_txq_select_v2 in struct if_txrx. This includes + * having iflib send L3+ extra header information to the + * function. + */ +#define IFLIB_FEATURE_QUEUE_SELECT_V2 1400073 /* * These enum values are used in iflib_needs_restart to indicate to iflib diff --git a/sys/net/iflib.c b/sys/net/iflib.c --- a/sys/net/iflib.c +++ b/sys/net/iflib.c @@ -210,6 +210,7 @@ #define isc_rxd_flush ifc_txrx.ift_rxd_flush #define isc_legacy_intr ifc_txrx.ift_legacy_intr #define isc_txq_select ifc_txrx.ift_txq_select +#define isc_txq_select_v2 ifc_txrx.ift_txq_select_v2 eventhandler_tag ifc_vlan_attach_event; eventhandler_tag ifc_vlan_detach_event; struct ether_addr ifc_mac; @@ -3195,32 +3196,24 @@ #define IS_TSO6(pi) ((pi)->ipi_csum_flags & CSUM_IP6_TSO) #define IS_TX_OFFLOAD6(pi) ((pi)->ipi_csum_flags & (CSUM_IP6_TCP | CSUM_IP6_TSO)) +/** + * Parses out ethernet header information in the given mbuf. + * Returns in pi: ipi_etype (EtherType) and ipi_ehdrlen (Ethernet header length) + * + * This will account for the VLAN header if present. + * + * XXX: This doesn't handle QinQ, which could prevent TX offloads for those + * types of packets. + */ static int -iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) +iflib_parse_ether_header(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups) { - if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx; struct ether_vlan_header *eh; struct mbuf *m; m = *mp; - if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) && - M_WRITABLE(m) == 0) { - if ((m = m_dup(m, M_NOWAIT)) == NULL) { - return (ENOMEM); - } else { - m_freem(*mp); - DBG_COUNTER_INC(tx_frees); - *mp = m; - } - } - - /* - * Determine where frame payload starts. - * Jump over vlan headers if already present, - * helpful for QinQ too. - */ if (__predict_false(m->m_len < sizeof(*eh))) { - txq->ift_pullups++; + (*pullups)++; if (__predict_false((m = m_pullup(m, sizeof(*eh))) == NULL)) return (ENOMEM); } @@ -3232,6 +3225,143 @@ pi->ipi_etype = ntohs(eh->evl_encap_proto); pi->ipi_ehdrlen = ETHER_HDR_LEN; } + *mp = m; + + return (0); +} + +/** + * Parse up to the L3 header and extract IPv4/IPv6 header information into pi. + * Currently this information includes: IP ToS value, IP header version/presence + * + * This is missing some checks and doesn't edit the packet content as it goes, + * unlike iflib_parse_header(), in order to keep the amount of code here minimal. + */ +static int +iflib_parse_header_partial(if_pkt_info_t pi, struct mbuf **mp, uint64_t *pullups) +{ + struct mbuf *m; + int err; + + *pullups = 0; + m = *mp; + if (!M_WRITABLE(m)) { + if ((m = m_dup(m, M_NOWAIT)) == NULL) { + return (ENOMEM); + } else { + m_freem(*mp); + DBG_COUNTER_INC(tx_frees); + *mp = m; + } + } + + /* Fills out pi->ipi_etype */ + err = iflib_parse_ether_header(pi, mp, pullups); + if (err) + return (err); + m = *mp; + + switch (pi->ipi_etype) { +#ifdef INET + case ETHERTYPE_IP: + { + struct mbuf *n; + struct ip *ip = NULL; + int miniplen; + + miniplen = min(m->m_pkthdr.len, pi->ipi_ehdrlen + sizeof(*ip)); + if (__predict_false(m->m_len < miniplen)) { + /* + * Check for common case where the first mbuf only contains + * the Ethernet header + */ + if (m->m_len == pi->ipi_ehdrlen) { + n = m->m_next; + MPASS(n); + /* If next mbuf contains at least the minimal IP header, then stop */ + if (n->m_len >= sizeof(*ip)) { + ip = (struct ip *)n->m_data; + } else { + (*pullups)++; + if (__predict_false((m = m_pullup(m, miniplen)) == NULL)) + return (ENOMEM); + ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); + } + } else { + (*pullups)++; + if (__predict_false((m = m_pullup(m, miniplen)) == NULL)) + return (ENOMEM); + ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); + } + } else { + ip = (struct ip *)(m->m_data + pi->ipi_ehdrlen); + } + + /* Have the IPv4 header w/ no options here */ + pi->ipi_ip_hlen = ip->ip_hl << 2; + pi->ipi_ipproto = ip->ip_p; + pi->ipi_ip_tos = ip->ip_tos; + pi->ipi_flags |= IPI_TX_IPV4; + + break; + } +#endif +#ifdef INET6 + case ETHERTYPE_IPV6: + { + struct ip6_hdr *ip6; + + if (__predict_false(m->m_len < pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) { + (*pullups)++; + if (__predict_false((m = m_pullup(m, pi->ipi_ehdrlen + sizeof(struct ip6_hdr))) == NULL)) + return (ENOMEM); + } + ip6 = (struct ip6_hdr *)(m->m_data + pi->ipi_ehdrlen); + + /* Have the IPv6 fixed header here */ + pi->ipi_ip_hlen = sizeof(struct ip6_hdr); + pi->ipi_ipproto = ip6->ip6_nxt; + pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6); + pi->ipi_flags |= IPI_TX_IPV6; + + break; + } +#endif + default: + pi->ipi_csum_flags &= ~CSUM_OFFLOAD; + pi->ipi_ip_hlen = 0; + break; + } + *mp = m; + + return (0); + +} + +static int +iflib_parse_header(iflib_txq_t txq, if_pkt_info_t pi, struct mbuf **mp) +{ + if_shared_ctx_t sctx = txq->ift_ctx->ifc_sctx; + struct mbuf *m; + int err; + + m = *mp; + if ((sctx->isc_flags & IFLIB_NEED_SCRATCH) && + M_WRITABLE(m) == 0) { + if ((m = m_dup(m, M_NOWAIT)) == NULL) { + return (ENOMEM); + } else { + m_freem(*mp); + DBG_COUNTER_INC(tx_frees); + *mp = m; + } + } + + /* Fills out pi->ipi_etype */ + err = iflib_parse_ether_header(pi, mp, &txq->ift_pullups); + if (__predict_false(err)) + return (err); + m = *mp; switch (pi->ipi_etype) { #ifdef INET @@ -3276,6 +3406,7 @@ } pi->ipi_ip_hlen = ip->ip_hl << 2; pi->ipi_ipproto = ip->ip_p; + pi->ipi_ip_tos = ip->ip_tos; pi->ipi_flags |= IPI_TX_IPV4; /* TCP checksum offload may require TCP header length */ @@ -3329,6 +3460,7 @@ /* XXX-BZ this will go badly in case of ext hdrs. */ pi->ipi_ipproto = ip6->ip6_nxt; + pi->ipi_ip_tos = IPV6_TRAFFIC_CLASS(ip6); pi->ipi_flags |= IPI_TX_IPV6; /* TCP checksum offload may require TCP header length */ @@ -4146,11 +4278,10 @@ static int iflib_if_transmit(if_t ifp, struct mbuf *m) { - if_ctx_t ctx = if_getsoftc(ifp); - + if_ctx_t ctx = if_getsoftc(ifp); iflib_txq_t txq; int err, qidx; - int abdicate = ctx->ifc_sysctl_tx_abdicate; + int abdicate; if (__predict_false((ifp->if_drv_flags & IFF_DRV_RUNNING) == 0 || !LINK_ACTIVE(ctx))) { DBG_COUNTER_INC(tx_frees); @@ -4162,7 +4293,24 @@ /* ALTQ-enabled interfaces always use queue 0. */ qidx = 0; /* Use driver-supplied queue selection method if it exists */ - if (ctx->isc_txq_select) + if (ctx->isc_txq_select_v2) { + struct if_pkt_info pi; + uint64_t early_pullups = 0; + pkt_info_zero(&pi); + + err = iflib_parse_header_partial(&pi, &m, &early_pullups); + if (__predict_false(err != 0)) { + /* Assign pullups for bad pkts to default queue */ + ctx->ifc_txqs[0].ift_pullups += early_pullups; + DBG_COUNTER_INC(encap_txd_encap_fail); + return (err); + } + /* Let driver make queueing decision */ + qidx = ctx->isc_txq_select_v2(ctx->ifc_softc, m, &pi); + ctx->ifc_txqs[qidx].ift_pullups += early_pullups; + } + /* Backwards compatibility w/ simpler queue select */ + else if (ctx->isc_txq_select) qidx = ctx->isc_txq_select(ctx->ifc_softc, m); /* If not, use iflib's standard method */ else if ((NTXQSETS(ctx) > 1) && M_HASHTYPE_GET(m) && !ALTQ_IS_ENABLED(&ifp->if_snd)) @@ -4207,6 +4355,8 @@ } #endif DBG_COUNTER_INC(tx_seen); + abdicate = ctx->ifc_sysctl_tx_abdicate; + err = ifmp_ring_enqueue(txq->ift_br, (void **)&m, 1, TX_BATCH_SIZE, abdicate); if (abdicate) diff --git a/sys/sys/param.h b/sys/sys/param.h --- a/sys/sys/param.h +++ b/sys/sys/param.h @@ -76,7 +76,7 @@ * cannot include sys/param.h and should only be updated here. */ #undef __FreeBSD_version -#define __FreeBSD_version 1400072 +#define __FreeBSD_version 1400073 /* * __FreeBSD_kernel__ indicates that this system uses the kernel of FreeBSD,