Index: sbin/ifconfig/ifconfig.8 =================================================================== --- sbin/ifconfig/ifconfig.8 +++ sbin/ifconfig/ifconfig.8 @@ -28,7 +28,7 @@ .\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94 .\" $FreeBSD$ .\" -.Dd June 4, 2020 +.Dd XXX .Dt IFCONFIG 8 .Os .Sh NAME @@ -597,6 +597,21 @@ reception of extended frames, tag processing in hardware, frame filtering in hardware, or TSO on VLAN, respectively. +.It Cm vxlanhwcsum , vxlanhwtso +If the driver offers user-configurable VXLAN support, enable checksum +offloading (receive and transmit) or TSO on VXLAN, respectively. +Note that this must be issued on a physical interface associated with +.Xr vxlan 4 , +not on a +.Xr vxlan 4 +interface itself. +The physical interface is either the interface specified as the vxlandev +or the interface hosting the vxlanlocal address. +The driver will offload as much checksum work and TSO as it can reliably +support, the exact level of offloading may vary between drivers. +.It Fl vxlanhwcsum , vxlanhwtso +If the driver offers user-configurable VXLAN support, disable checksum +offloading (receive and transmit) or TSO on VXLAN, respectively. .It Cm vnet Ar jail Move the interface to the .Xr jail 8 , Index: sbin/ifconfig/ifconfig.c =================================================================== --- sbin/ifconfig/ifconfig.c +++ sbin/ifconfig/ifconfig.c @@ -1343,7 +1343,8 @@ "\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \ "\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \ "\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \ -"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6" +"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6" \ +"\36VXLAN_HWCSUM\37VXLAN_HWTSO" /* * Print the status of the interface. If an address family was Index: sbin/ifconfig/ifvxlan.c =================================================================== --- sbin/ifconfig/ifvxlan.c +++ sbin/ifconfig/ifvxlan.c @@ -620,6 +620,11 @@ DEF_CMD("vxlanflush", 0, setvxlan_flush), DEF_CMD("vxlanflushall", 1, setvxlan_flush), + + DEF_CMD("vxlanhwcsum", IFCAP_VXLAN_HWCSUM, setifcap), + DEF_CMD("-vxlanhwcsum", -IFCAP_VXLAN_HWCSUM, setifcap), + DEF_CMD("vxlanhwtso", IFCAP_VXLAN_HWTSO, setifcap), + DEF_CMD("-vxlanhwtso", -IFCAP_VXLAN_HWTSO, setifcap), }; static struct afswtch af_vxlan = { Index: share/man/man4/cxgbe.4 =================================================================== --- share/man/man4/cxgbe.4 +++ share/man/man4/cxgbe.4 @@ -31,7 +31,7 @@ .\" .\" $FreeBSD$ .\" -.Dd Dec 10, 2019 +.Dd XXX .Dt CXGBE 4 .Os .Sh NAME @@ -61,8 +61,8 @@ the Chelsio Terminator 4, Terminator 5, and Terminator 6 ASICs (T4, T5, and T6). The driver supports Jumbo Frames, Transmit/Receive checksum offload, TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN -tag insertion/extraction, VLAN checksum offload, VLAN TSO, and -Receive Side Steering (RSS). +tag insertion/extraction, VLAN checksum offload, VLAN TSO, VXLAN checksum +offload, VXLAN TSO, and Receive Side Steering (RSS). For further hardware information and questions related to hardware requirements, see .Pa http://www.chelsio.com/ . Index: share/man/man4/vxlan.4 =================================================================== --- share/man/man4/vxlan.4 +++ share/man/man4/vxlan.4 @@ -24,7 +24,7 @@ .\" .\" $FreeBSD$ .\" -.Dd December 31, 2017 +.Dd XXX .Dt VXLAN 4 .Os .Sh NAME @@ -182,6 +182,37 @@ .Nm interface to allow the encapsulated frame to fit in the current MTU of the physical network. +.Sh HARDWARE +The +.Nm +driver supports hardware checksum offload (receive and transmit) and TSO on the +encapsulated traffic over physical interfaces that support these features. +The +.Nm +interface examines the +.Cm vxlandev +interface, if one is specified, or the interface hosting the +.Cm vxlanlocal +address, and configures its capabilities based on the hardware offload +capabilities of that physical interface. +If multiple physical interfaces will transmit or receive traffic for the +.Nm +then they all must have the same hardware capabilities. +The transmit routine of a +.Nm +interface may fail with ENXIO if an outbound physical interface does not support +an offload that the +.Nm +interface is requesting. +This can happen if there are multiple physical interfaces involved, with +different hardware capabilities, or an interface capability was disabled after +the +.Nm +interface had already started. +.Pp +At present, these devices are capable of generating checksums and performing TSO +on the inner frames in hardware: +.Xr cxgbe 4 . .Sh EXAMPLES Create a .Nm @@ -244,3 +275,7 @@ .Nm driver was written by .An Bryan Venteicher Aq bryanv@freebsd.org . +Support for stateless hardware offloads was added by +.An Navdeep Parhar Aq np@freebsd.org +in +.Fx 13.0 . Index: share/man/man9/EVENTHANDLER.9 =================================================================== --- share/man/man9/EVENTHANDLER.9 +++ share/man/man9/EVENTHANDLER.9 @@ -23,7 +23,7 @@ .\" SUCH DAMAGE. .\" $FreeBSD$ .\" -.Dd October 21, 2018 +.Dd October XXX .Dt EVENTHANDLER 9 .Os .Sh NAME @@ -389,6 +389,10 @@ Callback invoked when a vlan is destroyed. .It Vt vm_lowmem Callbacks invoked when virtual memory is low. +.It Vt vxlan_start +Callback invoked when a vxlan interface starts. +.It Vt vxlan_stop +Callback invoked when a vxlan interface stops. .It Vt watchdog_list Callbacks invoked when the system watchdog timer is reinitialized. .El Index: sys/dev/cxgbe/adapter.h =================================================================== --- sys/dev/cxgbe/adapter.h +++ sys/dev/cxgbe/adapter.h @@ -119,6 +119,7 @@ TX_SGL_SEGS = 39, TX_SGL_SEGS_TSO = 38, TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */ + TX_SGL_SEGS_VXLAN_TSO = 37, TX_WR_FLITS = SGE_MAX_WR_LEN / 8 }; @@ -286,6 +287,7 @@ int nvi; int up_vis; int uld_vis; + bool vxlan_tcam_entry; struct tx_sched_params *sched_params; @@ -593,6 +595,8 @@ uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */ uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */ uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */ + uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */ + uint64_t vxlan_txcsum; uint64_t kern_tls_records; uint64_t kern_tls_short; @@ -625,6 +629,7 @@ uint64_t rxcsum; /* # of times hardware assisted with checksum */ uint64_t vlan_extraction;/* # of times VLAN tag was extracted */ + uint64_t vxlan_rxcsum; /* stats for not-that-common events */ @@ -847,6 +852,11 @@ struct sge sge; int lro_timeout; int sc_do_rxcopy; + + int vxlan_port; + u_int vxlan_refcount; + int rawf_base; + int nrawf; struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */ struct task async_event_task; Index: sys/dev/cxgbe/common/common.h =================================================================== --- sys/dev/cxgbe/common/common.h +++ sys/dev/cxgbe/common/common.h @@ -247,7 +247,7 @@ uint32_t vlan_pri_map; uint32_t ingress_config; uint64_t hash_filter_mask; - __be16 err_vec_mask; + bool rx_pkt_encap; int8_t fcoe_shift; int8_t port_shift; Index: sys/dev/cxgbe/common/t4_hw.c =================================================================== --- sys/dev/cxgbe/common/t4_hw.c +++ sys/dev/cxgbe/common/t4_hw.c @@ -9632,14 +9632,11 @@ * rx_pkt.err_vec. T6+ can use a compressed error vector to make room * for information about outer encapsulation (GENEVE/VXLAN/NVGRE). */ - tpp->err_vec_mask = htobe16(0xffff); if (chip_id(adap) > CHELSIO_T5) { v = t4_read_reg(adap, A_TP_OUT_CONFIG); - if (v & F_CRXPKTENC) { - tpp->err_vec_mask = - htobe16(V_T6_COMPR_RXERR_VEC(M_T6_COMPR_RXERR_VEC)); - } - } + tpp->rx_pkt_encap = v & F_CRXPKTENC; + } else + tpp->rx_pkt_encap = false; return 0; } Index: sys/dev/cxgbe/firmware/t6fw_cfg.txt =================================================================== --- sys/dev/cxgbe/firmware/t6fw_cfg.txt +++ sys/dev/cxgbe/firmware/t6fw_cfg.txt @@ -146,7 +146,8 @@ nethctrl = 1024 neq = 2048 nqpcq = 8192 - nexactf = 456 + nexactf = 454 + nrawf = 2 cmask = all pmask = all ncrypto_lookaside = 16 @@ -272,7 +273,7 @@ [fini] version = 0x1 - checksum = 0x4528a6ac + checksum = 0x82be65fd # # $FreeBSD$ # Index: sys/dev/cxgbe/t4_main.c =================================================================== --- sys/dev/cxgbe/t4_main.c +++ sys/dev/cxgbe/t4_main.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -1068,6 +1069,8 @@ TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc); #endif + refcount_init(&sc->vxlan_refcount, 0); + rc = t4_map_bars_0_and_4(sc); if (rc != 0) goto done; /* error message displayed already */ @@ -1715,6 +1718,7 @@ struct ifnet *ifp; struct sbuf *sb; struct pfil_head_args pa; + struct adapter *sc = vi->adapter; vi->xact_addr_filt = -1; callout_init(&vi->tick, 1); @@ -1748,28 +1752,36 @@ ifp->if_capabilities = T4_CAP; ifp->if_capenable = T4_CAP_ENABLE; + ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | + CSUM_UDP_IPV6 | CSUM_TCP_IPV6; + if (chip_id(sc) >= CHELSIO_T6) { + ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; + ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO; + ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | + CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | + CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN; + } + #ifdef TCP_OFFLOAD - if (vi->nofldrxq != 0 && (vi->adapter->flags & KERN_TLS_OK) == 0) + if (vi->nofldrxq != 0 && (sc->flags & KERN_TLS_OK) == 0) ifp->if_capabilities |= IFCAP_TOE; #endif #ifdef RATELIMIT - if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0) { + if (is_ethoffload(sc) && vi->nofldtxq != 0) { ifp->if_capabilities |= IFCAP_TXRTLMT; ifp->if_capenable |= IFCAP_TXRTLMT; } #endif - ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO | - CSUM_UDP_IPV6 | CSUM_TCP_IPV6; ifp->if_hw_tsomax = IP_MAXPACKET; ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO; #ifdef RATELIMIT - if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0) + if (is_ethoffload(sc) && vi->nofldtxq != 0) ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO; #endif ifp->if_hw_tsomaxsegsize = 65536; #ifdef KERN_TLS - if (vi->adapter->flags & KERN_TLS_OK) { + if (sc->flags & KERN_TLS_OK) { ifp->if_capabilities |= IFCAP_TXTLS; ifp->if_capenable |= IFCAP_TXTLS; } @@ -2099,6 +2111,17 @@ if (mask & IFCAP_TXTLS) ifp->if_capenable ^= (mask & IFCAP_TXTLS); #endif + if (mask & IFCAP_VXLAN_HWCSUM) { + ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM; + ifp->if_hwassist ^= CSUM_INNER_IP6_UDP | + CSUM_INNER_IP6_TCP | CSUM_INNER_IP | + CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP; + } + if (mask & IFCAP_VXLAN_HWTSO) { + ifp->if_capenable ^= IFCAP_VXLAN_HWTSO; + ifp->if_hwassist ^= CSUM_INNER_IP6_TSO | + CSUM_INNER_IP_TSO; + } #ifdef VLAN_CAPABILITIES VLAN_CAPABILITIES(ifp); @@ -4410,6 +4433,19 @@ MPASS(sc->tids.hpftid_base == 0); MPASS(sc->tids.tid_base == sc->tids.nhpftids); } + + param[0] = FW_PARAM_PFVF(RAWF_START); + param[1] = FW_PARAM_PFVF(RAWF_END); + rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val); + if (rc != 0) { + device_printf(sc->dev, + "failed to query rawf parameters: %d.\n", rc); + return (rc); + } + if ((int)val[1] > (int)val[0]) { + sc->rawf_base = val[0]; + sc->nrawf = val[1] - val[0] + 1; + } } /* @@ -5141,6 +5177,7 @@ struct port_info *pi = vi->pi; struct adapter *sc = pi->adapter; int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1; + uint8_t match_all_mac[ETHER_ADDR_LEN] = {0}; ASSERT_SYNCHRONIZED_OP(sc); KASSERT(flags, ("%s: not told what to update.", __func__)); @@ -5214,7 +5251,7 @@ rc = -rc; for (j = 0; j < ctx.i; j++) { if_printf(ifp, - "failed to add mc address" + "failed to add mcast address" " %02x:%02x:%02x:" "%02x:%02x:%02x rc=%d\n", ctx.mcaddr[j][0], ctx.mcaddr[j][1], @@ -5224,14 +5261,36 @@ } return (rc); } + ctx.del = 0; } else NET_EPOCH_EXIT(et); rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0); if (rc != 0) - if_printf(ifp, "failed to set mc address hash: %d", rc); + if_printf(ifp, "failed to set mcast address hash: %d\n", + rc); + if (ctx.del == 0) { + /* We clobbered the VXLAN entry if there was one. */ + pi->vxlan_tcam_entry = false; + } } + if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 && + pi->vxlan_tcam_entry == false) { + rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac, + match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id, + true); + if (rc < 0) { + rc = -rc; + if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n", + rc); + } else { + MPASS(rc == sc->rawf_base + pi->port_id); + rc = 0; + pi->vxlan_tcam_entry = true; + } + } + return (rc); } @@ -10374,6 +10433,7 @@ #endif rxq->rxcsum = 0; rxq->vlan_extraction = 0; + rxq->vxlan_rxcsum = 0; rxq->fl.cl_allocated = 0; rxq->fl.cl_recycled = 0; @@ -10392,6 +10452,8 @@ txq->txpkts0_pkts = 0; txq->txpkts1_pkts = 0; txq->raw_wrs = 0; + txq->vxlan_tso_wrs = 0; + txq->vxlan_txcsum = 0; txq->kern_tls_records = 0; txq->kern_tls_short = 0; txq->kern_tls_partial = 0; @@ -11202,6 +11264,116 @@ } #endif +static eventhandler_tag vxlan_start_evtag; +static eventhandler_tag vxlan_stop_evtag; + +struct vxlan_evargs { + struct ifnet *ifp; + uint16_t port; +}; + +static void +t4_vxlan_start(struct adapter *sc, void *arg) +{ + struct vxlan_evargs *v = arg; + struct port_info *pi; + uint8_t match_all_mac[ETHER_ADDR_LEN] = {0}; + int i, rc; + + if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5) + return; + if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0) + return; + + if (sc->vxlan_refcount == 0) { + sc->vxlan_port = v->port; + sc->vxlan_refcount = 1; + t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE, + V_VXLAN(v->port) | F_VXLAN_EN); + for_each_port(sc, i) { + pi = sc->port[i]; + if (pi->vxlan_tcam_entry == true) + continue; + rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid, + match_all_mac, match_all_mac, + sc->rawf_base + pi->port_id, 1, pi->port_id, true); + if (rc < 0) { + rc = -rc; + log(LOG_ERR, + "%s: failed to add VXLAN TCAM entry: %d.\n", + device_get_name(pi->vi[0].dev), rc); + } else { + MPASS(rc == sc->rawf_base + pi->port_id); + rc = 0; + pi->vxlan_tcam_entry = true; + } + } + } else if (sc->vxlan_port == v->port) { + sc->vxlan_refcount++; + } else { + log(LOG_ERR, "%s: VXLAN already configured on port %d; " + "ignoring attempt to configure it on port %d\n", + device_get_nameunit(sc->dev), sc->vxlan_port, v->port); + } + end_synchronized_op(sc, 0); +} + +static void +t4_vxlan_stop(struct adapter *sc, void *arg) +{ + struct vxlan_evargs *v = arg; + + if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5) + return; + if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0) + return; + + /* + * VXLANs may have been configured before the driver was loaded so we + * may see more stops than starts. This is not handled cleanly but at + * least we keep the refcount sane. + */ + if (sc->vxlan_port != v->port) + goto done; + if (sc->vxlan_refcount == 0) { + log(LOG_ERR, + "%s: VXLAN operation on port %d was stopped earlier; " + "ignoring attempt to stop it again.\n", + device_get_nameunit(sc->dev), sc->vxlan_port); + } else if (--sc->vxlan_refcount == 0) { + t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0); + } +done: + end_synchronized_op(sc, 0); +} + +static void +t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp, + sa_family_t family, u_int port) +{ + struct vxlan_evargs v; + + MPASS(family == AF_INET || family == AF_INET6); + v.ifp = ifp; + v.port = port; + + t4_iterate(t4_vxlan_start, &v); +} + +static void +t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family, + u_int port) +{ + struct vxlan_evargs v; + + MPASS(family == AF_INET || family == AF_INET6); + v.ifp = ifp; + v.port = port; + + t4_iterate(t4_vxlan_stop, &v); +} + + static struct sx mlu; /* mod load unload */ SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload"); @@ -11245,6 +11417,14 @@ #endif t4_tracer_modload(); tweak_tunables(); + vxlan_start_evtag = + EVENTHANDLER_REGISTER(vxlan_start, + t4_vxlan_start_handler, NULL, + EVENTHANDLER_PRI_ANY); + vxlan_stop_evtag = + EVENTHANDLER_REGISTER(vxlan_stop, + t4_vxlan_stop_handler, NULL, + EVENTHANDLER_PRI_ANY); } sx_xunlock(&mlu); break; @@ -11281,6 +11461,10 @@ sx_sunlock(&t4_list_lock); if (t4_sge_extfree_refs() == 0) { + EVENTHANDLER_DEREGISTER(vxlan_start, + vxlan_start_evtag); + EVENTHANDLER_DEREGISTER(vxlan_stop, + vxlan_stop_evtag); t4_tracer_modunload(); #ifdef KERN_TLS t6_ktls_modunload(); Index: sys/dev/cxgbe/t4_sge.c =================================================================== --- sys/dev/cxgbe/t4_sge.c +++ sys/dev/cxgbe/t4_sge.c @@ -55,6 +55,7 @@ #include #include #include +#include #include #include #include @@ -266,8 +267,9 @@ static void add_fl_to_sfl(struct adapter *, struct sge_fl *); static inline void get_pkt_gl(struct mbuf *, struct sglist *); -static inline u_int txpkt_len16(u_int, u_int); -static inline u_int txpkt_vm_len16(u_int, u_int); +static inline u_int txpkt_len16(u_int, const u_int); +static inline u_int txpkt_vm_len16(u_int, const u_int); +static inline void calculate_mbuf_len16(struct adapter *, struct mbuf *); static inline u_int txpkts0_len16(u_int); static inline u_int txpkts1_len16(void); static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int); @@ -1911,6 +1913,7 @@ #if defined(INET) || defined(INET6) struct lro_ctrl *lro = &rxq->lro; #endif + uint16_t err_vec, tnl_type, tnlhdr_len; static const int sw_hashtype[4][2] = { {M_HASHTYPE_NONE, M_HASHTYPE_NONE}, {M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6}, @@ -1957,23 +1960,54 @@ m0->m_pkthdr.flowid = be32toh(d->rss.hash_val); cpl = (const void *)(&d->rss + 1); - if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) { - if (ifp->if_capenable & IFCAP_RXCSUM && - cpl->l2info & htobe32(F_RXF_IP)) { - m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED | - CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR); + if (sc->params.tp.rx_pkt_encap) { + const uint16_t ev = be16toh(cpl->err_vec); + + err_vec = G_T6_COMPR_RXERR_VEC(ev); + tnl_type = G_T6_RX_TNL_TYPE(ev); + tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev); + } else { + err_vec = be16toh(cpl->err_vec); + tnl_type = 0; + } + if (cpl->csum_calc && err_vec == 0 && + ((ifp->if_capenable & IFCAP_RXCSUM && cpl->l2info & htobe32(F_RXF_IP)) || + (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && cpl->l2info & htobe32(F_RXF_IP6)))) { + m0->m_pkthdr.csum_data = be16toh(cpl->csum); + if (tnl_type == 0) { + m0->m_pkthdr.csum_flags = CSUM_L3_CALC | CSUM_L3_VALID | + CSUM_L4_CALC | CSUM_L4_VALID; rxq->rxcsum++; - } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 && - cpl->l2info & htobe32(F_RXF_IP6)) { - m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 | - CSUM_PSEUDO_HDR); - rxq->rxcsum++; + } else { + MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN); + if (__predict_false(cpl->ip_frag)) { + /* + * csum_data is for the inner frame (which is an + * IP fragment) and is not 0xffff. There is no + * way to pass the inner csum_data to the stack. + * We don't want the stack to use the inner + * csum_data to validate the outer frame or it + * will get rejected. So we fix csum_data here + * and let sw do the checksum of inner IP + * fragments. + * + * XXX: Need 32b for csum_data2 in an rx mbuf. + * Maybe stuff it into rcv_tstmp? + */ + m0->m_pkthdr.csum_data = 0xffff; + m0->m_pkthdr.csum_flags = CSUM_L3_CALC | + CSUM_L3_VALID | CSUM_L4_CALC | + CSUM_L4_VALID; + } else { + m0->m_pkthdr.csum_flags = CSUM_ENCAP_VXLAN | + CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | + CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | + CSUM_L3_CALC | CSUM_L3_VALID | + CSUM_L4_CALC | CSUM_L4_VALID; + MPASS(m0->m_pkthdr.csum_data == 0xffff); + } + rxq->vxlan_rxcsum++; } - - if (__predict_false(cpl->ip_frag)) - m0->m_pkthdr.csum_data = be16toh(cpl->csum); - else - m0->m_pkthdr.csum_data = 0xffff; } if (cpl->vlan_ex) { @@ -2001,7 +2035,7 @@ m0->m_pkthdr.numa_domain = ifp->if_numa_domain; #endif #if defined(INET) || defined(INET6) - if (rxq->iq.flags & IQ_LRO_ENABLED && + if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 && (M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 || M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) { if (sort_before_lro(lro)) { @@ -2172,10 +2206,10 @@ { M_ASSERTPKTHDR(m); - KASSERT(m->m_pkthdr.l5hlen > 0, + KASSERT(m->m_pkthdr.inner_l5hlen > 0, ("%s: mbuf %p missing information on # of segments.", __func__, m)); - return (m->m_pkthdr.l5hlen); + return (m->m_pkthdr.inner_l5hlen); } static inline void @@ -2183,7 +2217,7 @@ { M_ASSERTPKTHDR(m); - m->m_pkthdr.l5hlen = nsegs; + m->m_pkthdr.inner_l5hlen = nsegs; } static inline int @@ -2309,63 +2343,108 @@ return (m); } -static inline int +static inline bool needs_hwcsum(struct mbuf *m) { + const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | + CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP | + CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP | + CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP | + CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); - return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP | - CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6)); + return (m->m_pkthdr.csum_flags & csum_flags); } -static inline int +static inline bool needs_tso(struct mbuf *m) { + const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO | + CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; M_ASSERTPKTHDR(m); - return (m->m_pkthdr.csum_flags & CSUM_TSO); + return (m->m_pkthdr.csum_flags & csum_flags); } -static inline int +static inline bool +needs_vxlan_csum(struct mbuf *m) +{ + + M_ASSERTPKTHDR(m); + + return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN); +} + +static inline bool +needs_vxlan_tso(struct mbuf *m) +{ + const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO | + CSUM_INNER_IP6_TSO; + + M_ASSERTPKTHDR(m); + + return ((m->m_pkthdr.csum_flags & csum_flags) != 0 && + (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN); +} + +static inline bool +needs_inner_tcp_csum(struct mbuf *m) +{ + const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO; + + M_ASSERTPKTHDR(m); + + return (m->m_pkthdr.csum_flags & csum_flags); +} + +static inline bool needs_l3_csum(struct mbuf *m) { + const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP | + CSUM_INNER_IP_TSO; M_ASSERTPKTHDR(m); - return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO)); + return (m->m_pkthdr.csum_flags & csum_flags); } -static inline int -needs_tcp_csum(struct mbuf *m) +static inline bool +needs_outer_tcp_csum(struct mbuf *m) { + const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP | + CSUM_IP6_TSO; M_ASSERTPKTHDR(m); - return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO)); + + return (m->m_pkthdr.csum_flags & csum_flags); } #ifdef RATELIMIT -static inline int -needs_l4_csum(struct mbuf *m) +static inline bool +needs_outer_l4_csum(struct mbuf *m) { + const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO | + CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO; M_ASSERTPKTHDR(m); - return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 | - CSUM_TCP_IPV6 | CSUM_TSO)); + return (m->m_pkthdr.csum_flags & csum_flags); } -static inline int -needs_udp_csum(struct mbuf *m) +static inline bool +needs_outer_udp_csum(struct mbuf *m) { + const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP; M_ASSERTPKTHDR(m); - return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6)); + + return (m->m_pkthdr.csum_flags & csum_flags); } #endif -static inline int +static inline bool needs_vlan_insertion(struct mbuf *m) { @@ -2506,6 +2585,23 @@ } /* + * The maximum number of segments that can fit in a WR. + */ +static int +max_nsegs_allowed(struct mbuf *m) +{ + + if (needs_tso(m)) { + if (needs_vxlan_tso(m)) + return (TX_SGL_SEGS_VXLAN_TSO); + else + return (TX_SGL_SEGS_TSO); + } + + return (TX_SGL_SEGS); +} + +/* * Analyze the mbuf to determine its tx needs. The mbuf passed in may change: * a) caller can assume it's been freed if this function returns with an error. * b) it may get defragged up if the gather list is too long for the hardware. @@ -2563,7 +2659,7 @@ return (0); } #endif - if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) { + if (nsegs > max_nsegs_allowed(m0)) { if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) { rc = EFBIG; goto fail; @@ -2585,18 +2681,15 @@ } set_mbuf_nsegs(m0, nsegs); set_mbuf_cflags(m0, cflags); - if (sc->flags & IS_VF) - set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0))); - else - set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0))); + calculate_mbuf_len16(sc, m0); #ifdef RATELIMIT /* * Ethofld is limited to TCP and UDP for now, and only when L4 hw - * checksumming is enabled. needs_l4_csum happens to check for all the - * right things. + * checksumming is enabled. needs_outer_l4_csum happens to check for + * all the right things. */ - if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) { + if (__predict_false(needs_eo(cst) && !needs_outer_l4_csum(m0))) { m_snd_tag_rele(m0->m_pkthdr.snd_tag); m0->m_pkthdr.snd_tag = NULL; m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG; @@ -2628,21 +2721,27 @@ switch (eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: - { - struct ip6_hdr *ip6 = l3hdr; - - MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP); - - m0->m_pkthdr.l3hlen = sizeof(*ip6); + m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr); break; - } #endif #ifdef INET case ETHERTYPE_IP: { struct ip *ip = l3hdr; - m0->m_pkthdr.l3hlen = ip->ip_hl * 4; + if (needs_vxlan_csum(m0)) { + /* Driver will do the outer IP hdr checksum. */ + ip->ip_sum = 0; + if (needs_vxlan_tso(m0)) { + const uint16_t ipl = ip->ip_len; + + ip->ip_len = 0; + ip->ip_sum = ~in_cksum_hdr(ip); + ip->ip_len = ipl; + } else + ip->ip_sum = in_cksum_hdr(ip); + } + m0->m_pkthdr.l3hlen = ip->ip_hl << 2; break; } #endif @@ -2652,8 +2751,59 @@ __func__, eh_type); } + if (needs_vxlan_csum(m0)) { + m0->m_pkthdr.l4hlen = sizeof(struct udphdr); + m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header); + + /* Inner headers. */ + eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen + + sizeof(struct udphdr) + sizeof(struct vxlan_header)); + eh_type = ntohs(eh->ether_type); + if (eh_type == ETHERTYPE_VLAN) { + struct ether_vlan_header *evh = (void *)eh; + + eh_type = ntohs(evh->evl_proto); + m0->m_pkthdr.inner_l2hlen = sizeof(*evh); + } else + m0->m_pkthdr.inner_l2hlen = sizeof(*eh); + l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen); + + switch (eh_type) { +#ifdef INET6 + case ETHERTYPE_IPV6: + m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr); + break; +#endif +#ifdef INET + case ETHERTYPE_IP: + { + struct ip *ip = l3hdr; + + m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2; + break; + } +#endif + default: + panic("%s: VXLAN hw offload requested with unknown " + "ethertype 0x%04x. if_cxgbe must be compiled" + " with the same INET/INET6 options as the kernel.", + __func__, eh_type); + } #if defined(INET) || defined(INET6) - if (needs_tcp_csum(m0)) { + if (needs_inner_tcp_csum(m0)) { + tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen); + m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4; + } +#endif + MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0); + m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP | + CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP | + CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | + CSUM_ENCAP_VXLAN; + } + +#if defined(INET) || defined(INET6) + if (needs_outer_tcp_csum(m0)) { tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen); m0->m_pkthdr.l4hlen = tcp->th_off * 4; #ifdef RATELIMIT @@ -2663,7 +2813,7 @@ V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1)); } else set_mbuf_eo_tsclk_tsoff(m0, 0); - } else if (needs_udp_csum(m0)) { + } else if (needs_outer_udp_csum(m0)) { m0->m_pkthdr.l4hlen = sizeof(struct udphdr); #endif } @@ -3618,6 +3768,9 @@ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction", CTLFLAG_RD, &rxq->vlan_extraction, "# of times hardware extracted 802.1Q tag"); + SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_rxcsum", + CTLFLAG_RD, &rxq->vxlan_rxcsum, + "# of times hardware assisted with inner checksum (VXLAN) "); add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl); @@ -4272,6 +4425,11 @@ "# of frames tx'd using type1 txpkts work requests"); SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD, &txq->raw_wrs, "# of raw work requests (non-packets)"); + SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs", + CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests"); + SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_txcsum", + CTLFLAG_RD, &txq->vxlan_txcsum, + "# of times hardware assisted with inner checksums (VXLAN)"); #ifdef KERN_TLS if (sc->flags & KERN_TLS_OK) { @@ -4561,27 +4719,25 @@ KASSERT(gl->sg_nseg == mbuf_nsegs(m), ("%s: nsegs changed for mbuf %p from %d to %d", __func__, m, mbuf_nsegs(m), gl->sg_nseg)); - KASSERT(gl->sg_nseg > 0 && - gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS), + KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m), ("%s: %d segments, should have been 1 <= nsegs <= %d", __func__, - gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)); + gl->sg_nseg, max_nsegs_allowed(m))); } /* * len16 for a txpkt WR with a GL. Includes the firmware work request header. */ static inline u_int -txpkt_len16(u_int nsegs, u_int tso) +txpkt_len16(u_int nsegs, const u_int extra) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ - n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) + + n = extra + sizeof(struct fw_eth_tx_pkt_wr) + + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); - if (tso) - n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } @@ -4591,22 +4747,43 @@ * request header. */ static inline u_int -txpkt_vm_len16(u_int nsegs, u_int tso) +txpkt_vm_len16(u_int nsegs, const u_int extra) { u_int n; MPASS(nsegs > 0); nsegs--; /* first segment is part of ulptx_sgl */ - n = sizeof(struct fw_eth_tx_pkt_vm_wr) + + n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) + sizeof(struct cpl_tx_pkt_core) + sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1)); - if (tso) - n += sizeof(struct cpl_tx_pkt_lso_core); return (howmany(n, 16)); } +static inline void +calculate_mbuf_len16(struct adapter *sc, struct mbuf *m) +{ + const int lso = sizeof(struct cpl_tx_pkt_lso_core); + const int tnl_lso = sizeof(struct cpl_tx_tnl_lso); + + if (sc->flags & IS_VF) { + if (needs_tso(m)) + set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso)); + else + set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0)); + return; + } + + if (needs_tso(m)) { + if (needs_vxlan_tso(m)) + set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso)); + else + set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso)); + } else + set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0)); +} + /* * len16 for a txpkts type 0 WR with a GL. Does not include the firmware work * request header. @@ -4655,51 +4832,162 @@ csum_to_ctrl(struct adapter *sc, struct mbuf *m) { uint64_t ctrl; - int csum_type; + int csum_type, l2hlen, l3hlen; + int x, y; + static const int csum_types[3][2] = { + {TX_CSUM_TCPIP, TX_CSUM_TCPIP6}, + {TX_CSUM_UDPIP, TX_CSUM_UDPIP6}, + {TX_CSUM_IP, 0} + }; M_ASSERTPKTHDR(m); - if (needs_hwcsum(m) == 0) + if (!needs_hwcsum(m)) return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS); + MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN); + MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip)); + + if (needs_vxlan_csum(m)) { + MPASS(m->m_pkthdr.l4hlen > 0); + MPASS(m->m_pkthdr.l5hlen > 0); + MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN); + MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip)); + + l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen + + m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen + + m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN; + l3hlen = m->m_pkthdr.inner_l3hlen; + } else { + l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN; + l3hlen = m->m_pkthdr.l3hlen; + } + ctrl = 0; - if (needs_l3_csum(m) == 0) + if (!needs_l3_csum(m)) ctrl |= F_TXPKT_IPCSUM_DIS; - switch (m->m_pkthdr.csum_flags & - (CSUM_IP_TCP | CSUM_IP_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP)) { - case CSUM_IP_TCP: - csum_type = TX_CSUM_TCPIP; - break; - case CSUM_IP_UDP: - csum_type = TX_CSUM_UDPIP; - break; - case CSUM_IP6_TCP: - csum_type = TX_CSUM_TCPIP6; - break; - case CSUM_IP6_UDP: - csum_type = TX_CSUM_UDPIP6; - break; - default: - /* needs_hwcsum told us that at least some hwcsum is needed. */ - MPASS(ctrl == 0); - MPASS(m->m_pkthdr.csum_flags & CSUM_IP); - ctrl |= F_TXPKT_L4CSUM_DIS; - csum_type = TX_CSUM_IP; - break; - } - MPASS(m->m_pkthdr.l2hlen > 0); - MPASS(m->m_pkthdr.l3hlen > 0); - ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | - V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen); + if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP | + CSUM_IP6_TCP | CSUM_INNER_IP6_TCP)) + x = 0; /* TCP */ + else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP | + CSUM_IP6_UDP | CSUM_INNER_IP6_UDP)) + x = 1; /* UDP */ + else + x = 2; + + if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP | + CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP)) + y = 0; /* IPv4 */ + else { + MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP | + CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP)); + y = 1; /* IPv6 */ + } + /* + * needs_hwcsum returned true earlier so there must be some kind of + * checksum to calculate. + */ + csum_type = csum_types[x][y]; + MPASS(csum_type != 0); + if (csum_type == TX_CSUM_IP) + ctrl |= F_TXPKT_L4CSUM_DIS; + ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen); if (chip_id(sc) <= CHELSIO_T5) - ctrl |= V_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN); + ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen); else - ctrl |= V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN); + ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen); return (ctrl); } +static inline void * +write_lso_cpl(void *cpl, struct mbuf *m0) +{ + struct cpl_tx_pkt_lso_core *lso; + uint32_t ctrl; + + KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && + m0->m_pkthdr.l4hlen > 0, + ("%s: mbuf %p needs TSO but missing header lengths", + __func__, m0)); + + ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | + F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE | + V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | + V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | + V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); + if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) + ctrl |= F_LSO_IPV6; + + lso = cpl; + lso->lso_ctrl = htobe32(ctrl); + lso->ipid_ofst = htobe16(0); + lso->mss = htobe16(m0->m_pkthdr.tso_segsz); + lso->seqno_offset = htobe32(0); + lso->len = htobe32(m0->m_pkthdr.len); + + return (lso + 1); +} + +static void * +write_tnl_lso_cpl(void *cpl, struct mbuf *m0) +{ + struct cpl_tx_tnl_lso *tnl_lso = cpl; + uint32_t ctrl; + + KASSERT(m0->m_pkthdr.inner_l2hlen > 0 && + m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 && + m0->m_pkthdr.inner_l5hlen > 0, + ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths", + __func__, m0)); + KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && + m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0, + ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths", + __func__, m0)); + + /* Outer headers. */ + ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) | + F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST | + V_CPL_TX_TNL_LSO_ETHHDRLENOUT( + (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) | + V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) | + F_CPL_TX_TNL_LSO_IPLENSETOUT; + if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) + ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT; + else { + ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT | + F_CPL_TX_TNL_LSO_IPIDINCOUT; + } + tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl); + tnl_lso->IpIdOffsetOut = 0; + tnl_lso->UdpLenSetOut_to_TnlHdrLen = + htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT | + F_CPL_TX_TNL_LSO_UDPLENSETOUT | + V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen + + m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen + + m0->m_pkthdr.l5hlen) | + V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN)); + tnl_lso->r1 = 0; + + /* Inner headers. */ + ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN( + (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) | + V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) | + V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2); + if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr)) + ctrl |= F_CPL_TX_TNL_LSO_IPV6; + tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl); + tnl_lso->IpIdOffset = 0; + tnl_lso->IpIdSplit_to_Mss = + htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz)); + tnl_lso->TCPSeqOffset = 0; + tnl_lso->EthLenOffset_Size = + htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len)); + + return (tnl_lso + 1); +} + #define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */ /* @@ -4753,29 +5041,7 @@ m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst); if (needs_tso(m0)) { - struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); - - KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && - m0->m_pkthdr.l4hlen > 0, - ("%s: mbuf %p needs TSO but missing header lengths", - __func__, m0)); - - ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | - F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - - ETHER_HDR_LEN) >> 2) | - V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | - V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); - if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) - ctrl |= F_LSO_IPV6; - - lso->lso_ctrl = htobe32(ctrl); - lso->ipid_ofst = htobe16(0); - lso->mss = htobe16(m0->m_pkthdr.tso_segsz); - lso->seqno_offset = htobe32(0); - lso->len = htobe32(pktlen); - - cpl = (void *)(lso + 1); - + cpl = write_lso_cpl(wr + 1, m0); txq->tso_wrs++; } else cpl = (void *)(wr + 1); @@ -4883,9 +5149,12 @@ nsegs = mbuf_nsegs(m0); pktlen = m0->m_pkthdr.len; ctrl = sizeof(struct cpl_tx_pkt_core); - if (needs_tso(m0)) - ctrl += sizeof(struct cpl_tx_pkt_lso_core); - else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) && + if (needs_tso(m0)) { + if (needs_vxlan_tso(m0)) + ctrl += sizeof(struct cpl_tx_tnl_lso); + else + ctrl += sizeof(struct cpl_tx_pkt_lso_core); + } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) && available >= 2) { /* Immediate data. Recalculate len16 and set nsegs to 0. */ ctrl += pktlen; @@ -4907,41 +5176,30 @@ wr->r3 = 0; if (needs_tso(m0)) { - struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1); - - KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 && - m0->m_pkthdr.l4hlen > 0, - ("%s: mbuf %p needs TSO but missing header lengths", - __func__, m0)); - - ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE | - F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - - ETHER_HDR_LEN) >> 2) | - V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) | - V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2); - if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr)) - ctrl |= F_LSO_IPV6; - - lso->lso_ctrl = htobe32(ctrl); - lso->ipid_ofst = htobe16(0); - lso->mss = htobe16(m0->m_pkthdr.tso_segsz); - lso->seqno_offset = htobe32(0); - lso->len = htobe32(pktlen); - - cpl = (void *)(lso + 1); - - txq->tso_wrs++; + if (needs_vxlan_tso(m0)) { + cpl = write_tnl_lso_cpl(wr + 1, m0); + txq->vxlan_tso_wrs++; + } else { + cpl = write_lso_cpl(wr + 1, m0); + txq->tso_wrs++; + } } else cpl = (void *)(wr + 1); /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m0); - if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) - txq->txcsum++; /* some hardware assistance provided */ + if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { + /* some hardware assistance provided */ + if (needs_vxlan_csum(m0)) + txq->vxlan_txcsum++; + else + txq->txcsum++; + } /* VLAN tag insertion */ if (needs_vlan_insertion(m0)) { - ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); + ctrl1 |= F_TXPKT_VLAN_VLD | + V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag); txq->vlan_insertion++; } @@ -4953,6 +5211,8 @@ /* SGL */ dst = (void *)(cpl + 1); + if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx])) + dst = (caddr_t)&eq->desc[0]; if (nsegs > 0) { write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx); @@ -5198,8 +5458,13 @@ /* Checksum offload */ ctrl1 = csum_to_ctrl(sc, m); - if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) - txq->txcsum++; /* some hardware assistance provided */ + if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) { + /* some hardware assistance provided */ + if (needs_vxlan_csum(m)) + txq->vxlan_txcsum++; + else + txq->txcsum++; + } /* VLAN tag insertion */ if (needs_vlan_insertion(m)) { @@ -5958,7 +6223,7 @@ wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) | V_FW_WR_FLOWID(cst->etid)); wr->r3 = 0; - if (needs_udp_csum(m0)) { + if (needs_outer_udp_csum(m0)) { wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG; wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); @@ -5970,7 +6235,7 @@ wr->u.udpseg.plen = htobe32(pktlen - immhdrs); cpl = (void *)(wr + 1); } else { - MPASS(needs_tcp_csum(m0)); + MPASS(needs_outer_tcp_csum(m0)); wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG; wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen; wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen); @@ -6007,7 +6272,7 @@ } /* Checksum offload must be requested for ethofld. */ - MPASS(needs_l4_csum(m0)); + MPASS(needs_outer_l4_csum(m0)); ctrl1 = csum_to_ctrl(cst->adapter, m0); /* VLAN tag insertion */ Index: sys/net/if.h =================================================================== --- sys/net/if.h +++ sys/net/if.h @@ -249,6 +249,8 @@ #define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */ #define IFCAP_TXTLS4 0x8000000 /* can do TLS encryption and segmentation for TCP */ #define IFCAP_TXTLS6 0x10000000 /* can do TLS encryption and segmentation for TCP6 */ +#define IFCAP_VXLAN_HWCSUM 0x20000000 /* can do IFCAN_HWCSUM on VXLANs */ +#define IFCAP_VXLAN_HWTSO 0x40000000 /* can do IFCAP_TSO on VXLANs */ #define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6) Index: sys/net/if_vxlan.h =================================================================== --- sys/net/if_vxlan.h +++ sys/net/if_vxlan.h @@ -143,4 +143,11 @@ char vxlcmd_ifname[IFNAMSIZ]; }; +#ifdef _KERNEL +typedef void (*vxlan_event_handler_t)(void *, struct ifnet *, sa_family_t, + u_int); +EVENTHANDLER_DECLARE(vxlan_start, vxlan_event_handler_t); +EVENTHANDLER_DECLARE(vxlan_stop, vxlan_event_handler_t); +#endif + #endif /* _NET_IF_VXLAN_H_ */ Index: sys/net/if_vxlan.c =================================================================== --- sys/net/if_vxlan.c +++ sys/net/if_vxlan.c @@ -1,6 +1,7 @@ /*- * Copyright (c) 2014, Bryan Venteicher * All rights reserved. + * Copyright (c) 2020, Chelsio Communications. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -60,6 +61,8 @@ #include #include #include +#include +#include #include #include @@ -70,6 +73,8 @@ #include #include #include +#include +#include #include #include @@ -92,6 +97,7 @@ sizeof(struct udphdr) - \ sizeof(struct vxlan_header) - \ ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN) +#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU) #define VXLAN_SO_MC_MAX_GROUPS 32 @@ -150,6 +156,7 @@ struct vxlan_softc { struct ifnet *vxl_ifp; + int vxl_reqcap; struct vxlan_socket *vxl_sock; uint32_t vxl_vni; union vxlan_sockaddr vxl_src_addr; @@ -297,7 +304,10 @@ static int vxlan_setup_multicast(struct vxlan_softc *); static int vxlan_setup_socket(struct vxlan_softc *); -static void vxlan_setup_interface(struct vxlan_softc *); +#ifdef INET6 +static void vxlan_setup_zero_checksum_port(struct vxlan_softc *); +#endif +static void vxlan_setup_interface_hdrlen(struct vxlan_softc *); static int vxlan_valid_init_config(struct vxlan_softc *); static void vxlan_init_wait(struct vxlan_softc *); static void vxlan_init_complete(struct vxlan_softc *); @@ -350,6 +360,7 @@ static void vxlan_set_default_config(struct vxlan_softc *); static int vxlan_set_user_config(struct vxlan_softc *, struct ifvxlanparam *); +static void vxlan_set_hwcaps(struct vxlan_softc *); static int vxlan_clone_create(struct if_clone *, int, caddr_t); static void vxlan_clone_destroy(struct ifnet *); @@ -1555,9 +1566,31 @@ return (error); } +#ifdef INET6 static void -vxlan_setup_interface(struct vxlan_softc *sc) +vxlan_setup_zero_checksum_port(struct vxlan_softc *sc) { + + if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr)) + return; + + if (V_zero_checksum_port != 0) + return; /* Leave it alone. */ + + MPASS(sc->vxl_src_addr.in6.sin6_port != 0); + MPASS(sc->vxl_dst_addr.in6.sin6_port != 0); + + if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) + return; + + V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port); + printf("rfc6935_port set to %d\n", V_zero_checksum_port); +} +#endif + +static void +vxlan_setup_interface_hdrlen(struct vxlan_softc *sc) +{ struct ifnet *ifp; ifp = sc->vxl_ifp; @@ -1666,11 +1699,13 @@ if (vxlan_valid_init_config(sc) != 0) goto out; - vxlan_setup_interface(sc); - if (vxlan_setup_socket(sc) != 0) goto out; +#ifdef INET6 + vxlan_setup_zero_checksum_port(sc); +#endif + /* Initialize the default forwarding entry. */ vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac, &sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC); @@ -1682,6 +1717,9 @@ VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_UP); + + EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family, + ntohs(sc->vxl_src_addr.in4.sin_port)); out: vxlan_init_complete(sc); } @@ -1738,6 +1776,8 @@ VXLAN_WUNLOCK(sc); if_link_state_change(ifp, LINK_STATE_DOWN); + EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family, + ntohs(sc->vxl_src_addr.in4.sin_port)); if (vso != NULL) { vxlan_socket_remove_softc(vso, sc); @@ -1907,6 +1947,7 @@ VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa); + vxlan_set_hwcaps(sc); error = 0; } else error = EBUSY; @@ -1936,6 +1977,7 @@ VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa); + vxlan_setup_interface_hdrlen(sc); error = 0; } else error = EBUSY; @@ -2063,6 +2105,7 @@ VXLAN_WLOCK(sc); if (vxlan_can_change_config(sc)) { strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ); + vxlan_set_hwcaps(sc); error = 0; } else error = EBUSY; @@ -2284,6 +2327,13 @@ ifp->if_mtu = ifr->ifr_mtu; break; + case SIOCSIFCAP: + VXLAN_WLOCK(sc); + sc->vxl_reqcap = ifr->ifr_reqcap; + vxlan_set_hwcaps(sc); + VXLAN_WUNLOCK(sc); + break; + default: error = ether_ioctl(ifp, cmd, data); break; @@ -2335,6 +2385,48 @@ } #endif +/* + * Return the CSUM_INNER_* equivalent of CSUM_* caps. + */ +static uint32_t +csum_flags_to_inner_flags(uint32_t csum_flags_in, uint32_t encap) +{ + uint32_t csum_flags = CSUM_ENCAP_VXLAN; + const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP; + + /* + * csum_flags can request either v4 or v6 offload but not both. + * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO) + * so those bits are no good to detect the IP version. Other bits are + * always set with CSUM_TSO and we use those to figure out the IP + * version. + */ + if (csum_flags_in & v4) { + if (csum_flags_in & CSUM_IP) + csum_flags |= CSUM_INNER_IP; + if (csum_flags_in & CSUM_IP_UDP) + csum_flags |= CSUM_INNER_IP_UDP; + if (csum_flags_in & CSUM_IP_TCP) + csum_flags |= CSUM_INNER_IP_TCP; + if (csum_flags_in & CSUM_IP_TSO) + csum_flags |= CSUM_INNER_IP_TSO; + } else { +#ifdef INVARIANTS + const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP; + + MPASS((csum_flags_in & v6) != 0); +#endif + if (csum_flags_in & CSUM_IP6_UDP) + csum_flags |= CSUM_INNER_IP6_UDP; + if (csum_flags_in & CSUM_IP6_TCP) + csum_flags |= CSUM_INNER_IP6_TCP; + if (csum_flags_in & CSUM_IP6_TSO) + csum_flags |= CSUM_INNER_IP6_TSO; + } + + return (csum_flags); +} + static int vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa, struct mbuf *m) @@ -2345,7 +2437,12 @@ struct in_addr srcaddr, dstaddr; uint16_t srcport, dstport; int len, mcast, error; + struct route route, *ro; + struct sockaddr_in *sin; + uint32_t csum_flags; + NET_EPOCH_ASSERT(); + ifp = sc->vxl_ifp; srcaddr = sc->vxl_src_addr.in4.sin_addr; srcport = vxlan_pick_source_port(sc, m); @@ -2376,7 +2473,46 @@ mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; m->m_flags &= ~(M_MCAST | M_BCAST); - error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL); + m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX; + if (m->m_pkthdr.csum_flags != 0) { + /* + * HW checksum (L3 and/or L4) or TSO has been requested. Look + * up the ifnet for the outbound route and verify that the + * outbound ifnet can perform the requested operation on the + * inner frame. + */ + bzero(&route, sizeof(route)); + ro = &route; + sin = (struct sockaddr_in *)&ro->ro_dst; + sin->sin_family = AF_INET; + sin->sin_len = sizeof(*sin); + sin->sin_addr = ip->ip_dst; + ro->ro_nh = fib4_lookup(RT_DEFAULT_FIB, ip->ip_dst, 0, NHR_NONE, + 0); + if (ro->ro_nh == NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (EHOSTUNREACH); + } + + csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags, + CSUM_ENCAP_VXLAN); + if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) != + csum_flags) { + printf("m->csum_flags 0x%08x, csum_flags 0x%08x, " + "hwassist 0x%08x, missing 0x%08x\n", + m->m_pkthdr.csum_flags, csum_flags, + (uint32_t)ro->ro_nh->nh_ifp->if_hwassist, + csum_flags & + ~(uint32_t)ro->ro_nh->nh_ifp->if_hwassist); + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENXIO); + } + m->m_pkthdr.csum_flags = csum_flags; + } else + ro = NULL; + error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); @@ -2402,7 +2538,12 @@ const struct in6_addr *srcaddr, *dstaddr; uint16_t srcport, dstport; int len, mcast, error; + struct route_in6 route, *ro; + struct sockaddr_in6 *sin6; + uint32_t csum_flags; + NET_EPOCH_ASSERT(); + ifp = sc->vxl_ifp; srcaddr = &sc->vxl_src_addr.in6.sin6_addr; srcport = vxlan_pick_source_port(sc, m); @@ -2429,22 +2570,56 @@ vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport); - /* - * XXX BMV We need support for RFC6935 before we can send and - * receive IPv6 UDP packets with a zero checksum. - */ - { + mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; + m->m_flags &= ~(M_MCAST | M_BCAST); + + ro = NULL; + m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX; + if (m->m_pkthdr.csum_flags != 0) { + /* + * HW checksum (L3 and/or L4) or TSO has been requested. Look + * up the ifnet for the outbound route and verify that the + * outbound ifnet can perform the requested operation on the + * inner frame. + */ + bzero(&route, sizeof(route)); + ro = &route; + sin6 = (struct sockaddr_in6 *)&ro->ro_dst; + sin6->sin6_family = AF_INET6; + sin6->sin6_len = sizeof(*sin6); + sin6->sin6_addr = ip6->ip6_dst; + ro->ro_nh = fib6_lookup(RT_DEFAULT_FIB, &ip6->ip6_dst, 0, + NHR_NONE, 0); + if (ro->ro_nh == NULL) { + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (EHOSTUNREACH); + } + + csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags, + CSUM_ENCAP_VXLAN); + if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) != + csum_flags) { + printf("m->csum_flags 0x%08x, csum_flags 0x%08x, " + "hwassist 0x%08x, missing 0x%08x\n", + m->m_pkthdr.csum_flags, csum_flags, + (uint32_t)ro->ro_nh->nh_ifp->if_hwassist, + csum_flags & + ~(uint32_t)ro->ro_nh->nh_ifp->if_hwassist); + m_freem(m); + if_inc_counter(ifp, IFCOUNTER_OERRORS, 1); + return (ENXIO); + } + m->m_pkthdr.csum_flags = csum_flags; + } else if (ntohs(dstport) != V_zero_checksum_port) { struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr)); + hdr->uh_sum = in6_cksum_pseudo(ip6, m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0); m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } - - mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0; - m->m_flags &= ~(M_MCAST | M_BCAST); - - error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL); + error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL); if (error == 0) { if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1); if_inc_counter(ifp, IFCOUNTER_OBYTES, len); @@ -2593,8 +2768,26 @@ m_clrprotoflags(m); m->m_pkthdr.rcvif = ifp; M_SETFIB(m, ifp->if_fib); + if (ifp->if_capenable & IFCAP_RXCSUM && + m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN) { + uint32_t csum_flags = 0; - error = netisr_queue_src(NETISR_ETHER, 0, m); + if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) + csum_flags |= CSUM_L3_CALC; + if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID) + csum_flags |= CSUM_L3_VALID; + if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC) + csum_flags |= CSUM_L4_CALC; + if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID) + csum_flags |= CSUM_L4_VALID; + m->m_pkthdr.csum_flags = csum_flags; + } else { + /* clear everything */ + m->m_pkthdr.csum_flags = 0; + m->m_pkthdr.csum_data = 0; + } + + error = netisr_dispatch(NETISR_ETHER, m); *m0 = NULL; out: @@ -2721,6 +2914,109 @@ return (0); } +/* + * A VXLAN interface inherits the capabilities of the vxlandev or the interface + * hosting the vxlanlocal address. + */ +static void +vxlan_set_hwcaps(struct vxlan_softc *sc) +{ + struct epoch_tracker et; + struct ifnet *p; + struct ifaddr *ifa; + u_long hwa; + int cap, ena; + bool rel; + struct ifnet *ifp = sc->vxl_ifp; + + /* reset caps */ + ifp->if_capabilities &= VXLAN_BASIC_IFCAPS; + ifp->if_capenable &= VXLAN_BASIC_IFCAPS; + ifp->if_hwassist = 0; + + NET_EPOCH_ENTER(et); + CURVNET_SET(ifp->if_vnet); + + rel = false; + p = NULL; + if (sc->vxl_mc_ifname[0] != '\0') { + rel = true; + p = ifunit_ref(sc->vxl_mc_ifname); + } else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) { + if (sc->vxl_src_addr.sa.sa_family == AF_INET) { + struct sockaddr_in in4 = sc->vxl_src_addr.in4; + + in4.sin_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *)&in4); + if (ifa != NULL) + p = ifa->ifa_ifp; + } else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) { + struct sockaddr_in6 in6 = sc->vxl_src_addr.in6; + + in6.sin6_port = 0; + ifa = ifa_ifwithaddr((struct sockaddr *)&in6); + if (ifa != NULL) + p = ifa->ifa_ifp; + } + } + if (p == NULL) + goto done; + + cap = ena = hwa = 0; + + /* checksum offload */ + if (p->if_capabilities & IFCAP_VXLAN_HWCSUM) + cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); + if (p->if_capenable & IFCAP_VXLAN_HWCSUM) { + ena |= sc->vxl_reqcap & p->if_capenable & + (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6); + if (ena & IFCAP_TXCSUM) { + if (p->if_hwassist & CSUM_INNER_IP) + hwa |= CSUM_IP; + if (p->if_hwassist & CSUM_INNER_IP_UDP) + hwa |= CSUM_IP_UDP; + if (p->if_hwassist & CSUM_INNER_IP_TCP) + hwa |= CSUM_IP_TCP; + } + if (ena & IFCAP_TXCSUM_IPV6) { + if (p->if_hwassist & CSUM_INNER_IP6_UDP) + hwa |= CSUM_IP6_UDP; + if (p->if_hwassist & CSUM_INNER_IP6_TCP) + hwa |= CSUM_IP6_TCP; + } + } + + /* hardware TSO */ + if (p->if_capabilities & IFCAP_VXLAN_HWTSO) { + cap |= p->if_capabilities & IFCAP_TSO; + if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen) + ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen; + else + ifp->if_hw_tsomax = p->if_hw_tsomax; + /* XXX: tsomaxsegcount decrement is cxgbe specific */ + ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1; + ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize; + } + if (p->if_capenable & IFCAP_VXLAN_HWTSO) { + ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO; + if (ena & IFCAP_TSO) { + if (p->if_hwassist & CSUM_INNER_IP_TSO) + hwa |= CSUM_IP_TSO; + if (p->if_hwassist & CSUM_INNER_IP6_TSO) + hwa |= CSUM_IP6_TSO; + } + } + + ifp->if_capabilities |= cap; + ifp->if_capenable |= ena; + ifp->if_hwassist |= hwa; + if (rel) + if_rele(p); +done: + CURVNET_RESTORE(); + NET_EPOCH_EXIT(et); +} + static int vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params) { @@ -2764,8 +3060,10 @@ ifp->if_ioctl = vxlan_ioctl; ifp->if_transmit = vxlan_transmit; ifp->if_qflush = vxlan_qflush; - ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; - ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU; + ifp->if_capabilities = VXLAN_BASIC_IFCAPS; + ifp->if_capenable = VXLAN_BASIC_IFCAPS; + sc->vxl_reqcap = -1; + vxlan_set_hwcaps(sc); ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status); ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL); @@ -2775,7 +3073,7 @@ ether_ifattach(ifp, sc->vxl_hwaddr.octet); ifp->if_baudrate = 0; - ifp->if_hdrlen = 0; + vxlan_setup_interface_hdrlen(sc); return (0); Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -769,9 +769,13 @@ /* * If small enough for interface, or the interface will take * care of the fragmentation for us, we can just send directly. + * Note that if_vxlan could have requested TSO even though the outer + * frame is UDP. It is correct to not fragment such datagrams and + * instead just pass them on to the driver. */ if (ip_len <= mtu || - (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) { + (m->m_pkthdr.csum_flags & ifp->if_hwassist & + (CSUM_TSO | CSUM_INNER_TSO)) != 0) { ip->ip_sum = 0; if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) { ip->ip_sum = in_cksum(m, hlen); @@ -785,7 +789,8 @@ * once instead of for every generated packet. */ if (!(flags & IP_FORWARDING) && ia) { - if (m->m_pkthdr.csum_flags & CSUM_TSO) + if (m->m_pkthdr.csum_flags & + (CSUM_TSO | CSUM_INNER_TSO)) counter_u64_add(ia->ia_ifa.ifa_opackets, m->m_pkthdr.len / m->m_pkthdr.tso_segsz); else @@ -809,7 +814,8 @@ } /* Balk when DF bit is set or the interface didn't support TSO. */ - if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) { + if ((ip_off & IP_DF) || + (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) { error = EMSGSIZE; IPSTAT_INC(ips_cantfrag); goto bad; Index: sys/netinet/udp_var.h =================================================================== --- sys/netinet/udp_var.h +++ sys/netinet/udp_var.h @@ -154,6 +154,9 @@ #define V_udp_blackhole VNET(udp_blackhole) #define V_udp_log_in_vain VNET(udp_log_in_vain) +VNET_DECLARE(int, zero_checksum_port); +#define V_zero_checksum_port VNET(zero_checksum_port) + static __inline struct inpcbinfo * udp_get_inpcbinfo(int protocol) { Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -1116,7 +1116,8 @@ */ sw_csum = m->m_pkthdr.csum_flags; if (!hdrsplit) { - tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0; + tso = ((sw_csum & ifp->if_hwassist & + (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0; sw_csum &= ~ifp->if_hwassist; } else tso = 0; Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -124,6 +124,11 @@ #include +VNET_DEFINE(int, zero_checksum_port) = 0; +#define V_zero_checksum_port VNET(zero_checksum_port) +SYSCTL_INT(_net_inet6_udp6, OID_AUTO, rfc6935_port, CTLFLAG_VNET | CTLFLAG_RW, + &VNET_NAME(zero_checksum_port), 0, + "Zero UDP checksum allowed for traffic to/from this port."); /* * UDP protocol implementation. * Per RFC 768, August, 1980. @@ -268,7 +273,14 @@ } if (uh->uh_sum == 0) { UDPSTAT_INC(udps_nosum); - goto badunlocked; + /* + * dport 0 was rejected earlier so this is OK even if + * zero_checksum_port is 0 (which is its default value). + */ + if (ntohs(uh->uh_dport) == V_zero_checksum_port) + goto skip_checksum; + else + goto badunlocked; } } @@ -288,6 +300,7 @@ goto badunlocked; } +skip_checksum: /* * Construct sockaddr format source address. */ Index: sys/sys/mbuf.h =================================================================== --- sys/sys/mbuf.h +++ sys/sys/mbuf.h @@ -170,7 +170,10 @@ uint8_t l3hlen; /* layer 3 hdr len */ uint8_t l4hlen; /* layer 4 hdr len */ uint8_t l5hlen; /* layer 5 hdr len */ - uint32_t spare; + uint8_t inner_l2hlen; + uint8_t inner_l3hlen; + uint8_t inner_l4hlen; + uint8_t inner_l5hlen; }; }; union { @@ -615,7 +618,13 @@ * Outbound flags that are set by upper protocol layers requesting lower * layers, or ideally the hardware, to perform these offloading tasks. * For outbound packets this field and its flags can be directly tested - * against ifnet if_hwassist. + * against ifnet if_hwassist. Note that the outbound and the inbound flags do + * not collide right now but they could be allowed to (as long as the flags are + * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS + * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX. + * + * CSUM_INNER_ is the same as CSUM_ but it applies to the inner frame. + * The CSUM_ENCAP_ bits identify the outer encapsulation. */ #define CSUM_IP 0x00000001 /* IP header checksum offload */ #define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */ @@ -624,13 +633,28 @@ #define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */ #define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */ +#define CSUM_INNER_IP6_UDP 0x00000040 +#define CSUM_INNER_IP6_TCP 0x00000080 +#define CSUM_INNER_IP6_TSO 0x00000100 #define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */ #define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */ #define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */ #define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */ #define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */ +#define CSUM_INNER_IP 0x00004000 +#define CSUM_INNER_IP_UDP 0x00008000 +#define CSUM_INNER_IP_TCP 0x00010000 +#define CSUM_INNER_IP_TSO 0x00020000 + +#define CSUM_ENCAP_VXLAN 0x00040000 /* VXLAN outer encapsulation */ +#define CSUM_ENCAP_RSVD1 0x00080000 + /* Inbound checksum support where the checksum was verified by hardware. */ +#define CSUM_INNER_L3_CALC 0x00100000 +#define CSUM_INNER_L3_VALID 0x00200000 +#define CSUM_INNER_L4_CALC 0x00400000 +#define CSUM_INNER_L4_VALID 0x00800000 #define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */ #define CSUM_L3_VALID 0x02000000 /* checksum is correct */ #define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */ @@ -641,16 +665,31 @@ #define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */ +#define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \ + CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \ + CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \ + CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \ + CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \ + CSUM_ENCAP_RSVD1 | CSUM_SND_TAG) + +#define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \ + CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \ + CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \ + CSUM_COALESCED) + /* * CSUM flag description for use with printf(9) %b identifier. */ #define CSUM_BITS \ "\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \ - "\6CSUM_IP_ISCSI" \ - "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \ - "\16CSUM_IP6_ISCSI" \ - "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \ - "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" + "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \ + "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \ + "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \ + "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \ + "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \ + "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \ + "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \ + "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG" /* CSUM flags compatibility mappings. */ #define CSUM_IP_CHECKED CSUM_L3_CALC @@ -666,6 +705,7 @@ #define CSUM_UDP CSUM_IP_UDP #define CSUM_SCTP CSUM_IP_SCTP #define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO) +#define CSUM_INNER_TSO (CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO) #define CSUM_UDP_IPV6 CSUM_IP6_UDP #define CSUM_TCP_IPV6 CSUM_IP6_TCP #define CSUM_SCTP_IPV6 CSUM_IP6_SCTP