Page MenuHomeFreeBSD

D25873.id76008.diff
No OneTemporary

D25873.id76008.diff

Index: sbin/ifconfig/ifconfig.8
===================================================================
--- sbin/ifconfig/ifconfig.8
+++ sbin/ifconfig/ifconfig.8
@@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
-.Dd June 4, 2020
+.Dd XXX
.Dt IFCONFIG 8
.Os
.Sh NAME
@@ -587,7 +587,7 @@
reception of extended frames, tag processing in hardware,
frame filtering in hardware, checksum offloading, or TSO on VLAN,
respectively.
-Note that this must be issued on a physical interface associated with
+Note that this must be configured on a physical interface associated with
.Xr vlan 4 ,
not on a
.Xr vlan 4
@@ -597,6 +597,21 @@
reception of extended frames, tag processing in hardware,
frame filtering in hardware, or TSO on VLAN,
respectively.
+.It Cm vxlanhwcsum , vxlanhwtso
+If the driver offers user-configurable VXLAN support, enable inner checksum
+offloading (receive and transmit) or TSO on VXLAN, respectively.
+Note that this must be configured on a physical interface associated with
+.Xr vxlan 4 ,
+not on a
+.Xr vxlan 4
+interface itself.
+The physical interface is either the interface specified as the vxlandev
+or the interface hosting the vxlanlocal address.
+The driver will offload as much checksum work and TSO as it can reliably
+support, the exact level of offloading may vary between drivers.
+.It Fl vxlanhwcsum , vxlanhwtso
+If the driver offers user-configurable VXLAN support, disable checksum
+offloading (receive and transmit) or TSO on VXLAN, respectively.
.It Cm vnet Ar jail
Move the interface to the
.Xr jail 8 ,
Index: sbin/ifconfig/ifconfig.c
===================================================================
--- sbin/ifconfig/ifconfig.c
+++ sbin/ifconfig/ifconfig.c
@@ -1344,7 +1344,8 @@
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6" \
+"\36VXLAN_HWCSUM\37VXLAN_HWTSO"
/*
* Print the status of the interface. If an address family was
Index: sbin/ifconfig/ifvxlan.c
===================================================================
--- sbin/ifconfig/ifvxlan.c
+++ sbin/ifconfig/ifvxlan.c
@@ -620,6 +620,11 @@
DEF_CMD("vxlanflush", 0, setvxlan_flush),
DEF_CMD("vxlanflushall", 1, setvxlan_flush),
+
+ DEF_CMD("vxlanhwcsum", IFCAP_VXLAN_HWCSUM, setifcap),
+ DEF_CMD("-vxlanhwcsum", -IFCAP_VXLAN_HWCSUM, setifcap),
+ DEF_CMD("vxlanhwtso", IFCAP_VXLAN_HWTSO, setifcap),
+ DEF_CMD("-vxlanhwtso", -IFCAP_VXLAN_HWTSO, setifcap),
};
static struct afswtch af_vxlan = {
Index: share/man/man4/cxgbe.4
===================================================================
--- share/man/man4/cxgbe.4
+++ share/man/man4/cxgbe.4
@@ -31,7 +31,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd Dec 10, 2019
+.Dd XXX
.Dt CXGBE 4
.Os
.Sh NAME
@@ -61,8 +61,8 @@
the Chelsio Terminator 4, Terminator 5, and Terminator 6 ASICs (T4, T5, and T6).
The driver supports Jumbo Frames, Transmit/Receive checksum offload,
TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN
-tag insertion/extraction, VLAN checksum offload, VLAN TSO, and
-Receive Side Steering (RSS).
+tag insertion/extraction, VLAN checksum offload, VLAN TSO, VXLAN checksum
+offload, VXLAN TSO, and Receive Side Steering (RSS).
For further hardware information and questions related to hardware
requirements, see
.Pa http://www.chelsio.com/ .
Index: share/man/man4/vxlan.4
===================================================================
--- share/man/man4/vxlan.4
+++ share/man/man4/vxlan.4
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd December 31, 2017
+.Dd XXX
.Dt VXLAN 4
.Os
.Sh NAME
@@ -182,6 +182,39 @@
.Nm
interface to allow the encapsulated frame to fit in the
current MTU of the physical network.
+.Sh HARDWARE
+The
+.Nm
+driver supports hardware checksum offload (receive and transmit) and TSO on the
+encapsulated traffic over physical interfaces that support these features.
+The
+.Nm
+interface examines the
+.Cm vxlandev
+interface, if one is specified, or the interface hosting the
+.Cm vxlanlocal
+address, and configures its capabilities based on the hardware offload
+capabilities of that physical interface.
+If multiple physical interfaces will transmit or receive traffic for the
+.Nm
+then they all must have the same hardware capabilities.
+The transmit routine of a
+.Nm
+interface may fail with
+.Er ENXIO
+if an outbound physical interface does not support
+an offload that the
+.Nm
+interface is requesting.
+This can happen if there are multiple physical interfaces involved, with
+different hardware capabilities, or an interface capability was disabled after
+the
+.Nm
+interface had already started.
+.Pp
+At present, these devices are capable of generating checksums and performing TSO
+on the inner frames in hardware:
+.Xr cxgbe 4 .
.Sh EXAMPLES
Create a
.Nm
@@ -244,3 +277,7 @@
.Nm
driver was written by
.An Bryan Venteicher Aq bryanv@freebsd.org .
+Support for stateless hardware offloads was added by
+.An Navdeep Parhar Aq np@freebsd.org
+in
+.Fx 13.0 .
Index: share/man/man9/EVENTHANDLER.9
===================================================================
--- share/man/man9/EVENTHANDLER.9
+++ share/man/man9/EVENTHANDLER.9
@@ -23,7 +23,7 @@
.\" SUCH DAMAGE.
.\" $FreeBSD$
.\"
-.Dd October 21, 2018
+.Dd October XXX
.Dt EVENTHANDLER 9
.Os
.Sh NAME
@@ -389,6 +389,10 @@
Callback invoked when a vlan is destroyed.
.It Vt vm_lowmem
Callbacks invoked when virtual memory is low.
+.It Vt vxlan_start
+Callback invoked when a vxlan interface starts.
+.It Vt vxlan_stop
+Callback invoked when a vxlan interface stops.
.It Vt watchdog_list
Callbacks invoked when the system watchdog timer is reinitialized.
.El
Index: sys/dev/cxgbe/adapter.h
===================================================================
--- sys/dev/cxgbe/adapter.h
+++ sys/dev/cxgbe/adapter.h
@@ -119,6 +119,7 @@
TX_SGL_SEGS = 39,
TX_SGL_SEGS_TSO = 38,
TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */
+ TX_SGL_SEGS_VXLAN_TSO = 37,
TX_WR_FLITS = SGE_MAX_WR_LEN / 8
};
@@ -286,6 +287,7 @@
int nvi;
int up_vis;
int uld_vis;
+ bool vxlan_tcam_entry;
struct tx_sched_params *sched_params;
@@ -593,6 +595,8 @@
uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */
uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */
uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */
+ uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */
+ uint64_t vxlan_txcsum;
uint64_t kern_tls_records;
uint64_t kern_tls_short;
@@ -625,6 +629,7 @@
uint64_t rxcsum; /* # of times hardware assisted with checksum */
uint64_t vlan_extraction;/* # of times VLAN tag was extracted */
+ uint64_t vxlan_rxcsum;
/* stats for not-that-common events */
@@ -847,6 +852,11 @@
struct sge sge;
int lro_timeout;
int sc_do_rxcopy;
+
+ int vxlan_port;
+ u_int vxlan_refcount;
+ int rawf_base;
+ int nrawf;
struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */
struct task async_event_task;
Index: sys/dev/cxgbe/common/common.h
===================================================================
--- sys/dev/cxgbe/common/common.h
+++ sys/dev/cxgbe/common/common.h
@@ -247,7 +247,7 @@
uint32_t vlan_pri_map;
uint32_t ingress_config;
uint64_t hash_filter_mask;
- __be16 err_vec_mask;
+ bool rx_pkt_encap;
int8_t fcoe_shift;
int8_t port_shift;
Index: sys/dev/cxgbe/common/t4_hw.c
===================================================================
--- sys/dev/cxgbe/common/t4_hw.c
+++ sys/dev/cxgbe/common/t4_hw.c
@@ -9627,19 +9627,11 @@
read_filter_mode_and_ingress_config(adap, sleep_ok);
- /*
- * Cache a mask of the bits that represent the error vector portion of
- * rx_pkt.err_vec. T6+ can use a compressed error vector to make room
- * for information about outer encapsulation (GENEVE/VXLAN/NVGRE).
- */
- tpp->err_vec_mask = htobe16(0xffff);
if (chip_id(adap) > CHELSIO_T5) {
v = t4_read_reg(adap, A_TP_OUT_CONFIG);
- if (v & F_CRXPKTENC) {
- tpp->err_vec_mask =
- htobe16(V_T6_COMPR_RXERR_VEC(M_T6_COMPR_RXERR_VEC));
- }
- }
+ tpp->rx_pkt_encap = v & F_CRXPKTENC;
+ } else
+ tpp->rx_pkt_encap = false;
return 0;
}
Index: sys/dev/cxgbe/firmware/t6fw_cfg.txt
===================================================================
--- sys/dev/cxgbe/firmware/t6fw_cfg.txt
+++ sys/dev/cxgbe/firmware/t6fw_cfg.txt
@@ -146,7 +146,8 @@
nethctrl = 1024
neq = 2048
nqpcq = 8192
- nexactf = 456
+ nexactf = 454
+ nrawf = 2
cmask = all
pmask = all
ncrypto_lookaside = 16
@@ -272,7 +273,7 @@
[fini]
version = 0x1
- checksum = 0x4528a6ac
+ checksum = 0x82be65fd
#
# $FreeBSD$
#
Index: sys/dev/cxgbe/t4_main.c
===================================================================
--- sys/dev/cxgbe/t4_main.c
+++ sys/dev/cxgbe/t4_main.c
@@ -42,6 +42,7 @@
#include <sys/priv.h>
#include <sys/kernel.h>
#include <sys/bus.h>
+#include <sys/eventhandler.h>
#include <sys/module.h>
#include <sys/malloc.h>
#include <sys/queue.h>
@@ -1068,6 +1069,8 @@
TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
#endif
+ refcount_init(&sc->vxlan_refcount, 0);
+
rc = t4_map_bars_0_and_4(sc);
if (rc != 0)
goto done; /* error message displayed already */
@@ -1715,6 +1718,7 @@
struct ifnet *ifp;
struct sbuf *sb;
struct pfil_head_args pa;
+ struct adapter *sc = vi->adapter;
vi->xact_addr_filt = -1;
callout_init(&vi->tick, 1);
@@ -1748,28 +1752,36 @@
ifp->if_capabilities = T4_CAP;
ifp->if_capenable = T4_CAP_ENABLE;
+ ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
+ CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
+ if (chip_id(sc) >= CHELSIO_T6) {
+ ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+ ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+ ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
+ CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN;
+ }
+
#ifdef TCP_OFFLOAD
- if (vi->nofldrxq != 0 && (vi->adapter->flags & KERN_TLS_OK) == 0)
+ if (vi->nofldrxq != 0 && (sc->flags & KERN_TLS_OK) == 0)
ifp->if_capabilities |= IFCAP_TOE;
#endif
#ifdef RATELIMIT
- if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0) {
+ if (is_ethoffload(sc) && vi->nofldtxq != 0) {
ifp->if_capabilities |= IFCAP_TXRTLMT;
ifp->if_capenable |= IFCAP_TXRTLMT;
}
#endif
- ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
- CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
ifp->if_hw_tsomax = IP_MAXPACKET;
ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
#ifdef RATELIMIT
- if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0)
+ if (is_ethoffload(sc) && vi->nofldtxq != 0)
ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
#endif
ifp->if_hw_tsomaxsegsize = 65536;
#ifdef KERN_TLS
- if (vi->adapter->flags & KERN_TLS_OK) {
+ if (sc->flags & KERN_TLS_OK) {
ifp->if_capabilities |= IFCAP_TXTLS;
ifp->if_capenable |= IFCAP_TXTLS;
}
@@ -2099,6 +2111,17 @@
if (mask & IFCAP_TXTLS)
ifp->if_capenable ^= (mask & IFCAP_TXTLS);
#endif
+ if (mask & IFCAP_VXLAN_HWCSUM) {
+ ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM;
+ ifp->if_hwassist ^= CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP |
+ CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP;
+ }
+ if (mask & IFCAP_VXLAN_HWTSO) {
+ ifp->if_capenable ^= IFCAP_VXLAN_HWTSO;
+ ifp->if_hwassist ^= CSUM_INNER_IP6_TSO |
+ CSUM_INNER_IP_TSO;
+ }
#ifdef VLAN_CAPABILITIES
VLAN_CAPABILITIES(ifp);
@@ -4410,6 +4433,19 @@
MPASS(sc->tids.hpftid_base == 0);
MPASS(sc->tids.tid_base == sc->tids.nhpftids);
}
+
+ param[0] = FW_PARAM_PFVF(RAWF_START);
+ param[1] = FW_PARAM_PFVF(RAWF_END);
+ rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
+ if (rc != 0) {
+ device_printf(sc->dev,
+ "failed to query rawf parameters: %d.\n", rc);
+ return (rc);
+ }
+ if ((int)val[1] > (int)val[0]) {
+ sc->rawf_base = val[0];
+ sc->nrawf = val[1] - val[0] + 1;
+ }
}
/*
@@ -5141,6 +5177,7 @@
struct port_info *pi = vi->pi;
struct adapter *sc = pi->adapter;
int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
+ uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
ASSERT_SYNCHRONIZED_OP(sc);
KASSERT(flags, ("%s: not told what to update.", __func__));
@@ -5214,7 +5251,7 @@
rc = -rc;
for (j = 0; j < ctx.i; j++) {
if_printf(ifp,
- "failed to add mc address"
+ "failed to add mcast address"
" %02x:%02x:%02x:"
"%02x:%02x:%02x rc=%d\n",
ctx.mcaddr[j][0], ctx.mcaddr[j][1],
@@ -5224,14 +5261,36 @@
}
return (rc);
}
+ ctx.del = 0;
} else
NET_EPOCH_EXIT(et);
rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0);
if (rc != 0)
- if_printf(ifp, "failed to set mc address hash: %d", rc);
+ if_printf(ifp, "failed to set mcast address hash: %d\n",
+ rc);
+ if (ctx.del == 0) {
+ /* We clobbered the VXLAN entry if there was one. */
+ pi->vxlan_tcam_entry = false;
+ }
}
+ if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 &&
+ pi->vxlan_tcam_entry == false) {
+ rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac,
+ match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id,
+ true);
+ if (rc < 0) {
+ rc = -rc;
+ if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n",
+ rc);
+ } else {
+ MPASS(rc == sc->rawf_base + pi->port_id);
+ rc = 0;
+ pi->vxlan_tcam_entry = true;
+ }
+ }
+
return (rc);
}
@@ -10374,6 +10433,7 @@
#endif
rxq->rxcsum = 0;
rxq->vlan_extraction = 0;
+ rxq->vxlan_rxcsum = 0;
rxq->fl.cl_allocated = 0;
rxq->fl.cl_recycled = 0;
@@ -10392,6 +10452,8 @@
txq->txpkts0_pkts = 0;
txq->txpkts1_pkts = 0;
txq->raw_wrs = 0;
+ txq->vxlan_tso_wrs = 0;
+ txq->vxlan_txcsum = 0;
txq->kern_tls_records = 0;
txq->kern_tls_short = 0;
txq->kern_tls_partial = 0;
@@ -11202,6 +11264,116 @@
}
#endif
+static eventhandler_tag vxlan_start_evtag;
+static eventhandler_tag vxlan_stop_evtag;
+
+struct vxlan_evargs {
+ struct ifnet *ifp;
+ uint16_t port;
+};
+
+static void
+t4_vxlan_start(struct adapter *sc, void *arg)
+{
+ struct vxlan_evargs *v = arg;
+ struct port_info *pi;
+ uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
+ int i, rc;
+
+ if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+ return;
+ if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0)
+ return;
+
+ if (sc->vxlan_refcount == 0) {
+ sc->vxlan_port = v->port;
+ sc->vxlan_refcount = 1;
+ t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE,
+ V_VXLAN(v->port) | F_VXLAN_EN);
+ for_each_port(sc, i) {
+ pi = sc->port[i];
+ if (pi->vxlan_tcam_entry == true)
+ continue;
+ rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid,
+ match_all_mac, match_all_mac,
+ sc->rawf_base + pi->port_id, 1, pi->port_id, true);
+ if (rc < 0) {
+ rc = -rc;
+ log(LOG_ERR,
+ "%s: failed to add VXLAN TCAM entry: %d.\n",
+ device_get_name(pi->vi[0].dev), rc);
+ } else {
+ MPASS(rc == sc->rawf_base + pi->port_id);
+ rc = 0;
+ pi->vxlan_tcam_entry = true;
+ }
+ }
+ } else if (sc->vxlan_port == v->port) {
+ sc->vxlan_refcount++;
+ } else {
+ log(LOG_ERR, "%s: VXLAN already configured on port %d; "
+ "ignoring attempt to configure it on port %d\n",
+ device_get_nameunit(sc->dev), sc->vxlan_port, v->port);
+ }
+ end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_stop(struct adapter *sc, void *arg)
+{
+ struct vxlan_evargs *v = arg;
+
+ if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+ return;
+ if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0)
+ return;
+
+ /*
+ * VXLANs may have been configured before the driver was loaded so we
+ * may see more stops than starts. This is not handled cleanly but at
+ * least we keep the refcount sane.
+ */
+ if (sc->vxlan_port != v->port)
+ goto done;
+ if (sc->vxlan_refcount == 0) {
+ log(LOG_ERR,
+ "%s: VXLAN operation on port %d was stopped earlier; "
+ "ignoring attempt to stop it again.\n",
+ device_get_nameunit(sc->dev), sc->vxlan_port);
+ } else if (--sc->vxlan_refcount == 0) {
+ t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0);
+ }
+done:
+ end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp,
+ sa_family_t family, u_int port)
+{
+ struct vxlan_evargs v;
+
+ MPASS(family == AF_INET || family == AF_INET6);
+ v.ifp = ifp;
+ v.port = port;
+
+ t4_iterate(t4_vxlan_start, &v);
+}
+
+static void
+t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family,
+ u_int port)
+{
+ struct vxlan_evargs v;
+
+ MPASS(family == AF_INET || family == AF_INET6);
+ v.ifp = ifp;
+ v.port = port;
+
+ t4_iterate(t4_vxlan_stop, &v);
+}
+
+
static struct sx mlu; /* mod load unload */
SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
@@ -11245,6 +11417,14 @@
#endif
t4_tracer_modload();
tweak_tunables();
+ vxlan_start_evtag =
+ EVENTHANDLER_REGISTER(vxlan_start,
+ t4_vxlan_start_handler, NULL,
+ EVENTHANDLER_PRI_ANY);
+ vxlan_stop_evtag =
+ EVENTHANDLER_REGISTER(vxlan_stop,
+ t4_vxlan_stop_handler, NULL,
+ EVENTHANDLER_PRI_ANY);
}
sx_xunlock(&mlu);
break;
@@ -11281,6 +11461,10 @@
sx_sunlock(&t4_list_lock);
if (t4_sge_extfree_refs() == 0) {
+ EVENTHANDLER_DEREGISTER(vxlan_start,
+ vxlan_start_evtag);
+ EVENTHANDLER_DEREGISTER(vxlan_stop,
+ vxlan_stop_evtag);
t4_tracer_modunload();
#ifdef KERN_TLS
t6_ktls_modunload();
Index: sys/dev/cxgbe/t4_sge.c
===================================================================
--- sys/dev/cxgbe/t4_sge.c
+++ sys/dev/cxgbe/t4_sge.c
@@ -55,6 +55,7 @@
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_vlan_var.h>
+#include <net/if_vxlan.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
@@ -266,8 +267,9 @@
static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
static inline void get_pkt_gl(struct mbuf *, struct sglist *);
-static inline u_int txpkt_len16(u_int, u_int);
-static inline u_int txpkt_vm_len16(u_int, u_int);
+static inline u_int txpkt_len16(u_int, const u_int);
+static inline u_int txpkt_vm_len16(u_int, const u_int);
+static inline void calculate_mbuf_len16(struct adapter *, struct mbuf *);
static inline u_int txpkts0_len16(u_int);
static inline u_int txpkts1_len16(void);
static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
@@ -1911,13 +1913,42 @@
#if defined(INET) || defined(INET6)
struct lro_ctrl *lro = &rxq->lro;
#endif
+ uint16_t err_vec, tnl_type, tnlhdr_len;
static const int sw_hashtype[4][2] = {
{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
};
+ static const int sw_csum_flags[2][2] = {
+ {
+ /* IP, inner IP */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L3_CALC | CSUM_L3_VALID |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+ /* IP, inner IP6 */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L3_CALC | CSUM_L3_VALID |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+ },
+ {
+ /* IP6, inner IP */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+
+ /* IP6, inner IP6 */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+ },
+ };
+
MPASS(plen > sc->params.sge.fl_pktshift);
if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
__predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
@@ -1957,23 +1988,73 @@
m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
cpl = (const void *)(&d->rss + 1);
- if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
- if (ifp->if_capenable & IFCAP_RXCSUM &&
- cpl->l2info & htobe32(F_RXF_IP)) {
- m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
- CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ if (sc->params.tp.rx_pkt_encap) {
+ const uint16_t ev = be16toh(cpl->err_vec);
+
+ err_vec = G_T6_COMPR_RXERR_VEC(ev);
+ tnl_type = G_T6_RX_TNL_TYPE(ev);
+ tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
+ } else {
+ err_vec = be16toh(cpl->err_vec);
+ tnl_type = 0;
+ tnlhdr_len = 0;
+ }
+ if (cpl->csum_calc && err_vec == 0) {
+ int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
+
+ /* checksum(s) calculated and found to be correct. */
+
+ MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
+ (cpl->l2info & htobe32(F_RXF_IP6)));
+ m0->m_pkthdr.csum_data = be16toh(cpl->csum);
+ if (tnl_type == 0) {
+ if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) {
+ m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+ CSUM_L3_VALID | CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ } else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
+ m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ }
rxq->rxcsum++;
- } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
- cpl->l2info & htobe32(F_RXF_IP6)) {
- m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
- CSUM_PSEUDO_HDR);
- rxq->rxcsum++;
- }
+ } else {
+ MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
+ if (__predict_false(cpl->ip_frag)) {
+ /*
+ * csum_data is for the inner frame (which is an
+ * IP fragment) and is not 0xffff. There is no
+ * way to pass the inner csum_data to the stack.
+ * We don't want the stack to use the inner
+ * csum_data to validate the outer frame or it
+ * will get rejected. So we fix csum_data here
+ * and let sw do the checksum of inner IP
+ * fragments.
+ *
+ * XXX: Need 32b for csum_data2 in an rx mbuf.
+ * Maybe stuff it into rcv_tstmp?
+ */
+ m0->m_pkthdr.csum_data = 0xffff;
+ if (ipv6) {
+ m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ } else {
+ m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+ CSUM_L3_VALID | CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ }
+ } else {
+ int outer_ipv6;
- if (__predict_false(cpl->ip_frag))
- m0->m_pkthdr.csum_data = be16toh(cpl->csum);
- else
- m0->m_pkthdr.csum_data = 0xffff;
+ MPASS(m0->m_pkthdr.csum_data == 0xffff);
+
+ outer_ipv6 = tnlhdr_len >=
+ sizeof(struct ether_header) +
+ sizeof(struct ip6_hdr);
+ m0->m_pkthdr.csum_flags =
+ sw_csum_flags[outer_ipv6][ipv6];
+ }
+ rxq->vxlan_rxcsum++;
+ }
}
if (cpl->vlan_ex) {
@@ -2001,7 +2082,7 @@
m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
#endif
#if defined(INET) || defined(INET6)
- if (rxq->iq.flags & IQ_LRO_ENABLED &&
+ if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 &&
(M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
if (sort_before_lro(lro)) {
@@ -2172,10 +2253,10 @@
{
M_ASSERTPKTHDR(m);
- KASSERT(m->m_pkthdr.l5hlen > 0,
+ KASSERT(m->m_pkthdr.inner_l5hlen > 0,
("%s: mbuf %p missing information on # of segments.", __func__, m));
- return (m->m_pkthdr.l5hlen);
+ return (m->m_pkthdr.inner_l5hlen);
}
static inline void
@@ -2183,7 +2264,7 @@
{
M_ASSERTPKTHDR(m);
- m->m_pkthdr.l5hlen = nsegs;
+ m->m_pkthdr.inner_l5hlen = nsegs;
}
static inline int
@@ -2309,63 +2390,108 @@
return (m);
}
-static inline int
+static inline bool
needs_hwcsum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
+ CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
+ CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP |
- CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
+static inline bool
needs_tso(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
+ CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & CSUM_TSO);
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
+static inline bool
+needs_vxlan_csum(struct mbuf *m)
+{
+
+ M_ASSERTPKTHDR(m);
+
+ return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_vxlan_tso(struct mbuf *m)
+{
+ const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
+ CSUM_INNER_IP6_TSO;
+
+ M_ASSERTPKTHDR(m);
+
+ return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
+ (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_inner_tcp_csum(struct mbuf *m)
+{
+ const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
+
+ M_ASSERTPKTHDR(m);
+
+ return (m->m_pkthdr.csum_flags & csum_flags);
+}
+
+static inline bool
needs_l3_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
+ CSUM_INNER_IP_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
-needs_tcp_csum(struct mbuf *m)
+static inline bool
+needs_outer_tcp_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
+ CSUM_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
+
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
#ifdef RATELIMIT
-static inline int
-needs_l4_csum(struct mbuf *m)
+static inline bool
+needs_outer_l4_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
+ CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
- CSUM_TCP_IPV6 | CSUM_TSO));
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
-needs_udp_csum(struct mbuf *m)
+static inline bool
+needs_outer_udp_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
+
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
#endif
-static inline int
+static inline bool
needs_vlan_insertion(struct mbuf *m)
{
@@ -2506,6 +2632,23 @@
}
/*
+ * The maximum number of segments that can fit in a WR.
+ */
+static int
+max_nsegs_allowed(struct mbuf *m)
+{
+
+ if (needs_tso(m)) {
+ if (needs_vxlan_tso(m))
+ return (TX_SGL_SEGS_VXLAN_TSO);
+ else
+ return (TX_SGL_SEGS_TSO);
+ }
+
+ return (TX_SGL_SEGS);
+}
+
+/*
* Analyze the mbuf to determine its tx needs. The mbuf passed in may change:
* a) caller can assume it's been freed if this function returns with an error.
* b) it may get defragged up if the gather list is too long for the hardware.
@@ -2563,7 +2706,7 @@
return (0);
}
#endif
- if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
+ if (nsegs > max_nsegs_allowed(m0)) {
if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
rc = EFBIG;
goto fail;
@@ -2585,18 +2728,15 @@
}
set_mbuf_nsegs(m0, nsegs);
set_mbuf_cflags(m0, cflags);
- if (sc->flags & IS_VF)
- set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
- else
- set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
+ calculate_mbuf_len16(sc, m0);
#ifdef RATELIMIT
/*
* Ethofld is limited to TCP and UDP for now, and only when L4 hw
- * checksumming is enabled. needs_l4_csum happens to check for all the
- * right things.
+ * checksumming is enabled. needs_outer_l4_csum happens to check for
+ * all the right things.
*/
- if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) {
+ if (__predict_false(needs_eo(cst) && !needs_outer_l4_csum(m0))) {
m_snd_tag_rele(m0->m_pkthdr.snd_tag);
m0->m_pkthdr.snd_tag = NULL;
m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
@@ -2628,21 +2768,27 @@
switch (eh_type) {
#ifdef INET6
case ETHERTYPE_IPV6:
- {
- struct ip6_hdr *ip6 = l3hdr;
-
- MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
-
- m0->m_pkthdr.l3hlen = sizeof(*ip6);
+ m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
break;
- }
#endif
#ifdef INET
case ETHERTYPE_IP:
{
struct ip *ip = l3hdr;
- m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
+ if (needs_vxlan_csum(m0)) {
+ /* Driver will do the outer IP hdr checksum. */
+ ip->ip_sum = 0;
+ if (needs_vxlan_tso(m0)) {
+ const uint16_t ipl = ip->ip_len;
+
+ ip->ip_len = 0;
+ ip->ip_sum = ~in_cksum_hdr(ip);
+ ip->ip_len = ipl;
+ } else
+ ip->ip_sum = in_cksum_hdr(ip);
+ }
+ m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
break;
}
#endif
@@ -2652,8 +2798,59 @@
__func__, eh_type);
}
+ if (needs_vxlan_csum(m0)) {
+ m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
+ m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
+
+ /* Inner headers. */
+ eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
+ sizeof(struct udphdr) + sizeof(struct vxlan_header));
+ eh_type = ntohs(eh->ether_type);
+ if (eh_type == ETHERTYPE_VLAN) {
+ struct ether_vlan_header *evh = (void *)eh;
+
+ eh_type = ntohs(evh->evl_proto);
+ m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
+ } else
+ m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
+ l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
+
+ switch (eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
+ break;
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ struct ip *ip = l3hdr;
+
+ m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
+ break;
+ }
+#endif
+ default:
+ panic("%s: VXLAN hw offload requested with unknown "
+ "ethertype 0x%04x. if_cxgbe must be compiled"
+ " with the same INET/INET6 options as the kernel.",
+ __func__, eh_type);
+ }
#if defined(INET) || defined(INET6)
- if (needs_tcp_csum(m0)) {
+ if (needs_inner_tcp_csum(m0)) {
+ tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
+ m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
+ }
+#endif
+ MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+ m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
+ CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
+ CSUM_ENCAP_VXLAN;
+ }
+
+#if defined(INET) || defined(INET6)
+ if (needs_outer_tcp_csum(m0)) {
tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
m0->m_pkthdr.l4hlen = tcp->th_off * 4;
#ifdef RATELIMIT
@@ -2663,7 +2860,7 @@
V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
} else
set_mbuf_eo_tsclk_tsoff(m0, 0);
- } else if (needs_udp_csum(m0)) {
+ } else if (needs_outer_udp_csum(m0)) {
m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
#endif
}
@@ -3618,6 +3815,9 @@
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
CTLFLAG_RD, &rxq->vlan_extraction,
"# of times hardware extracted 802.1Q tag");
+ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_rxcsum",
+ CTLFLAG_RD, &rxq->vxlan_rxcsum,
+ "# of times hardware assisted with inner checksum (VXLAN) ");
add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
@@ -4272,6 +4472,11 @@
"# of frames tx'd using type1 txpkts work requests");
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
&txq->raw_wrs, "# of raw work requests (non-packets)");
+ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs",
+ CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
+ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_txcsum",
+ CTLFLAG_RD, &txq->vxlan_txcsum,
+ "# of times hardware assisted with inner checksums (VXLAN)");
#ifdef KERN_TLS
if (sc->flags & KERN_TLS_OK) {
@@ -4561,27 +4766,25 @@
KASSERT(gl->sg_nseg == mbuf_nsegs(m),
("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
mbuf_nsegs(m), gl->sg_nseg));
- KASSERT(gl->sg_nseg > 0 &&
- gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
+ KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m),
("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
- gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
+ gl->sg_nseg, max_nsegs_allowed(m)));
}
/*
* len16 for a txpkt WR with a GL. Includes the firmware work request header.
*/
static inline u_int
-txpkt_len16(u_int nsegs, u_int tso)
+txpkt_len16(u_int nsegs, const u_int extra)
{
u_int n;
MPASS(nsegs > 0);
nsegs--; /* first segment is part of ulptx_sgl */
- n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
+ n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
+ sizeof(struct cpl_tx_pkt_core) +
sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
- if (tso)
- n += sizeof(struct cpl_tx_pkt_lso_core);
return (howmany(n, 16));
}
@@ -4591,22 +4794,43 @@
* request header.
*/
static inline u_int
-txpkt_vm_len16(u_int nsegs, u_int tso)
+txpkt_vm_len16(u_int nsegs, const u_int extra)
{
u_int n;
MPASS(nsegs > 0);
nsegs--; /* first segment is part of ulptx_sgl */
- n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
+ n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
sizeof(struct cpl_tx_pkt_core) +
sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
- if (tso)
- n += sizeof(struct cpl_tx_pkt_lso_core);
return (howmany(n, 16));
}
+static inline void
+calculate_mbuf_len16(struct adapter *sc, struct mbuf *m)
+{
+ const int lso = sizeof(struct cpl_tx_pkt_lso_core);
+ const int tnl_lso = sizeof(struct cpl_tx_tnl_lso);
+
+ if (sc->flags & IS_VF) {
+ if (needs_tso(m))
+ set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso));
+ else
+ set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0));
+ return;
+ }
+
+ if (needs_tso(m)) {
+ if (needs_vxlan_tso(m))
+ set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso));
+ else
+ set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso));
+ } else
+ set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0));
+}
+
/*
* len16 for a txpkts type 0 WR with a GL. Does not include the firmware work
* request header.
@@ -4655,51 +4879,162 @@
csum_to_ctrl(struct adapter *sc, struct mbuf *m)
{
uint64_t ctrl;
- int csum_type;
+ int csum_type, l2hlen, l3hlen;
+ int x, y;
+ static const int csum_types[3][2] = {
+ {TX_CSUM_TCPIP, TX_CSUM_TCPIP6},
+ {TX_CSUM_UDPIP, TX_CSUM_UDPIP6},
+ {TX_CSUM_IP, 0}
+ };
M_ASSERTPKTHDR(m);
- if (needs_hwcsum(m) == 0)
+ if (!needs_hwcsum(m))
return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
+ MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN);
+ MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip));
+
+ if (needs_vxlan_csum(m)) {
+ MPASS(m->m_pkthdr.l4hlen > 0);
+ MPASS(m->m_pkthdr.l5hlen > 0);
+ MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN);
+ MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip));
+
+ l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
+ m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen +
+ m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN;
+ l3hlen = m->m_pkthdr.inner_l3hlen;
+ } else {
+ l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN;
+ l3hlen = m->m_pkthdr.l3hlen;
+ }
+
ctrl = 0;
- if (needs_l3_csum(m) == 0)
+ if (!needs_l3_csum(m))
ctrl |= F_TXPKT_IPCSUM_DIS;
- switch (m->m_pkthdr.csum_flags &
- (CSUM_IP_TCP | CSUM_IP_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP)) {
- case CSUM_IP_TCP:
- csum_type = TX_CSUM_TCPIP;
- break;
- case CSUM_IP_UDP:
- csum_type = TX_CSUM_UDPIP;
- break;
- case CSUM_IP6_TCP:
- csum_type = TX_CSUM_TCPIP6;
- break;
- case CSUM_IP6_UDP:
- csum_type = TX_CSUM_UDPIP6;
- break;
- default:
- /* needs_hwcsum told us that at least some hwcsum is needed. */
- MPASS(ctrl == 0);
- MPASS(m->m_pkthdr.csum_flags & CSUM_IP);
- ctrl |= F_TXPKT_L4CSUM_DIS;
- csum_type = TX_CSUM_IP;
- break;
- }
- MPASS(m->m_pkthdr.l2hlen > 0);
- MPASS(m->m_pkthdr.l3hlen > 0);
- ctrl |= V_TXPKT_CSUM_TYPE(csum_type) |
- V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
+ if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP |
+ CSUM_IP6_TCP | CSUM_INNER_IP6_TCP))
+ x = 0; /* TCP */
+ else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP |
+ CSUM_IP6_UDP | CSUM_INNER_IP6_UDP))
+ x = 1; /* UDP */
+ else
+ x = 2;
+
+ if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP |
+ CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP))
+ y = 0; /* IPv4 */
+ else {
+ MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP));
+ y = 1; /* IPv6 */
+ }
+ /*
+ * needs_hwcsum returned true earlier so there must be some kind of
+ * checksum to calculate.
+ */
+ csum_type = csum_types[x][y];
+ MPASS(csum_type != 0);
+ if (csum_type == TX_CSUM_IP)
+ ctrl |= F_TXPKT_L4CSUM_DIS;
+ ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen);
if (chip_id(sc) <= CHELSIO_T5)
- ctrl |= V_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
+ ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen);
else
- ctrl |= V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
+ ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen);
return (ctrl);
}
+static inline void *
+write_lso_cpl(void *cpl, struct mbuf *m0)
+{
+ struct cpl_tx_pkt_lso_core *lso;
+ uint32_t ctrl;
+
+ KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+ m0->m_pkthdr.l4hlen > 0,
+ ("%s: mbuf %p needs TSO but missing header lengths",
+ __func__, m0));
+
+ ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
+ F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
+ V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
+ V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
+ V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
+ if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
+ ctrl |= F_LSO_IPV6;
+
+ lso = cpl;
+ lso->lso_ctrl = htobe32(ctrl);
+ lso->ipid_ofst = htobe16(0);
+ lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
+ lso->seqno_offset = htobe32(0);
+ lso->len = htobe32(m0->m_pkthdr.len);
+
+ return (lso + 1);
+}
+
+static void *
+write_tnl_lso_cpl(void *cpl, struct mbuf *m0)
+{
+ struct cpl_tx_tnl_lso *tnl_lso = cpl;
+ uint32_t ctrl;
+
+ KASSERT(m0->m_pkthdr.inner_l2hlen > 0 &&
+ m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 &&
+ m0->m_pkthdr.inner_l5hlen > 0,
+ ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths",
+ __func__, m0));
+ KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+ m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0,
+ ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths",
+ __func__, m0));
+
+ /* Outer headers. */
+ ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) |
+ F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST |
+ V_CPL_TX_TNL_LSO_ETHHDRLENOUT(
+ (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
+ V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) |
+ F_CPL_TX_TNL_LSO_IPLENSETOUT;
+ if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
+ ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT;
+ else {
+ ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT |
+ F_CPL_TX_TNL_LSO_IPIDINCOUT;
+ }
+ tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl);
+ tnl_lso->IpIdOffsetOut = 0;
+ tnl_lso->UdpLenSetOut_to_TnlHdrLen =
+ htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT |
+ F_CPL_TX_TNL_LSO_UDPLENSETOUT |
+ V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen +
+ m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen +
+ m0->m_pkthdr.l5hlen) |
+ V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN));
+ tnl_lso->r1 = 0;
+
+ /* Inner headers. */
+ ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN(
+ (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) |
+ V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) |
+ V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2);
+ if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr))
+ ctrl |= F_CPL_TX_TNL_LSO_IPV6;
+ tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl);
+ tnl_lso->IpIdOffset = 0;
+ tnl_lso->IpIdSplit_to_Mss =
+ htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz));
+ tnl_lso->TCPSeqOffset = 0;
+ tnl_lso->EthLenOffset_Size =
+ htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len));
+
+ return (tnl_lso + 1);
+}
+
#define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */
/*
@@ -4753,29 +5088,7 @@
m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst);
if (needs_tso(m0)) {
- struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
-
- KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
- m0->m_pkthdr.l4hlen > 0,
- ("%s: mbuf %p needs TSO but missing header lengths",
- __func__, m0));
-
- ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
- F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
- ETHER_HDR_LEN) >> 2) |
- V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
- V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
- if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
- ctrl |= F_LSO_IPV6;
-
- lso->lso_ctrl = htobe32(ctrl);
- lso->ipid_ofst = htobe16(0);
- lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
- lso->seqno_offset = htobe32(0);
- lso->len = htobe32(pktlen);
-
- cpl = (void *)(lso + 1);
-
+ cpl = write_lso_cpl(wr + 1, m0);
txq->tso_wrs++;
} else
cpl = (void *)(wr + 1);
@@ -4883,9 +5196,12 @@
nsegs = mbuf_nsegs(m0);
pktlen = m0->m_pkthdr.len;
ctrl = sizeof(struct cpl_tx_pkt_core);
- if (needs_tso(m0))
- ctrl += sizeof(struct cpl_tx_pkt_lso_core);
- else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
+ if (needs_tso(m0)) {
+ if (needs_vxlan_tso(m0))
+ ctrl += sizeof(struct cpl_tx_tnl_lso);
+ else
+ ctrl += sizeof(struct cpl_tx_pkt_lso_core);
+ } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
available >= 2) {
/* Immediate data. Recalculate len16 and set nsegs to 0. */
ctrl += pktlen;
@@ -4907,41 +5223,30 @@
wr->r3 = 0;
if (needs_tso(m0)) {
- struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
-
- KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
- m0->m_pkthdr.l4hlen > 0,
- ("%s: mbuf %p needs TSO but missing header lengths",
- __func__, m0));
-
- ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
- F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
- ETHER_HDR_LEN) >> 2) |
- V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
- V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
- if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
- ctrl |= F_LSO_IPV6;
-
- lso->lso_ctrl = htobe32(ctrl);
- lso->ipid_ofst = htobe16(0);
- lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
- lso->seqno_offset = htobe32(0);
- lso->len = htobe32(pktlen);
-
- cpl = (void *)(lso + 1);
-
- txq->tso_wrs++;
+ if (needs_vxlan_tso(m0)) {
+ cpl = write_tnl_lso_cpl(wr + 1, m0);
+ txq->vxlan_tso_wrs++;
+ } else {
+ cpl = write_lso_cpl(wr + 1, m0);
+ txq->tso_wrs++;
+ }
} else
cpl = (void *)(wr + 1);
/* Checksum offload */
ctrl1 = csum_to_ctrl(sc, m0);
- if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
- txq->txcsum++; /* some hardware assistance provided */
+ if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
+ /* some hardware assistance provided */
+ if (needs_vxlan_csum(m0))
+ txq->vxlan_txcsum++;
+ else
+ txq->txcsum++;
+ }
/* VLAN tag insertion */
if (needs_vlan_insertion(m0)) {
- ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
+ ctrl1 |= F_TXPKT_VLAN_VLD |
+ V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
txq->vlan_insertion++;
}
@@ -4953,6 +5258,8 @@
/* SGL */
dst = (void *)(cpl + 1);
+ if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx]))
+ dst = (caddr_t)&eq->desc[0];
if (nsegs > 0) {
write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
@@ -5198,8 +5505,13 @@
/* Checksum offload */
ctrl1 = csum_to_ctrl(sc, m);
- if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
- txq->txcsum++; /* some hardware assistance provided */
+ if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
+ /* some hardware assistance provided */
+ if (needs_vxlan_csum(m))
+ txq->vxlan_txcsum++;
+ else
+ txq->txcsum++;
+ }
/* VLAN tag insertion */
if (needs_vlan_insertion(m)) {
@@ -5958,7 +6270,7 @@
wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
V_FW_WR_FLOWID(cst->etid));
wr->r3 = 0;
- if (needs_udp_csum(m0)) {
+ if (needs_outer_udp_csum(m0)) {
wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
@@ -5970,7 +6282,7 @@
wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
cpl = (void *)(wr + 1);
} else {
- MPASS(needs_tcp_csum(m0));
+ MPASS(needs_outer_tcp_csum(m0));
wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
@@ -6007,7 +6319,7 @@
}
/* Checksum offload must be requested for ethofld. */
- MPASS(needs_l4_csum(m0));
+ MPASS(needs_outer_l4_csum(m0));
ctrl1 = csum_to_ctrl(cst->adapter, m0);
/* VLAN tag insertion */
Index: sys/net/if.h
===================================================================
--- sys/net/if.h
+++ sys/net/if.h
@@ -249,6 +249,8 @@
#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */
#define IFCAP_TXTLS4 0x8000000 /* can do TLS encryption and segmentation for TCP */
#define IFCAP_TXTLS6 0x10000000 /* can do TLS encryption and segmentation for TCP6 */
+#define IFCAP_VXLAN_HWCSUM 0x20000000 /* can do IFCAN_HWCSUM on VXLANs */
+#define IFCAP_VXLAN_HWTSO 0x40000000 /* can do IFCAP_TSO on VXLANs */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
Index: sys/net/if_vxlan.h
===================================================================
--- sys/net/if_vxlan.h
+++ sys/net/if_vxlan.h
@@ -143,4 +143,11 @@
char vxlcmd_ifname[IFNAMSIZ];
};
+#ifdef _KERNEL
+typedef void (*vxlan_event_handler_t)(void *, struct ifnet *, sa_family_t,
+ u_int);
+EVENTHANDLER_DECLARE(vxlan_start, vxlan_event_handler_t);
+EVENTHANDLER_DECLARE(vxlan_stop, vxlan_event_handler_t);
+#endif
+
#endif /* _NET_IF_VXLAN_H_ */
Index: sys/net/if_vxlan.c
===================================================================
--- sys/net/if_vxlan.c
+++ sys/net/if_vxlan.c
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
* All rights reserved.
+ * Copyright (c) 2020, Chelsio Communications.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -60,6 +61,8 @@
#include <net/if_types.h>
#include <net/if_vxlan.h>
#include <net/netisr.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
@@ -70,6 +73,8 @@
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+#include <netinet/in_fib.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
@@ -92,6 +97,7 @@
sizeof(struct udphdr) - \
sizeof(struct vxlan_header) - \
ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN)
+#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU)
#define VXLAN_SO_MC_MAX_GROUPS 32
@@ -146,10 +152,14 @@
struct vxlan_statistics {
uint32_t ftable_nospace;
uint32_t ftable_lock_upgrade_failed;
+ counter_u64_t txcsum;
+ counter_u64_t tso;
+ counter_u64_t rxcsum;
};
struct vxlan_softc {
struct ifnet *vxl_ifp;
+ int vxl_reqcap;
struct vxlan_socket *vxl_sock;
uint32_t vxl_vni;
union vxlan_sockaddr vxl_src_addr;
@@ -193,6 +203,10 @@
char vxl_mc_ifname[IFNAMSIZ];
LIST_ENTRY(vxlan_softc) vxl_entry;
LIST_ENTRY(vxlan_softc) vxl_ifdetach_list;
+
+ /* For rate limiting errors on the tx fast path. */
+ struct timeval err_time;
+ int err_pps;
};
#define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p))
@@ -297,7 +311,10 @@
static int vxlan_setup_multicast(struct vxlan_softc *);
static int vxlan_setup_socket(struct vxlan_softc *);
-static void vxlan_setup_interface(struct vxlan_softc *);
+#ifdef INET6
+static void vxlan_setup_zero_checksum_port(struct vxlan_softc *);
+#endif
+static void vxlan_setup_interface_hdrlen(struct vxlan_softc *);
static int vxlan_valid_init_config(struct vxlan_softc *);
static void vxlan_init_wait(struct vxlan_softc *);
static void vxlan_init_complete(struct vxlan_softc *);
@@ -347,9 +364,13 @@
static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
const struct sockaddr *);
+static int vxlan_stats_alloc(struct vxlan_softc *);
+static void vxlan_stats_free(struct vxlan_softc *);
static void vxlan_set_default_config(struct vxlan_softc *);
static int vxlan_set_user_config(struct vxlan_softc *,
struct ifvxlanparam *);
+static int vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int);
+static void vxlan_set_hwcaps(struct vxlan_softc *);
static int vxlan_clone_create(struct if_clone *, int, caddr_t);
static void vxlan_clone_destroy(struct ifnet *);
@@ -1555,9 +1576,44 @@
return (error);
}
+#ifdef INET6
static void
-vxlan_setup_interface(struct vxlan_softc *sc)
+vxlan_setup_zero_checksum_port(struct vxlan_softc *sc)
{
+
+ if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr))
+ return;
+
+ MPASS(sc->vxl_src_addr.in6.sin6_port != 0);
+ MPASS(sc->vxl_dst_addr.in6.sin6_port != 0);
+
+ if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) {
+ if_printf(sc->vxl_ifp, "port %d in src address does not match "
+ "port %d in dst address, rfc6935_port (%d) not updated.\n",
+ ntohs(sc->vxl_src_addr.in6.sin6_port),
+ ntohs(sc->vxl_dst_addr.in6.sin6_port),
+ V_zero_checksum_port);
+ return;
+ }
+
+ if (V_zero_checksum_port != 0) {
+ if (V_zero_checksum_port != sc->vxl_src_addr.in6.sin6_port) {
+ if_printf(sc->vxl_ifp, "rfc6935_port is already set to "
+ "%d, cannot set it to %d.\n", V_zero_checksum_port,
+ ntohs(sc->vxl_src_addr.in6.sin6_port));
+ }
+ return;
+ }
+
+ V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port);
+ if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n",
+ V_zero_checksum_port);
+}
+#endif
+
+static void
+vxlan_setup_interface_hdrlen(struct vxlan_softc *sc)
+{
struct ifnet *ifp;
ifp = sc->vxl_ifp;
@@ -1666,11 +1722,13 @@
if (vxlan_valid_init_config(sc) != 0)
goto out;
- vxlan_setup_interface(sc);
-
if (vxlan_setup_socket(sc) != 0)
goto out;
+#ifdef INET6
+ vxlan_setup_zero_checksum_port(sc);
+#endif
+
/* Initialize the default forwarding entry. */
vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
&sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
@@ -1682,6 +1740,9 @@
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_UP);
+
+ EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family,
+ ntohs(sc->vxl_src_addr.in4.sin_port));
out:
vxlan_init_complete(sc);
}
@@ -1738,6 +1799,8 @@
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_DOWN);
+ EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family,
+ ntohs(sc->vxl_src_addr.in4.sin_port));
if (vso != NULL) {
vxlan_socket_remove_softc(vso, sc);
@@ -1907,6 +1970,7 @@
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
+ vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@@ -1936,6 +2000,7 @@
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
+ vxlan_setup_interface_hdrlen(sc);
error = 0;
} else
error = EBUSY;
@@ -2063,6 +2128,7 @@
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
+ vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@@ -2284,6 +2350,14 @@
ifp->if_mtu = ifr->ifr_mtu;
break;
+ case SIOCSIFCAP:
+ VXLAN_WLOCK(sc);
+ error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap);
+ if (error == 0)
+ vxlan_set_hwcaps(sc);
+ VXLAN_WUNLOCK(sc);
+ break;
+
default:
error = ether_ioctl(ifp, cmd, data);
break;
@@ -2335,6 +2409,48 @@
}
#endif
+/*
+ * Return the CSUM_INNER_* equivalent of CSUM_* caps.
+ */
+static uint32_t
+csum_flags_to_inner_flags(uint32_t csum_flags_in, uint32_t encap)
+{
+ uint32_t csum_flags = CSUM_ENCAP_VXLAN;
+ const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP;
+
+ /*
+ * csum_flags can request either v4 or v6 offload but not both.
+ * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO)
+ * so those bits are no good to detect the IP version. Other bits are
+ * always set with CSUM_TSO and we use those to figure out the IP
+ * version.
+ */
+ if (csum_flags_in & v4) {
+ if (csum_flags_in & CSUM_IP)
+ csum_flags |= CSUM_INNER_IP;
+ if (csum_flags_in & CSUM_IP_UDP)
+ csum_flags |= CSUM_INNER_IP_UDP;
+ if (csum_flags_in & CSUM_IP_TCP)
+ csum_flags |= CSUM_INNER_IP_TCP;
+ if (csum_flags_in & CSUM_IP_TSO)
+ csum_flags |= CSUM_INNER_IP_TSO;
+ } else {
+#ifdef INVARIANTS
+ const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP;
+
+ MPASS((csum_flags_in & v6) != 0);
+#endif
+ if (csum_flags_in & CSUM_IP6_UDP)
+ csum_flags |= CSUM_INNER_IP6_UDP;
+ if (csum_flags_in & CSUM_IP6_TCP)
+ csum_flags |= CSUM_INNER_IP6_TCP;
+ if (csum_flags_in & CSUM_IP6_TSO)
+ csum_flags |= CSUM_INNER_IP6_TSO;
+ }
+
+ return (csum_flags);
+}
+
static int
vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
struct mbuf *m)
@@ -2345,7 +2461,12 @@
struct in_addr srcaddr, dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
+ struct route route, *ro;
+ struct sockaddr_in *sin;
+ uint32_t csum_flags;
+ NET_EPOCH_ASSERT();
+
ifp = sc->vxl_ifp;
srcaddr = sc->vxl_src_addr.in4.sin_addr;
srcport = vxlan_pick_source_port(sc, m);
@@ -2376,7 +2497,57 @@
mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
m->m_flags &= ~(M_MCAST | M_BCAST);
- error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL);
+ m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
+ if (m->m_pkthdr.csum_flags != 0) {
+ /*
+ * HW checksum (L3 and/or L4) or TSO has been requested. Look
+ * up the ifnet for the outbound route and verify that the
+ * outbound ifnet can perform the requested operation on the
+ * inner frame.
+ */
+ bzero(&route, sizeof(route));
+ ro = &route;
+ sin = (struct sockaddr_in *)&ro->ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+ ro->ro_nh = fib4_lookup(RT_DEFAULT_FIB, ip->ip_dst, 0, NHR_NONE,
+ 0);
+ if (ro->ro_nh == NULL) {
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (EHOSTUNREACH);
+ }
+
+ csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
+ CSUM_ENCAP_VXLAN);
+ if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
+ csum_flags) {
+ if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
+ const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
+
+ if_printf(ifp, "interface %s is missing hwcaps "
+ "0x%08x, csum_flags 0x%08x -> 0x%08x, "
+ "hwassist 0x%08x\n", nh_ifp->if_xname,
+ csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
+ m->m_pkthdr.csum_flags, csum_flags,
+ (uint32_t)nh_ifp->if_hwassist);
+ }
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (ENXIO);
+ }
+ m->m_pkthdr.csum_flags = csum_flags;
+ if (csum_flags &
+ (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
+ counter_u64_add(sc->vxl_stats.txcsum, 1);
+ if (csum_flags & CSUM_INNER_TSO)
+ counter_u64_add(sc->vxl_stats.tso, 1);
+ }
+ } else
+ ro = NULL;
+ error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@@ -2402,7 +2573,12 @@
const struct in6_addr *srcaddr, *dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
+ struct route_in6 route, *ro;
+ struct sockaddr_in6 *sin6;
+ uint32_t csum_flags;
+ NET_EPOCH_ASSERT();
+
ifp = sc->vxl_ifp;
srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
srcport = vxlan_pick_source_port(sc, m);
@@ -2429,22 +2605,67 @@
vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
- /*
- * XXX BMV We need support for RFC6935 before we can send and
- * receive IPv6 UDP packets with a zero checksum.
- */
- {
+ mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
+ m->m_flags &= ~(M_MCAST | M_BCAST);
+
+ ro = NULL;
+ m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
+ if (m->m_pkthdr.csum_flags != 0) {
+ /*
+ * HW checksum (L3 and/or L4) or TSO has been requested. Look
+ * up the ifnet for the outbound route and verify that the
+ * outbound ifnet can perform the requested operation on the
+ * inner frame.
+ */
+ bzero(&route, sizeof(route));
+ ro = &route;
+ sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_addr = ip6->ip6_dst;
+ ro->ro_nh = fib6_lookup(RT_DEFAULT_FIB, &ip6->ip6_dst, 0,
+ NHR_NONE, 0);
+ if (ro->ro_nh == NULL) {
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (EHOSTUNREACH);
+ }
+
+ csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
+ CSUM_ENCAP_VXLAN);
+ if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
+ csum_flags) {
+ if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
+ const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
+
+ if_printf(ifp, "interface %s is missing hwcaps "
+ "0x%08x, csum_flags 0x%08x -> 0x%08x, "
+ "hwassist 0x%08x\n", nh_ifp->if_xname,
+ csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
+ m->m_pkthdr.csum_flags, csum_flags,
+ (uint32_t)nh_ifp->if_hwassist);
+ }
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (ENXIO);
+ }
+ m->m_pkthdr.csum_flags = csum_flags;
+ if (csum_flags &
+ (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
+ counter_u64_add(sc->vxl_stats.txcsum, 1);
+ if (csum_flags & CSUM_INNER_TSO)
+ counter_u64_add(sc->vxl_stats.tso, 1);
+ }
+ } else if (ntohs(dstport) != V_zero_checksum_port) {
struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
+
hdr->uh_sum = in6_cksum_pseudo(ip6,
m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
}
-
- mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
- m->m_flags &= ~(M_MCAST | M_BCAST);
-
- error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL);
+ error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@@ -2593,8 +2814,30 @@
m_clrprotoflags(m);
m->m_pkthdr.rcvif = ifp;
M_SETFIB(m, ifp->if_fib);
+ if (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN &&
+ ((ifp->if_capenable & IFCAP_RXCSUM &&
+ m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) ||
+ (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
+ !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) {
+ uint32_t csum_flags = 0;
- error = netisr_queue_src(NETISR_ETHER, 0, m);
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)
+ csum_flags |= CSUM_L3_CALC;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID)
+ csum_flags |= CSUM_L3_VALID;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC)
+ csum_flags |= CSUM_L4_CALC;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID)
+ csum_flags |= CSUM_L4_VALID;
+ m->m_pkthdr.csum_flags = csum_flags;
+ counter_u64_add(sc->vxl_stats.rxcsum, 1);
+ } else {
+ /* clear everything */
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.csum_data = 0;
+ }
+
+ error = netisr_dispatch(NETISR_ETHER, m);
*m0 = NULL;
out:
@@ -2602,7 +2845,49 @@
return (error);
}
+static int
+vxlan_stats_alloc(struct vxlan_softc *sc)
+{
+ struct vxlan_statistics *stats = &sc->vxl_stats;
+
+ stats->txcsum = counter_u64_alloc(M_WAITOK);
+ if (stats->txcsum == NULL)
+ goto failed;
+
+ stats->tso = counter_u64_alloc(M_WAITOK);
+ if (stats->tso == NULL)
+ goto failed;
+
+ stats->rxcsum = counter_u64_alloc(M_WAITOK);
+ if (stats->rxcsum == NULL)
+ goto failed;
+
+ return (0);
+failed:
+ vxlan_stats_free(sc);
+ return (ENOMEM);
+}
+
static void
+vxlan_stats_free(struct vxlan_softc *sc)
+{
+ struct vxlan_statistics *stats = &sc->vxl_stats;
+
+ if (stats->txcsum != NULL) {
+ counter_u64_free(stats->txcsum);
+ stats->txcsum = NULL;
+ }
+ if (stats->tso != NULL) {
+ counter_u64_free(stats->tso);
+ stats->tso = NULL;
+ }
+ if (stats->rxcsum != NULL) {
+ counter_u64_free(stats->rxcsum);
+ stats->rxcsum = NULL;
+ }
+}
+
+static void
vxlan_set_default_config(struct vxlan_softc *sc)
{
@@ -2722,6 +3007,142 @@
}
static int
+vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap)
+{
+ int mask = reqcap ^ ifp->if_capenable;
+
+ /* Disable TSO if tx checksums are disabled. */
+ if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) &&
+ reqcap & IFCAP_TSO4) {
+ reqcap &= ~IFCAP_TSO4;
+ if_printf(ifp, "tso4 disabled due to -txcsum.\n");
+ }
+ if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) &&
+ reqcap & IFCAP_TSO6) {
+ reqcap &= ~IFCAP_TSO6;
+ if_printf(ifp, "tso6 disabled due to -txcsum6.\n");
+ }
+
+ /* Do not enable TSO if tx checksums are disabled. */
+ if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 &&
+ !(reqcap & IFCAP_TXCSUM)) {
+ if_printf(ifp, "enable txcsum first.\n");
+ return (EAGAIN);
+ }
+ if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 &&
+ !(reqcap & IFCAP_TXCSUM_IPV6)) {
+ if_printf(ifp, "enable txcsum6 first.\n");
+ return (EAGAIN);
+ }
+
+ sc->vxl_reqcap = reqcap;
+ return (0);
+}
+
+/*
+ * A VXLAN interface inherits the capabilities of the vxlandev or the interface
+ * hosting the vxlanlocal address.
+ */
+static void
+vxlan_set_hwcaps(struct vxlan_softc *sc)
+{
+ struct epoch_tracker et;
+ struct ifnet *p;
+ struct ifaddr *ifa;
+ u_long hwa;
+ int cap, ena;
+ bool rel;
+ struct ifnet *ifp = sc->vxl_ifp;
+
+ /* reset caps */
+ ifp->if_capabilities &= VXLAN_BASIC_IFCAPS;
+ ifp->if_capenable &= VXLAN_BASIC_IFCAPS;
+ ifp->if_hwassist = 0;
+
+ NET_EPOCH_ENTER(et);
+ CURVNET_SET(ifp->if_vnet);
+
+ rel = false;
+ p = NULL;
+ if (sc->vxl_mc_ifname[0] != '\0') {
+ rel = true;
+ p = ifunit_ref(sc->vxl_mc_ifname);
+ } else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
+ if (sc->vxl_src_addr.sa.sa_family == AF_INET) {
+ struct sockaddr_in in4 = sc->vxl_src_addr.in4;
+
+ in4.sin_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&in4);
+ if (ifa != NULL)
+ p = ifa->ifa_ifp;
+ } else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) {
+ struct sockaddr_in6 in6 = sc->vxl_src_addr.in6;
+
+ in6.sin6_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&in6);
+ if (ifa != NULL)
+ p = ifa->ifa_ifp;
+ }
+ }
+ if (p == NULL)
+ goto done;
+
+ cap = ena = hwa = 0;
+
+ /* checksum offload */
+ if (p->if_capabilities & IFCAP_VXLAN_HWCSUM)
+ cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
+ if (p->if_capenable & IFCAP_VXLAN_HWCSUM) {
+ ena |= sc->vxl_reqcap & p->if_capenable &
+ (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
+ if (ena & IFCAP_TXCSUM) {
+ if (p->if_hwassist & CSUM_INNER_IP)
+ hwa |= CSUM_IP;
+ if (p->if_hwassist & CSUM_INNER_IP_UDP)
+ hwa |= CSUM_IP_UDP;
+ if (p->if_hwassist & CSUM_INNER_IP_TCP)
+ hwa |= CSUM_IP_TCP;
+ }
+ if (ena & IFCAP_TXCSUM_IPV6) {
+ if (p->if_hwassist & CSUM_INNER_IP6_UDP)
+ hwa |= CSUM_IP6_UDP;
+ if (p->if_hwassist & CSUM_INNER_IP6_TCP)
+ hwa |= CSUM_IP6_TCP;
+ }
+ }
+
+ /* hardware TSO */
+ if (p->if_capabilities & IFCAP_VXLAN_HWTSO) {
+ cap |= p->if_capabilities & IFCAP_TSO;
+ if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen)
+ ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen;
+ else
+ ifp->if_hw_tsomax = p->if_hw_tsomax;
+ /* XXX: tsomaxsegcount decrement is cxgbe specific */
+ ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1;
+ ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize;
+ }
+ if (p->if_capenable & IFCAP_VXLAN_HWTSO) {
+ ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO;
+ if (ena & IFCAP_TSO) {
+ if (p->if_hwassist & CSUM_INNER_IP_TSO)
+ hwa |= CSUM_IP_TSO;
+ if (p->if_hwassist & CSUM_INNER_IP6_TSO)
+ hwa |= CSUM_IP6_TSO;
+ }
+ }
+
+ ifp->if_capabilities |= cap;
+ ifp->if_capenable |= ena;
+ ifp->if_hwassist |= hwa;
+ if (rel)
+ if_rele(p);
+done:
+ CURVNET_RESTORE();
+ NET_EPOCH_EXIT(et);
+}
+
+static int
vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
{
struct vxlan_softc *sc;
@@ -2732,6 +3153,9 @@
sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
sc->vxl_unit = unit;
vxlan_set_default_config(sc);
+ error = vxlan_stats_alloc(sc);
+ if (error != 0)
+ goto fail;
if (params != 0) {
error = copyin(params, &vxlp, sizeof(vxlp));
@@ -2764,8 +3188,10 @@
ifp->if_ioctl = vxlan_ioctl;
ifp->if_transmit = vxlan_transmit;
ifp->if_qflush = vxlan_qflush;
- ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
- ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
+ ifp->if_capabilities = VXLAN_BASIC_IFCAPS;
+ ifp->if_capenable = VXLAN_BASIC_IFCAPS;
+ sc->vxl_reqcap = -1;
+ vxlan_set_hwcaps(sc);
ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
@@ -2775,7 +3201,7 @@
ether_ifattach(ifp, sc->vxl_hwaddr.octet);
ifp->if_baudrate = 0;
- ifp->if_hdrlen = 0;
+ vxlan_setup_interface_hdrlen(sc);
return (0);
@@ -2803,6 +3229,7 @@
vxlan_sysctl_destroy(sc);
rm_destroy(&sc->vxl_lock);
+ vxlan_stats_free(sc);
free(sc, M_VXLAN);
}
@@ -3087,6 +3514,15 @@
"ftable_lock_upgrade_failed", CTLFLAG_RD,
&stats->ftable_lock_upgrade_failed, 0,
"Forwarding table update required lock upgrade");
+
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum",
+ CTLFLAG_RD, &stats->txcsum,
+ "# of times hardware assisted with tx checksum");
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso",
+ CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO");
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum",
+ CTLFLAG_RD, &stats->rxcsum,
+ "# of times hardware assisted with rx checksum");
}
static void
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -769,9 +769,13 @@
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
+ * Note that if_vxlan could have requested TSO even though the outer
+ * frame is UDP. It is correct to not fragment such datagrams and
+ * instead just pass them on to the driver.
*/
if (ip_len <= mtu ||
- (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
+ (m->m_pkthdr.csum_flags & ifp->if_hwassist &
+ (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
ip->ip_sum = 0;
if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
ip->ip_sum = in_cksum(m, hlen);
@@ -785,7 +789,8 @@
* once instead of for every generated packet.
*/
if (!(flags & IP_FORWARDING) && ia) {
- if (m->m_pkthdr.csum_flags & CSUM_TSO)
+ if (m->m_pkthdr.csum_flags &
+ (CSUM_TSO | CSUM_INNER_TSO))
counter_u64_add(ia->ia_ifa.ifa_opackets,
m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
else
@@ -809,7 +814,8 @@
}
/* Balk when DF bit is set or the interface didn't support TSO. */
- if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
+ if ((ip_off & IP_DF) ||
+ (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
Index: sys/netinet/udp_var.h
===================================================================
--- sys/netinet/udp_var.h
+++ sys/netinet/udp_var.h
@@ -154,6 +154,9 @@
#define V_udp_blackhole VNET(udp_blackhole)
#define V_udp_log_in_vain VNET(udp_log_in_vain)
+VNET_DECLARE(int, zero_checksum_port);
+#define V_zero_checksum_port VNET(zero_checksum_port)
+
static __inline struct inpcbinfo *
udp_get_inpcbinfo(int protocol)
{
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -1120,7 +1120,8 @@
*/
sw_csum = m->m_pkthdr.csum_flags;
if (!hdrsplit) {
- tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
+ tso = ((sw_csum & ifp->if_hwassist &
+ (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
sw_csum &= ~ifp->if_hwassist;
} else
tso = 0;
Index: sys/netinet6/udp6_usrreq.c
===================================================================
--- sys/netinet6/udp6_usrreq.c
+++ sys/netinet6/udp6_usrreq.c
@@ -124,6 +124,11 @@
#include <security/mac/mac_framework.h>
+VNET_DEFINE(int, zero_checksum_port) = 0;
+#define V_zero_checksum_port VNET(zero_checksum_port)
+SYSCTL_INT(_net_inet6_udp6, OID_AUTO, rfc6935_port, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(zero_checksum_port), 0,
+ "Zero UDP checksum allowed for traffic to/from this port.");
/*
* UDP protocol implementation.
* Per RFC 768, August, 1980.
@@ -268,7 +273,14 @@
}
if (uh->uh_sum == 0) {
UDPSTAT_INC(udps_nosum);
- goto badunlocked;
+ /*
+ * dport 0 was rejected earlier so this is OK even if
+ * zero_checksum_port is 0 (which is its default value).
+ */
+ if (ntohs(uh->uh_dport) == V_zero_checksum_port)
+ goto skip_checksum;
+ else
+ goto badunlocked;
}
}
@@ -288,6 +300,7 @@
goto badunlocked;
}
+skip_checksum:
/*
* Construct sockaddr format source address.
*/
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h
+++ sys/sys/mbuf.h
@@ -171,7 +171,10 @@
uint8_t l3hlen; /* layer 3 hdr len */
uint8_t l4hlen; /* layer 4 hdr len */
uint8_t l5hlen; /* layer 5 hdr len */
- uint32_t spare;
+ uint8_t inner_l2hlen;
+ uint8_t inner_l3hlen;
+ uint8_t inner_l4hlen;
+ uint8_t inner_l5hlen;
};
};
union {
@@ -616,7 +619,13 @@
* Outbound flags that are set by upper protocol layers requesting lower
* layers, or ideally the hardware, to perform these offloading tasks.
* For outbound packets this field and its flags can be directly tested
- * against ifnet if_hwassist.
+ * against ifnet if_hwassist. Note that the outbound and the inbound flags do
+ * not collide right now but they could be allowed to (as long as the flags are
+ * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS
+ * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX.
+ *
+ * CSUM_INNER_<x> is the same as CSUM_<x> but it applies to the inner frame.
+ * The CSUM_ENCAP_<x> bits identify the outer encapsulation.
*/
#define CSUM_IP 0x00000001 /* IP header checksum offload */
#define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */
@@ -625,13 +634,28 @@
#define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */
#define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */
+#define CSUM_INNER_IP6_UDP 0x00000040
+#define CSUM_INNER_IP6_TCP 0x00000080
+#define CSUM_INNER_IP6_TSO 0x00000100
#define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */
#define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */
#define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */
#define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */
#define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */
+#define CSUM_INNER_IP 0x00004000
+#define CSUM_INNER_IP_UDP 0x00008000
+#define CSUM_INNER_IP_TCP 0x00010000
+#define CSUM_INNER_IP_TSO 0x00020000
+
+#define CSUM_ENCAP_VXLAN 0x00040000 /* VXLAN outer encapsulation */
+#define CSUM_ENCAP_RSVD1 0x00080000
+
/* Inbound checksum support where the checksum was verified by hardware. */
+#define CSUM_INNER_L3_CALC 0x00100000
+#define CSUM_INNER_L3_VALID 0x00200000
+#define CSUM_INNER_L4_CALC 0x00400000
+#define CSUM_INNER_L4_VALID 0x00800000
#define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */
#define CSUM_L3_VALID 0x02000000 /* checksum is correct */
#define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */
@@ -642,16 +666,31 @@
#define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */
+#define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \
+ CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \
+ CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \
+ CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \
+ CSUM_ENCAP_RSVD1 | CSUM_SND_TAG)
+
+#define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \
+ CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \
+ CSUM_COALESCED)
+
/*
* CSUM flag description for use with printf(9) %b identifier.
*/
#define CSUM_BITS \
"\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
- "\6CSUM_IP_ISCSI" \
- "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
- "\16CSUM_IP6_ISCSI" \
- "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
- "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
+ "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \
+ "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \
+ "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \
+ "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \
+ "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \
+ "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \
+ "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \
+ "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
/* CSUM flags compatibility mappings. */
#define CSUM_IP_CHECKED CSUM_L3_CALC
@@ -667,6 +706,7 @@
#define CSUM_UDP CSUM_IP_UDP
#define CSUM_SCTP CSUM_IP_SCTP
#define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO)
+#define CSUM_INNER_TSO (CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO)
#define CSUM_UDP_IPV6 CSUM_IP6_UDP
#define CSUM_TCP_IPV6 CSUM_IP6_TCP
#define CSUM_SCTP_IPV6 CSUM_IP6_SCTP

File Metadata

Mime Type
text/plain
Expires
Fri, Nov 15, 6:41 PM (1 h, 2 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14644904
Default Alt Text
D25873.id76008.diff (73 KB)

Event Timeline