Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F102610629
D25873.id76008.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
73 KB
Referenced Files
None
Subscribers
None
D25873.id76008.diff
View Options
Index: sbin/ifconfig/ifconfig.8
===================================================================
--- sbin/ifconfig/ifconfig.8
+++ sbin/ifconfig/ifconfig.8
@@ -28,7 +28,7 @@
.\" From: @(#)ifconfig.8 8.3 (Berkeley) 1/5/94
.\" $FreeBSD$
.\"
-.Dd June 4, 2020
+.Dd XXX
.Dt IFCONFIG 8
.Os
.Sh NAME
@@ -587,7 +587,7 @@
reception of extended frames, tag processing in hardware,
frame filtering in hardware, checksum offloading, or TSO on VLAN,
respectively.
-Note that this must be issued on a physical interface associated with
+Note that this must be configured on a physical interface associated with
.Xr vlan 4 ,
not on a
.Xr vlan 4
@@ -597,6 +597,21 @@
reception of extended frames, tag processing in hardware,
frame filtering in hardware, or TSO on VLAN,
respectively.
+.It Cm vxlanhwcsum , vxlanhwtso
+If the driver offers user-configurable VXLAN support, enable inner checksum
+offloading (receive and transmit) or TSO on VXLAN, respectively.
+Note that this must be configured on a physical interface associated with
+.Xr vxlan 4 ,
+not on a
+.Xr vxlan 4
+interface itself.
+The physical interface is either the interface specified as the vxlandev
+or the interface hosting the vxlanlocal address.
+The driver will offload as much checksum work and TSO as it can reliably
+support, the exact level of offloading may vary between drivers.
+.It Fl vxlanhwcsum , vxlanhwtso
+If the driver offers user-configurable VXLAN support, disable checksum
+offloading (receive and transmit) or TSO on VXLAN, respectively.
.It Cm vnet Ar jail
Move the interface to the
.Xr jail 8 ,
Index: sbin/ifconfig/ifconfig.c
===================================================================
--- sbin/ifconfig/ifconfig.c
+++ sbin/ifconfig/ifconfig.c
@@ -1344,7 +1344,8 @@
"\020\1RXCSUM\2TXCSUM\3NETCONS\4VLAN_MTU\5VLAN_HWTAGGING\6JUMBO_MTU\7POLLING" \
"\10VLAN_HWCSUM\11TSO4\12TSO6\13LRO\14WOL_UCAST\15WOL_MCAST\16WOL_MAGIC" \
"\17TOE4\20TOE6\21VLAN_HWFILTER\23VLAN_HWTSO\24LINKSTATE\25NETMAP" \
-"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6"
+"\26RXCSUM_IPV6\27TXCSUM_IPV6\31TXRTLMT\32HWRXTSTMP\33NOMAP\34TXTLS4\35TXTLS6" \
+"\36VXLAN_HWCSUM\37VXLAN_HWTSO"
/*
* Print the status of the interface. If an address family was
Index: sbin/ifconfig/ifvxlan.c
===================================================================
--- sbin/ifconfig/ifvxlan.c
+++ sbin/ifconfig/ifvxlan.c
@@ -620,6 +620,11 @@
DEF_CMD("vxlanflush", 0, setvxlan_flush),
DEF_CMD("vxlanflushall", 1, setvxlan_flush),
+
+ DEF_CMD("vxlanhwcsum", IFCAP_VXLAN_HWCSUM, setifcap),
+ DEF_CMD("-vxlanhwcsum", -IFCAP_VXLAN_HWCSUM, setifcap),
+ DEF_CMD("vxlanhwtso", IFCAP_VXLAN_HWTSO, setifcap),
+ DEF_CMD("-vxlanhwtso", -IFCAP_VXLAN_HWTSO, setifcap),
};
static struct afswtch af_vxlan = {
Index: share/man/man4/cxgbe.4
===================================================================
--- share/man/man4/cxgbe.4
+++ share/man/man4/cxgbe.4
@@ -31,7 +31,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd Dec 10, 2019
+.Dd XXX
.Dt CXGBE 4
.Os
.Sh NAME
@@ -61,8 +61,8 @@
the Chelsio Terminator 4, Terminator 5, and Terminator 6 ASICs (T4, T5, and T6).
The driver supports Jumbo Frames, Transmit/Receive checksum offload,
TCP segmentation offload (TSO), Large Receive Offload (LRO), VLAN
-tag insertion/extraction, VLAN checksum offload, VLAN TSO, and
-Receive Side Steering (RSS).
+tag insertion/extraction, VLAN checksum offload, VLAN TSO, VXLAN checksum
+offload, VXLAN TSO, and Receive Side Steering (RSS).
For further hardware information and questions related to hardware
requirements, see
.Pa http://www.chelsio.com/ .
Index: share/man/man4/vxlan.4
===================================================================
--- share/man/man4/vxlan.4
+++ share/man/man4/vxlan.4
@@ -24,7 +24,7 @@
.\"
.\" $FreeBSD$
.\"
-.Dd December 31, 2017
+.Dd XXX
.Dt VXLAN 4
.Os
.Sh NAME
@@ -182,6 +182,39 @@
.Nm
interface to allow the encapsulated frame to fit in the
current MTU of the physical network.
+.Sh HARDWARE
+The
+.Nm
+driver supports hardware checksum offload (receive and transmit) and TSO on the
+encapsulated traffic over physical interfaces that support these features.
+The
+.Nm
+interface examines the
+.Cm vxlandev
+interface, if one is specified, or the interface hosting the
+.Cm vxlanlocal
+address, and configures its capabilities based on the hardware offload
+capabilities of that physical interface.
+If multiple physical interfaces will transmit or receive traffic for the
+.Nm
+then they all must have the same hardware capabilities.
+The transmit routine of a
+.Nm
+interface may fail with
+.Er ENXIO
+if an outbound physical interface does not support
+an offload that the
+.Nm
+interface is requesting.
+This can happen if there are multiple physical interfaces involved, with
+different hardware capabilities, or an interface capability was disabled after
+the
+.Nm
+interface had already started.
+.Pp
+At present, these devices are capable of generating checksums and performing TSO
+on the inner frames in hardware:
+.Xr cxgbe 4 .
.Sh EXAMPLES
Create a
.Nm
@@ -244,3 +277,7 @@
.Nm
driver was written by
.An Bryan Venteicher Aq bryanv@freebsd.org .
+Support for stateless hardware offloads was added by
+.An Navdeep Parhar Aq np@freebsd.org
+in
+.Fx 13.0 .
Index: share/man/man9/EVENTHANDLER.9
===================================================================
--- share/man/man9/EVENTHANDLER.9
+++ share/man/man9/EVENTHANDLER.9
@@ -23,7 +23,7 @@
.\" SUCH DAMAGE.
.\" $FreeBSD$
.\"
-.Dd October 21, 2018
+.Dd October XXX
.Dt EVENTHANDLER 9
.Os
.Sh NAME
@@ -389,6 +389,10 @@
Callback invoked when a vlan is destroyed.
.It Vt vm_lowmem
Callbacks invoked when virtual memory is low.
+.It Vt vxlan_start
+Callback invoked when a vxlan interface starts.
+.It Vt vxlan_stop
+Callback invoked when a vxlan interface stops.
.It Vt watchdog_list
Callbacks invoked when the system watchdog timer is reinitialized.
.El
Index: sys/dev/cxgbe/adapter.h
===================================================================
--- sys/dev/cxgbe/adapter.h
+++ sys/dev/cxgbe/adapter.h
@@ -119,6 +119,7 @@
TX_SGL_SEGS = 39,
TX_SGL_SEGS_TSO = 38,
TX_SGL_SEGS_EO_TSO = 30, /* XXX: lower for IPv6. */
+ TX_SGL_SEGS_VXLAN_TSO = 37,
TX_WR_FLITS = SGE_MAX_WR_LEN / 8
};
@@ -286,6 +287,7 @@
int nvi;
int up_vis;
int uld_vis;
+ bool vxlan_tcam_entry;
struct tx_sched_params *sched_params;
@@ -593,6 +595,8 @@
uint64_t txpkts0_pkts; /* # of frames in type0 coalesced tx WRs */
uint64_t txpkts1_pkts; /* # of frames in type1 coalesced tx WRs */
uint64_t raw_wrs; /* # of raw work requests (alloc_wr_mbuf) */
+ uint64_t vxlan_tso_wrs; /* # of VXLAN TSO work requests */
+ uint64_t vxlan_txcsum;
uint64_t kern_tls_records;
uint64_t kern_tls_short;
@@ -625,6 +629,7 @@
uint64_t rxcsum; /* # of times hardware assisted with checksum */
uint64_t vlan_extraction;/* # of times VLAN tag was extracted */
+ uint64_t vxlan_rxcsum;
/* stats for not-that-common events */
@@ -847,6 +852,11 @@
struct sge sge;
int lro_timeout;
int sc_do_rxcopy;
+
+ int vxlan_port;
+ u_int vxlan_refcount;
+ int rawf_base;
+ int nrawf;
struct taskqueue *tq[MAX_NCHAN]; /* General purpose taskqueues */
struct task async_event_task;
Index: sys/dev/cxgbe/common/common.h
===================================================================
--- sys/dev/cxgbe/common/common.h
+++ sys/dev/cxgbe/common/common.h
@@ -247,7 +247,7 @@
uint32_t vlan_pri_map;
uint32_t ingress_config;
uint64_t hash_filter_mask;
- __be16 err_vec_mask;
+ bool rx_pkt_encap;
int8_t fcoe_shift;
int8_t port_shift;
Index: sys/dev/cxgbe/common/t4_hw.c
===================================================================
--- sys/dev/cxgbe/common/t4_hw.c
+++ sys/dev/cxgbe/common/t4_hw.c
@@ -9627,19 +9627,11 @@
read_filter_mode_and_ingress_config(adap, sleep_ok);
- /*
- * Cache a mask of the bits that represent the error vector portion of
- * rx_pkt.err_vec. T6+ can use a compressed error vector to make room
- * for information about outer encapsulation (GENEVE/VXLAN/NVGRE).
- */
- tpp->err_vec_mask = htobe16(0xffff);
if (chip_id(adap) > CHELSIO_T5) {
v = t4_read_reg(adap, A_TP_OUT_CONFIG);
- if (v & F_CRXPKTENC) {
- tpp->err_vec_mask =
- htobe16(V_T6_COMPR_RXERR_VEC(M_T6_COMPR_RXERR_VEC));
- }
- }
+ tpp->rx_pkt_encap = v & F_CRXPKTENC;
+ } else
+ tpp->rx_pkt_encap = false;
return 0;
}
Index: sys/dev/cxgbe/firmware/t6fw_cfg.txt
===================================================================
--- sys/dev/cxgbe/firmware/t6fw_cfg.txt
+++ sys/dev/cxgbe/firmware/t6fw_cfg.txt
@@ -146,7 +146,8 @@
nethctrl = 1024
neq = 2048
nqpcq = 8192
- nexactf = 456
+ nexactf = 454
+ nrawf = 2
cmask = all
pmask = all
ncrypto_lookaside = 16
@@ -272,7 +273,7 @@
[fini]
version = 0x1
- checksum = 0x4528a6ac
+ checksum = 0x82be65fd
#
# $FreeBSD$
#
Index: sys/dev/cxgbe/t4_main.c
===================================================================
--- sys/dev/cxgbe/t4_main.c
+++ sys/dev/cxgbe/t4_main.c
@@ -42,6 +42,7 @@
#include <sys/priv.h>
#include <sys/kernel.h>
#include <sys/bus.h>
+#include <sys/eventhandler.h>
#include <sys/module.h>
#include <sys/malloc.h>
#include <sys/queue.h>
@@ -1068,6 +1069,8 @@
TASK_INIT(&sc->async_event_task, 0, t4_async_event, sc);
#endif
+ refcount_init(&sc->vxlan_refcount, 0);
+
rc = t4_map_bars_0_and_4(sc);
if (rc != 0)
goto done; /* error message displayed already */
@@ -1715,6 +1718,7 @@
struct ifnet *ifp;
struct sbuf *sb;
struct pfil_head_args pa;
+ struct adapter *sc = vi->adapter;
vi->xact_addr_filt = -1;
callout_init(&vi->tick, 1);
@@ -1748,28 +1752,36 @@
ifp->if_capabilities = T4_CAP;
ifp->if_capenable = T4_CAP_ENABLE;
+ ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
+ CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
+ if (chip_id(sc) >= CHELSIO_T6) {
+ ifp->if_capabilities |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+ ifp->if_capenable |= IFCAP_VXLAN_HWCSUM | IFCAP_VXLAN_HWTSO;
+ ifp->if_hwassist |= CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP |
+ CSUM_INNER_IP6_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN;
+ }
+
#ifdef TCP_OFFLOAD
- if (vi->nofldrxq != 0 && (vi->adapter->flags & KERN_TLS_OK) == 0)
+ if (vi->nofldrxq != 0 && (sc->flags & KERN_TLS_OK) == 0)
ifp->if_capabilities |= IFCAP_TOE;
#endif
#ifdef RATELIMIT
- if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0) {
+ if (is_ethoffload(sc) && vi->nofldtxq != 0) {
ifp->if_capabilities |= IFCAP_TXRTLMT;
ifp->if_capenable |= IFCAP_TXRTLMT;
}
#endif
- ifp->if_hwassist = CSUM_TCP | CSUM_UDP | CSUM_IP | CSUM_TSO |
- CSUM_UDP_IPV6 | CSUM_TCP_IPV6;
ifp->if_hw_tsomax = IP_MAXPACKET;
ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_TSO;
#ifdef RATELIMIT
- if (is_ethoffload(vi->adapter) && vi->nofldtxq != 0)
+ if (is_ethoffload(sc) && vi->nofldtxq != 0)
ifp->if_hw_tsomaxsegcount = TX_SGL_SEGS_EO_TSO;
#endif
ifp->if_hw_tsomaxsegsize = 65536;
#ifdef KERN_TLS
- if (vi->adapter->flags & KERN_TLS_OK) {
+ if (sc->flags & KERN_TLS_OK) {
ifp->if_capabilities |= IFCAP_TXTLS;
ifp->if_capenable |= IFCAP_TXTLS;
}
@@ -2099,6 +2111,17 @@
if (mask & IFCAP_TXTLS)
ifp->if_capenable ^= (mask & IFCAP_TXTLS);
#endif
+ if (mask & IFCAP_VXLAN_HWCSUM) {
+ ifp->if_capenable ^= IFCAP_VXLAN_HWCSUM;
+ ifp->if_hwassist ^= CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP |
+ CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP;
+ }
+ if (mask & IFCAP_VXLAN_HWTSO) {
+ ifp->if_capenable ^= IFCAP_VXLAN_HWTSO;
+ ifp->if_hwassist ^= CSUM_INNER_IP6_TSO |
+ CSUM_INNER_IP_TSO;
+ }
#ifdef VLAN_CAPABILITIES
VLAN_CAPABILITIES(ifp);
@@ -4410,6 +4433,19 @@
MPASS(sc->tids.hpftid_base == 0);
MPASS(sc->tids.tid_base == sc->tids.nhpftids);
}
+
+ param[0] = FW_PARAM_PFVF(RAWF_START);
+ param[1] = FW_PARAM_PFVF(RAWF_END);
+ rc = -t4_query_params(sc, sc->mbox, sc->pf, 0, 2, param, val);
+ if (rc != 0) {
+ device_printf(sc->dev,
+ "failed to query rawf parameters: %d.\n", rc);
+ return (rc);
+ }
+ if ((int)val[1] > (int)val[0]) {
+ sc->rawf_base = val[0];
+ sc->nrawf = val[1] - val[0] + 1;
+ }
}
/*
@@ -5141,6 +5177,7 @@
struct port_info *pi = vi->pi;
struct adapter *sc = pi->adapter;
int mtu = -1, promisc = -1, allmulti = -1, vlanex = -1;
+ uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
ASSERT_SYNCHRONIZED_OP(sc);
KASSERT(flags, ("%s: not told what to update.", __func__));
@@ -5214,7 +5251,7 @@
rc = -rc;
for (j = 0; j < ctx.i; j++) {
if_printf(ifp,
- "failed to add mc address"
+ "failed to add mcast address"
" %02x:%02x:%02x:"
"%02x:%02x:%02x rc=%d\n",
ctx.mcaddr[j][0], ctx.mcaddr[j][1],
@@ -5224,14 +5261,36 @@
}
return (rc);
}
+ ctx.del = 0;
} else
NET_EPOCH_EXIT(et);
rc = -t4_set_addr_hash(sc, sc->mbox, vi->viid, 0, ctx.hash, 0);
if (rc != 0)
- if_printf(ifp, "failed to set mc address hash: %d", rc);
+ if_printf(ifp, "failed to set mcast address hash: %d\n",
+ rc);
+ if (ctx.del == 0) {
+ /* We clobbered the VXLAN entry if there was one. */
+ pi->vxlan_tcam_entry = false;
+ }
}
+ if (IS_MAIN_VI(vi) && sc->vxlan_refcount > 0 &&
+ pi->vxlan_tcam_entry == false) {
+ rc = t4_alloc_raw_mac_filt(sc, vi->viid, match_all_mac,
+ match_all_mac, sc->rawf_base + pi->port_id, 1, pi->port_id,
+ true);
+ if (rc < 0) {
+ rc = -rc;
+ if_printf(ifp, "failed to add VXLAN TCAM entry: %d.\n",
+ rc);
+ } else {
+ MPASS(rc == sc->rawf_base + pi->port_id);
+ rc = 0;
+ pi->vxlan_tcam_entry = true;
+ }
+ }
+
return (rc);
}
@@ -10374,6 +10433,7 @@
#endif
rxq->rxcsum = 0;
rxq->vlan_extraction = 0;
+ rxq->vxlan_rxcsum = 0;
rxq->fl.cl_allocated = 0;
rxq->fl.cl_recycled = 0;
@@ -10392,6 +10452,8 @@
txq->txpkts0_pkts = 0;
txq->txpkts1_pkts = 0;
txq->raw_wrs = 0;
+ txq->vxlan_tso_wrs = 0;
+ txq->vxlan_txcsum = 0;
txq->kern_tls_records = 0;
txq->kern_tls_short = 0;
txq->kern_tls_partial = 0;
@@ -11202,6 +11264,116 @@
}
#endif
+static eventhandler_tag vxlan_start_evtag;
+static eventhandler_tag vxlan_stop_evtag;
+
+struct vxlan_evargs {
+ struct ifnet *ifp;
+ uint16_t port;
+};
+
+static void
+t4_vxlan_start(struct adapter *sc, void *arg)
+{
+ struct vxlan_evargs *v = arg;
+ struct port_info *pi;
+ uint8_t match_all_mac[ETHER_ADDR_LEN] = {0};
+ int i, rc;
+
+ if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+ return;
+ if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxst") != 0)
+ return;
+
+ if (sc->vxlan_refcount == 0) {
+ sc->vxlan_port = v->port;
+ sc->vxlan_refcount = 1;
+ t4_write_reg(sc, A_MPS_RX_VXLAN_TYPE,
+ V_VXLAN(v->port) | F_VXLAN_EN);
+ for_each_port(sc, i) {
+ pi = sc->port[i];
+ if (pi->vxlan_tcam_entry == true)
+ continue;
+ rc = t4_alloc_raw_mac_filt(sc, pi->vi[0].viid,
+ match_all_mac, match_all_mac,
+ sc->rawf_base + pi->port_id, 1, pi->port_id, true);
+ if (rc < 0) {
+ rc = -rc;
+ log(LOG_ERR,
+ "%s: failed to add VXLAN TCAM entry: %d.\n",
+ device_get_name(pi->vi[0].dev), rc);
+ } else {
+ MPASS(rc == sc->rawf_base + pi->port_id);
+ rc = 0;
+ pi->vxlan_tcam_entry = true;
+ }
+ }
+ } else if (sc->vxlan_port == v->port) {
+ sc->vxlan_refcount++;
+ } else {
+ log(LOG_ERR, "%s: VXLAN already configured on port %d; "
+ "ignoring attempt to configure it on port %d\n",
+ device_get_nameunit(sc->dev), sc->vxlan_port, v->port);
+ }
+ end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_stop(struct adapter *sc, void *arg)
+{
+ struct vxlan_evargs *v = arg;
+
+ if (sc->nrawf == 0 || chip_id(sc) <= CHELSIO_T5)
+ return;
+ if (begin_synchronized_op(sc, NULL, SLEEP_OK | INTR_OK, "t4vxsp") != 0)
+ return;
+
+ /*
+ * VXLANs may have been configured before the driver was loaded so we
+ * may see more stops than starts. This is not handled cleanly but at
+ * least we keep the refcount sane.
+ */
+ if (sc->vxlan_port != v->port)
+ goto done;
+ if (sc->vxlan_refcount == 0) {
+ log(LOG_ERR,
+ "%s: VXLAN operation on port %d was stopped earlier; "
+ "ignoring attempt to stop it again.\n",
+ device_get_nameunit(sc->dev), sc->vxlan_port);
+ } else if (--sc->vxlan_refcount == 0) {
+ t4_set_reg_field(sc, A_MPS_RX_VXLAN_TYPE, F_VXLAN_EN, 0);
+ }
+done:
+ end_synchronized_op(sc, 0);
+}
+
+static void
+t4_vxlan_start_handler(void *arg __unused, struct ifnet *ifp,
+ sa_family_t family, u_int port)
+{
+ struct vxlan_evargs v;
+
+ MPASS(family == AF_INET || family == AF_INET6);
+ v.ifp = ifp;
+ v.port = port;
+
+ t4_iterate(t4_vxlan_start, &v);
+}
+
+static void
+t4_vxlan_stop_handler(void *arg __unused, struct ifnet *ifp, sa_family_t family,
+ u_int port)
+{
+ struct vxlan_evargs v;
+
+ MPASS(family == AF_INET || family == AF_INET6);
+ v.ifp = ifp;
+ v.port = port;
+
+ t4_iterate(t4_vxlan_stop, &v);
+}
+
+
static struct sx mlu; /* mod load unload */
SX_SYSINIT(cxgbe_mlu, &mlu, "cxgbe mod load/unload");
@@ -11245,6 +11417,14 @@
#endif
t4_tracer_modload();
tweak_tunables();
+ vxlan_start_evtag =
+ EVENTHANDLER_REGISTER(vxlan_start,
+ t4_vxlan_start_handler, NULL,
+ EVENTHANDLER_PRI_ANY);
+ vxlan_stop_evtag =
+ EVENTHANDLER_REGISTER(vxlan_stop,
+ t4_vxlan_stop_handler, NULL,
+ EVENTHANDLER_PRI_ANY);
}
sx_xunlock(&mlu);
break;
@@ -11281,6 +11461,10 @@
sx_sunlock(&t4_list_lock);
if (t4_sge_extfree_refs() == 0) {
+ EVENTHANDLER_DEREGISTER(vxlan_start,
+ vxlan_start_evtag);
+ EVENTHANDLER_DEREGISTER(vxlan_stop,
+ vxlan_stop_evtag);
t4_tracer_modunload();
#ifdef KERN_TLS
t6_ktls_modunload();
Index: sys/dev/cxgbe/t4_sge.c
===================================================================
--- sys/dev/cxgbe/t4_sge.c
+++ sys/dev/cxgbe/t4_sge.c
@@ -55,6 +55,7 @@
#include <net/ethernet.h>
#include <net/if.h>
#include <net/if_vlan_var.h>
+#include <net/if_vxlan.h>
#include <netinet/in.h>
#include <netinet/ip.h>
#include <netinet/ip6.h>
@@ -266,8 +267,9 @@
static void add_fl_to_sfl(struct adapter *, struct sge_fl *);
static inline void get_pkt_gl(struct mbuf *, struct sglist *);
-static inline u_int txpkt_len16(u_int, u_int);
-static inline u_int txpkt_vm_len16(u_int, u_int);
+static inline u_int txpkt_len16(u_int, const u_int);
+static inline u_int txpkt_vm_len16(u_int, const u_int);
+static inline void calculate_mbuf_len16(struct adapter *, struct mbuf *);
static inline u_int txpkts0_len16(u_int);
static inline u_int txpkts1_len16(void);
static u_int write_raw_wr(struct sge_txq *, void *, struct mbuf *, u_int);
@@ -1911,13 +1913,42 @@
#if defined(INET) || defined(INET6)
struct lro_ctrl *lro = &rxq->lro;
#endif
+ uint16_t err_vec, tnl_type, tnlhdr_len;
static const int sw_hashtype[4][2] = {
{M_HASHTYPE_NONE, M_HASHTYPE_NONE},
{M_HASHTYPE_RSS_IPV4, M_HASHTYPE_RSS_IPV6},
{M_HASHTYPE_RSS_TCP_IPV4, M_HASHTYPE_RSS_TCP_IPV6},
{M_HASHTYPE_RSS_UDP_IPV4, M_HASHTYPE_RSS_UDP_IPV6},
};
+ static const int sw_csum_flags[2][2] = {
+ {
+ /* IP, inner IP */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L3_CALC | CSUM_L3_VALID |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+ /* IP, inner IP6 */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L3_CALC | CSUM_L3_VALID |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+ },
+ {
+ /* IP6, inner IP */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+
+ /* IP6, inner IP6 */
+ CSUM_ENCAP_VXLAN |
+ CSUM_L4_CALC | CSUM_L4_VALID |
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID,
+ },
+ };
+
MPASS(plen > sc->params.sge.fl_pktshift);
if (vi->pfil != NULL && PFIL_HOOKED_IN(vi->pfil) &&
__predict_true((fl->flags & FL_BUF_RESUME) == 0)) {
@@ -1957,23 +1988,73 @@
m0->m_pkthdr.flowid = be32toh(d->rss.hash_val);
cpl = (const void *)(&d->rss + 1);
- if (cpl->csum_calc && !(cpl->err_vec & sc->params.tp.err_vec_mask)) {
- if (ifp->if_capenable & IFCAP_RXCSUM &&
- cpl->l2info & htobe32(F_RXF_IP)) {
- m0->m_pkthdr.csum_flags = (CSUM_IP_CHECKED |
- CSUM_IP_VALID | CSUM_DATA_VALID | CSUM_PSEUDO_HDR);
+ if (sc->params.tp.rx_pkt_encap) {
+ const uint16_t ev = be16toh(cpl->err_vec);
+
+ err_vec = G_T6_COMPR_RXERR_VEC(ev);
+ tnl_type = G_T6_RX_TNL_TYPE(ev);
+ tnlhdr_len = G_T6_RX_TNLHDR_LEN(ev);
+ } else {
+ err_vec = be16toh(cpl->err_vec);
+ tnl_type = 0;
+ tnlhdr_len = 0;
+ }
+ if (cpl->csum_calc && err_vec == 0) {
+ int ipv6 = !!(cpl->l2info & htobe32(F_RXF_IP6));
+
+ /* checksum(s) calculated and found to be correct. */
+
+ MPASS((cpl->l2info & htobe32(F_RXF_IP)) ^
+ (cpl->l2info & htobe32(F_RXF_IP6)));
+ m0->m_pkthdr.csum_data = be16toh(cpl->csum);
+ if (tnl_type == 0) {
+ if (!ipv6 && ifp->if_capenable & IFCAP_RXCSUM) {
+ m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+ CSUM_L3_VALID | CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ } else if (ipv6 && ifp->if_capenable & IFCAP_RXCSUM_IPV6) {
+ m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ }
rxq->rxcsum++;
- } else if (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
- cpl->l2info & htobe32(F_RXF_IP6)) {
- m0->m_pkthdr.csum_flags = (CSUM_DATA_VALID_IPV6 |
- CSUM_PSEUDO_HDR);
- rxq->rxcsum++;
- }
+ } else {
+ MPASS(tnl_type == RX_PKT_TNL_TYPE_VXLAN);
+ if (__predict_false(cpl->ip_frag)) {
+ /*
+ * csum_data is for the inner frame (which is an
+ * IP fragment) and is not 0xffff. There is no
+ * way to pass the inner csum_data to the stack.
+ * We don't want the stack to use the inner
+ * csum_data to validate the outer frame or it
+ * will get rejected. So we fix csum_data here
+ * and let sw do the checksum of inner IP
+ * fragments.
+ *
+ * XXX: Need 32b for csum_data2 in an rx mbuf.
+ * Maybe stuff it into rcv_tstmp?
+ */
+ m0->m_pkthdr.csum_data = 0xffff;
+ if (ipv6) {
+ m0->m_pkthdr.csum_flags = CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ } else {
+ m0->m_pkthdr.csum_flags = CSUM_L3_CALC |
+ CSUM_L3_VALID | CSUM_L4_CALC |
+ CSUM_L4_VALID;
+ }
+ } else {
+ int outer_ipv6;
- if (__predict_false(cpl->ip_frag))
- m0->m_pkthdr.csum_data = be16toh(cpl->csum);
- else
- m0->m_pkthdr.csum_data = 0xffff;
+ MPASS(m0->m_pkthdr.csum_data == 0xffff);
+
+ outer_ipv6 = tnlhdr_len >=
+ sizeof(struct ether_header) +
+ sizeof(struct ip6_hdr);
+ m0->m_pkthdr.csum_flags =
+ sw_csum_flags[outer_ipv6][ipv6];
+ }
+ rxq->vxlan_rxcsum++;
+ }
}
if (cpl->vlan_ex) {
@@ -2001,7 +2082,7 @@
m0->m_pkthdr.numa_domain = ifp->if_numa_domain;
#endif
#if defined(INET) || defined(INET6)
- if (rxq->iq.flags & IQ_LRO_ENABLED &&
+ if (rxq->iq.flags & IQ_LRO_ENABLED && tnl_type == 0 &&
(M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV4 ||
M_HASHTYPE_GET(m0) == M_HASHTYPE_RSS_TCP_IPV6)) {
if (sort_before_lro(lro)) {
@@ -2172,10 +2253,10 @@
{
M_ASSERTPKTHDR(m);
- KASSERT(m->m_pkthdr.l5hlen > 0,
+ KASSERT(m->m_pkthdr.inner_l5hlen > 0,
("%s: mbuf %p missing information on # of segments.", __func__, m));
- return (m->m_pkthdr.l5hlen);
+ return (m->m_pkthdr.inner_l5hlen);
}
static inline void
@@ -2183,7 +2264,7 @@
{
M_ASSERTPKTHDR(m);
- m->m_pkthdr.l5hlen = nsegs;
+ m->m_pkthdr.inner_l5hlen = nsegs;
}
static inline int
@@ -2309,63 +2390,108 @@
return (m);
}
-static inline int
+static inline bool
needs_hwcsum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP |
+ CSUM_IP_TSO | CSUM_INNER_IP | CSUM_INNER_IP_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_IP6_UDP |
+ CSUM_IP6_TCP | CSUM_IP6_TSO | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_IP |
- CSUM_TSO | CSUM_UDP_IPV6 | CSUM_TCP_IPV6));
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
+static inline bool
needs_tso(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_TSO | CSUM_IP6_TSO |
+ CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & CSUM_TSO);
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
+static inline bool
+needs_vxlan_csum(struct mbuf *m)
+{
+
+ M_ASSERTPKTHDR(m);
+
+ return (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_vxlan_tso(struct mbuf *m)
+{
+ const uint32_t csum_flags = CSUM_ENCAP_VXLAN | CSUM_INNER_IP_TSO |
+ CSUM_INNER_IP6_TSO;
+
+ M_ASSERTPKTHDR(m);
+
+ return ((m->m_pkthdr.csum_flags & csum_flags) != 0 &&
+ (m->m_pkthdr.csum_flags & csum_flags) != CSUM_ENCAP_VXLAN);
+}
+
+static inline bool
+needs_inner_tcp_csum(struct mbuf *m)
+{
+ const uint32_t csum_flags = CSUM_INNER_IP_TSO | CSUM_INNER_IP6_TSO;
+
+ M_ASSERTPKTHDR(m);
+
+ return (m->m_pkthdr.csum_flags & csum_flags);
+}
+
+static inline bool
needs_l3_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP | CSUM_IP_TSO | CSUM_INNER_IP |
+ CSUM_INNER_IP_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_TSO));
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
-needs_tcp_csum(struct mbuf *m)
+static inline bool
+needs_outer_tcp_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_TCP | CSUM_IP_TSO | CSUM_IP6_TCP |
+ CSUM_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_TCP_IPV6 | CSUM_TSO));
+
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
#ifdef RATELIMIT
-static inline int
-needs_l4_csum(struct mbuf *m)
+static inline bool
+needs_outer_l4_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_TSO |
+ CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_TSO;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_TCP | CSUM_UDP | CSUM_UDP_IPV6 |
- CSUM_TCP_IPV6 | CSUM_TSO));
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
-static inline int
-needs_udp_csum(struct mbuf *m)
+static inline bool
+needs_outer_udp_csum(struct mbuf *m)
{
+ const uint32_t csum_flags = CSUM_IP_UDP | CSUM_IP6_UDP;
M_ASSERTPKTHDR(m);
- return (m->m_pkthdr.csum_flags & (CSUM_UDP | CSUM_UDP_IPV6));
+
+ return (m->m_pkthdr.csum_flags & csum_flags);
}
#endif
-static inline int
+static inline bool
needs_vlan_insertion(struct mbuf *m)
{
@@ -2506,6 +2632,23 @@
}
/*
+ * The maximum number of segments that can fit in a WR.
+ */
+static int
+max_nsegs_allowed(struct mbuf *m)
+{
+
+ if (needs_tso(m)) {
+ if (needs_vxlan_tso(m))
+ return (TX_SGL_SEGS_VXLAN_TSO);
+ else
+ return (TX_SGL_SEGS_TSO);
+ }
+
+ return (TX_SGL_SEGS);
+}
+
+/*
* Analyze the mbuf to determine its tx needs. The mbuf passed in may change:
* a) caller can assume it's been freed if this function returns with an error.
* b) it may get defragged up if the gather list is too long for the hardware.
@@ -2563,7 +2706,7 @@
return (0);
}
#endif
- if (nsegs > (needs_tso(m0) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS)) {
+ if (nsegs > max_nsegs_allowed(m0)) {
if (defragged++ > 0 || (m = m_defrag(m0, M_NOWAIT)) == NULL) {
rc = EFBIG;
goto fail;
@@ -2585,18 +2728,15 @@
}
set_mbuf_nsegs(m0, nsegs);
set_mbuf_cflags(m0, cflags);
- if (sc->flags & IS_VF)
- set_mbuf_len16(m0, txpkt_vm_len16(nsegs, needs_tso(m0)));
- else
- set_mbuf_len16(m0, txpkt_len16(nsegs, needs_tso(m0)));
+ calculate_mbuf_len16(sc, m0);
#ifdef RATELIMIT
/*
* Ethofld is limited to TCP and UDP for now, and only when L4 hw
- * checksumming is enabled. needs_l4_csum happens to check for all the
- * right things.
+ * checksumming is enabled. needs_outer_l4_csum happens to check for
+ * all the right things.
*/
- if (__predict_false(needs_eo(cst) && !needs_l4_csum(m0))) {
+ if (__predict_false(needs_eo(cst) && !needs_outer_l4_csum(m0))) {
m_snd_tag_rele(m0->m_pkthdr.snd_tag);
m0->m_pkthdr.snd_tag = NULL;
m0->m_pkthdr.csum_flags &= ~CSUM_SND_TAG;
@@ -2628,21 +2768,27 @@
switch (eh_type) {
#ifdef INET6
case ETHERTYPE_IPV6:
- {
- struct ip6_hdr *ip6 = l3hdr;
-
- MPASS(!needs_tso(m0) || ip6->ip6_nxt == IPPROTO_TCP);
-
- m0->m_pkthdr.l3hlen = sizeof(*ip6);
+ m0->m_pkthdr.l3hlen = sizeof(struct ip6_hdr);
break;
- }
#endif
#ifdef INET
case ETHERTYPE_IP:
{
struct ip *ip = l3hdr;
- m0->m_pkthdr.l3hlen = ip->ip_hl * 4;
+ if (needs_vxlan_csum(m0)) {
+ /* Driver will do the outer IP hdr checksum. */
+ ip->ip_sum = 0;
+ if (needs_vxlan_tso(m0)) {
+ const uint16_t ipl = ip->ip_len;
+
+ ip->ip_len = 0;
+ ip->ip_sum = ~in_cksum_hdr(ip);
+ ip->ip_len = ipl;
+ } else
+ ip->ip_sum = in_cksum_hdr(ip);
+ }
+ m0->m_pkthdr.l3hlen = ip->ip_hl << 2;
break;
}
#endif
@@ -2652,8 +2798,59 @@
__func__, eh_type);
}
+ if (needs_vxlan_csum(m0)) {
+ m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
+ m0->m_pkthdr.l5hlen = sizeof(struct vxlan_header);
+
+ /* Inner headers. */
+ eh = m_advance(&m, &offset, m0->m_pkthdr.l3hlen +
+ sizeof(struct udphdr) + sizeof(struct vxlan_header));
+ eh_type = ntohs(eh->ether_type);
+ if (eh_type == ETHERTYPE_VLAN) {
+ struct ether_vlan_header *evh = (void *)eh;
+
+ eh_type = ntohs(evh->evl_proto);
+ m0->m_pkthdr.inner_l2hlen = sizeof(*evh);
+ } else
+ m0->m_pkthdr.inner_l2hlen = sizeof(*eh);
+ l3hdr = m_advance(&m, &offset, m0->m_pkthdr.inner_l2hlen);
+
+ switch (eh_type) {
+#ifdef INET6
+ case ETHERTYPE_IPV6:
+ m0->m_pkthdr.inner_l3hlen = sizeof(struct ip6_hdr);
+ break;
+#endif
+#ifdef INET
+ case ETHERTYPE_IP:
+ {
+ struct ip *ip = l3hdr;
+
+ m0->m_pkthdr.inner_l3hlen = ip->ip_hl << 2;
+ break;
+ }
+#endif
+ default:
+ panic("%s: VXLAN hw offload requested with unknown "
+ "ethertype 0x%04x. if_cxgbe must be compiled"
+ " with the same INET/INET6 options as the kernel.",
+ __func__, eh_type);
+ }
#if defined(INET) || defined(INET6)
- if (needs_tcp_csum(m0)) {
+ if (needs_inner_tcp_csum(m0)) {
+ tcp = m_advance(&m, &offset, m0->m_pkthdr.inner_l3hlen);
+ m0->m_pkthdr.inner_l4hlen = tcp->th_off * 4;
+ }
+#endif
+ MPASS((m0->m_pkthdr.csum_flags & CSUM_SND_TAG) == 0);
+ m0->m_pkthdr.csum_flags &= CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_TSO | CSUM_INNER_IP |
+ CSUM_INNER_IP_UDP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO |
+ CSUM_ENCAP_VXLAN;
+ }
+
+#if defined(INET) || defined(INET6)
+ if (needs_outer_tcp_csum(m0)) {
tcp = m_advance(&m, &offset, m0->m_pkthdr.l3hlen);
m0->m_pkthdr.l4hlen = tcp->th_off * 4;
#ifdef RATELIMIT
@@ -2663,7 +2860,7 @@
V_FW_ETH_TX_EO_WR_TSOFF(sizeof(*tcp) / 2 + 1));
} else
set_mbuf_eo_tsclk_tsoff(m0, 0);
- } else if (needs_udp_csum(m0)) {
+ } else if (needs_outer_udp_csum(m0)) {
m0->m_pkthdr.l4hlen = sizeof(struct udphdr);
#endif
}
@@ -3618,6 +3815,9 @@
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vlan_extraction",
CTLFLAG_RD, &rxq->vlan_extraction,
"# of times hardware extracted 802.1Q tag");
+ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_rxcsum",
+ CTLFLAG_RD, &rxq->vxlan_rxcsum,
+ "# of times hardware assisted with inner checksum (VXLAN) ");
add_fl_sysctls(sc, &vi->ctx, oid, &rxq->fl);
@@ -4272,6 +4472,11 @@
"# of frames tx'd using type1 txpkts work requests");
SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "raw_wrs", CTLFLAG_RD,
&txq->raw_wrs, "# of raw work requests (non-packets)");
+ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_tso_wrs",
+ CTLFLAG_RD, &txq->vxlan_tso_wrs, "# of VXLAN TSO work requests");
+ SYSCTL_ADD_UQUAD(&vi->ctx, children, OID_AUTO, "vxlan_txcsum",
+ CTLFLAG_RD, &txq->vxlan_txcsum,
+ "# of times hardware assisted with inner checksums (VXLAN)");
#ifdef KERN_TLS
if (sc->flags & KERN_TLS_OK) {
@@ -4561,27 +4766,25 @@
KASSERT(gl->sg_nseg == mbuf_nsegs(m),
("%s: nsegs changed for mbuf %p from %d to %d", __func__, m,
mbuf_nsegs(m), gl->sg_nseg));
- KASSERT(gl->sg_nseg > 0 &&
- gl->sg_nseg <= (needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS),
+ KASSERT(gl->sg_nseg > 0 && gl->sg_nseg <= max_nsegs_allowed(m),
("%s: %d segments, should have been 1 <= nsegs <= %d", __func__,
- gl->sg_nseg, needs_tso(m) ? TX_SGL_SEGS_TSO : TX_SGL_SEGS));
+ gl->sg_nseg, max_nsegs_allowed(m)));
}
/*
* len16 for a txpkt WR with a GL. Includes the firmware work request header.
*/
static inline u_int
-txpkt_len16(u_int nsegs, u_int tso)
+txpkt_len16(u_int nsegs, const u_int extra)
{
u_int n;
MPASS(nsegs > 0);
nsegs--; /* first segment is part of ulptx_sgl */
- n = sizeof(struct fw_eth_tx_pkt_wr) + sizeof(struct cpl_tx_pkt_core) +
+ n = extra + sizeof(struct fw_eth_tx_pkt_wr) +
+ sizeof(struct cpl_tx_pkt_core) +
sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
- if (tso)
- n += sizeof(struct cpl_tx_pkt_lso_core);
return (howmany(n, 16));
}
@@ -4591,22 +4794,43 @@
* request header.
*/
static inline u_int
-txpkt_vm_len16(u_int nsegs, u_int tso)
+txpkt_vm_len16(u_int nsegs, const u_int extra)
{
u_int n;
MPASS(nsegs > 0);
nsegs--; /* first segment is part of ulptx_sgl */
- n = sizeof(struct fw_eth_tx_pkt_vm_wr) +
+ n = extra + sizeof(struct fw_eth_tx_pkt_vm_wr) +
sizeof(struct cpl_tx_pkt_core) +
sizeof(struct ulptx_sgl) + 8 * ((3 * nsegs) / 2 + (nsegs & 1));
- if (tso)
- n += sizeof(struct cpl_tx_pkt_lso_core);
return (howmany(n, 16));
}
+static inline void
+calculate_mbuf_len16(struct adapter *sc, struct mbuf *m)
+{
+ const int lso = sizeof(struct cpl_tx_pkt_lso_core);
+ const int tnl_lso = sizeof(struct cpl_tx_tnl_lso);
+
+ if (sc->flags & IS_VF) {
+ if (needs_tso(m))
+ set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), lso));
+ else
+ set_mbuf_len16(m, txpkt_vm_len16(mbuf_nsegs(m), 0));
+ return;
+ }
+
+ if (needs_tso(m)) {
+ if (needs_vxlan_tso(m))
+ set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), tnl_lso));
+ else
+ set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), lso));
+ } else
+ set_mbuf_len16(m, txpkt_len16(mbuf_nsegs(m), 0));
+}
+
/*
* len16 for a txpkts type 0 WR with a GL. Does not include the firmware work
* request header.
@@ -4655,51 +4879,162 @@
csum_to_ctrl(struct adapter *sc, struct mbuf *m)
{
uint64_t ctrl;
- int csum_type;
+ int csum_type, l2hlen, l3hlen;
+ int x, y;
+ static const int csum_types[3][2] = {
+ {TX_CSUM_TCPIP, TX_CSUM_TCPIP6},
+ {TX_CSUM_UDPIP, TX_CSUM_UDPIP6},
+ {TX_CSUM_IP, 0}
+ };
M_ASSERTPKTHDR(m);
- if (needs_hwcsum(m) == 0)
+ if (!needs_hwcsum(m))
return (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS);
+ MPASS(m->m_pkthdr.l2hlen >= ETHER_HDR_LEN);
+ MPASS(m->m_pkthdr.l3hlen >= sizeof(struct ip));
+
+ if (needs_vxlan_csum(m)) {
+ MPASS(m->m_pkthdr.l4hlen > 0);
+ MPASS(m->m_pkthdr.l5hlen > 0);
+ MPASS(m->m_pkthdr.inner_l2hlen >= ETHER_HDR_LEN);
+ MPASS(m->m_pkthdr.inner_l3hlen >= sizeof(struct ip));
+
+ l2hlen = m->m_pkthdr.l2hlen + m->m_pkthdr.l3hlen +
+ m->m_pkthdr.l4hlen + m->m_pkthdr.l5hlen +
+ m->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN;
+ l3hlen = m->m_pkthdr.inner_l3hlen;
+ } else {
+ l2hlen = m->m_pkthdr.l2hlen - ETHER_HDR_LEN;
+ l3hlen = m->m_pkthdr.l3hlen;
+ }
+
ctrl = 0;
- if (needs_l3_csum(m) == 0)
+ if (!needs_l3_csum(m))
ctrl |= F_TXPKT_IPCSUM_DIS;
- switch (m->m_pkthdr.csum_flags &
- (CSUM_IP_TCP | CSUM_IP_UDP | CSUM_IP6_TCP | CSUM_IP6_UDP)) {
- case CSUM_IP_TCP:
- csum_type = TX_CSUM_TCPIP;
- break;
- case CSUM_IP_UDP:
- csum_type = TX_CSUM_UDPIP;
- break;
- case CSUM_IP6_TCP:
- csum_type = TX_CSUM_TCPIP6;
- break;
- case CSUM_IP6_UDP:
- csum_type = TX_CSUM_UDPIP6;
- break;
- default:
- /* needs_hwcsum told us that at least some hwcsum is needed. */
- MPASS(ctrl == 0);
- MPASS(m->m_pkthdr.csum_flags & CSUM_IP);
- ctrl |= F_TXPKT_L4CSUM_DIS;
- csum_type = TX_CSUM_IP;
- break;
- }
- MPASS(m->m_pkthdr.l2hlen > 0);
- MPASS(m->m_pkthdr.l3hlen > 0);
- ctrl |= V_TXPKT_CSUM_TYPE(csum_type) |
- V_TXPKT_IPHDR_LEN(m->m_pkthdr.l3hlen);
+ if (m->m_pkthdr.csum_flags & (CSUM_IP_TCP | CSUM_INNER_IP_TCP |
+ CSUM_IP6_TCP | CSUM_INNER_IP6_TCP))
+ x = 0; /* TCP */
+ else if (m->m_pkthdr.csum_flags & (CSUM_IP_UDP | CSUM_INNER_IP_UDP |
+ CSUM_IP6_UDP | CSUM_INNER_IP6_UDP))
+ x = 1; /* UDP */
+ else
+ x = 2;
+
+ if (m->m_pkthdr.csum_flags & (CSUM_IP | CSUM_IP_TCP | CSUM_IP_UDP |
+ CSUM_INNER_IP | CSUM_INNER_IP_TCP | CSUM_INNER_IP_UDP))
+ y = 0; /* IPv4 */
+ else {
+ MPASS(m->m_pkthdr.csum_flags & (CSUM_IP6_TCP | CSUM_IP6_UDP |
+ CSUM_INNER_IP6_TCP | CSUM_INNER_IP6_UDP));
+ y = 1; /* IPv6 */
+ }
+ /*
+ * needs_hwcsum returned true earlier so there must be some kind of
+ * checksum to calculate.
+ */
+ csum_type = csum_types[x][y];
+ MPASS(csum_type != 0);
+ if (csum_type == TX_CSUM_IP)
+ ctrl |= F_TXPKT_L4CSUM_DIS;
+ ctrl |= V_TXPKT_CSUM_TYPE(csum_type) | V_TXPKT_IPHDR_LEN(l3hlen);
if (chip_id(sc) <= CHELSIO_T5)
- ctrl |= V_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
+ ctrl |= V_TXPKT_ETHHDR_LEN(l2hlen);
else
- ctrl |= V_T6_TXPKT_ETHHDR_LEN(m->m_pkthdr.l2hlen - ETHER_HDR_LEN);
+ ctrl |= V_T6_TXPKT_ETHHDR_LEN(l2hlen);
return (ctrl);
}
+static inline void *
+write_lso_cpl(void *cpl, struct mbuf *m0)
+{
+ struct cpl_tx_pkt_lso_core *lso;
+ uint32_t ctrl;
+
+ KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+ m0->m_pkthdr.l4hlen > 0,
+ ("%s: mbuf %p needs TSO but missing header lengths",
+ __func__, m0));
+
+ ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) |
+ F_LSO_FIRST_SLICE | F_LSO_LAST_SLICE |
+ V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
+ V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
+ V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
+ if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
+ ctrl |= F_LSO_IPV6;
+
+ lso = cpl;
+ lso->lso_ctrl = htobe32(ctrl);
+ lso->ipid_ofst = htobe16(0);
+ lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
+ lso->seqno_offset = htobe32(0);
+ lso->len = htobe32(m0->m_pkthdr.len);
+
+ return (lso + 1);
+}
+
+static void *
+write_tnl_lso_cpl(void *cpl, struct mbuf *m0)
+{
+ struct cpl_tx_tnl_lso *tnl_lso = cpl;
+ uint32_t ctrl;
+
+ KASSERT(m0->m_pkthdr.inner_l2hlen > 0 &&
+ m0->m_pkthdr.inner_l3hlen > 0 && m0->m_pkthdr.inner_l4hlen > 0 &&
+ m0->m_pkthdr.inner_l5hlen > 0,
+ ("%s: mbuf %p needs VXLAN_TSO but missing inner header lengths",
+ __func__, m0));
+ KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
+ m0->m_pkthdr.l4hlen > 0 && m0->m_pkthdr.l5hlen > 0,
+ ("%s: mbuf %p needs VXLAN_TSO but missing outer header lengths",
+ __func__, m0));
+
+ /* Outer headers. */
+ ctrl = V_CPL_TX_TNL_LSO_OPCODE(CPL_TX_TNL_LSO) |
+ F_CPL_TX_TNL_LSO_FIRST | F_CPL_TX_TNL_LSO_LAST |
+ V_CPL_TX_TNL_LSO_ETHHDRLENOUT(
+ (m0->m_pkthdr.l2hlen - ETHER_HDR_LEN) >> 2) |
+ V_CPL_TX_TNL_LSO_IPHDRLENOUT(m0->m_pkthdr.l3hlen >> 2) |
+ F_CPL_TX_TNL_LSO_IPLENSETOUT;
+ if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
+ ctrl |= F_CPL_TX_TNL_LSO_IPV6OUT;
+ else {
+ ctrl |= F_CPL_TX_TNL_LSO_IPHDRCHKOUT |
+ F_CPL_TX_TNL_LSO_IPIDINCOUT;
+ }
+ tnl_lso->op_to_IpIdSplitOut = htobe32(ctrl);
+ tnl_lso->IpIdOffsetOut = 0;
+ tnl_lso->UdpLenSetOut_to_TnlHdrLen =
+ htobe16(F_CPL_TX_TNL_LSO_UDPCHKCLROUT |
+ F_CPL_TX_TNL_LSO_UDPLENSETOUT |
+ V_CPL_TX_TNL_LSO_TNLHDRLEN(m0->m_pkthdr.l2hlen +
+ m0->m_pkthdr.l3hlen + m0->m_pkthdr.l4hlen +
+ m0->m_pkthdr.l5hlen) |
+ V_CPL_TX_TNL_LSO_TNLTYPE(TX_TNL_TYPE_VXLAN));
+ tnl_lso->r1 = 0;
+
+ /* Inner headers. */
+ ctrl = V_CPL_TX_TNL_LSO_ETHHDRLEN(
+ (m0->m_pkthdr.inner_l2hlen - ETHER_HDR_LEN) >> 2) |
+ V_CPL_TX_TNL_LSO_IPHDRLEN(m0->m_pkthdr.inner_l3hlen >> 2) |
+ V_CPL_TX_TNL_LSO_TCPHDRLEN(m0->m_pkthdr.inner_l4hlen >> 2);
+ if (m0->m_pkthdr.inner_l3hlen == sizeof(struct ip6_hdr))
+ ctrl |= F_CPL_TX_TNL_LSO_IPV6;
+ tnl_lso->Flow_to_TcpHdrLen = htobe32(ctrl);
+ tnl_lso->IpIdOffset = 0;
+ tnl_lso->IpIdSplit_to_Mss =
+ htobe16(V_CPL_TX_TNL_LSO_MSS(m0->m_pkthdr.tso_segsz));
+ tnl_lso->TCPSeqOffset = 0;
+ tnl_lso->EthLenOffset_Size =
+ htobe32(V_CPL_TX_TNL_LSO_SIZE(m0->m_pkthdr.len));
+
+ return (tnl_lso + 1);
+}
+
#define VM_TX_L2HDR_LEN 16 /* ethmacdst to vlantci */
/*
@@ -4753,29 +5088,7 @@
m_copydata(m0, 0, VM_TX_L2HDR_LEN, wr->ethmacdst);
if (needs_tso(m0)) {
- struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
-
- KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
- m0->m_pkthdr.l4hlen > 0,
- ("%s: mbuf %p needs TSO but missing header lengths",
- __func__, m0));
-
- ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
- F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
- ETHER_HDR_LEN) >> 2) |
- V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
- V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
- if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
- ctrl |= F_LSO_IPV6;
-
- lso->lso_ctrl = htobe32(ctrl);
- lso->ipid_ofst = htobe16(0);
- lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
- lso->seqno_offset = htobe32(0);
- lso->len = htobe32(pktlen);
-
- cpl = (void *)(lso + 1);
-
+ cpl = write_lso_cpl(wr + 1, m0);
txq->tso_wrs++;
} else
cpl = (void *)(wr + 1);
@@ -4883,9 +5196,12 @@
nsegs = mbuf_nsegs(m0);
pktlen = m0->m_pkthdr.len;
ctrl = sizeof(struct cpl_tx_pkt_core);
- if (needs_tso(m0))
- ctrl += sizeof(struct cpl_tx_pkt_lso_core);
- else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
+ if (needs_tso(m0)) {
+ if (needs_vxlan_tso(m0))
+ ctrl += sizeof(struct cpl_tx_tnl_lso);
+ else
+ ctrl += sizeof(struct cpl_tx_pkt_lso_core);
+ } else if (!(mbuf_cflags(m0) & MC_NOMAP) && pktlen <= imm_payload(2) &&
available >= 2) {
/* Immediate data. Recalculate len16 and set nsegs to 0. */
ctrl += pktlen;
@@ -4907,41 +5223,30 @@
wr->r3 = 0;
if (needs_tso(m0)) {
- struct cpl_tx_pkt_lso_core *lso = (void *)(wr + 1);
-
- KASSERT(m0->m_pkthdr.l2hlen > 0 && m0->m_pkthdr.l3hlen > 0 &&
- m0->m_pkthdr.l4hlen > 0,
- ("%s: mbuf %p needs TSO but missing header lengths",
- __func__, m0));
-
- ctrl = V_LSO_OPCODE(CPL_TX_PKT_LSO) | F_LSO_FIRST_SLICE |
- F_LSO_LAST_SLICE | V_LSO_ETHHDR_LEN((m0->m_pkthdr.l2hlen -
- ETHER_HDR_LEN) >> 2) |
- V_LSO_IPHDR_LEN(m0->m_pkthdr.l3hlen >> 2) |
- V_LSO_TCPHDR_LEN(m0->m_pkthdr.l4hlen >> 2);
- if (m0->m_pkthdr.l3hlen == sizeof(struct ip6_hdr))
- ctrl |= F_LSO_IPV6;
-
- lso->lso_ctrl = htobe32(ctrl);
- lso->ipid_ofst = htobe16(0);
- lso->mss = htobe16(m0->m_pkthdr.tso_segsz);
- lso->seqno_offset = htobe32(0);
- lso->len = htobe32(pktlen);
-
- cpl = (void *)(lso + 1);
-
- txq->tso_wrs++;
+ if (needs_vxlan_tso(m0)) {
+ cpl = write_tnl_lso_cpl(wr + 1, m0);
+ txq->vxlan_tso_wrs++;
+ } else {
+ cpl = write_lso_cpl(wr + 1, m0);
+ txq->tso_wrs++;
+ }
} else
cpl = (void *)(wr + 1);
/* Checksum offload */
ctrl1 = csum_to_ctrl(sc, m0);
- if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
- txq->txcsum++; /* some hardware assistance provided */
+ if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
+ /* some hardware assistance provided */
+ if (needs_vxlan_csum(m0))
+ txq->vxlan_txcsum++;
+ else
+ txq->txcsum++;
+ }
/* VLAN tag insertion */
if (needs_vlan_insertion(m0)) {
- ctrl1 |= F_TXPKT_VLAN_VLD | V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
+ ctrl1 |= F_TXPKT_VLAN_VLD |
+ V_TXPKT_VLAN(m0->m_pkthdr.ether_vtag);
txq->vlan_insertion++;
}
@@ -4953,6 +5258,8 @@
/* SGL */
dst = (void *)(cpl + 1);
+ if (__predict_false((uintptr_t)dst == (uintptr_t)&eq->desc[eq->sidx]))
+ dst = (caddr_t)&eq->desc[0];
if (nsegs > 0) {
write_gl_to_txd(txq, m0, &dst, eq->sidx - ndesc < eq->pidx);
@@ -5198,8 +5505,13 @@
/* Checksum offload */
ctrl1 = csum_to_ctrl(sc, m);
- if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS))
- txq->txcsum++; /* some hardware assistance provided */
+ if (ctrl1 != (F_TXPKT_IPCSUM_DIS | F_TXPKT_L4CSUM_DIS)) {
+ /* some hardware assistance provided */
+ if (needs_vxlan_csum(m))
+ txq->vxlan_txcsum++;
+ else
+ txq->txcsum++;
+ }
/* VLAN tag insertion */
if (needs_vlan_insertion(m)) {
@@ -5958,7 +6270,7 @@
wr->equiq_to_len16 = htobe32(V_FW_WR_LEN16(len16) |
V_FW_WR_FLOWID(cst->etid));
wr->r3 = 0;
- if (needs_udp_csum(m0)) {
+ if (needs_outer_udp_csum(m0)) {
wr->u.udpseg.type = FW_ETH_TX_EO_TYPE_UDPSEG;
wr->u.udpseg.ethlen = m0->m_pkthdr.l2hlen;
wr->u.udpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
@@ -5970,7 +6282,7 @@
wr->u.udpseg.plen = htobe32(pktlen - immhdrs);
cpl = (void *)(wr + 1);
} else {
- MPASS(needs_tcp_csum(m0));
+ MPASS(needs_outer_tcp_csum(m0));
wr->u.tcpseg.type = FW_ETH_TX_EO_TYPE_TCPSEG;
wr->u.tcpseg.ethlen = m0->m_pkthdr.l2hlen;
wr->u.tcpseg.iplen = htobe16(m0->m_pkthdr.l3hlen);
@@ -6007,7 +6319,7 @@
}
/* Checksum offload must be requested for ethofld. */
- MPASS(needs_l4_csum(m0));
+ MPASS(needs_outer_l4_csum(m0));
ctrl1 = csum_to_ctrl(cst->adapter, m0);
/* VLAN tag insertion */
Index: sys/net/if.h
===================================================================
--- sys/net/if.h
+++ sys/net/if.h
@@ -249,6 +249,8 @@
#define IFCAP_NOMAP 0x4000000 /* can TX unmapped mbufs */
#define IFCAP_TXTLS4 0x8000000 /* can do TLS encryption and segmentation for TCP */
#define IFCAP_TXTLS6 0x10000000 /* can do TLS encryption and segmentation for TCP6 */
+#define IFCAP_VXLAN_HWCSUM 0x20000000 /* can do IFCAN_HWCSUM on VXLANs */
+#define IFCAP_VXLAN_HWTSO 0x40000000 /* can do IFCAP_TSO on VXLANs */
#define IFCAP_HWCSUM_IPV6 (IFCAP_RXCSUM_IPV6 | IFCAP_TXCSUM_IPV6)
Index: sys/net/if_vxlan.h
===================================================================
--- sys/net/if_vxlan.h
+++ sys/net/if_vxlan.h
@@ -143,4 +143,11 @@
char vxlcmd_ifname[IFNAMSIZ];
};
+#ifdef _KERNEL
+typedef void (*vxlan_event_handler_t)(void *, struct ifnet *, sa_family_t,
+ u_int);
+EVENTHANDLER_DECLARE(vxlan_start, vxlan_event_handler_t);
+EVENTHANDLER_DECLARE(vxlan_stop, vxlan_event_handler_t);
+#endif
+
#endif /* _NET_IF_VXLAN_H_ */
Index: sys/net/if_vxlan.c
===================================================================
--- sys/net/if_vxlan.c
+++ sys/net/if_vxlan.c
@@ -1,6 +1,7 @@
/*-
* Copyright (c) 2014, Bryan Venteicher <bryanv@FreeBSD.org>
* All rights reserved.
+ * Copyright (c) 2020, Chelsio Communications.
*
* Redistribution and use in source and binary forms, with or without
* modification, are permitted provided that the following conditions
@@ -60,6 +61,8 @@
#include <net/if_types.h>
#include <net/if_vxlan.h>
#include <net/netisr.h>
+#include <net/route.h>
+#include <net/route/nhop.h>
#include <netinet/in.h>
#include <netinet/in_systm.h>
@@ -70,6 +73,8 @@
#include <netinet/ip_var.h>
#include <netinet/udp.h>
#include <netinet/udp_var.h>
+#include <netinet/in_fib.h>
+#include <netinet6/in6_fib.h>
#include <netinet6/ip6_var.h>
#include <netinet6/scope6_var.h>
@@ -92,6 +97,7 @@
sizeof(struct udphdr) - \
sizeof(struct vxlan_header) - \
ETHER_HDR_LEN - ETHER_CRC_LEN - ETHER_VLAN_ENCAP_LEN)
+#define VXLAN_BASIC_IFCAPS (IFCAP_LINKSTATE | IFCAP_JUMBO_MTU)
#define VXLAN_SO_MC_MAX_GROUPS 32
@@ -146,10 +152,14 @@
struct vxlan_statistics {
uint32_t ftable_nospace;
uint32_t ftable_lock_upgrade_failed;
+ counter_u64_t txcsum;
+ counter_u64_t tso;
+ counter_u64_t rxcsum;
};
struct vxlan_softc {
struct ifnet *vxl_ifp;
+ int vxl_reqcap;
struct vxlan_socket *vxl_sock;
uint32_t vxl_vni;
union vxlan_sockaddr vxl_src_addr;
@@ -193,6 +203,10 @@
char vxl_mc_ifname[IFNAMSIZ];
LIST_ENTRY(vxlan_softc) vxl_entry;
LIST_ENTRY(vxlan_softc) vxl_ifdetach_list;
+
+ /* For rate limiting errors on the tx fast path. */
+ struct timeval err_time;
+ int err_pps;
};
#define VXLAN_RLOCK(_sc, _p) rm_rlock(&(_sc)->vxl_lock, (_p))
@@ -297,7 +311,10 @@
static int vxlan_setup_multicast(struct vxlan_softc *);
static int vxlan_setup_socket(struct vxlan_softc *);
-static void vxlan_setup_interface(struct vxlan_softc *);
+#ifdef INET6
+static void vxlan_setup_zero_checksum_port(struct vxlan_softc *);
+#endif
+static void vxlan_setup_interface_hdrlen(struct vxlan_softc *);
static int vxlan_valid_init_config(struct vxlan_softc *);
static void vxlan_init_wait(struct vxlan_softc *);
static void vxlan_init_complete(struct vxlan_softc *);
@@ -347,9 +364,13 @@
static int vxlan_input(struct vxlan_socket *, uint32_t, struct mbuf **,
const struct sockaddr *);
+static int vxlan_stats_alloc(struct vxlan_softc *);
+static void vxlan_stats_free(struct vxlan_softc *);
static void vxlan_set_default_config(struct vxlan_softc *);
static int vxlan_set_user_config(struct vxlan_softc *,
struct ifvxlanparam *);
+static int vxlan_set_reqcap(struct vxlan_softc *, struct ifnet *, int);
+static void vxlan_set_hwcaps(struct vxlan_softc *);
static int vxlan_clone_create(struct if_clone *, int, caddr_t);
static void vxlan_clone_destroy(struct ifnet *);
@@ -1555,9 +1576,44 @@
return (error);
}
+#ifdef INET6
static void
-vxlan_setup_interface(struct vxlan_softc *sc)
+vxlan_setup_zero_checksum_port(struct vxlan_softc *sc)
{
+
+ if (!VXLAN_SOCKADDR_IS_IPV6(&sc->vxl_src_addr))
+ return;
+
+ MPASS(sc->vxl_src_addr.in6.sin6_port != 0);
+ MPASS(sc->vxl_dst_addr.in6.sin6_port != 0);
+
+ if (sc->vxl_src_addr.in6.sin6_port != sc->vxl_dst_addr.in6.sin6_port) {
+ if_printf(sc->vxl_ifp, "port %d in src address does not match "
+ "port %d in dst address, rfc6935_port (%d) not updated.\n",
+ ntohs(sc->vxl_src_addr.in6.sin6_port),
+ ntohs(sc->vxl_dst_addr.in6.sin6_port),
+ V_zero_checksum_port);
+ return;
+ }
+
+ if (V_zero_checksum_port != 0) {
+ if (V_zero_checksum_port != sc->vxl_src_addr.in6.sin6_port) {
+ if_printf(sc->vxl_ifp, "rfc6935_port is already set to "
+ "%d, cannot set it to %d.\n", V_zero_checksum_port,
+ ntohs(sc->vxl_src_addr.in6.sin6_port));
+ }
+ return;
+ }
+
+ V_zero_checksum_port = ntohs(sc->vxl_src_addr.in6.sin6_port);
+ if_printf(sc->vxl_ifp, "rfc6935_port set to %d\n",
+ V_zero_checksum_port);
+}
+#endif
+
+static void
+vxlan_setup_interface_hdrlen(struct vxlan_softc *sc)
+{
struct ifnet *ifp;
ifp = sc->vxl_ifp;
@@ -1666,11 +1722,13 @@
if (vxlan_valid_init_config(sc) != 0)
goto out;
- vxlan_setup_interface(sc);
-
if (vxlan_setup_socket(sc) != 0)
goto out;
+#ifdef INET6
+ vxlan_setup_zero_checksum_port(sc);
+#endif
+
/* Initialize the default forwarding entry. */
vxlan_ftable_entry_init(sc, &sc->vxl_default_fe, empty_mac,
&sc->vxl_dst_addr.sa, VXLAN_FE_FLAG_STATIC);
@@ -1682,6 +1740,9 @@
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_UP);
+
+ EVENTHANDLER_INVOKE(vxlan_start, ifp, sc->vxl_src_addr.in4.sin_family,
+ ntohs(sc->vxl_src_addr.in4.sin_port));
out:
vxlan_init_complete(sc);
}
@@ -1738,6 +1799,8 @@
VXLAN_WUNLOCK(sc);
if_link_state_change(ifp, LINK_STATE_DOWN);
+ EVENTHANDLER_INVOKE(vxlan_stop, ifp, sc->vxl_src_addr.in4.sin_family,
+ ntohs(sc->vxl_src_addr.in4.sin_port));
if (vso != NULL) {
vxlan_socket_remove_softc(vso, sc);
@@ -1907,6 +1970,7 @@
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_src_addr, &vxlsa->sa);
+ vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@@ -1936,6 +2000,7 @@
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
vxlan_sockaddr_in_copy(&sc->vxl_dst_addr, &vxlsa->sa);
+ vxlan_setup_interface_hdrlen(sc);
error = 0;
} else
error = EBUSY;
@@ -2063,6 +2128,7 @@
VXLAN_WLOCK(sc);
if (vxlan_can_change_config(sc)) {
strlcpy(sc->vxl_mc_ifname, cmd->vxlcmd_ifname, IFNAMSIZ);
+ vxlan_set_hwcaps(sc);
error = 0;
} else
error = EBUSY;
@@ -2284,6 +2350,14 @@
ifp->if_mtu = ifr->ifr_mtu;
break;
+ case SIOCSIFCAP:
+ VXLAN_WLOCK(sc);
+ error = vxlan_set_reqcap(sc, ifp, ifr->ifr_reqcap);
+ if (error == 0)
+ vxlan_set_hwcaps(sc);
+ VXLAN_WUNLOCK(sc);
+ break;
+
default:
error = ether_ioctl(ifp, cmd, data);
break;
@@ -2335,6 +2409,48 @@
}
#endif
+/*
+ * Return the CSUM_INNER_* equivalent of CSUM_* caps.
+ */
+static uint32_t
+csum_flags_to_inner_flags(uint32_t csum_flags_in, uint32_t encap)
+{
+ uint32_t csum_flags = CSUM_ENCAP_VXLAN;
+ const uint32_t v4 = CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP;
+
+ /*
+ * csum_flags can request either v4 or v6 offload but not both.
+ * tcp_output always sets CSUM_TSO (both CSUM_IP_TSO and CSUM_IP6_TSO)
+ * so those bits are no good to detect the IP version. Other bits are
+ * always set with CSUM_TSO and we use those to figure out the IP
+ * version.
+ */
+ if (csum_flags_in & v4) {
+ if (csum_flags_in & CSUM_IP)
+ csum_flags |= CSUM_INNER_IP;
+ if (csum_flags_in & CSUM_IP_UDP)
+ csum_flags |= CSUM_INNER_IP_UDP;
+ if (csum_flags_in & CSUM_IP_TCP)
+ csum_flags |= CSUM_INNER_IP_TCP;
+ if (csum_flags_in & CSUM_IP_TSO)
+ csum_flags |= CSUM_INNER_IP_TSO;
+ } else {
+#ifdef INVARIANTS
+ const uint32_t v6 = CSUM_IP6_UDP | CSUM_IP6_TCP;
+
+ MPASS((csum_flags_in & v6) != 0);
+#endif
+ if (csum_flags_in & CSUM_IP6_UDP)
+ csum_flags |= CSUM_INNER_IP6_UDP;
+ if (csum_flags_in & CSUM_IP6_TCP)
+ csum_flags |= CSUM_INNER_IP6_TCP;
+ if (csum_flags_in & CSUM_IP6_TSO)
+ csum_flags |= CSUM_INNER_IP6_TSO;
+ }
+
+ return (csum_flags);
+}
+
static int
vxlan_encap4(struct vxlan_softc *sc, const union vxlan_sockaddr *fvxlsa,
struct mbuf *m)
@@ -2345,7 +2461,12 @@
struct in_addr srcaddr, dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
+ struct route route, *ro;
+ struct sockaddr_in *sin;
+ uint32_t csum_flags;
+ NET_EPOCH_ASSERT();
+
ifp = sc->vxl_ifp;
srcaddr = sc->vxl_src_addr.in4.sin_addr;
srcport = vxlan_pick_source_port(sc, m);
@@ -2376,7 +2497,57 @@
mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
m->m_flags &= ~(M_MCAST | M_BCAST);
- error = ip_output(m, NULL, NULL, 0, sc->vxl_im4o, NULL);
+ m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
+ if (m->m_pkthdr.csum_flags != 0) {
+ /*
+ * HW checksum (L3 and/or L4) or TSO has been requested. Look
+ * up the ifnet for the outbound route and verify that the
+ * outbound ifnet can perform the requested operation on the
+ * inner frame.
+ */
+ bzero(&route, sizeof(route));
+ ro = &route;
+ sin = (struct sockaddr_in *)&ro->ro_dst;
+ sin->sin_family = AF_INET;
+ sin->sin_len = sizeof(*sin);
+ sin->sin_addr = ip->ip_dst;
+ ro->ro_nh = fib4_lookup(RT_DEFAULT_FIB, ip->ip_dst, 0, NHR_NONE,
+ 0);
+ if (ro->ro_nh == NULL) {
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (EHOSTUNREACH);
+ }
+
+ csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
+ CSUM_ENCAP_VXLAN);
+ if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
+ csum_flags) {
+ if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
+ const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
+
+ if_printf(ifp, "interface %s is missing hwcaps "
+ "0x%08x, csum_flags 0x%08x -> 0x%08x, "
+ "hwassist 0x%08x\n", nh_ifp->if_xname,
+ csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
+ m->m_pkthdr.csum_flags, csum_flags,
+ (uint32_t)nh_ifp->if_hwassist);
+ }
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (ENXIO);
+ }
+ m->m_pkthdr.csum_flags = csum_flags;
+ if (csum_flags &
+ (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
+ counter_u64_add(sc->vxl_stats.txcsum, 1);
+ if (csum_flags & CSUM_INNER_TSO)
+ counter_u64_add(sc->vxl_stats.tso, 1);
+ }
+ } else
+ ro = NULL;
+ error = ip_output(m, NULL, ro, 0, sc->vxl_im4o, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@@ -2402,7 +2573,12 @@
const struct in6_addr *srcaddr, *dstaddr;
uint16_t srcport, dstport;
int len, mcast, error;
+ struct route_in6 route, *ro;
+ struct sockaddr_in6 *sin6;
+ uint32_t csum_flags;
+ NET_EPOCH_ASSERT();
+
ifp = sc->vxl_ifp;
srcaddr = &sc->vxl_src_addr.in6.sin6_addr;
srcport = vxlan_pick_source_port(sc, m);
@@ -2429,22 +2605,67 @@
vxlan_encap_header(sc, m, sizeof(struct ip6_hdr), srcport, dstport);
- /*
- * XXX BMV We need support for RFC6935 before we can send and
- * receive IPv6 UDP packets with a zero checksum.
- */
- {
+ mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
+ m->m_flags &= ~(M_MCAST | M_BCAST);
+
+ ro = NULL;
+ m->m_pkthdr.csum_flags &= CSUM_FLAGS_TX;
+ if (m->m_pkthdr.csum_flags != 0) {
+ /*
+ * HW checksum (L3 and/or L4) or TSO has been requested. Look
+ * up the ifnet for the outbound route and verify that the
+ * outbound ifnet can perform the requested operation on the
+ * inner frame.
+ */
+ bzero(&route, sizeof(route));
+ ro = &route;
+ sin6 = (struct sockaddr_in6 *)&ro->ro_dst;
+ sin6->sin6_family = AF_INET6;
+ sin6->sin6_len = sizeof(*sin6);
+ sin6->sin6_addr = ip6->ip6_dst;
+ ro->ro_nh = fib6_lookup(RT_DEFAULT_FIB, &ip6->ip6_dst, 0,
+ NHR_NONE, 0);
+ if (ro->ro_nh == NULL) {
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (EHOSTUNREACH);
+ }
+
+ csum_flags = csum_flags_to_inner_flags(m->m_pkthdr.csum_flags,
+ CSUM_ENCAP_VXLAN);
+ if ((csum_flags & ro->ro_nh->nh_ifp->if_hwassist) !=
+ csum_flags) {
+ if (ppsratecheck(&sc->err_time, &sc->err_pps, 1)) {
+ const struct ifnet *nh_ifp = ro->ro_nh->nh_ifp;
+
+ if_printf(ifp, "interface %s is missing hwcaps "
+ "0x%08x, csum_flags 0x%08x -> 0x%08x, "
+ "hwassist 0x%08x\n", nh_ifp->if_xname,
+ csum_flags & ~(uint32_t)nh_ifp->if_hwassist,
+ m->m_pkthdr.csum_flags, csum_flags,
+ (uint32_t)nh_ifp->if_hwassist);
+ }
+ m_freem(m);
+ if_inc_counter(ifp, IFCOUNTER_OERRORS, 1);
+ return (ENXIO);
+ }
+ m->m_pkthdr.csum_flags = csum_flags;
+ if (csum_flags &
+ (CSUM_INNER_IP | CSUM_INNER_IP_UDP | CSUM_INNER_IP6_UDP |
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP6_TCP)) {
+ counter_u64_add(sc->vxl_stats.txcsum, 1);
+ if (csum_flags & CSUM_INNER_TSO)
+ counter_u64_add(sc->vxl_stats.tso, 1);
+ }
+ } else if (ntohs(dstport) != V_zero_checksum_port) {
struct udphdr *hdr = mtodo(m, sizeof(struct ip6_hdr));
+
hdr->uh_sum = in6_cksum_pseudo(ip6,
m->m_pkthdr.len - sizeof(struct ip6_hdr), IPPROTO_UDP, 0);
m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
}
-
- mcast = (m->m_flags & (M_MCAST | M_BCAST)) ? 1 : 0;
- m->m_flags &= ~(M_MCAST | M_BCAST);
-
- error = ip6_output(m, NULL, NULL, 0, sc->vxl_im6o, NULL, NULL);
+ error = ip6_output(m, NULL, ro, 0, sc->vxl_im6o, NULL, NULL);
if (error == 0) {
if_inc_counter(ifp, IFCOUNTER_OPACKETS, 1);
if_inc_counter(ifp, IFCOUNTER_OBYTES, len);
@@ -2593,8 +2814,30 @@
m_clrprotoflags(m);
m->m_pkthdr.rcvif = ifp;
M_SETFIB(m, ifp->if_fib);
+ if (m->m_pkthdr.csum_flags & CSUM_ENCAP_VXLAN &&
+ ((ifp->if_capenable & IFCAP_RXCSUM &&
+ m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC) ||
+ (ifp->if_capenable & IFCAP_RXCSUM_IPV6 &&
+ !(m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)))) {
+ uint32_t csum_flags = 0;
- error = netisr_queue_src(NETISR_ETHER, 0, m);
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_CALC)
+ csum_flags |= CSUM_L3_CALC;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L3_VALID)
+ csum_flags |= CSUM_L3_VALID;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_CALC)
+ csum_flags |= CSUM_L4_CALC;
+ if (m->m_pkthdr.csum_flags & CSUM_INNER_L4_VALID)
+ csum_flags |= CSUM_L4_VALID;
+ m->m_pkthdr.csum_flags = csum_flags;
+ counter_u64_add(sc->vxl_stats.rxcsum, 1);
+ } else {
+ /* clear everything */
+ m->m_pkthdr.csum_flags = 0;
+ m->m_pkthdr.csum_data = 0;
+ }
+
+ error = netisr_dispatch(NETISR_ETHER, m);
*m0 = NULL;
out:
@@ -2602,7 +2845,49 @@
return (error);
}
+static int
+vxlan_stats_alloc(struct vxlan_softc *sc)
+{
+ struct vxlan_statistics *stats = &sc->vxl_stats;
+
+ stats->txcsum = counter_u64_alloc(M_WAITOK);
+ if (stats->txcsum == NULL)
+ goto failed;
+
+ stats->tso = counter_u64_alloc(M_WAITOK);
+ if (stats->tso == NULL)
+ goto failed;
+
+ stats->rxcsum = counter_u64_alloc(M_WAITOK);
+ if (stats->rxcsum == NULL)
+ goto failed;
+
+ return (0);
+failed:
+ vxlan_stats_free(sc);
+ return (ENOMEM);
+}
+
static void
+vxlan_stats_free(struct vxlan_softc *sc)
+{
+ struct vxlan_statistics *stats = &sc->vxl_stats;
+
+ if (stats->txcsum != NULL) {
+ counter_u64_free(stats->txcsum);
+ stats->txcsum = NULL;
+ }
+ if (stats->tso != NULL) {
+ counter_u64_free(stats->tso);
+ stats->tso = NULL;
+ }
+ if (stats->rxcsum != NULL) {
+ counter_u64_free(stats->rxcsum);
+ stats->rxcsum = NULL;
+ }
+}
+
+static void
vxlan_set_default_config(struct vxlan_softc *sc)
{
@@ -2722,6 +3007,142 @@
}
static int
+vxlan_set_reqcap(struct vxlan_softc *sc, struct ifnet *ifp, int reqcap)
+{
+ int mask = reqcap ^ ifp->if_capenable;
+
+ /* Disable TSO if tx checksums are disabled. */
+ if (mask & IFCAP_TXCSUM && !(reqcap & IFCAP_TXCSUM) &&
+ reqcap & IFCAP_TSO4) {
+ reqcap &= ~IFCAP_TSO4;
+ if_printf(ifp, "tso4 disabled due to -txcsum.\n");
+ }
+ if (mask & IFCAP_TXCSUM_IPV6 && !(reqcap & IFCAP_TXCSUM_IPV6) &&
+ reqcap & IFCAP_TSO6) {
+ reqcap &= ~IFCAP_TSO6;
+ if_printf(ifp, "tso6 disabled due to -txcsum6.\n");
+ }
+
+ /* Do not enable TSO if tx checksums are disabled. */
+ if (mask & IFCAP_TSO4 && reqcap & IFCAP_TSO4 &&
+ !(reqcap & IFCAP_TXCSUM)) {
+ if_printf(ifp, "enable txcsum first.\n");
+ return (EAGAIN);
+ }
+ if (mask & IFCAP_TSO6 && reqcap & IFCAP_TSO6 &&
+ !(reqcap & IFCAP_TXCSUM_IPV6)) {
+ if_printf(ifp, "enable txcsum6 first.\n");
+ return (EAGAIN);
+ }
+
+ sc->vxl_reqcap = reqcap;
+ return (0);
+}
+
+/*
+ * A VXLAN interface inherits the capabilities of the vxlandev or the interface
+ * hosting the vxlanlocal address.
+ */
+static void
+vxlan_set_hwcaps(struct vxlan_softc *sc)
+{
+ struct epoch_tracker et;
+ struct ifnet *p;
+ struct ifaddr *ifa;
+ u_long hwa;
+ int cap, ena;
+ bool rel;
+ struct ifnet *ifp = sc->vxl_ifp;
+
+ /* reset caps */
+ ifp->if_capabilities &= VXLAN_BASIC_IFCAPS;
+ ifp->if_capenable &= VXLAN_BASIC_IFCAPS;
+ ifp->if_hwassist = 0;
+
+ NET_EPOCH_ENTER(et);
+ CURVNET_SET(ifp->if_vnet);
+
+ rel = false;
+ p = NULL;
+ if (sc->vxl_mc_ifname[0] != '\0') {
+ rel = true;
+ p = ifunit_ref(sc->vxl_mc_ifname);
+ } else if (vxlan_sockaddr_in_any(&sc->vxl_src_addr) == 0) {
+ if (sc->vxl_src_addr.sa.sa_family == AF_INET) {
+ struct sockaddr_in in4 = sc->vxl_src_addr.in4;
+
+ in4.sin_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&in4);
+ if (ifa != NULL)
+ p = ifa->ifa_ifp;
+ } else if (sc->vxl_src_addr.sa.sa_family == AF_INET6) {
+ struct sockaddr_in6 in6 = sc->vxl_src_addr.in6;
+
+ in6.sin6_port = 0;
+ ifa = ifa_ifwithaddr((struct sockaddr *)&in6);
+ if (ifa != NULL)
+ p = ifa->ifa_ifp;
+ }
+ }
+ if (p == NULL)
+ goto done;
+
+ cap = ena = hwa = 0;
+
+ /* checksum offload */
+ if (p->if_capabilities & IFCAP_VXLAN_HWCSUM)
+ cap |= p->if_capabilities & (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
+ if (p->if_capenable & IFCAP_VXLAN_HWCSUM) {
+ ena |= sc->vxl_reqcap & p->if_capenable &
+ (IFCAP_HWCSUM | IFCAP_HWCSUM_IPV6);
+ if (ena & IFCAP_TXCSUM) {
+ if (p->if_hwassist & CSUM_INNER_IP)
+ hwa |= CSUM_IP;
+ if (p->if_hwassist & CSUM_INNER_IP_UDP)
+ hwa |= CSUM_IP_UDP;
+ if (p->if_hwassist & CSUM_INNER_IP_TCP)
+ hwa |= CSUM_IP_TCP;
+ }
+ if (ena & IFCAP_TXCSUM_IPV6) {
+ if (p->if_hwassist & CSUM_INNER_IP6_UDP)
+ hwa |= CSUM_IP6_UDP;
+ if (p->if_hwassist & CSUM_INNER_IP6_TCP)
+ hwa |= CSUM_IP6_TCP;
+ }
+ }
+
+ /* hardware TSO */
+ if (p->if_capabilities & IFCAP_VXLAN_HWTSO) {
+ cap |= p->if_capabilities & IFCAP_TSO;
+ if (p->if_hw_tsomax > IP_MAXPACKET - ifp->if_hdrlen)
+ ifp->if_hw_tsomax = IP_MAXPACKET - ifp->if_hdrlen;
+ else
+ ifp->if_hw_tsomax = p->if_hw_tsomax;
+ /* XXX: tsomaxsegcount decrement is cxgbe specific */
+ ifp->if_hw_tsomaxsegcount = p->if_hw_tsomaxsegcount - 1;
+ ifp->if_hw_tsomaxsegsize = p->if_hw_tsomaxsegsize;
+ }
+ if (p->if_capenable & IFCAP_VXLAN_HWTSO) {
+ ena |= sc->vxl_reqcap & p->if_capenable & IFCAP_TSO;
+ if (ena & IFCAP_TSO) {
+ if (p->if_hwassist & CSUM_INNER_IP_TSO)
+ hwa |= CSUM_IP_TSO;
+ if (p->if_hwassist & CSUM_INNER_IP6_TSO)
+ hwa |= CSUM_IP6_TSO;
+ }
+ }
+
+ ifp->if_capabilities |= cap;
+ ifp->if_capenable |= ena;
+ ifp->if_hwassist |= hwa;
+ if (rel)
+ if_rele(p);
+done:
+ CURVNET_RESTORE();
+ NET_EPOCH_EXIT(et);
+}
+
+static int
vxlan_clone_create(struct if_clone *ifc, int unit, caddr_t params)
{
struct vxlan_softc *sc;
@@ -2732,6 +3153,9 @@
sc = malloc(sizeof(struct vxlan_softc), M_VXLAN, M_WAITOK | M_ZERO);
sc->vxl_unit = unit;
vxlan_set_default_config(sc);
+ error = vxlan_stats_alloc(sc);
+ if (error != 0)
+ goto fail;
if (params != 0) {
error = copyin(params, &vxlp, sizeof(vxlp));
@@ -2764,8 +3188,10 @@
ifp->if_ioctl = vxlan_ioctl;
ifp->if_transmit = vxlan_transmit;
ifp->if_qflush = vxlan_qflush;
- ifp->if_capabilities |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
- ifp->if_capenable |= IFCAP_LINKSTATE | IFCAP_JUMBO_MTU;
+ ifp->if_capabilities = VXLAN_BASIC_IFCAPS;
+ ifp->if_capenable = VXLAN_BASIC_IFCAPS;
+ sc->vxl_reqcap = -1;
+ vxlan_set_hwcaps(sc);
ifmedia_init(&sc->vxl_media, 0, vxlan_media_change, vxlan_media_status);
ifmedia_add(&sc->vxl_media, IFM_ETHER | IFM_AUTO, 0, NULL);
@@ -2775,7 +3201,7 @@
ether_ifattach(ifp, sc->vxl_hwaddr.octet);
ifp->if_baudrate = 0;
- ifp->if_hdrlen = 0;
+ vxlan_setup_interface_hdrlen(sc);
return (0);
@@ -2803,6 +3229,7 @@
vxlan_sysctl_destroy(sc);
rm_destroy(&sc->vxl_lock);
+ vxlan_stats_free(sc);
free(sc, M_VXLAN);
}
@@ -3087,6 +3514,15 @@
"ftable_lock_upgrade_failed", CTLFLAG_RD,
&stats->ftable_lock_upgrade_failed, 0,
"Forwarding table update required lock upgrade");
+
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "txcsum",
+ CTLFLAG_RD, &stats->txcsum,
+ "# of times hardware assisted with tx checksum");
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "tso",
+ CTLFLAG_RD, &stats->tso, "# of times hardware assisted with TSO");
+ SYSCTL_ADD_COUNTER_U64(ctx, SYSCTL_CHILDREN(node), OID_AUTO, "rxcsum",
+ CTLFLAG_RD, &stats->rxcsum,
+ "# of times hardware assisted with rx checksum");
}
static void
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -769,9 +769,13 @@
/*
* If small enough for interface, or the interface will take
* care of the fragmentation for us, we can just send directly.
+ * Note that if_vxlan could have requested TSO even though the outer
+ * frame is UDP. It is correct to not fragment such datagrams and
+ * instead just pass them on to the driver.
*/
if (ip_len <= mtu ||
- (m->m_pkthdr.csum_flags & ifp->if_hwassist & CSUM_TSO) != 0) {
+ (m->m_pkthdr.csum_flags & ifp->if_hwassist &
+ (CSUM_TSO | CSUM_INNER_TSO)) != 0) {
ip->ip_sum = 0;
if (m->m_pkthdr.csum_flags & CSUM_IP & ~ifp->if_hwassist) {
ip->ip_sum = in_cksum(m, hlen);
@@ -785,7 +789,8 @@
* once instead of for every generated packet.
*/
if (!(flags & IP_FORWARDING) && ia) {
- if (m->m_pkthdr.csum_flags & CSUM_TSO)
+ if (m->m_pkthdr.csum_flags &
+ (CSUM_TSO | CSUM_INNER_TSO))
counter_u64_add(ia->ia_ifa.ifa_opackets,
m->m_pkthdr.len / m->m_pkthdr.tso_segsz);
else
@@ -809,7 +814,8 @@
}
/* Balk when DF bit is set or the interface didn't support TSO. */
- if ((ip_off & IP_DF) || (m->m_pkthdr.csum_flags & CSUM_TSO)) {
+ if ((ip_off & IP_DF) ||
+ (m->m_pkthdr.csum_flags & (CSUM_TSO | CSUM_INNER_TSO))) {
error = EMSGSIZE;
IPSTAT_INC(ips_cantfrag);
goto bad;
Index: sys/netinet/udp_var.h
===================================================================
--- sys/netinet/udp_var.h
+++ sys/netinet/udp_var.h
@@ -154,6 +154,9 @@
#define V_udp_blackhole VNET(udp_blackhole)
#define V_udp_log_in_vain VNET(udp_log_in_vain)
+VNET_DECLARE(int, zero_checksum_port);
+#define V_zero_checksum_port VNET(zero_checksum_port)
+
static __inline struct inpcbinfo *
udp_get_inpcbinfo(int protocol)
{
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -1120,7 +1120,8 @@
*/
sw_csum = m->m_pkthdr.csum_flags;
if (!hdrsplit) {
- tso = ((sw_csum & ifp->if_hwassist & CSUM_TSO) != 0) ? 1 : 0;
+ tso = ((sw_csum & ifp->if_hwassist &
+ (CSUM_TSO | CSUM_INNER_TSO)) != 0) ? 1 : 0;
sw_csum &= ~ifp->if_hwassist;
} else
tso = 0;
Index: sys/netinet6/udp6_usrreq.c
===================================================================
--- sys/netinet6/udp6_usrreq.c
+++ sys/netinet6/udp6_usrreq.c
@@ -124,6 +124,11 @@
#include <security/mac/mac_framework.h>
+VNET_DEFINE(int, zero_checksum_port) = 0;
+#define V_zero_checksum_port VNET(zero_checksum_port)
+SYSCTL_INT(_net_inet6_udp6, OID_AUTO, rfc6935_port, CTLFLAG_VNET | CTLFLAG_RW,
+ &VNET_NAME(zero_checksum_port), 0,
+ "Zero UDP checksum allowed for traffic to/from this port.");
/*
* UDP protocol implementation.
* Per RFC 768, August, 1980.
@@ -268,7 +273,14 @@
}
if (uh->uh_sum == 0) {
UDPSTAT_INC(udps_nosum);
- goto badunlocked;
+ /*
+ * dport 0 was rejected earlier so this is OK even if
+ * zero_checksum_port is 0 (which is its default value).
+ */
+ if (ntohs(uh->uh_dport) == V_zero_checksum_port)
+ goto skip_checksum;
+ else
+ goto badunlocked;
}
}
@@ -288,6 +300,7 @@
goto badunlocked;
}
+skip_checksum:
/*
* Construct sockaddr format source address.
*/
Index: sys/sys/mbuf.h
===================================================================
--- sys/sys/mbuf.h
+++ sys/sys/mbuf.h
@@ -171,7 +171,10 @@
uint8_t l3hlen; /* layer 3 hdr len */
uint8_t l4hlen; /* layer 4 hdr len */
uint8_t l5hlen; /* layer 5 hdr len */
- uint32_t spare;
+ uint8_t inner_l2hlen;
+ uint8_t inner_l3hlen;
+ uint8_t inner_l4hlen;
+ uint8_t inner_l5hlen;
};
};
union {
@@ -616,7 +619,13 @@
* Outbound flags that are set by upper protocol layers requesting lower
* layers, or ideally the hardware, to perform these offloading tasks.
* For outbound packets this field and its flags can be directly tested
- * against ifnet if_hwassist.
+ * against ifnet if_hwassist. Note that the outbound and the inbound flags do
+ * not collide right now but they could be allowed to (as long as the flags are
+ * scrubbed appropriately when the direction of an mbuf changes). CSUM_BITS
+ * would also have to split into CSUM_BITS_TX and CSUM_BITS_RX.
+ *
+ * CSUM_INNER_<x> is the same as CSUM_<x> but it applies to the inner frame.
+ * The CSUM_ENCAP_<x> bits identify the outer encapsulation.
*/
#define CSUM_IP 0x00000001 /* IP header checksum offload */
#define CSUM_IP_UDP 0x00000002 /* UDP checksum offload */
@@ -625,13 +634,28 @@
#define CSUM_IP_TSO 0x00000010 /* TCP segmentation offload */
#define CSUM_IP_ISCSI 0x00000020 /* iSCSI checksum offload */
+#define CSUM_INNER_IP6_UDP 0x00000040
+#define CSUM_INNER_IP6_TCP 0x00000080
+#define CSUM_INNER_IP6_TSO 0x00000100
#define CSUM_IP6_UDP 0x00000200 /* UDP checksum offload */
#define CSUM_IP6_TCP 0x00000400 /* TCP checksum offload */
#define CSUM_IP6_SCTP 0x00000800 /* SCTP checksum offload */
#define CSUM_IP6_TSO 0x00001000 /* TCP segmentation offload */
#define CSUM_IP6_ISCSI 0x00002000 /* iSCSI checksum offload */
+#define CSUM_INNER_IP 0x00004000
+#define CSUM_INNER_IP_UDP 0x00008000
+#define CSUM_INNER_IP_TCP 0x00010000
+#define CSUM_INNER_IP_TSO 0x00020000
+
+#define CSUM_ENCAP_VXLAN 0x00040000 /* VXLAN outer encapsulation */
+#define CSUM_ENCAP_RSVD1 0x00080000
+
/* Inbound checksum support where the checksum was verified by hardware. */
+#define CSUM_INNER_L3_CALC 0x00100000
+#define CSUM_INNER_L3_VALID 0x00200000
+#define CSUM_INNER_L4_CALC 0x00400000
+#define CSUM_INNER_L4_VALID 0x00800000
#define CSUM_L3_CALC 0x01000000 /* calculated layer 3 csum */
#define CSUM_L3_VALID 0x02000000 /* checksum is correct */
#define CSUM_L4_CALC 0x04000000 /* calculated layer 4 csum */
@@ -642,16 +666,31 @@
#define CSUM_SND_TAG 0x80000000 /* Packet header has send tag */
+#define CSUM_FLAGS_TX (CSUM_IP | CSUM_IP_UDP | CSUM_IP_TCP | CSUM_IP_SCTP | \
+ CSUM_IP_TSO | CSUM_IP_ISCSI | CSUM_INNER_IP6_UDP | CSUM_INNER_IP6_TCP | \
+ CSUM_INNER_IP6_TSO | CSUM_IP6_UDP | CSUM_IP6_TCP | CSUM_IP6_SCTP | \
+ CSUM_IP6_TSO | CSUM_IP6_ISCSI | CSUM_INNER_IP | CSUM_INNER_IP_UDP | \
+ CSUM_INNER_IP_TCP | CSUM_INNER_IP_TSO | CSUM_ENCAP_VXLAN | \
+ CSUM_ENCAP_RSVD1 | CSUM_SND_TAG)
+
+#define CSUM_FLAGS_RX (CSUM_INNER_L3_CALC | CSUM_INNER_L3_VALID | \
+ CSUM_INNER_L4_CALC | CSUM_INNER_L4_VALID | CSUM_L3_CALC | CSUM_L3_VALID | \
+ CSUM_L4_CALC | CSUM_L4_VALID | CSUM_L5_CALC | CSUM_L5_VALID | \
+ CSUM_COALESCED)
+
/*
* CSUM flag description for use with printf(9) %b identifier.
*/
#define CSUM_BITS \
"\20\1CSUM_IP\2CSUM_IP_UDP\3CSUM_IP_TCP\4CSUM_IP_SCTP\5CSUM_IP_TSO" \
- "\6CSUM_IP_ISCSI" \
- "\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP\15CSUM_IP6_TSO" \
- "\16CSUM_IP6_ISCSI" \
- "\31CSUM_L3_CALC\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID" \
- "\35CSUM_L5_CALC\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
+ "\6CSUM_IP_ISCSI\7CSUM_INNER_IP6_UDP\10CSUM_INNER_IP6_TCP" \
+ "\11CSUM_INNER_IP6_TSO\12CSUM_IP6_UDP\13CSUM_IP6_TCP\14CSUM_IP6_SCTP" \
+ "\15CSUM_IP6_TSO\16CSUM_IP6_ISCSI\17CSUM_INNER_IP\20CSUM_INNER_IP_UDP" \
+ "\21CSUM_INNER_IP_TCP\22CSUM_INNER_IP_TSO\23CSUM_ENCAP_VXLAN" \
+ "\24CSUM_ENCAP_RSVD1\25CSUM_INNER_L3_CALC\26CSUM_INNER_L3_VALID" \
+ "\27CSUM_INNER_L4_CALC\30CSUM_INNER_L4_VALID\31CSUM_L3_CALC" \
+ "\32CSUM_L3_VALID\33CSUM_L4_CALC\34CSUM_L4_VALID\35CSUM_L5_CALC" \
+ "\36CSUM_L5_VALID\37CSUM_COALESCED\40CSUM_SND_TAG"
/* CSUM flags compatibility mappings. */
#define CSUM_IP_CHECKED CSUM_L3_CALC
@@ -667,6 +706,7 @@
#define CSUM_UDP CSUM_IP_UDP
#define CSUM_SCTP CSUM_IP_SCTP
#define CSUM_TSO (CSUM_IP_TSO|CSUM_IP6_TSO)
+#define CSUM_INNER_TSO (CSUM_INNER_IP_TSO|CSUM_INNER_IP6_TSO)
#define CSUM_UDP_IPV6 CSUM_IP6_UDP
#define CSUM_TCP_IPV6 CSUM_IP6_TCP
#define CSUM_SCTP_IPV6 CSUM_IP6_SCTP
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Nov 15, 6:41 PM (1 h, 2 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
14644904
Default Alt Text
D25873.id76008.diff (73 KB)
Attached To
Mode
D25873: vxlan(4): Support for stateless NIC hardware offloads with VXLAN encapsulated traffic.
Attached
Detach File
Event Timeline
Log In to Comment