Page MenuHomeFreeBSD

D527.id1109.diff
No OneTemporary

D527.id1109.diff

Index: sys/netinet/in_rss.h
===================================================================
--- sys/netinet/in_rss.h
+++ sys/netinet/in_rss.h
@@ -83,6 +83,16 @@
#define RSS_KEYSIZE 40
/*
+ * For RSS hash methods that do a software hash on an mbuf, the packet
+ * direction (ingress / egress) is required.
+ *
+ * The default direction (INGRESS) is the "receive into the NIC" - ie,
+ * what the hardware is hashing on.
+ */
+#define RSS_HASH_PKT_INGRESS 0
+#define RSS_HASH_PKT_EGRESS 1
+
+/*
* Device driver interfaces to query RSS properties that must be programmed
* into hardware.
*/
@@ -116,4 +126,15 @@
uint32_t *bucket_id);
int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id);
+/*
+ * Functions to calculate a software RSS hash for a given mbuf or
+ * packet detail.
+ */
+int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir,
+ uint32_t *hashval, uint32_t *hashtype);
+int rss_proto_software_hash_v4(struct in_addr src,
+ struct in_addr dst, u_short src_port, u_short dst_port,
+ int proto, int dir, uint32_t *hashval,
+ uint32_t *hashtype);
+
#endif /* !_NETINET_IN_RSS_H_ */
Index: sys/netinet/in_rss.c
===================================================================
--- sys/netinet/in_rss.c
+++ sys/netinet/in_rss.c
@@ -57,6 +57,11 @@
#include <netinet/in_var.h>
#include <netinet/toeplitz.h>
+/* for software rss hash support */
+#include <netinet/ip.h>
+#include <netinet/tcp.h>
+#include <netinet/udp.h>
+
/*-
* Operating system parts of receiver-side scaling (RSS), which allows
* network cards to direct flows to particular receive queues based on hashes
@@ -170,6 +175,8 @@
};
static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN];
+static inline u_int rss_gethashconfig_local(void);
+
static void
rss_init(__unused void *arg)
{
@@ -491,6 +498,188 @@
}
/*
+ * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given
+ * IPv4 source/destination address, UDP or TCP source/destination ports
+ * and the protocol type.
+ *
+ * The protocol code may wish to do a software hash of the given
+ * tuple. This depends upon the currently configured RSS hash types.
+ *
+ * dir is RSS_HASH_PKT_INGRESS 0 for in, RSS_HASH_PKT_EGRESS for out.
+ * proto is the IPv4 protocol type.
+ */
+int
+rss_proto_software_hash_v4(struct in_addr src, struct in_addr dst,
+ u_short src_port, u_short dst_port, int proto, int dir,
+ uint32_t *hashval, uint32_t *hashtype)
+{
+ struct in_addr s, d;
+ u_short sp, dp;
+ uint32_t hash;
+
+ /* first, assign data appropriately */
+ if (dir == RSS_HASH_PKT_INGRESS) {
+ s = src;
+ d = dst;
+ sp = src_port;
+ dp = dst_port;
+ } else {
+ s = dst;
+ d = src;
+ sp = dst_port;
+ dp = src_port;
+ }
+
+ /*
+ * Next, choose the hash type depending upon the protocol
+ * identifier.
+ */
+ if ((proto == IPPROTO_TCP) &&
+ (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) {
+ hash = rss_hash_ip4_4tuple(s, sp, d, dp);
+ *hashval = hash;
+ *hashtype = M_HASHTYPE_RSS_TCP_IPV4;
+ return (0);
+ } else if ((proto == IPPROTO_UDP) &&
+ (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) {
+ hash = rss_hash_ip4_4tuple(s, sp, d, dp);
+ *hashval = hash;
+ *hashtype = M_HASHTYPE_RSS_UDP_IPV4;
+ return (0);
+ } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) {
+ /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */
+ hash = rss_hash_ip4_2tuple(s, d);
+ *hashval = hash;
+ *hashtype = M_HASHTYPE_RSS_IPV4;
+ return (0);
+ }
+
+ /* No configured available hashtypes! */
+ return (-1);
+}
+
+/*
+ * Do a software calculation of the RSS for the given mbuf.
+ *
+ * This is typically used by the input path to recalculate the RSS after
+ * some form of packet processing (eg de-capsulation, IP fragment reassembly.)
+ *
+ * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and
+ * RSS_HASH_PKT_EGRESS for outgoing.
+ *
+ * Returns 0 if a hash was done, -1 if no hash was done, +1 if
+ * the mbuf already had a valid RSS flowid.
+ *
+ * This function doesn't modify the mbuf. It's up to the caller to
+ * assign flowid/flowtype as appropriate.
+ */
+int
+rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval,
+ uint32_t *hashtype)
+{
+ const struct ip *ip;
+ const struct tcphdr *th;
+ const struct udphdr *uh;
+ uint8_t proto;
+ int iphlen;
+
+ /*
+ * First, validate that the mbuf we have is long enough
+ * to have an IPv4 header in it.
+ */
+
+ if (m->m_pkthdr.len < (sizeof(struct ip)))
+ return (-1);
+ if (m->m_len < (sizeof(struct ip)))
+ return (-1);
+
+ /* Ok, let's dereference that */
+ ip = mtod(m, struct ip *);
+ proto = ip->ip_p;
+ iphlen = ip->ip_hl << 2;
+
+ /*
+ * If the mbuf flowid/flowtype matches the packet type,
+ * then signal to the owner that it can trust the flowid/flowtype
+ * details.
+ */
+ if (m->m_flags & M_FLOWID) {
+ uint32_t flowid, flowtype;
+
+ flowid = m->m_pkthdr.flowid;
+ flowtype = M_HASHTYPE_GET(m);
+
+ switch (proto) {
+ case IPPROTO_UDP:
+ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) &&
+ flowtype == M_HASHTYPE_RSS_UDP_IPV4) {
+ return (1);
+ }
+ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
+ flowtype == M_HASHTYPE_RSS_IPV4) {
+ return (1);
+ }
+ break;
+ case IPPROTO_TCP:
+ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) &&
+ flowtype == M_HASHTYPE_RSS_TCP_IPV4) {
+ return (1);
+ }
+ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
+ flowtype == M_HASHTYPE_RSS_IPV4) {
+ return (1);
+ }
+ break;
+ default:
+ if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) &&
+ flowtype == M_HASHTYPE_RSS_IPV4) {
+ return (1);
+ }
+ break;
+ }
+ }
+
+ /*
+ * Decode enough information to make a hash decision.
+ *
+ * XXX TODO: does the hardware hash on 4-tuple if IP
+ * options are present?
+ */
+ if (proto == IPPROTO_TCP) {
+ if (m->m_len < iphlen + sizeof(struct tcphdr))
+ return (-1);
+ th = (struct tcphdr *)((caddr_t)ip + iphlen);
+ return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
+ th->th_sport,
+ th->th_dport,
+ proto,
+ dir,
+ hashval,
+ hashtype);
+ } else if (proto == IPPROTO_UDP) {
+ uh = (struct udphdr *)((caddr_t)ip + iphlen);
+ if (m->m_len < iphlen + sizeof(struct udphdr))
+ return (-1);
+ return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
+ uh->uh_sport,
+ uh->uh_dport,
+ proto,
+ dir,
+ hashval,
+ hashtype);
+ } else {
+ /* Default to 2-tuple hash */
+ return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst,
+ 0, /* source port */
+ 0, /* destination port */
+ 0, /* IPPROTO_IP */
+ dir,
+ hashval,
+ hashtype);
+ }
+}
+
+/*
* Query the RSS hash algorithm.
*/
u_int
@@ -538,15 +727,10 @@
return (rss_ncpus);
}
-/*
- * Return the supported RSS hash configuration.
- *
- * NICs should query this to determine what to configure in their redirection
- * matching table.
- */
-u_int
-rss_gethashconfig(void)
+static inline u_int
+rss_gethashconfig_local(void)
{
+
/* Return 4-tuple for TCP; 2-tuple for others */
/*
* UDP may fragment more often than TCP and thus we'll end up with
@@ -573,6 +757,19 @@
}
/*
+ * Return the supported RSS hash configuration.
+ *
+ * NICs should query this to determine what to configure in their redirection
+ * matching table.
+ */
+u_int
+rss_gethashconfig(void)
+{
+
+ return (rss_gethashconfig_local());
+}
+
+/*
* XXXRW: Confirm that sysctl -a won't dump this keying material, don't want
* it appearing in debugging output unnecessarily.
*/
Index: sys/netinet/ip_input.c
===================================================================
--- sys/netinet/ip_input.c
+++ sys/netinet/ip_input.c
@@ -37,6 +37,7 @@
#include "opt_ipstealth.h"
#include "opt_ipsec.h"
#include "opt_route.h"
+#include "opt_rss.h"
#include <sys/param.h>
#include <sys/systm.h>
@@ -77,6 +78,7 @@
#ifdef IPSEC
#include <netinet/ip_ipsec.h>
#endif /* IPSEC */
+#include <netinet/in_rss.h>
#include <sys/socketvar.h>
@@ -140,11 +142,21 @@
VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */
+/*
+ * We may need to re-inject packets into the IP stack for further work.
+ * In this instance, use the CPU policy and query the RSS layer for the
+ * relevant CPU ID to use.
+ */
static struct netisr_handler ip_nh = {
.nh_name = "ip",
.nh_handler = ip_input,
.nh_proto = NETISR_IP,
+#ifdef RSS
+ .nh_m2cpuid = rss_m2cpuid,
+ .nh_policy = NETISR_POLICY_CPU,
+#else
.nh_policy = NETISR_POLICY_FLOW,
+#endif
};
extern struct domain inetdomain;
@@ -368,13 +380,18 @@
M_ASSERTPKTHDR(m);
- if (m->m_flags & M_FASTFWD_OURS) {
- m->m_flags &= ~M_FASTFWD_OURS;
+ if (m->m_flags & (M_REINJECT_OURS | M_FASTFWD_OURS)) {
/* Set up some basics that will be used later. */
ip = mtod(m, struct ip *);
hlen = ip->ip_hl << 2;
ip_len = ntohs(ip->ip_len);
- goto ours;
+ if (m->m_flags & M_REINJECT_OURS) {
+ m->m_flags &= ~(M_REINJECT_OURS|M_FASTFWD_OURS);
+ goto reinject_ours;
+ } else {
+ m->m_flags &= ~(M_REINJECT_OURS|M_FASTFWD_OURS);
+ goto ours;
+ }
}
IPSTAT_INC(ips_total);
@@ -463,6 +480,7 @@
} else
m_adj(m, ip_len - m->m_pkthdr.len);
}
+
#ifdef IPSEC
/*
* Bypass packet filtering for packets previously handled by IPsec.
@@ -721,6 +739,8 @@
goto bad;
#endif /* IPSEC */
+reinject_ours:
+
/*
* Switch out to protocol's input routine.
*/
@@ -817,6 +837,9 @@
int i, hlen, next;
u_int8_t ecn, ecn0;
u_short hash;
+#ifdef RSS
+ uint32_t rss_hash, rss_type;
+#endif
/* If maxnipq or maxfragsperpacket are 0, never accept fragments. */
if (V_maxnipq == 0 || V_maxfragsperpacket == 0) {
@@ -1106,6 +1129,41 @@
m_fixhdr(m);
IPSTAT_INC(ips_reassembled);
IPQ_UNLOCK();
+
+#ifdef RSS
+ /*
+ * Query the RSS layer for the flowid / flowtype for the
+ * mbuf payload.
+ *
+ * For now, just assume we have to calculate a new one.
+ * Later on we should check to see if the assigned flowid matches
+ * what RSS wants for the given IP protocol and if so, just keep it.
+ *
+ * We then queue into the relevant netisr so it can be dispatched
+ * to the correct CPU.
+ *
+ * Note - this may return 1, which means the flowid in the mbuf
+ * is correct for the configured RSS hash types and can be used.
+ */
+ if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) {
+ m->m_pkthdr.flowid = rss_hash;
+ M_HASHTYPE_SET(m, rss_type);
+ m->m_flags |= M_FLOWID;
+ }
+
+ /*
+ * Queue/dispatch for reprocessing.
+ *
+ * Note: this is much slower than just handling the frame in the
+ * current receive context. It's likely worth investigating
+ * why this is.
+ */
+ m->m_flags |= M_REINJECT_OURS;
+ netisr_dispatch(NETISR_IP, m);
+ return (NULL);
+#endif
+
+ /* Handle in-line */
return (m);
dropfrag:
Index: sys/netinet/ip_output.c
===================================================================
--- sys/netinet/ip_output.c
+++ sys/netinet/ip_output.c
@@ -145,7 +145,9 @@
if (inp != NULL) {
INP_LOCK_ASSERT(inp);
M_SETFIB(m, inp->inp_inc.inc_fibnum);
- if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
+
+ if (((flags & IP_NODEFAULTFLOWID) == 0) &&
+ inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) {
m->m_pkthdr.flowid = inp->inp_flowid;
M_HASHTYPE_SET(m, inp->inp_flowtype);
m->m_flags |= M_FLOWID;
Index: sys/netinet/ip_var.h
===================================================================
--- sys/netinet/ip_var.h
+++ sys/netinet/ip_var.h
@@ -161,6 +161,7 @@
#define IP_SENDTOIF 0x8 /* send on specific ifnet */
#define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */
#define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */
+#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */
#ifdef __NO_STRICT_ALIGNMENT
#define IP_HDR_ALIGNED_P(ip) 1
Index: sys/netinet/udp_usrreq.c
===================================================================
--- sys/netinet/udp_usrreq.c
+++ sys/netinet/udp_usrreq.c
@@ -43,6 +43,7 @@
#include "opt_inet.h"
#include "opt_inet6.h"
#include "opt_ipsec.h"
+#include "opt_rss.h"
#include <sys/param.h>
#include <sys/domain.h>
@@ -89,6 +90,7 @@
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udplite.h>
+#include <netinet/in_rss.h>
#ifdef IPSEC
#include <netipsec/ipsec.h>
@@ -206,6 +208,13 @@
udp_init(void)
{
+ /*
+ * For now default to 2-tuple UDP hashing - until the fragment
+ * reassembly code can also update the flowid.
+ *
+ * Once we can calculate the flowid that way and re-establish
+ * a 4-tuple, flip this to 4-tuple.
+ */
in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE,
"udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE,
IPI_HASHFIELDS_2TUPLE);
@@ -1395,6 +1404,60 @@
((struct ip *)ui)->ip_tos = tos; /* XXX */
UDPSTAT_INC(udps_opackets);
+ /*
+ * Setup flowid / RSS information for outbound socket.
+ *
+ * Once the UDP code decides to set a flowid some other way,
+ * this allows the flowid to be overridden by userland.
+ *
+ * Remember ip_output() overrides with the inp flowid details
+ * if they exist.
+ *
+ * .. and ip_output() -> flowtable_lookup() also assigns
+ * a flowid too. Ugh.
+ */
+#ifdef RSS
+ {
+ uint32_t hash_val, hash_type;
+ /*
+ * Calculate an appropriate RSS hash for UDP and
+ * UDP Lite.
+ *
+ * The called function will take care of figuring out
+ * whether a 2-tuple or 4-tuple hash is required based
+ * on the currently configured scheme.
+ *
+ * Later later on connected socket values should be
+ * cached in the inpcb and reused, rather than constantly
+ * re-calculating it.
+ *
+ * UDP Lite is a different protocol number and will
+ * likely end up being hashed as a 2-tuple until
+ * RSS / NICs grow UDP Lite protocol awareness.
+ */
+ if (rss_proto_software_hash_v4(laddr, faddr, lport, fport,
+ pr, RSS_HASH_PKT_EGRESS,
+ &hash_val, &hash_type) == 0) {
+ m->m_pkthdr.flowid = hash_val;
+ m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, hash_type);
+ }
+ }
+#endif
+
+ /*
+ * Don't override with the inp cached flowid value.
+ *
+ * Depending upon the kind of send being done, the inp
+ * flowid/flowtype values may actually not be appropriate
+ * for this particular socket send.
+ *
+ * We should either leave the flowid at zero (which is what is
+ * currently done) or set it to some software generated
+ * hash value based on the packet contents.
+ */
+ ipflags |= IP_NODEFAULTFLOWID;
+
if (unlock_udbinfo == UH_WLOCKED)
INP_HASH_WUNLOCK(pcbinfo);
else if (unlock_udbinfo == UH_RLOCKED)
Index: sys/netinet6/in6.h
===================================================================
--- sys/netinet6/in6.h
+++ sys/netinet6/in6.h
@@ -640,6 +640,8 @@
#define M_LOOP M_PROTO6
#define M_AUTHIPDGM M_PROTO7
#define M_RTALERT_MLD M_PROTO8
+#define M_REINJECT_OURS M_PROTO9 /* Re-injected from some
+ * de-encaps / defrag process */
#ifdef _KERNEL
struct cmsghdr;
Index: sys/netinet6/ip6_output.c
===================================================================
--- sys/netinet6/ip6_output.c
+++ sys/netinet6/ip6_output.c
@@ -260,8 +260,14 @@
goto bad;
}
- if (inp != NULL)
+ if (inp != NULL) {
M_SETFIB(m, inp->inp_inc.inc_fibnum);
+ if (((flags & IP_NODEFAULTFLOWID) == 0) &&
+ (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID))) {
+ m->m_pkthdr.flowid = inp->inp_flowid;
+ m->m_flags |= M_FLOWID;
+ }
+ }
finaldst = ip6->ip6_dst;
bzero(&exthdrs, sizeof(exthdrs));
Index: sys/netinet6/udp6_usrreq.c
===================================================================
--- sys/netinet6/udp6_usrreq.c
+++ sys/netinet6/udp6_usrreq.c
@@ -74,6 +74,7 @@
#include "opt_inet6.h"
#include "opt_ipfw.h"
#include "opt_ipsec.h"
+#include "opt_rss.h"
#include <sys/param.h>
#include <sys/jail.h>
@@ -111,6 +112,7 @@
#include <netinet/udp.h>
#include <netinet/udp_var.h>
#include <netinet/udplite.h>
+#include <netinet/in_rss.h>
#include <netinet6/ip6protosw.h>
#include <netinet6/ip6_var.h>
@@ -850,8 +852,28 @@
m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
}
+ /*
+ * XXX for now assume UDP is 2-tuple.
+ * Later on this may become configurable as 4-tuple;
+ * we should support that.
+ *
+ * XXX .. and we should likely cache this in the inpcb.
+ */
+#ifdef RSS
+ m->m_pkthdr.flowid = rss_hash_ip6_2tuple(*faddr, *laddr);
+ m->m_flags |= M_FLOWID;
+ M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6);
+#endif
flags = 0;
+ /*
+ * Don't override with the inp cached flowid.
+ *
+ * Until the whole UDP path is vetted, it may actually
+ * be incorrect.
+ */
+ flags |= IP_NODEFAULTFLOWID;
+
UDP_PROBE(send, NULL, inp, ip6, inp, udp6);
UDPSTAT_INC(udps_opackets);
error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,

File Metadata

Mime Type
text/plain
Expires
Fri, Jun 5, 10:42 AM (3 h, 40 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
33750745
Default Alt Text
D527.id1109.diff (16 KB)

Event Timeline