Index: .arcconfig =================================================================== --- .arcconfig +++ .arcconfig @@ -1,5 +1,5 @@ { "project.name": "S", - "phabricator.uri" : "https://phabric.freebsd.org/", + "phabricator.uri" : "https://reviews.freebsd.org/", "history.immutable" : true } Index: sys/netinet/in_rss.h =================================================================== --- sys/netinet/in_rss.h +++ sys/netinet/in_rss.h @@ -83,6 +83,16 @@ #define RSS_KEYSIZE 40 /* + * For RSS hash methods that do a software hash on an mbuf, the packet + * direction (ingress / egress) is required. + * + * The default direction (INGRESS) is the "receive into the NIC" - ie, + * what the hardware is hashing on. + */ +#define RSS_HASH_PKT_INGRESS 0 +#define RSS_HASH_PKT_EGRESS 1 + +/* * Device driver interfaces to query RSS properties that must be programmed * into hardware. */ @@ -116,4 +126,15 @@ uint32_t *bucket_id); int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id); +/* + * Functions to calculate a software RSS hash for a given mbuf or + * packet detail. + */ +int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, + uint32_t *hashval, uint32_t *hashtype); +int rss_proto_software_hash_v4(struct in_addr src, + struct in_addr dst, u_short src_port, u_short dst_port, + int proto, int dir, uint32_t *hashval, + uint32_t *hashtype); + #endif /* !_NETINET_IN_RSS_H_ */ Index: sys/netinet/in_rss.c =================================================================== --- sys/netinet/in_rss.c +++ sys/netinet/in_rss.c @@ -57,6 +57,11 @@ #include #include +/* for software rss hash support */ +#include +#include +#include + /*- * Operating system parts of receiver-side scaling (RSS), which allows * network cards to direct flows to particular receive queues based on hashes @@ -170,6 +175,8 @@ }; static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; +static inline u_int rss_gethashconfig_local(void); + static void rss_init(__unused void *arg) { @@ -491,6 +498,188 @@ } /* + * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given + * IPv4 source/destination address, UDP or TCP source/destination ports + * and the protocol type. + * + * The protocol code may wish to do a software hash of the given + * tuple. This depends upon the currently configured RSS hash types. + * + * dir is RSS_HASH_PKT_INGRESS 0 for in, RSS_HASH_PKT_EGRESS for out. + * proto is the IPv4 protocol type. + */ +int +rss_proto_software_hash_v4(struct in_addr src, struct in_addr dst, + u_short src_port, u_short dst_port, int proto, int dir, + uint32_t *hashval, uint32_t *hashtype) +{ + struct in_addr s, d; + u_short sp, dp; + uint32_t hash; + + /* first, assign data appropriately */ + if (dir == RSS_HASH_PKT_INGRESS) { + s = src; + d = dst; + sp = src_port; + dp = dst_port; + } else { + s = dst; + d = src; + sp = dst_port; + dp = src_port; + } + + /* + * Next, choose the hash type depending upon the protocol + * identifier. + */ + if ((proto == IPPROTO_TCP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_TCP_IPV4; + return (0); + } else if ((proto == IPPROTO_UDP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_UDP_IPV4; + return (0); + } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { + /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */ + hash = rss_hash_ip4_2tuple(s, d); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_IPV4; + return (0); + } + + /* No configured available hashtypes! */ + return (-1); +} + +/* + * Do a software calculation of the RSS for the given mbuf. + * + * This is typically used by the input path to recalculate the RSS after + * some form of packet processing (eg de-capsulation, IP fragment reassembly.) + * + * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and + * RSS_HASH_PKT_EGRESS for outgoing. + * + * Returns 0 if a hash was done, -1 if no hash was done, +1 if + * the mbuf already had a valid RSS flowid. + * + * This function doesn't modify the mbuf. It's up to the caller to + * assign flowid/flowtype as appropriate. + */ +int +rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, + uint32_t *hashtype) +{ + const struct ip *ip; + const struct tcphdr *th; + const struct udphdr *uh; + uint8_t proto; + int iphlen; + + /* + * First, validate that the mbuf we have is long enough + * to have an IPv4 header in it. + */ + + if (m->m_pkthdr.len < (sizeof(struct ip))) + return (-1); + if (m->m_len < (sizeof(struct ip))) + return (-1); + + /* Ok, let's dereference that */ + ip = mtod(m, struct ip *); + proto = ip->ip_p; + iphlen = ip->ip_hl << 2; + + /* + * If the mbuf flowid/flowtype matches the packet type, + * then signal to the owner that it can trust the flowid/flowtype + * details. + */ + if (m->m_flags & M_FLOWID) { + uint32_t flowid, flowtype; + + flowid = m->m_pkthdr.flowid; + flowtype = M_HASHTYPE_GET(m); + + switch (proto) { + case IPPROTO_UDP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && + flowtype == M_HASHTYPE_RSS_UDP_IPV4) { + return (1); + } + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + case IPPROTO_TCP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && + flowtype == M_HASHTYPE_RSS_TCP_IPV4) { + return (1); + } + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + default: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + } + } + + /* + * Decode enough information to make a hash decision. + * + * XXX TODO: does the hardware hash on 4-tuple if IP + * options are present? + */ + if (proto == IPPROTO_TCP) { + if (m->m_len < iphlen + sizeof(struct tcphdr)) + return (-1); + th = (struct tcphdr *)((caddr_t)ip + iphlen); + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + th->th_sport, + th->th_dport, + proto, + dir, + hashval, + hashtype); + } else if (proto == IPPROTO_UDP) { + uh = (struct udphdr *)((caddr_t)ip + iphlen); + if (m->m_len < iphlen + sizeof(struct udphdr)) + return (-1); + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + uh->uh_sport, + uh->uh_dport, + proto, + dir, + hashval, + hashtype); + } else { + /* Default to 2-tuple hash */ + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + 0, /* source port */ + 0, /* destination port */ + 0, /* IPPROTO_IP */ + dir, + hashval, + hashtype); + } +} + +/* * Query the RSS hash algorithm. */ u_int @@ -538,15 +727,10 @@ return (rss_ncpus); } -/* - * Return the supported RSS hash configuration. - * - * NICs should query this to determine what to configure in their redirection - * matching table. - */ -u_int -rss_gethashconfig(void) +static inline u_int +rss_gethashconfig_local(void) { + /* Return 4-tuple for TCP; 2-tuple for others */ /* * UDP may fragment more often than TCP and thus we'll end up with @@ -573,6 +757,19 @@ } /* + * Return the supported RSS hash configuration. + * + * NICs should query this to determine what to configure in their redirection + * matching table. + */ +u_int +rss_gethashconfig(void) +{ + + return (rss_gethashconfig_local()); +} + +/* * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want * it appearing in debugging output unnecessarily. */ Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c +++ sys/netinet/ip_input.c @@ -37,6 +37,7 @@ #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" +#include "opt_rss.h" #include #include @@ -77,6 +78,7 @@ #ifdef IPSEC #include #endif /* IPSEC */ +#include #include @@ -140,11 +142,21 @@ VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ +/* + * We may need to re-inject packets into the IP stack for further work. + * In this instance, use the CPU policy and query the RSS layer for the + * relevant CPU ID to use. + */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, +#ifdef RSS + .nh_m2cpuid = rss_m2cpuid, + .nh_policy = NETISR_POLICY_CPU, +#else .nh_policy = NETISR_POLICY_FLOW, +#endif }; extern struct domain inetdomain; @@ -368,13 +380,18 @@ M_ASSERTPKTHDR(m); - if (m->m_flags & M_FASTFWD_OURS) { - m->m_flags &= ~M_FASTFWD_OURS; + if (m->m_flags & (M_REINJECT_OURS | M_FASTFWD_OURS)) { /* Set up some basics that will be used later. */ ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; ip_len = ntohs(ip->ip_len); - goto ours; + if (m->m_flags & M_REINJECT_OURS) { + m->m_flags &= ~(M_REINJECT_OURS|M_FASTFWD_OURS); + goto reinject_ours; + } else { + m->m_flags &= ~(M_REINJECT_OURS|M_FASTFWD_OURS); + goto ours; + } } IPSTAT_INC(ips_total); @@ -463,6 +480,7 @@ } else m_adj(m, ip_len - m->m_pkthdr.len); } + #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. @@ -721,6 +739,8 @@ goto bad; #endif /* IPSEC */ +reinject_ours: + /* * Switch out to protocol's input routine. */ @@ -817,6 +837,9 @@ int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; +#ifdef RSS + uint32_t rss_hash, rss_type; +#endif /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { @@ -1106,6 +1129,41 @@ m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); + +#ifdef RSS + /* + * Query the RSS layer for the flowid / flowtype for the + * mbuf payload. + * + * For now, just assume we have to calculate a new one. + * Later on we should check to see if the assigned flowid matches + * what RSS wants for the given IP protocol and if so, just keep it. + * + * We then queue into the relevant netisr so it can be dispatched + * to the correct CPU. + * + * Note - this may return 1, which means the flowid in the mbuf + * is correct for the configured RSS hash types and can be used. + */ + if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { + m->m_pkthdr.flowid = rss_hash; + M_HASHTYPE_SET(m, rss_type); + m->m_flags |= M_FLOWID; + } + + /* + * Queue/dispatch for reprocessing. + * + * Note: this is much slower than just handling the frame in the + * current receive context. It's likely worth investigating + * why this is. + */ + m->m_flags |= M_REINJECT_OURS; + netisr_dispatch(NETISR_IP, m); + return (NULL); +#endif + + /* Handle in-line */ return (m); dropfrag: Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -145,7 +145,9 @@ if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + + if (((flags & IP_NODEFAULTFLOWID) == 0) && + inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); m->m_flags |= M_FLOWID; Index: sys/netinet/ip_var.h =================================================================== --- sys/netinet/ip_var.h +++ sys/netinet/ip_var.h @@ -161,6 +161,7 @@ #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ +#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -43,6 +43,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -89,6 +90,7 @@ #include #include #include +#include #ifdef IPSEC #include @@ -206,6 +208,13 @@ udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); @@ -1395,6 +1404,60 @@ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); + /* + * Setup flowid / RSS information for outbound socket. + * + * Once the UDP code decides to set a flowid some other way, + * this allows the flowid to be overridden by userland. + * + * Remember ip_output() overrides with the inp flowid details + * if they exist. + * + * .. and ip_output() -> flowtable_lookup() also assigns + * a flowid too. Ugh. + */ +#ifdef RSS + { + uint32_t hash_val, hash_type; + /* + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. + */ + if (rss_proto_software_hash_v4(laddr, faddr, lport, fport, + pr, RSS_HASH_PKT_EGRESS, + &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, hash_type); + } + } +#endif + + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; + if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) Index: sys/netinet6/in6.h =================================================================== --- sys/netinet6/in6.h +++ sys/netinet6/in6.h @@ -640,6 +640,8 @@ #define M_LOOP M_PROTO6 #define M_AUTHIPDGM M_PROTO7 #define M_RTALERT_MLD M_PROTO8 +#define M_REINJECT_OURS M_PROTO9 /* Re-injected from some + * de-encaps / defrag process */ #ifdef _KERNEL struct cmsghdr; Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -260,8 +260,14 @@ goto bad; } - if (inp != NULL) + if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); + if (((flags & IP_NODEFAULTFLOWID) == 0) && + (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID))) { + m->m_pkthdr.flowid = inp->inp_flowid; + m->m_flags |= M_FLOWID; + } + } finaldst = ip6->ip6_dst; bzero(&exthdrs, sizeof(exthdrs)); Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -74,6 +74,7 @@ #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -111,6 +112,7 @@ #include #include #include +#include #include #include @@ -850,8 +852,28 @@ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } + /* + * XXX for now assume UDP is 2-tuple. + * Later on this may become configurable as 4-tuple; + * we should support that. + * + * XXX .. and we should likely cache this in the inpcb. + */ +#ifdef RSS + m->m_pkthdr.flowid = rss_hash_ip6_2tuple(*faddr, *laddr); + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6); +#endif flags = 0; + /* + * Don't override with the inp cached flowid. + * + * Until the whole UDP path is vetted, it may actually + * be incorrect. + */ + flags |= IP_NODEFAULTFLOWID; + UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,