Index: sys/net/netisr.h =================================================================== --- sys/net/netisr.h +++ sys/net/netisr.h @@ -57,6 +57,8 @@ #define NETISR_IPV6 6 #define NETISR_NATM 7 #define NETISR_EPAIR 8 /* if_epair(4) */ +#define NETISR_IP_DIRECT 9 /* direct-dispatch IPv4 */ +#define NETISR_IPV6_DIRECT 10 /* direct-dispatch IPv6 */ /* * Protocol ordering and affinity policy constants. See the detailed Index: sys/netinet/in.h =================================================================== --- sys/netinet/in.h +++ sys/netinet/in.h @@ -492,6 +492,8 @@ #define IP_FLOWID 90 /* get flow id for the given socket/inp */ #define IP_FLOWTYPE 91 /* get flow type (M_HASHTYPE) */ #define IP_RSSBUCKETID 92 /* get RSS flowid -> bucket mapping */ +#define IP_RECVFLOWID 93 /* bool; receive IP flowid/flowtype w/ datagram */ +#define IP_RECVRSSBUCKETID 94 /* bool; receive IP RSS bucket id w/ datagram */ /* * Defaults and limits for options Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -549,6 +549,8 @@ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ +#define INP_RECVFLOWID 0x00000100 /* populate recv datagram with flow info */ +#define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ /* * Flags passed to in_pcblookup*() functions. Index: sys/netinet/in_rss.h =================================================================== --- sys/netinet/in_rss.h +++ sys/netinet/in_rss.h @@ -83,6 +83,16 @@ #define RSS_KEYSIZE 40 /* + * For RSS hash methods that do a software hash on an mbuf, the packet + * direction (ingress / egress) is required. + * + * The default direction (INGRESS) is the "receive into the NIC" - ie, + * what the hardware is hashing on. + */ +#define RSS_HASH_PKT_INGRESS 0 +#define RSS_HASH_PKT_EGRESS 1 + +/* * Device driver interfaces to query RSS properties that must be programmed * into hardware. */ @@ -116,4 +126,17 @@ uint32_t *bucket_id); int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id); +/* + * Functions to calculate a software RSS hash for a given mbuf or + * packet detail. + */ +int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, + uint32_t *hashval, uint32_t *hashtype); +int rss_proto_software_hash_v4(struct in_addr src, + struct in_addr dst, u_short src_port, u_short dst_port, + int proto, uint32_t *hashval, + uint32_t *hashtype); +struct mbuf * rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, + u_int *cpuid); + #endif /* !_NETINET_IN_RSS_H_ */ Index: sys/netinet/in_rss.c =================================================================== --- sys/netinet/in_rss.c +++ sys/netinet/in_rss.c @@ -57,6 +57,11 @@ #include #include +/* for software rss hash support */ +#include +#include +#include + /*- * Operating system parts of receiver-side scaling (RSS), which allows * network cards to direct flows to particular receive queues based on hashes @@ -170,6 +175,8 @@ }; static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; +static inline u_int rss_gethashconfig_local(void); + static void rss_init(__unused void *arg) { @@ -295,6 +302,16 @@ } /* + * XXX + * + * The RSS specification states that the hash type for incoming + * packets is done on the source address/port first, + * then the destination address/port. + * + * The opposite is required for outgoing frames! + */ + +/* * Hash an IPv4 2-tuple. */ uint32_t @@ -491,6 +508,257 @@ } /* + * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given + * IPv4 source/destination address, UDP or TCP source/destination ports + * and the protocol type. + * + * The protocol code may wish to do a software hash of the given + * tuple. This depends upon the currently configured RSS hash types. + * + * This assumes that the packet in question isn't a fragment. + * + * It also assumes the packet source/destination address + * are in "incoming" packet order (ie, source is "far" address.) + */ +int +rss_proto_software_hash_v4(struct in_addr s, struct in_addr d, + u_short sp, u_short dp, int proto, + uint32_t *hashval, uint32_t *hashtype) +{ + uint32_t hash; + + /* + * Next, choose the hash type depending upon the protocol + * identifier. + */ + if ((proto == IPPROTO_TCP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_TCP_IPV4; + return (0); + } else if ((proto == IPPROTO_UDP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_UDP_IPV4; + return (0); + } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { + /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */ + hash = rss_hash_ip4_2tuple(s, d); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_IPV4; + return (0); + } + + /* No configured available hashtypes! */ + printf("%s: no available hashtypes!\n", __func__); + return (-1); +} + +/* + * Do a software calculation of the RSS for the given mbuf. + * + * This is typically used by the input path to recalculate the RSS after + * some form of packet processing (eg de-capsulation, IP fragment reassembly.) + * + * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and + * RSS_HASH_PKT_EGRESS for outgoing. + * + * Returns 0 if a hash was done, -1 if no hash was done, +1 if + * the mbuf already had a valid RSS flowid. + * + * This function doesn't modify the mbuf. It's up to the caller to + * assign flowid/flowtype as appropriate. + */ +int +rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, + uint32_t *hashtype) +{ + const struct ip *ip; + const struct tcphdr *th; + const struct udphdr *uh; + uint8_t proto; + int iphlen; + int is_frag = 0; + + /* + * XXX For now this only handles hashing on incoming mbufs. + */ + if (dir != RSS_HASH_PKT_INGRESS) { + printf("%s: called on EGRESS packet!\n", __func__); + return (-1); + } + + /* + * First, validate that the mbuf we have is long enough + * to have an IPv4 header in it. + */ + if (m->m_pkthdr.len < (sizeof(struct ip))) { + printf("%s: short mbuf pkthdr\n", __func__); + return (-1); + } + if (m->m_len < (sizeof(struct ip))) { + printf("%s: short mbuf len\n", __func__); + return (-1); + } + + /* Ok, let's dereference that */ + ip = mtod(m, struct ip *); + proto = ip->ip_p; + iphlen = ip->ip_hl << 2; + + /* + * If this is a fragment then it shouldn't be four-tuple + * hashed just yet. Once it's reassembled into a full + * frame it should be re-hashed. + */ + if (ip->ip_off & htons(IP_MF | IP_OFFMASK)) + is_frag = 1; + + /* + * If the mbuf flowid/flowtype matches the packet type, + * and we don't support the 4-tuple version of the given protocol, + * then signal to the owner that it can trust the flowid/flowtype + * details. + * + * This is a little picky - eg, if TCPv4 / UDPv4 hashing + * is supported but we got a TCP/UDP frame only 2-tuple hashed, + * then we shouldn't just "trust" the 2-tuple hash. We need + * a 4-tuple hash. + */ + if (m->m_flags & M_FLOWID) { + uint32_t flowid, flowtype; + + flowid = m->m_pkthdr.flowid; + flowtype = M_HASHTYPE_GET(m); + + switch (proto) { + case IPPROTO_UDP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && + (flowtype == M_HASHTYPE_RSS_UDP_IPV4) && + (is_frag == 0)) { + return (1); + } + /* + * Only allow 2-tuple for UDP frames if we don't also + * support 4-tuple for UDP. + */ + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) == 0) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + case IPPROTO_TCP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && + (flowtype == M_HASHTYPE_RSS_TCP_IPV4) && + (is_frag == 0)) { + return (1); + } + /* + * Only allow 2-tuple for TCP frames if we don't also + * support 2-tuple for TCP. + */ + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) == 0) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + default: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + } + } + + /* + * Decode enough information to make a hash decision. + * + * XXX TODO: does the hardware hash on 4-tuple if IP + * options are present? + */ + if (proto == IPPROTO_TCP && is_frag == 0) { + if (m->m_len < iphlen + sizeof(struct tcphdr)) { + printf("%s: short TCP frame?\n", __func__); + return (-1); + } + th = (struct tcphdr *)((caddr_t)ip + iphlen); + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + th->th_sport, + th->th_dport, + proto, + hashval, + hashtype); + } else if (proto == IPPROTO_UDP && is_frag == 0) { + uh = (struct udphdr *)((caddr_t)ip + iphlen); + if (m->m_len < iphlen + sizeof(struct udphdr)) { + printf("%s: short UDP frame?\n", __func__); + return (-1); + } + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + uh->uh_sport, + uh->uh_dport, + proto, + hashval, + hashtype); + } else { + /* Default to 2-tuple hash */ + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + 0, /* source port */ + 0, /* destination port */ + 0, /* IPPROTO_IP */ + hashval, + hashtype); + } +} + +/* + * Similar to rss_m2cpuid, but designed to be used by the IP NETISR + * on incoming frames. + * + * If an existing RSS hash exists and it matches what the configured + * hashing is, then use it. + * + * If there's an existing RSS hash but the desired hash is different, + * or if there's no useful RSS hash, then calculate it via + * the software path. + * + * XXX TODO: definitely want statistics here! + */ +struct mbuf * +rss_soft_m2cpuid(struct mbuf *m, uintptr_t source, u_int *cpuid) +{ + uint32_t hash_val, hash_type; + int ret; + + M_ASSERTPKTHDR(m); + + ret = rss_mbuf_software_hash_v4(m, RSS_HASH_PKT_INGRESS, + &hash_val, &hash_type); + if (ret > 0) { + //printf("%s: ret=%d, val=0x%08x, type=%d\n", __func__, ret, hash_val, hash_type); + /* mbuf has a valid hash already; don't need to modify it */ + *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); + } else if (ret == 0) { + /* hash was done; update */ + m->m_pkthdr.flowid = hash_val; + M_HASHTYPE_SET(m, hash_type); + m->m_flags |= M_FLOWID; + //printf("%s: ret=%d, val=0x%08x, type=%d\n", __func__, ret, hash_val, hash_type); + *cpuid = rss_hash2cpuid(m->m_pkthdr.flowid, M_HASHTYPE_GET(m)); + } else { /* ret < 0 */ + /* no hash was done */ + //printf("%s: no hash done!\n", __func__); + *cpuid = NETISR_CPUID_NONE; + } + return (m); +} + +/* * Query the RSS hash algorithm. */ u_int @@ -538,15 +806,10 @@ return (rss_ncpus); } -/* - * Return the supported RSS hash configuration. - * - * NICs should query this to determine what to configure in their redirection - * matching table. - */ -u_int -rss_gethashconfig(void) +static inline u_int +rss_gethashconfig_local(void) { + /* Return 4-tuple for TCP; 2-tuple for others */ /* * UDP may fragment more often than TCP and thus we'll end up with @@ -573,6 +836,19 @@ } /* + * Return the supported RSS hash configuration. + * + * NICs should query this to determine what to configure in their redirection + * matching table. + */ +u_int +rss_gethashconfig(void) +{ + + return (rss_gethashconfig_local()); +} + +/* * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want * it appearing in debugging output unnecessarily. */ Index: sys/netinet/in_var.h =================================================================== --- sys/netinet/in_var.h +++ sys/netinet/in_var.h @@ -411,6 +411,7 @@ int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void ip_input(struct mbuf *); +void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_fastforward(struct mbuf *); void *in_domifattach(struct ifnet *); Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c +++ sys/netinet/ip_input.c @@ -37,6 +37,7 @@ #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" +#include "opt_rss.h" #include #include @@ -77,6 +78,7 @@ #ifdef IPSEC #include #endif /* IPSEC */ +#include #include @@ -144,9 +146,33 @@ .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, +#ifdef RSS + .nh_m2cpuid = rss_soft_m2cpuid, + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_HYBRID, +#else .nh_policy = NETISR_POLICY_FLOW, +#endif }; +#ifdef RSS +/* + * Directly dispatched frames are currently assumed + * to have a flowid already calculated. + * + * It should likely have something that assert it + * actually has valid flow details. + */ +static struct netisr_handler ip_direct_nh = { + .nh_name = "ip_direct", + .nh_handler = ip_direct_input, + .nh_proto = NETISR_IP_DIRECT, + .nh_m2cpuid = rss_m2cpuid, + .nh_policy = NETISR_POLICY_CPU, + .nh_dispatch = NETISR_DISPATCH_HYBRID, +}; +#endif + extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; @@ -266,6 +292,46 @@ CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); +#ifdef RSS +static int +sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) +{ + int error, qlimit; + + netisr_getqlimit(&ip_direct_nh, &qlimit); + error = sysctl_handle_int(oidp, &qlimit, 0, req); + if (error || !req->newptr) + return (error); + if (qlimit < 1) + return (EINVAL); + return (netisr_setqlimit(&ip_direct_nh, qlimit)); +} +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", + "Maximum size of the IP direct input queue"); + +static int +sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) +{ + u_int64_t qdrops_long; + int error, qdrops; + + netisr_getqdrops(&ip_direct_nh, &qdrops_long); + qdrops = qdrops_long; + error = sysctl_handle_int(oidp, &qdrops, 0, req); + if (error || !req->newptr) + return (error); + if (qdrops != 0) + return (EINVAL); + netisr_clearqdrops(&ip_direct_nh); + return (0); +} + +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, + CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", + "Number of packets dropped from the IP direct input queue"); +#endif /* RSS */ + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -327,6 +393,9 @@ /* Initialize various other remaining things. */ IPQ_LOCK_INIT(); netisr_register(&ip_nh); +#ifdef RSS + netisr_register(&ip_direct_nh); +#endif } #ifdef VIMAGE @@ -350,6 +419,28 @@ } #endif +#ifdef RSS +/* + * IP direct input routine. + * + * This is called when reinjecting completed fragments where + * all of the previous checking and book-keeping has been done. + */ +void +ip_direct_input(struct mbuf *m) +{ + struct ip *ip; + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + + IPSTAT_INC(ips_delivered); + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + return; +} +#endif + /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. @@ -463,6 +554,7 @@ } else m_adj(m, ip_len - m->m_pkthdr.len); } + #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. @@ -817,6 +909,9 @@ int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; +#ifdef RSS + uint32_t rss_hash, rss_type; +#endif /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { @@ -1106,6 +1201,42 @@ m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); + +#ifdef RSS + /* + * Query the RSS layer for the flowid / flowtype for the + * mbuf payload. + * + * For now, just assume we have to calculate a new one. + * Later on we should check to see if the assigned flowid matches + * what RSS wants for the given IP protocol and if so, just keep it. + * + * We then queue into the relevant netisr so it can be dispatched + * to the correct CPU. + * + * Note - this may return 1, which means the flowid in the mbuf + * is correct for the configured RSS hash types and can be used. + */ + if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { + m->m_pkthdr.flowid = rss_hash; + M_HASHTYPE_SET(m, rss_type); + m->m_flags |= M_FLOWID; + } +#endif + +#ifdef RSS + /* + * Queue/dispatch for reprocessing. + * + * Note: this is much slower than just handling the frame in the + * current receive context. It's likely worth investigating + * why this is. + */ + netisr_dispatch(NETISR_IP_DIRECT, m); + return (NULL); +#endif + + /* Handle in-line */ return (m); dropfrag: @@ -1662,6 +1793,43 @@ if (*mp) mp = &(*mp)->m_next; } + + if (inp->inp_flags2 & INP_RECVFLOWID) { + uint32_t flowid, flow_type; + + flowid = m->m_pkthdr.flowid; + flow_type = M_HASHTYPE_GET(m); + + /* + * XXX should handle the failure of one or the + * other - don't populate both? + */ + *mp = sbcreatecontrol((caddr_t) &flowid, + sizeof(uint32_t), IP_FLOWID, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + *mp = sbcreatecontrol((caddr_t) &flow_type, + sizeof(uint32_t), IP_FLOWTYPE, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + +#ifdef RSS + if (inp->inp_flags2 & INP_RECVRSSBUCKETID) { + uint32_t flowid, flow_type; + uint32_t rss_bucketid; + + flowid = m->m_pkthdr.flowid; + flow_type = M_HASHTYPE_GET(m); + + if (rss_hash2bucket(flowid, flow_type, &rss_bucketid) == 0) { + *mp = sbcreatecontrol((caddr_t) &rss_bucketid, + sizeof(uint32_t), IP_RSSBUCKETID, IPPROTO_IP); + if (*mp) + mp = &(*mp)->m_next; + } + } +#endif } /* Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -145,7 +145,23 @@ if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + + /* + * Force the flowid / flowtype for outbound data to match the + * inp. + * + * For UDP that's going to be a bit special because we may be + * sending with a different source/destination address + * than the initial bind (which may be a global bind.) + * + * I'm not sure what to do about this just yet. + * Maybe the correct thing to do is to do the flowid + * assignment in the callers of ip_output() since they'll + * know if the inp flowid details are supposed to be + * used. + */ + if( ((flags & IP_NODEFAULTFLOWID) == 0) && + inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); m->m_flags |= M_FLOWID; @@ -1016,6 +1032,10 @@ case IP_ONESBCAST: case IP_DONTFRAG: case IP_RECVTOS: + case IP_RECVFLOWID: +#ifdef RSS + case IP_RECVRSSBUCKETID: +#endif error = sooptcopyin(sopt, &optval, sizeof optval, sizeof optval); if (error) @@ -1094,6 +1114,9 @@ case IP_BINDMULTI: OPTSET2(INP_BINDMULTI, optval); break; + case IP_RECVFLOWID: + OPTSET2(INP_RECVFLOWID, optval); + break; #ifdef RSS case IP_RSS_LISTEN_BUCKET: if ((optval >= 0) && @@ -1104,6 +1127,9 @@ error = EINVAL; } break; + case IP_RECVRSSBUCKETID: + OPTSET2(INP_RECVRSSBUCKETID, optval); + break; #endif } break; @@ -1219,8 +1245,10 @@ case IP_BINDMULTI: case IP_FLOWID: case IP_FLOWTYPE: + case IP_RECVFLOWID: #ifdef RSS case IP_RSSBUCKETID: + case IP_RECVRSSBUCKETID: #endif switch (sopt->sopt_name) { @@ -1290,6 +1318,9 @@ case IP_FLOWTYPE: optval = inp->inp_flowtype; break; + case IP_RECVFLOWID: + optval = OPTBIT2(INP_RECVFLOWID); + break; #ifdef RSS case IP_RSSBUCKETID: retval = rss_hash2bucket(inp->inp_flowid, @@ -1300,6 +1331,9 @@ else error = EINVAL; break; + case IP_RECVRSSBUCKETID: + optval = OPTBIT2(INP_RECVRSSBUCKETID); + break; #endif case IP_BINDMULTI: optval = OPTBIT2(INP_BINDMULTI); Index: sys/netinet/ip_var.h =================================================================== --- sys/netinet/ip_var.h +++ sys/netinet/ip_var.h @@ -161,6 +161,7 @@ #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ +#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -43,6 +43,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -89,6 +90,7 @@ #include #include #include +#include #ifdef IPSEC #include @@ -206,6 +208,13 @@ udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); @@ -1084,6 +1093,9 @@ u_char tos; uint8_t pr; uint16_t cscov = 0; + uint32_t flowid = 0; + int flowid_type = 0; + int use_flowid = 0; /* * udp_output() may need to temporarily bind or connect the current @@ -1147,6 +1159,34 @@ tos = *(u_char *)CMSG_DATA(cm); break; + case IP_FLOWID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowid = *(uint32_t *) CMSG_DATA(cm); + break; + + case IP_FLOWTYPE: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + flowid_type = *(uint32_t *) CMSG_DATA(cm); + use_flowid = 1; + break; + + case IP_RSSBUCKETID: + if (cm->cmsg_len != CMSG_LEN(sizeof(uint32_t))) { + error = EINVAL; + break; + } + /* + * XXX don't error out for now, but don't + * do anything + */ + break; + default: error = ENOPROTOOPT; break; @@ -1395,6 +1435,63 @@ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); + /* + * Setup flowid / RSS information for outbound socket. + * + * Once the UDP code decides to set a flowid some other way, + * this allows the flowid to be overridden by userland. + * + * Remember ip_output() overrides with the inp flowid details + * if they exist. + * + * .. and ip_output() -> flowtable_lookup() also assigns + * a flowid too. Ugh. + */ + if (use_flowid) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = flowid; + M_HASHTYPE_SET(m, flowid_type); +#ifdef RSS + } else { + uint32_t hash_val, hash_type; + /* + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. + */ + if (rss_proto_software_hash_v4(faddr, laddr, fport, lport, + pr, &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, hash_type); + } +#endif + } + + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; + if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED) Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -227,6 +227,9 @@ * * ifpp - XXX: just for statistics */ +/* + * XXX TODO: no flowid is assigned for outbound flows? + */ int ip6_output(struct mbuf *m0, struct ip6_pktopts *opt, struct route_in6 *ro, int flags, struct ip6_moptions *im6o, @@ -260,8 +263,14 @@ goto bad; } - if (inp != NULL) + if (inp != NULL) { M_SETFIB(m, inp->inp_inc.inc_fibnum); + if (((flags & IP_NODEFAULTFLOWID) == 0) && + (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID))) { + m->m_pkthdr.flowid = inp->inp_flowid; + m->m_flags |= M_FLOWID; + } + } finaldst = ip6->ip6_dst; bzero(&exthdrs, sizeof(exthdrs)); Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -74,6 +74,7 @@ #include "opt_inet6.h" #include "opt_ipfw.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -111,6 +112,7 @@ #include #include #include +#include #include #include @@ -850,8 +852,28 @@ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); } + /* + * XXX for now assume UDP is 2-tuple. + * Later on this may become configurable as 4-tuple; + * we should support that. + * + * XXX .. and we should likely cache this in the inpcb. + */ +#ifdef RSS + m->m_pkthdr.flowid = rss_hash_ip6_2tuple(*faddr, *laddr); + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, M_HASHTYPE_RSS_IPV6); +#endif flags = 0; + /* + * Don't override with the inp cached flowid. + * + * Until the whole UDP path is vetted, it may actually + * be incorrect. + */ + flags |= IP_NODEFAULTFLOWID; + UDP_PROBE(send, NULL, inp, ip6, inp, udp6); UDPSTAT_INC(udps_opackets); error = ip6_output(m, optp, NULL, flags, inp->in6p_moptions,