Index: sys/netinet/in_rss.h =================================================================== --- sys/netinet/in_rss.h +++ sys/netinet/in_rss.h @@ -83,6 +83,16 @@ #define RSS_KEYSIZE 40 /* + * For RSS hash methods that do a software hash on an mbuf, the packet + * direction (ingress / egress) is required. + * + * The default direction (INGRESS) is the "receive into the NIC" - ie, + * what the hardware is hashing on. + */ +#define RSS_HASH_PKT_INGRESS 0 +#define RSS_HASH_PKT_EGRESS 1 + +/* * Device driver interfaces to query RSS properties that must be programmed * into hardware. */ @@ -116,4 +126,15 @@ uint32_t *bucket_id); int rss_m2bucket(struct mbuf *m, uint32_t *bucket_id); +/* + * Functions to calculate a software RSS hash for a given mbuf or + * packet detail. + */ +int rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, + uint32_t *hashval, uint32_t *hashtype); +int rss_proto_software_hash_v4(struct in_addr src, + struct in_addr dst, u_short src_port, u_short dst_port, + int proto, int dir, uint32_t *hashval, + uint32_t *hashtype); + #endif /* !_NETINET_IN_RSS_H_ */ Index: sys/netinet/in_rss.c =================================================================== --- sys/netinet/in_rss.c +++ sys/netinet/in_rss.c @@ -57,6 +57,11 @@ #include #include +/* for software rss hash support */ +#include +#include +#include + /*- * Operating system parts of receiver-side scaling (RSS), which allows * network cards to direct flows to particular receive queues based on hashes @@ -170,6 +175,8 @@ }; static struct rss_table_entry rss_table[RSS_TABLE_MAXLEN]; +static inline u_int rss_gethashconfig_local(void); + static void rss_init(__unused void *arg) { @@ -491,6 +498,188 @@ } /* + * Calculate an appropriate ipv4 2-tuple or 4-tuple given the given + * IPv4 source/destination address, UDP or TCP source/destination ports + * and the protocol type. + * + * The protocol code may wish to do a software hash of the given + * tuple. This depends upon the currently configured RSS hash types. + * + * dir is RSS_HASH_PKT_INGRESS 0 for in, RSS_HASH_PKT_EGRESS for out. + * proto is the IPv4 protocol type. + */ +int +rss_proto_software_hash_v4(struct in_addr src, struct in_addr dst, + u_short src_port, u_short dst_port, int proto, int dir, + uint32_t *hashval, uint32_t *hashtype) +{ + struct in_addr s, d; + u_short sp, dp; + uint32_t hash; + + /* first, assign data appropriately */ + if (dir == RSS_HASH_PKT_INGRESS) { + s = src; + d = dst; + sp = src_port; + dp = dst_port; + } else { + s = dst; + d = src; + sp = dst_port; + dp = src_port; + } + + /* + * Next, choose the hash type depending upon the protocol + * identifier. + */ + if ((proto == IPPROTO_TCP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_TCP_IPV4; + return (0); + } else if ((proto == IPPROTO_UDP) && + (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4)) { + hash = rss_hash_ip4_4tuple(s, sp, d, dp); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_UDP_IPV4; + return (0); + } else if (rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) { + /* RSS doesn't hash on other protocols like SCTP; so 2-tuple */ + hash = rss_hash_ip4_2tuple(s, d); + *hashval = hash; + *hashtype = M_HASHTYPE_RSS_IPV4; + return (0); + } + + /* No configured available hashtypes! */ + return (-1); +} + +/* + * Do a software calculation of the RSS for the given mbuf. + * + * This is typically used by the input path to recalculate the RSS after + * some form of packet processing (eg de-capsulation, IP fragment reassembly.) + * + * dir is the packet direction - RSS_HASH_PKT_INGRESS for incoming and + * RSS_HASH_PKT_EGRESS for outgoing. + * + * Returns 0 if a hash was done, -1 if no hash was done, +1 if + * the mbuf already had a valid RSS flowid. + * + * This function doesn't modify the mbuf. It's up to the caller to + * assign flowid/flowtype as appropriate. + */ +int +rss_mbuf_software_hash_v4(const struct mbuf *m, int dir, uint32_t *hashval, + uint32_t *hashtype) +{ + const struct ip *ip; + const struct tcphdr *th; + const struct udphdr *uh; + uint8_t proto; + int iphlen; + + /* + * First, validate that the mbuf we have is long enough + * to have an IPv4 header in it. + */ + + if (m->m_pkthdr.len < (sizeof(struct ip))) + return (-1); + if (m->m_len < (sizeof(struct ip))) + return (-1); + + /* Ok, let's dereference that */ + ip = mtod(m, struct ip *); + proto = ip->ip_p; + iphlen = ip->ip_hl << 2; + + /* + * If the mbuf flowid/flowtype matches the packet type, + * then signal to the owner that it can trust the flowid/flowtype + * details. + */ + if (m->m_flags & M_FLOWID) { + uint32_t flowid, flowtype; + + flowid = m->m_pkthdr.flowid; + flowtype = M_HASHTYPE_GET(m); + + switch (proto) { + case IPPROTO_UDP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_UDP_IPV4) && + flowtype == M_HASHTYPE_RSS_UDP_IPV4) { + return (1); + } + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + case IPPROTO_TCP: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_TCP_IPV4) && + flowtype == M_HASHTYPE_RSS_TCP_IPV4) { + return (1); + } + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + default: + if ((rss_gethashconfig_local() & RSS_HASHTYPE_RSS_IPV4) && + flowtype == M_HASHTYPE_RSS_IPV4) { + return (1); + } + break; + } + } + + /* + * Decode enough information to make a hash decision. + * + * XXX TODO: does the hardware hash on 4-tuple if IP + * options are present? + */ + if (proto == IPPROTO_TCP) { + if (m->m_len < iphlen + sizeof(struct tcphdr)) + return (-1); + th = (struct tcphdr *)((caddr_t)ip + iphlen); + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + th->th_sport, + th->th_dport, + proto, + dir, + hashval, + hashtype); + } else if (proto == IPPROTO_UDP) { + uh = (struct udphdr *)((caddr_t)ip + iphlen); + if (m->m_len < iphlen + sizeof(struct udphdr)) + return (-1); + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + uh->uh_sport, + uh->uh_dport, + proto, + dir, + hashval, + hashtype); + } else { + /* Default to 2-tuple hash */ + return rss_proto_software_hash_v4(ip->ip_src, ip->ip_dst, + 0, /* source port */ + 0, /* destination port */ + 0, /* IPPROTO_IP */ + dir, + hashval, + hashtype); + } +} + +/* * Query the RSS hash algorithm. */ u_int @@ -538,15 +727,10 @@ return (rss_ncpus); } -/* - * Return the supported RSS hash configuration. - * - * NICs should query this to determine what to configure in their redirection - * matching table. - */ -u_int -rss_gethashconfig(void) +static inline u_int +rss_gethashconfig_local(void) { + /* Return 4-tuple for TCP; 2-tuple for others */ /* * UDP may fragment more often than TCP and thus we'll end up with @@ -573,6 +757,19 @@ } /* + * Return the supported RSS hash configuration. + * + * NICs should query this to determine what to configure in their redirection + * matching table. + */ +u_int +rss_gethashconfig(void) +{ + + return (rss_gethashconfig_local()); +} + +/* * XXXRW: Confirm that sysctl -a won't dump this keying material, don't want * it appearing in debugging output unnecessarily. */ Index: sys/netinet/in_var.h =================================================================== --- sys/netinet/in_var.h +++ sys/netinet/in_var.h @@ -411,6 +411,7 @@ int in_addprefix(struct in_ifaddr *, int); int in_scrubprefix(struct in_ifaddr *, u_int); void ip_input(struct mbuf *); +void ip_direct_input(struct mbuf *); void in_ifadown(struct ifaddr *ifa, int); struct mbuf *ip_fastforward(struct mbuf *); void *in_domifattach(struct ifnet *); Index: sys/netinet/ip_input.c =================================================================== --- sys/netinet/ip_input.c +++ sys/netinet/ip_input.c @@ -37,6 +37,7 @@ #include "opt_ipstealth.h" #include "opt_ipsec.h" #include "opt_route.h" +#include "opt_rss.h" #include #include @@ -77,6 +78,7 @@ #ifdef IPSEC #include #endif /* IPSEC */ +#include #include @@ -140,13 +142,33 @@ VNET_DEFINE(struct pfil_head, inet_pfil_hook); /* Packet filter hooks */ +/* + * We may need to re-inject packets into the IP stack for further work. + * In this instance, use the CPU policy and query the RSS layer for the + * relevant CPU ID to use. + */ static struct netisr_handler ip_nh = { .nh_name = "ip", .nh_handler = ip_input, .nh_proto = NETISR_IP, +#ifdef RSS + .nh_m2cpuid = rss_m2cpuid, + .nh_policy = NETISR_POLICY_CPU, +#else .nh_policy = NETISR_POLICY_FLOW, +#endif }; +#ifdef RSS +static struct netisr_handler ip_direct_nh = { + .nh_name = "ip_direct", + .nh_handler = ip_direct_input, + .nh_proto = NETISR_IP_DIRECT, + .nh_m2cpuid = rss_m2cpuid, + .nh_policy = NETISR_POLICY_CPU, +}; +#endif + extern struct domain inetdomain; extern struct protosw inetsw[]; u_char ip_protox[IPPROTO_MAX]; @@ -266,6 +288,46 @@ CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_queue_drops, "I", "Number of packets dropped from the IP input queue"); +#ifdef RSS +static int +sysctl_netinet_intr_direct_queue_maxlen(SYSCTL_HANDLER_ARGS) +{ + int error, qlimit; + + netisr_getqlimit(&ip_direct_nh, &qlimit); + error = sysctl_handle_int(oidp, &qlimit, 0, req); + if (error || !req->newptr) + return (error); + if (qlimit < 1) + return (EINVAL); + return (netisr_setqlimit(&ip_direct_nh, qlimit)); +} +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQMAXLEN, intr_direct_queue_maxlen, + CTLTYPE_INT|CTLFLAG_RW, 0, 0, sysctl_netinet_intr_direct_queue_maxlen, "I", + "Maximum size of the IP direct input queue"); + +static int +sysctl_netinet_intr_direct_queue_drops(SYSCTL_HANDLER_ARGS) +{ + u_int64_t qdrops_long; + int error, qdrops; + + netisr_getqdrops(&ip_direct_nh, &qdrops_long); + qdrops = qdrops_long; + error = sysctl_handle_int(oidp, &qdrops, 0, req); + if (error || !req->newptr) + return (error); + if (qdrops != 0) + return (EINVAL); + netisr_clearqdrops(&ip_direct_nh); + return (0); +} + +SYSCTL_PROC(_net_inet_ip, IPCTL_INTRQDROPS, intr_direct_queue_drops, + CTLTYPE_INT|CTLFLAG_RD, 0, 0, sysctl_netinet_intr_direct_queue_drops, "I", + "Number of packets dropped from the IP direct input queue"); +#endif /* RSS */ + /* * IP initialization: fill in IP protocol switch table. * All protocols not implemented in kernel go to raw IP protocol handler. @@ -327,6 +389,9 @@ /* Initialize various other remaining things. */ IPQ_LOCK_INIT(); netisr_register(&ip_nh); +#ifdef RSS + netisr_register(&ip_direct_nh); +#endif } #ifdef VIMAGE @@ -350,6 +415,28 @@ } #endif +#ifdef RSS +/* + * IP direct input routine. + * + * This is called when reinjecting completed fragments where + * all of the previous checking and book-keeping has been done. + */ +void +ip_direct_input(struct mbuf *m) +{ + struct ip *ip; + int hlen; + + ip = mtod(m, struct ip *); + hlen = ip->ip_hl << 2; + + IPSTAT_INC(ips_delivered); + (*inetsw[ip_protox[ip->ip_p]].pr_input)(m, hlen); + return; +} +#endif + /* * Ip input routine. Checksum and byte swap header. If fragmented * try to reassemble. Process options. Pass to next level. @@ -369,11 +456,10 @@ M_ASSERTPKTHDR(m); if (m->m_flags & M_FASTFWD_OURS) { - m->m_flags &= ~M_FASTFWD_OURS; /* Set up some basics that will be used later. */ + m->m_flags &= ~M_FASTFWD_OURS; ip = mtod(m, struct ip *); hlen = ip->ip_hl << 2; - ip_len = ntohs(ip->ip_len); goto ours; } @@ -463,6 +549,7 @@ } else m_adj(m, ip_len - m->m_pkthdr.len); } + #ifdef IPSEC /* * Bypass packet filtering for packets previously handled by IPsec. @@ -806,6 +893,9 @@ * mbuf returned for further processing. Only m_tags attached * to the first packet/fragment are preserved. * The IP header is *NOT* adjusted out of iplen. + * + * XXX TODO: re-calculate the RSS flowid upon completing the received + * IP packet. */ struct mbuf * ip_reass(struct mbuf *m) @@ -817,6 +907,9 @@ int i, hlen, next; u_int8_t ecn, ecn0; u_short hash; +#ifdef RSS + uint32_t rss_hash, rss_type; +#endif /* If maxnipq or maxfragsperpacket are 0, never accept fragments. */ if (V_maxnipq == 0 || V_maxfragsperpacket == 0) { @@ -1106,6 +1199,36 @@ m_fixhdr(m); IPSTAT_INC(ips_reassembled); IPQ_UNLOCK(); + +#ifdef RSS + /* + * Query the RSS layer for the flowid / flowtype for the + * mbuf payload. + * + * We then queue into the relevant netisr so it can be dispatched + * to the correct CPU. + * + * Note - this may return 1, which means the flowid in the mbuf + * is correct for the configured RSS hash types and can be used. + */ + if (rss_mbuf_software_hash_v4(m, 0, &rss_hash, &rss_type) == 0) { + m->m_pkthdr.flowid = rss_hash; + M_HASHTYPE_SET(m, rss_type); + m->m_flags |= M_FLOWID; + } + + /* + * Queue/dispatch for reprocessing. + * + * Note: this is much slower than just handling the frame in the + * current receive context. It's likely worth investigating + * why this is. + */ + netisr_dispatch(NETISR_IP_DIRECT, m); + return (NULL); +#endif + + /* Handle in-line */ return (m); dropfrag: Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -145,7 +145,12 @@ if (inp != NULL) { INP_LOCK_ASSERT(inp); M_SETFIB(m, inp->inp_inc.inc_fibnum); - if (inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { + /* + * Force the flowid / flowtype for outbound data to match the + * inp. + */ + if (((flags & IP_NODEFAULTFLOWID) == 0) && + inp->inp_flags & (INP_HW_FLOWID|INP_SW_FLOWID)) { m->m_pkthdr.flowid = inp->inp_flowid; M_HASHTYPE_SET(m, inp->inp_flowtype); m->m_flags |= M_FLOWID; Index: sys/netinet/ip_var.h =================================================================== --- sys/netinet/ip_var.h +++ sys/netinet/ip_var.h @@ -161,6 +161,7 @@ #define IP_SENDTOIF 0x8 /* send on specific ifnet */ #define IP_ROUTETOIF SO_DONTROUTE /* 0x10 bypass routing tables */ #define IP_ALLOWBROADCAST SO_BROADCAST /* 0x20 can send broadcast packets */ +#define IP_NODEFAULTFLOWID 0x40 /* Don't set the flowid from inp */ #ifdef __NO_STRICT_ALIGNMENT #define IP_HDR_ALIGNED_P(ip) 1 Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -646,8 +646,18 @@ TW_WLOCK(V_tw_lock); TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); - crfree(tw->tw_cred); - tw->tw_cred = NULL; + /* + * I'm not sure yet why I'm seeing NULL pointers here. + * Is it possible that under high connection create/destroy + * and SO_REUSEPORT that we're creating timewait sessions + * without credentials? Or are we finding timewait + * sessions without credentials because they're being + * recycled? + */ + if (tw->tw_cred) { + crfree(tw->tw_cred); + tw->tw_cred = NULL; + } TW_WUNLOCK(V_tw_lock); if (!reuse) Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -43,6 +43,7 @@ #include "opt_inet.h" #include "opt_inet6.h" #include "opt_ipsec.h" +#include "opt_rss.h" #include #include @@ -89,6 +90,7 @@ #include #include #include +#include #ifdef IPSEC #include @@ -206,6 +208,13 @@ udp_init(void) { + /* + * For now default to 2-tuple UDP hashing - until the fragment + * reassembly code can also update the flowid. + * + * Once we can calculate the flowid that way and re-establish + * a 4-tuple, flip this to 4-tuple. + */ in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, "udp_inpcb", udp_inpcb_init, NULL, UMA_ZONE_NOFREE, IPI_HASHFIELDS_2TUPLE); @@ -1395,6 +1404,64 @@ ((struct ip *)ui)->ip_tos = tos; /* XXX */ UDPSTAT_INC(udps_opackets); + /* + * Setup flowid / RSS information for outbound socket. + * + * Once the UDP code decides to set a flowid some other way, + * this allows the flowid to be overridden by userland. + * + * Remember ip_output() overrides with the inp flowid details + * if they exist. + * + * .. and ip_output() -> flowtable_lookup() also assigns + * a flowid too. Ugh. + */ + if (use_flowid) { + m->m_flags |= M_FLOWID; + m->m_pkthdr.flowid = flowid; + M_HASHTYPE_SET(m, flowid_type); +#ifdef RSS + { + uint32_t hash_val, hash_type; + /* + * Calculate an appropriate RSS hash for UDP and + * UDP Lite. + * + * The called function will take care of figuring out + * whether a 2-tuple or 4-tuple hash is required based + * on the currently configured scheme. + * + * Later later on connected socket values should be + * cached in the inpcb and reused, rather than constantly + * re-calculating it. + * + * UDP Lite is a different protocol number and will + * likely end up being hashed as a 2-tuple until + * RSS / NICs grow UDP Lite protocol awareness. + */ + if (rss_proto_software_hash_v4(laddr, faddr, lport, fport, + pr, RSS_HASH_PKT_EGRESS, + &hash_val, &hash_type) == 0) { + m->m_pkthdr.flowid = hash_val; + m->m_flags |= M_FLOWID; + M_HASHTYPE_SET(m, hash_type); + } + } +#endif + + /* + * Don't override with the inp cached flowid value. + * + * Depending upon the kind of send being done, the inp + * flowid/flowtype values may actually not be appropriate + * for this particular socket send. + * + * We should either leave the flowid at zero (which is what is + * currently done) or set it to some software generated + * hash value based on the packet contents. + */ + ipflags |= IP_NODEFAULTFLOWID; + if (unlock_udbinfo == UH_WLOCKED) INP_HASH_WUNLOCK(pcbinfo); else if (unlock_udbinfo == UH_RLOCKED)