Index: head/share/man/man4/tcp.4 =================================================================== --- head/share/man/man4/tcp.4 +++ head/share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd November 25, 2020 +.Dd December 19, 2020 .Dt TCP 4 .Os .Sh NAME @@ -314,6 +314,21 @@ See .Xr ktls 4 for more details. +.It Dv TCP_REUSPORT_LB_NUMA +Changes NUMA affinity filtering for an established TCP listen +socket. +This option takes a single integer argument which specifies +the NUMA domain to filter on for this listen socket. +The argument can also have the follwing special values: +.Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA" +.It Dv TCP_REUSPORT_LB_NUMA_NODOM +Remove NUMA filtering for this listen socket. +.It Dv TCP_REUSPORT_LB_NUMA_CURDOM +Filter traffic associated with the domain where the calling thread is +currently executing. +This is typically used after a process or thread inherits a listen +socket from its parent, and sets its CPU affinity to a particular core. +.El .El .Pp The option level for the Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h +++ head/sys/netinet/in_pcb.h @@ -565,7 +565,7 @@ struct epoch_context il_epoch_ctx; uint16_t il_lport; /* (c) */ u_char il_vflag; /* (c) */ - u_char il_pad; + u_int8_t il_numa_domain; uint32_t il_pad2; union in_dependaddr il_dependladdr; /* (c) */ #define il_laddr il_dependladdr.id46_addr.ia46_addr4 @@ -852,6 +852,7 @@ int in_pcbinshash_mbuf(struct inpcb *, struct mbuf *); int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *, struct ucred *); +int in_pcblbgroup_numa(struct inpcb *, int arg); struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); Index: head/sys/netinet/in_pcb.c =================================================================== --- head/sys/netinet/in_pcb.c +++ head/sys/netinet/in_pcb.c @@ -75,6 +75,7 @@ #endif #include +#include #include #include @@ -150,7 +151,8 @@ static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, - int lookupflags, struct ifnet *ifp); + int lookupflags, struct ifnet *ifp, + uint8_t numa_domain); #define RANGECHK(var, min, max) \ if ((var) < (min)) { (var) = (min); } \ @@ -248,7 +250,8 @@ static struct inpcblbgroup * in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, - uint16_t port, const union in_dependaddr *addr, int size) + uint16_t port, const union in_dependaddr *addr, int size, + uint8_t numa_domain) { struct inpcblbgroup *grp; size_t bytes; @@ -259,6 +262,7 @@ return (NULL); grp->il_vflag = vflag; grp->il_lport = port; + grp->il_numa_domain = numa_domain; grp->il_dependladdr = *addr; grp->il_inpsiz = size; CK_LIST_INSERT_HEAD(hdr, grp, il_list); @@ -290,7 +294,8 @@ int i; grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, - old_grp->il_lport, &old_grp->il_dependladdr, size); + old_grp->il_lport, &old_grp->il_dependladdr, size, + old_grp->il_numa_domain); if (grp == NULL) return (NULL); @@ -333,7 +338,7 @@ * Add PCB to load balance group for SO_REUSEPORT_LB option. */ static int -in_pcbinslbgrouphash(struct inpcb *inp) +in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain) { const static struct timeval interval = { 60, 0 }; static struct timeval lastprint; @@ -369,6 +374,7 @@ CK_LIST_FOREACH(grp, hdr, il_list) { if (grp->il_vflag == inp->inp_vflag && grp->il_lport == inp->inp_lport && + grp->il_numa_domain == numa_domain && memcmp(&grp->il_dependladdr, &inp->inp_inc.inc_ie.ie_dependladdr, sizeof(grp->il_dependladdr)) == 0) @@ -378,7 +384,7 @@ /* Create new load balance group. */ grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, - INPCBLBGROUP_SIZMIN); + INPCBLBGROUP_SIZMIN, numa_domain); if (grp == NULL) return (ENOBUFS); } else if (grp->il_inpcnt == grp->il_inpsiz) { @@ -439,6 +445,57 @@ } } +int +in_pcblbgroup_numa(struct inpcb *inp, int arg) +{ + struct inpcbinfo *pcbinfo; + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + int err, i; + uint8_t numa_domain; + + switch (arg) { + case TCP_REUSPORT_LB_NUMA_NODOM: + numa_domain = M_NODOM; + break; + case TCP_REUSPORT_LB_NUMA_CURDOM: + numa_domain = PCPU_GET(domain); + break; + default: + if (arg < 0 || arg >= vm_ndomains) + return (EINVAL); + numa_domain = arg; + } + + err = 0; + pcbinfo = inp->inp_pcbinfo; + INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK(pcbinfo); + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; + CK_LIST_FOREACH(grp, hdr, il_list) { + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] != inp) + continue; + + if (grp->il_numa_domain == numa_domain) { + goto abort_with_hash_wlock; + } + + /* Remove it from the old group. */ + in_pcbremlbgrouphash(inp); + + /* Add it to the new group based on numa domain. */ + in_pcbinslbgrouphash(inp, numa_domain); + goto abort_with_hash_wlock; + } + } + err = ENOENT; +abort_with_hash_wlock: + INP_HASH_WUNLOCK(pcbinfo); + return (err); +} + /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. @@ -731,14 +788,14 @@ if (lsa->sa_family == AF_INET) { tmpinp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags, - NULL); + NULL, M_NODOM); } #endif #ifdef INET6 if (lsa->sa_family == AF_INET6) { tmpinp = in6_pcblookup_hash_locked(pcbinfo, faddr6, fport, laddr6, lport, lookupflags, - NULL); + NULL, M_NODOM); } #endif } else { @@ -1399,9 +1456,10 @@ if (error) return (error); } + if (lport != 0) { oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, - fport, laddr, lport, 0, NULL); + fport, laddr, lport, 0, NULL, M_NODOM); if (oinp != NULL) { if (oinpp != NULL) *oinpp = oinp; @@ -2019,9 +2077,9 @@ static struct inpcb * in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, - uint16_t fport, int lookupflags) + uint16_t fport, int lookupflags, int numa_domain) { - struct inpcb *local_wild; + struct inpcb *local_wild, *numa_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; @@ -2041,6 +2099,7 @@ * - Load balanced group does not contain IPv4 mapped INET6 wild sockets */ local_wild = NULL; + numa_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) @@ -2051,12 +2110,24 @@ idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % grp->il_inpcnt; - if (grp->il_laddr.s_addr == laddr->s_addr) - return (grp->il_inp[idx]); + if (grp->il_laddr.s_addr == laddr->s_addr) { + if (numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain) { + return (grp->il_inp[idx]); + } else { + numa_wild = grp->il_inp[idx]; + } + } if (grp->il_laddr.s_addr == INADDR_ANY && - (lookupflags & INPLOOKUP_WILDCARD) != 0) + (lookupflags & INPLOOKUP_WILDCARD) != 0 && + (local_wild == NULL || numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain)) { local_wild = grp->il_inp[idx]; + } } + if (numa_wild != NULL) + return (numa_wild); + return (local_wild); } @@ -2303,7 +2374,7 @@ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, - struct ifnet *ifp) + struct ifnet *ifp, uint8_t numa_domain) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; @@ -2348,7 +2419,7 @@ */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, - fport, lookupflags); + fport, lookupflags, numa_domain); if (inp != NULL) return (inp); } @@ -2435,12 +2506,13 @@ static struct inpcb * in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport, struct in_addr laddr, u_int lport, int lookupflags, - struct ifnet *ifp) + struct ifnet *ifp, uint8_t numa_domain) { struct inpcb *inp; inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp, + numa_domain); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); @@ -2507,7 +2579,7 @@ } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, M_NODOM)); } struct inpcb * @@ -2549,7 +2621,7 @@ } #endif return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, m->m_pkthdr.numa_domain)); } #endif /* INET */ @@ -2591,7 +2663,7 @@ */ so_options = inp_so_options(inp); if (so_options & SO_REUSEPORT_LB) { - int ret = in_pcbinslbgrouphash(inp); + int ret = in_pcbinslbgrouphash(inp, M_NODOM); if (ret) { /* pcb lb group malloc fail (ret=ENOBUFS). */ return (ret); Index: head/sys/netinet/tcp.h =================================================================== --- head/sys/netinet/tcp.h +++ head/sys/netinet/tcp.h @@ -196,6 +196,7 @@ #define TCP_PCAP_IN 4096 /* number of input packets to keep */ #define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */ /* Options for Rack and BBR */ +#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */ #define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ #define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */ #define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ @@ -405,5 +406,8 @@ #define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */ #define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */ #define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */ + +#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */ +#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */ #endif /* !_NETINET_TCP_H_ */ Index: head/sys/netinet/tcp_usrreq.c =================================================================== --- head/sys/netinet/tcp_usrreq.c +++ head/sys/netinet/tcp_usrreq.c @@ -2143,6 +2143,16 @@ INP_WUNLOCK(inp); break; + case TCP_REUSPORT_LB_NUMA: + INP_WUNLOCK(inp); + error = sooptcopyin(sopt, &optval, sizeof(optval), + sizeof(optval)); + INP_WLOCK_RECHECK(inp); + if (!error) + error = in_pcblbgroup_numa(inp, optval); + INP_WUNLOCK(inp); + break; + #ifdef KERN_TLS case TCP_TXTLS_ENABLE: INP_WUNLOCK(inp); Index: head/sys/netinet6/in6_pcb.h =================================================================== --- head/sys/netinet6/in6_pcb.h +++ head/sys/netinet6/in6_pcb.h @@ -95,7 +95,7 @@ struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, - u_int lport_arg, int lookupflags, struct ifnet *ifp); + u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t); struct inpcb * in6_pcblookup(struct inpcbinfo *, struct in6_addr *, u_int, struct in6_addr *, u_int, int, Index: head/sys/netinet6/in6_pcb.c =================================================================== --- head/sys/netinet6/in6_pcb.c +++ head/sys/netinet6/in6_pcb.c @@ -446,7 +446,7 @@ sin6->sin6_port, IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) ? &laddr6.sin6_addr : &inp->in6p_laddr, - inp->inp_lport, 0, NULL) != NULL) { + inp->inp_lport, 0, NULL, M_NODOM) != NULL) { return (EADDRINUSE); } if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { @@ -903,9 +903,9 @@ static struct inpcb * in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, - uint16_t fport, int lookupflags) + uint16_t fport, int lookupflags, uint8_t numa_domain) { - struct inpcb *local_wild; + struct inpcb *local_wild, *numa_wild; const struct inpcblbgrouphead *hdr; struct inpcblbgroup *grp; uint32_t idx; @@ -925,6 +925,7 @@ * - Load balanced does not contain IPv4 mapped INET6 wild sockets. */ local_wild = NULL; + numa_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET if (!(grp->il_vflag & INP_IPV6)) @@ -935,12 +936,23 @@ idx = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport, fport) % grp->il_inpcnt; - if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) - return (grp->il_inp[idx]); + if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) { + if (numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain) { + return (grp->il_inp[idx]); + } + else + numa_wild = grp->il_inp[idx]; + } if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) && - (lookupflags & INPLOOKUP_WILDCARD) != 0) + (lookupflags & INPLOOKUP_WILDCARD) != 0 && + (local_wild == NULL || numa_domain == M_NODOM || + grp->il_numa_domain == numa_domain)) { local_wild = grp->il_inp[idx]; + } } + if (numa_wild != NULL) + return (numa_wild); return (local_wild); } @@ -1151,7 +1163,7 @@ struct inpcb * in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr, u_int lport_arg, - int lookupflags, struct ifnet *ifp) + int lookupflags, struct ifnet *ifp, uint8_t numa_domain) { struct inpcbhead *head; struct inpcb *inp, *tmpinp; @@ -1195,7 +1207,7 @@ */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr, - fport, lookupflags); + fport, lookupflags, numa_domain); if (inp != NULL) return (inp); } @@ -1273,12 +1285,13 @@ static struct inpcb * in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr, u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags, - struct ifnet *ifp) + struct ifnet *ifp, uint8_t numa_domain) { struct inpcb *inp; inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, - (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); + (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp, + numa_domain); if (inp != NULL) { if (lookupflags & INPLOOKUP_WLOCKPCB) { INP_WLOCK(inp); @@ -1344,7 +1357,7 @@ } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, M_NODOM)); } struct inpcb * @@ -1386,7 +1399,7 @@ } #endif return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport, - lookupflags, ifp)); + lookupflags, ifp, m->m_pkthdr.numa_domain)); } void