Page Menu
Home
FreeBSD
Search
Configure Global Search
Log In
Files
F140591274
D21636.diff
No One
Temporary
Actions
View File
Edit File
Delete File
View Transforms
Subscribe
Mute Notifications
Flag For Later
Award Token
Size
14 KB
Referenced Files
None
Subscribers
None
D21636.diff
View Options
Index: head/share/man/man4/tcp.4
===================================================================
--- head/share/man/man4/tcp.4
+++ head/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd November 25, 2020
+.Dd December 19, 2020
.Dt TCP 4
.Os
.Sh NAME
@@ -314,6 +314,21 @@
See
.Xr ktls 4
for more details.
+.It Dv TCP_REUSPORT_LB_NUMA
+Changes NUMA affinity filtering for an established TCP listen
+socket.
+This option takes a single integer argument which specifies
+the NUMA domain to filter on for this listen socket.
+The argument can also have the follwing special values:
+.Bl -tag -width "Dv TCP_REUSPORT_LB_NUMA"
+.It Dv TCP_REUSPORT_LB_NUMA_NODOM
+Remove NUMA filtering for this listen socket.
+.It Dv TCP_REUSPORT_LB_NUMA_CURDOM
+Filter traffic associated with the domain where the calling thread is
+currently executing.
+This is typically used after a process or thread inherits a listen
+socket from its parent, and sets its CPU affinity to a particular core.
+.El
.El
.Pp
The option level for the
Index: head/sys/netinet/in_pcb.h
===================================================================
--- head/sys/netinet/in_pcb.h
+++ head/sys/netinet/in_pcb.h
@@ -565,7 +565,7 @@
struct epoch_context il_epoch_ctx;
uint16_t il_lport; /* (c) */
u_char il_vflag; /* (c) */
- u_char il_pad;
+ u_int8_t il_numa_domain;
uint32_t il_pad2;
union in_dependaddr il_dependladdr; /* (c) */
#define il_laddr il_dependladdr.id46_addr.ia46_addr4
@@ -852,6 +852,7 @@
int in_pcbinshash_mbuf(struct inpcb *, struct mbuf *);
int in_pcbladdr(struct inpcb *, struct in_addr *, struct in_addr *,
struct ucred *);
+int in_pcblbgroup_numa(struct inpcb *, int arg);
struct inpcb *
in_pcblookup_local(struct inpcbinfo *,
struct in_addr, u_short, int, struct ucred *);
Index: head/sys/netinet/in_pcb.c
===================================================================
--- head/sys/netinet/in_pcb.c
+++ head/sys/netinet/in_pcb.c
@@ -75,6 +75,7 @@
#endif
#include <vm/uma.h>
+#include <vm/vm.h>
#include <net/if.h>
#include <net/if_var.h>
@@ -150,7 +151,8 @@
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in_addr faddr, u_int fport_arg,
struct in_addr laddr, u_int lport_arg,
- int lookupflags, struct ifnet *ifp);
+ int lookupflags, struct ifnet *ifp,
+ uint8_t numa_domain);
#define RANGECHK(var, min, max) \
if ((var) < (min)) { (var) = (min); } \
@@ -248,7 +250,8 @@
static struct inpcblbgroup *
in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag,
- uint16_t port, const union in_dependaddr *addr, int size)
+ uint16_t port, const union in_dependaddr *addr, int size,
+ uint8_t numa_domain)
{
struct inpcblbgroup *grp;
size_t bytes;
@@ -259,6 +262,7 @@
return (NULL);
grp->il_vflag = vflag;
grp->il_lport = port;
+ grp->il_numa_domain = numa_domain;
grp->il_dependladdr = *addr;
grp->il_inpsiz = size;
CK_LIST_INSERT_HEAD(hdr, grp, il_list);
@@ -290,7 +294,8 @@
int i;
grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag,
- old_grp->il_lport, &old_grp->il_dependladdr, size);
+ old_grp->il_lport, &old_grp->il_dependladdr, size,
+ old_grp->il_numa_domain);
if (grp == NULL)
return (NULL);
@@ -333,7 +338,7 @@
* Add PCB to load balance group for SO_REUSEPORT_LB option.
*/
static int
-in_pcbinslbgrouphash(struct inpcb *inp)
+in_pcbinslbgrouphash(struct inpcb *inp, uint8_t numa_domain)
{
const static struct timeval interval = { 60, 0 };
static struct timeval lastprint;
@@ -369,6 +374,7 @@
CK_LIST_FOREACH(grp, hdr, il_list) {
if (grp->il_vflag == inp->inp_vflag &&
grp->il_lport == inp->inp_lport &&
+ grp->il_numa_domain == numa_domain &&
memcmp(&grp->il_dependladdr,
&inp->inp_inc.inc_ie.ie_dependladdr,
sizeof(grp->il_dependladdr)) == 0)
@@ -378,7 +384,7 @@
/* Create new load balance group. */
grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag,
inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr,
- INPCBLBGROUP_SIZMIN);
+ INPCBLBGROUP_SIZMIN, numa_domain);
if (grp == NULL)
return (ENOBUFS);
} else if (grp->il_inpcnt == grp->il_inpsiz) {
@@ -439,6 +445,57 @@
}
}
+int
+in_pcblbgroup_numa(struct inpcb *inp, int arg)
+{
+ struct inpcbinfo *pcbinfo;
+ struct inpcblbgrouphead *hdr;
+ struct inpcblbgroup *grp;
+ int err, i;
+ uint8_t numa_domain;
+
+ switch (arg) {
+ case TCP_REUSPORT_LB_NUMA_NODOM:
+ numa_domain = M_NODOM;
+ break;
+ case TCP_REUSPORT_LB_NUMA_CURDOM:
+ numa_domain = PCPU_GET(domain);
+ break;
+ default:
+ if (arg < 0 || arg >= vm_ndomains)
+ return (EINVAL);
+ numa_domain = arg;
+ }
+
+ err = 0;
+ pcbinfo = inp->inp_pcbinfo;
+ INP_WLOCK_ASSERT(inp);
+ INP_HASH_WLOCK(pcbinfo);
+ hdr = &pcbinfo->ipi_lbgrouphashbase[
+ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)];
+ CK_LIST_FOREACH(grp, hdr, il_list) {
+ for (i = 0; i < grp->il_inpcnt; ++i) {
+ if (grp->il_inp[i] != inp)
+ continue;
+
+ if (grp->il_numa_domain == numa_domain) {
+ goto abort_with_hash_wlock;
+ }
+
+ /* Remove it from the old group. */
+ in_pcbremlbgrouphash(inp);
+
+ /* Add it to the new group based on numa domain. */
+ in_pcbinslbgrouphash(inp, numa_domain);
+ goto abort_with_hash_wlock;
+ }
+ }
+ err = ENOENT;
+abort_with_hash_wlock:
+ INP_HASH_WUNLOCK(pcbinfo);
+ return (err);
+}
+
/*
* Different protocols initialize their inpcbs differently - giving
* different name to the lock. But they all are disposed the same.
@@ -731,14 +788,14 @@
if (lsa->sa_family == AF_INET) {
tmpinp = in_pcblookup_hash_locked(pcbinfo,
faddr, fport, laddr, lport, lookupflags,
- NULL);
+ NULL, M_NODOM);
}
#endif
#ifdef INET6
if (lsa->sa_family == AF_INET6) {
tmpinp = in6_pcblookup_hash_locked(pcbinfo,
faddr6, fport, laddr6, lport, lookupflags,
- NULL);
+ NULL, M_NODOM);
}
#endif
} else {
@@ -1399,9 +1456,10 @@
if (error)
return (error);
}
+
if (lport != 0) {
oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr,
- fport, laddr, lport, 0, NULL);
+ fport, laddr, lport, 0, NULL, M_NODOM);
if (oinp != NULL) {
if (oinpp != NULL)
*oinpp = oinp;
@@ -2019,9 +2077,9 @@
static struct inpcb *
in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr,
- uint16_t fport, int lookupflags)
+ uint16_t fport, int lookupflags, int numa_domain)
{
- struct inpcb *local_wild;
+ struct inpcb *local_wild, *numa_wild;
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
@@ -2041,6 +2099,7 @@
* - Load balanced group does not contain IPv4 mapped INET6 wild sockets
*/
local_wild = NULL;
+ numa_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
#ifdef INET6
if (!(grp->il_vflag & INP_IPV4))
@@ -2051,12 +2110,24 @@
idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) %
grp->il_inpcnt;
- if (grp->il_laddr.s_addr == laddr->s_addr)
- return (grp->il_inp[idx]);
+ if (grp->il_laddr.s_addr == laddr->s_addr) {
+ if (numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain) {
+ return (grp->il_inp[idx]);
+ } else {
+ numa_wild = grp->il_inp[idx];
+ }
+ }
if (grp->il_laddr.s_addr == INADDR_ANY &&
- (lookupflags & INPLOOKUP_WILDCARD) != 0)
+ (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
+ (local_wild == NULL || numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain)) {
local_wild = grp->il_inp[idx];
+ }
}
+ if (numa_wild != NULL)
+ return (numa_wild);
+
return (local_wild);
}
@@ -2303,7 +2374,7 @@
static struct inpcb *
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags,
- struct ifnet *ifp)
+ struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
@@ -2348,7 +2419,7 @@
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr,
- fport, lookupflags);
+ fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
}
@@ -2435,12 +2506,13 @@
static struct inpcb *
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr,
u_int fport, struct in_addr laddr, u_int lport, int lookupflags,
- struct ifnet *ifp)
+ struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcb *inp;
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
- (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
+ (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
+ numa_domain);
if (inp != NULL) {
if (lookupflags & INPLOOKUP_WLOCKPCB) {
INP_WLOCK(inp);
@@ -2507,7 +2579,7 @@
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, M_NODOM));
}
struct inpcb *
@@ -2549,7 +2621,7 @@
}
#endif
return (in_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, m->m_pkthdr.numa_domain));
}
#endif /* INET */
@@ -2591,7 +2663,7 @@
*/
so_options = inp_so_options(inp);
if (so_options & SO_REUSEPORT_LB) {
- int ret = in_pcbinslbgrouphash(inp);
+ int ret = in_pcbinslbgrouphash(inp, M_NODOM);
if (ret) {
/* pcb lb group malloc fail (ret=ENOBUFS). */
return (ret);
Index: head/sys/netinet/tcp.h
===================================================================
--- head/sys/netinet/tcp.h
+++ head/sys/netinet/tcp.h
@@ -196,6 +196,7 @@
#define TCP_PCAP_IN 4096 /* number of input packets to keep */
#define TCP_FUNCTION_BLK 8192 /* Set the tcp function pointers to the specified stack */
/* Options for Rack and BBR */
+#define TCP_REUSPORT_LB_NUMA 1026 /* set listen socket numa domain */
#define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */
#define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */
#define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */
@@ -405,5 +406,8 @@
#define VOI_TCP_CALCFRWINDIFF 7 /* Congestion avoidance LCWIN - FRWIN */
#define VOI_TCP_GPUT_ND 8 /* Goodput normalised delta */
#define VOI_TCP_ACKLEN 9 /* Average ACKed bytes per ACK */
+
+#define TCP_REUSPORT_LB_NUMA_NODOM (-2) /* remove numa binding */
+#define TCP_REUSPORT_LB_NUMA_CURDOM (-1) /* bind to current domain */
#endif /* !_NETINET_TCP_H_ */
Index: head/sys/netinet/tcp_usrreq.c
===================================================================
--- head/sys/netinet/tcp_usrreq.c
+++ head/sys/netinet/tcp_usrreq.c
@@ -2143,6 +2143,16 @@
INP_WUNLOCK(inp);
break;
+ case TCP_REUSPORT_LB_NUMA:
+ INP_WUNLOCK(inp);
+ error = sooptcopyin(sopt, &optval, sizeof(optval),
+ sizeof(optval));
+ INP_WLOCK_RECHECK(inp);
+ if (!error)
+ error = in_pcblbgroup_numa(inp, optval);
+ INP_WUNLOCK(inp);
+ break;
+
#ifdef KERN_TLS
case TCP_TXTLS_ENABLE:
INP_WUNLOCK(inp);
Index: head/sys/netinet6/in6_pcb.h
===================================================================
--- head/sys/netinet6/in6_pcb.h
+++ head/sys/netinet6/in6_pcb.h
@@ -95,7 +95,7 @@
struct inpcb *
in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo,
struct in6_addr *faddr, u_int fport_arg, struct in6_addr *laddr,
- u_int lport_arg, int lookupflags, struct ifnet *ifp);
+ u_int lport_arg, int lookupflags, struct ifnet *ifp, uint8_t);
struct inpcb *
in6_pcblookup(struct inpcbinfo *, struct in6_addr *,
u_int, struct in6_addr *, u_int, int,
Index: head/sys/netinet6/in6_pcb.c
===================================================================
--- head/sys/netinet6/in6_pcb.c
+++ head/sys/netinet6/in6_pcb.c
@@ -446,7 +446,7 @@
sin6->sin6_port,
IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)
? &laddr6.sin6_addr : &inp->in6p_laddr,
- inp->inp_lport, 0, NULL) != NULL) {
+ inp->inp_lport, 0, NULL, M_NODOM) != NULL) {
return (EADDRINUSE);
}
if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) {
@@ -903,9 +903,9 @@
static struct inpcb *
in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo,
const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr,
- uint16_t fport, int lookupflags)
+ uint16_t fport, int lookupflags, uint8_t numa_domain)
{
- struct inpcb *local_wild;
+ struct inpcb *local_wild, *numa_wild;
const struct inpcblbgrouphead *hdr;
struct inpcblbgroup *grp;
uint32_t idx;
@@ -925,6 +925,7 @@
* - Load balanced does not contain IPv4 mapped INET6 wild sockets.
*/
local_wild = NULL;
+ numa_wild = NULL;
CK_LIST_FOREACH(grp, hdr, il_list) {
#ifdef INET
if (!(grp->il_vflag & INP_IPV6))
@@ -935,12 +936,23 @@
idx = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport,
fport) % grp->il_inpcnt;
- if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr))
- return (grp->il_inp[idx]);
+ if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) {
+ if (numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain) {
+ return (grp->il_inp[idx]);
+ }
+ else
+ numa_wild = grp->il_inp[idx];
+ }
if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) &&
- (lookupflags & INPLOOKUP_WILDCARD) != 0)
+ (lookupflags & INPLOOKUP_WILDCARD) != 0 &&
+ (local_wild == NULL || numa_domain == M_NODOM ||
+ grp->il_numa_domain == numa_domain)) {
local_wild = grp->il_inp[idx];
+ }
}
+ if (numa_wild != NULL)
+ return (numa_wild);
return (local_wild);
}
@@ -1151,7 +1163,7 @@
struct inpcb *
in6_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
u_int fport_arg, struct in6_addr *laddr, u_int lport_arg,
- int lookupflags, struct ifnet *ifp)
+ int lookupflags, struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcbhead *head;
struct inpcb *inp, *tmpinp;
@@ -1195,7 +1207,7 @@
*/
if ((lookupflags & INPLOOKUP_WILDCARD) != 0) {
inp = in6_pcblookup_lbgroup(pcbinfo, laddr, lport, faddr,
- fport, lookupflags);
+ fport, lookupflags, numa_domain);
if (inp != NULL)
return (inp);
}
@@ -1273,12 +1285,13 @@
static struct inpcb *
in6_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in6_addr *faddr,
u_int fport, struct in6_addr *laddr, u_int lport, int lookupflags,
- struct ifnet *ifp)
+ struct ifnet *ifp, uint8_t numa_domain)
{
struct inpcb *inp;
inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport,
- (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp);
+ (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp,
+ numa_domain);
if (inp != NULL) {
if (lookupflags & INPLOOKUP_WLOCKPCB) {
INP_WLOCK(inp);
@@ -1344,7 +1357,7 @@
}
#endif
return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, M_NODOM));
}
struct inpcb *
@@ -1386,7 +1399,7 @@
}
#endif
return (in6_pcblookup_hash(pcbinfo, faddr, fport, laddr, lport,
- lookupflags, ifp));
+ lookupflags, ifp, m->m_pkthdr.numa_domain));
}
void
File Metadata
Details
Attached
Mime Type
text/plain
Expires
Fri, Dec 26, 4:42 PM (10 h, 16 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
27260493
Default Alt Text
D21636.diff (14 KB)
Attached To
Mode
D21636: Filter TCP connections to SO_REUSEPORT_LB listen sockets by NUMA domain
Attached
Detach File
Event Timeline
Log In to Comment