Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -129,6 +129,24 @@ #define inc6_zoneid inc_ie.ie6_zoneid #if defined(_KERNEL) || defined(_WANT_INPCB) + +#define LBSTATE_HASHSIZE 32 +#define LBSTATE_HASHMASK (LBSTATE_HASHSIZE - 1) +#define INP_LBSTATE_HASH(g, h) ((g)->il_htbl[(h) & LBSTATE_HASHMASK]) + +VNET_DECLARE(int, udp_lbstate_lifetime); +#define V_udp_lbstate_lifetime VNET(udp_lbstate_lifetime) + +struct inpcb_lbstate { + CK_LIST_ENTRY(inpcb_lbstate) pcbchain; + CK_LIST_ENTRY(inpcb_lbstate) grpchain; + struct in_endpoints ie; + uint32_t ts; + struct inpcb *inp; + struct epoch_context epoch_ctx; +}; +CK_LIST_HEAD(inpcb_lbstatehead, inpcb_lbstate); + /* * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and * IPv6 sockets. In the case of TCP and UDP, further per-connection state is @@ -294,6 +312,9 @@ struct label *inp_label; /* (i) MAC label */ struct inpcbpolicy *inp_sp; /* (s) for IPSEC */ + struct inpcb_lbstatehead inp_lbstates; /* LB states chain */ + uint32_t inp_lbscnt; /* LB states count */ + /* Protocol-dependent part; options. */ struct { u_char inp_ip_tos; /* (i) type of service proto */ @@ -572,6 +593,16 @@ #define il6_laddr il_dependladdr.id6_addr uint32_t il_inpsiz; /* max count in il_inp[] (h) */ uint32_t il_inpcnt; /* cur count in il_inp[] (h) */ + struct inpcbinfo *il_pcbinfo; + + /* + * callout, lock and htbl are allocated only on creating of + * new load balance group. Then on resize they are inherited. + */ + struct callout *il_callout; + struct mtx *il_lock; + struct inpcb_lbstatehead *il_htbl; + struct inpcb *il_inp[]; /* (h) */ }; @@ -821,6 +852,8 @@ void in_pcbgroup_update(struct inpcb *); void in_pcbgroup_update_mbuf(struct inpcb *, struct mbuf *); +void in_pcblbstate_update(struct inpcb *, const struct in_addr *, uint16_t, + const struct in_addr *, uint16_t); void in_pcbpurgeif0(struct inpcbinfo *, struct ifnet *); int in_pcballoc(struct socket *, struct inpcbinfo *); int in_pcbbind(struct inpcb *, struct sockaddr *, struct ucred *); Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -115,6 +115,24 @@ static struct callout ipport_tick_callout; +#if 0 +#define LBDEBUG(fmt, ...) do { \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#define LBDEBUG1(fmt, ...) do { \ + char _addr[50]; \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#define LBDEBUG2(fmt, ...) do { \ + char _laddr[50], _faddr[50]; \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#else +#define LBDEBUG(fmt, ...) +#define LBDEBUG1(fmt, ...) +#define LBDEBUG2(fmt, ...) +#endif + /* * These configure the range of local port addresses assigned to * "unspecified" outgoing connections/packets/whatever. @@ -145,6 +163,7 @@ #define V_ipport_tcplastcount VNET(ipport_tcplastcount) static void in_pcbremlists(struct inpcb *inp); +static void in_pcblbstate_free(epoch_context_t ctx); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, @@ -265,6 +284,53 @@ } static void +lbstate_tick(void *arg) +{ + struct inpcblbgroup *grp; + struct inpcb_lbstate *s, *ts; + int i; + + grp = arg; + CURVNET_SET(grp->il_pcbinfo->ipi_vnet); + for (i = 0; i < LBSTATE_HASHSIZE; i++) { + CK_LIST_FOREACH_SAFE(s, &grp->il_htbl[i], grpchain, ts) { + if (time_uptime - s->ts < V_udp_lbstate_lifetime) + continue; +#if 0 + switch (INP_SOCKAF(s->inp->inp_socket)) { + case AF_INET: + LBDEBUG2("expired state: %s:%u -> %s:%u", + inet_ntop(AF_INET, + &s->ie.ie_laddr, _laddr, sizeof(_laddr)), + ntohs(s->ie.ie_lport), + inet_ntop(AF_INET, + &s->ie.ie_faddr, _faddr, sizeof(_faddr)), + ntohs(s->ie.ie_fport)); + break; + case AF_INET6: + LBDEBUG2("expired state: %s:%u -> %s:%u", + inet_ntop(AF_INET6, + &s->ie.ie6_laddr, _laddr, sizeof(_laddr)), + ntohs(s->ie.ie_lport), + inet_ntop(AF_INET6, + &s->ie.ie6_faddr, _faddr, sizeof(_faddr)), + ntohs(s->ie.ie_fport)); + break; + default: + LBDEBUG("expired state"); + } +#endif + CK_LIST_REMOVE(s, grpchain); + CK_LIST_REMOVE(s, pcbchain); + s->inp->inp_lbscnt--; + NET_EPOCH_CALL(in_pcblbstate_free, &s->epoch_ctx); + } + } + CURVNET_RESTORE(); + callout_reset(grp->il_callout, hz, lbstate_tick, grp); +} + +static void in_pcblbgroup_free_deferred(epoch_context_t ctx) { struct inpcblbgroup *grp; @@ -281,7 +347,46 @@ NET_EPOCH_CALL(in_pcblbgroup_free_deferred, &grp->il_epoch_ctx); } + static struct inpcblbgroup * +in_pcblbgroup_alloc0(struct inpcblbgrouphead *hdr, u_char vflag, + uint16_t port, const union in_dependaddr *addr, int size) +{ + struct inpcblbgroup *grp; + + grp = in_pcblbgroup_alloc(hdr, vflag, port, addr, size); + if (grp == NULL) + return (NULL); + grp->il_lock = malloc(sizeof(*grp->il_lock), M_PCB, M_NOWAIT); + if (grp->il_lock == NULL) { + in_pcblbgroup_free(grp); + return (NULL); + } + grp->il_callout = malloc(sizeof(*grp->il_callout), M_PCB, + M_NOWAIT); + if (grp->il_callout == NULL) { + free(grp->il_lock, M_PCB); + in_pcblbgroup_free(grp); + return (NULL); + } + grp->il_htbl = malloc(sizeof(*grp->il_htbl) * LBSTATE_HASHSIZE, + M_PCB, M_NOWAIT | M_ZERO); + if (grp->il_htbl == NULL) { + free(grp->il_callout, M_PCB); + free(grp->il_lock, M_PCB); + in_pcblbgroup_free(grp); + return (NULL); + } + mtx_init(grp->il_lock, "inplbhtbl", NULL, MTX_DEF | MTX_NEW); + callout_init_mtx(grp->il_callout, grp->il_lock, 0); + mtx_lock(grp->il_lock); + callout_reset(grp->il_callout, hz, lbstate_tick, grp); + mtx_unlock(grp->il_lock); + LBDEBUG("new lbgroup for port %u", ntohs(port)); + return (grp); +} + +static struct inpcblbgroup * in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, struct inpcblbgroup *old_grp, int size) { @@ -299,7 +404,17 @@ for (i = 0; i < old_grp->il_inpcnt; ++i) grp->il_inp[i] = old_grp->il_inp[i]; + grp->il_pcbinfo = old_grp->il_pcbinfo; grp->il_inpcnt = old_grp->il_inpcnt; + /* Inherit lock and htbl from old group */ + grp->il_lock = old_grp->il_lock; + grp->il_htbl = old_grp->il_htbl; + grp->il_callout = old_grp->il_callout; + + /* Reschedule callout with new group */ + mtx_lock(grp->il_lock); + callout_reset(grp->il_callout, hz, lbstate_tick, grp); + mtx_unlock(grp->il_lock); in_pcblbgroup_free(old_grp); return (grp); } @@ -375,11 +490,12 @@ } if (grp == NULL) { /* Create new load balance group. */ - grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, + grp = in_pcblbgroup_alloc0(hdr, inp->inp_vflag, inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, INPCBLBGROUP_SIZMIN); if (grp == NULL) return (ENOBUFS); + grp->il_pcbinfo = pcbinfo; } else if (grp->il_inpcnt == grp->il_inpsiz) { if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { if (ratecheck(&lastprint, &interval)) @@ -403,7 +519,57 @@ return (0); } +static void +in_pcblbstate_free(epoch_context_t ctx) +{ + struct inpcb_lbstate *s; + + s = __containerof(ctx, struct inpcb_lbstate, epoch_ctx); + free(s, M_PCB); +} + /* + * Free LB states releated to PCB. + */ +static void +in_pcblbstates_destroy(struct inpcblbgroup *grp, struct inpcb *inp) +{ + struct inpcb_lbstate *s; + + mtx_lock(grp->il_lock); + while (!CK_LIST_EMPTY(&inp->inp_lbstates)) { + s = CK_LIST_FIRST(&inp->inp_lbstates); + CK_LIST_REMOVE(s, grpchain); + CK_LIST_REMOVE(s, pcbchain); + NET_EPOCH_CALL(in_pcblbstate_free, &s->epoch_ctx); + } + mtx_unlock(grp->il_lock); +} + +static void +in_pcblbhtbl_free(struct inpcblbgroup *grp) +{ + struct inpcb_lbstate *s; + int i; + + mtx_lock(grp->il_lock); + callout_stop(grp->il_callout); + for (i = 0; i < LBSTATE_HASHSIZE; i++) { + while (!CK_LIST_EMPTY(&grp->il_htbl[i])) { + s = CK_LIST_FIRST(&grp->il_htbl[i]); + CK_LIST_REMOVE(s, grpchain); + CK_LIST_REMOVE(s, pcbchain); + NET_EPOCH_CALL(in_pcblbstate_free, &s->epoch_ctx); + } + } + mtx_unlock(grp->il_lock); + mtx_destroy(grp->il_lock); + free(grp->il_callout, M_PCB); + free(grp->il_lock, M_PCB); + free(grp->il_htbl, M_PCB); +} + +/* * Remove PCB from load balance group. */ static void @@ -428,10 +594,12 @@ if (grp->il_inpcnt == 1) { /* We are the last, free this local group. */ + in_pcblbhtbl_free(grp); in_pcblbgroup_free(grp); } else { /* Pull up inpcbs, shrink group if possible. */ in_pcblbgroup_reorder(hdr, &grp, i); + in_pcblbstates_destroy(grp, inp); } return; } @@ -2005,31 +2173,28 @@ } #undef INP_LOOKUP_MAPPED_PCB_COST -static struct inpcb * -in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, - const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, - uint16_t fport, int lookupflags) +static struct inpcblbgroup * +in_pcblbgroup_lookup(const struct inpcbinfo *pcbinfo, + const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, + uint16_t fport, int lookupflags) { - struct inpcb *local_wild; const struct inpcblbgrouphead *hdr; - struct inpcblbgroup *grp; - uint32_t idx; + struct inpcblbgroup *grp, *grp_local_wild; - INP_HASH_LOCK_ASSERT(pcbinfo); - - hdr = &pcbinfo->ipi_lbgrouphashbase[ - INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; - /* - * Order of socket selection: + * Order of group selection: * 1. non-wild. * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). * * NOTE: * - Load balanced group does not contain jailed sockets - * - Load balanced group does not contain IPv4 mapped INET6 wild sockets + * - Load balanced group does not contain IPv4 mapped INET6 + * wild sockets */ - local_wild = NULL; + INP_HASH_LOCK_ASSERT(pcbinfo); + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; + grp_local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET6 if (!(grp->il_vflag & INP_IPV4)) @@ -2038,15 +2203,123 @@ if (grp->il_lport != lport) continue; - idx = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport) % - grp->il_inpcnt; if (grp->il_laddr.s_addr == laddr->s_addr) - return (grp->il_inp[idx]); + return (grp); + if (grp->il_laddr.s_addr == INADDR_ANY && - (lookupflags & INPLOOKUP_WILDCARD) != 0) - local_wild = grp->il_inp[idx]; + (lookupflags & INPLOOKUP_WILDCARD)) + grp_local_wild = grp; } - return (local_wild); + return (grp_local_wild); +} + +static struct inpcb_lbstate * +in_pcblbstate_lookup(struct inpcblbgroup *grp, uint32_t hash, + const struct in_addr *faddr, uint16_t fport) +{ + struct inpcb_lbstate *s; + + NET_EPOCH_ASSERT(); + CK_LIST_FOREACH(s, &INP_LBSTATE_HASH(grp, hash), grpchain) { + if (s->ie.ie_faddr.s_addr == faddr->s_addr && + s->ie.ie_fport == fport && + s->ie.ie_laddr.s_addr == grp->il_laddr.s_addr && + s->ie.ie_lport == grp->il_lport) { + LBDEBUG2("matched state: %s:%u -> %s:%u", + inet_ntop(AF_INET, + &grp->il_laddr, _laddr, sizeof(_laddr)), + grp->il_lport, + inet_ntop(AF_INET, + faddr, _faddr, sizeof(_faddr)), + fport); + break; + } + } + return (s); +} + +void +in_pcblbstate_update(struct inpcb *inp, const struct in_addr *laddr, + uint16_t lport, const struct in_addr *faddr, uint16_t fport) +{ + struct inpcblbgroup *grp; + struct inpcb_lbstate *s; + uint32_t hash; + + grp = in_pcblbgroup_lookup(inp->inp_pcbinfo, laddr, lport, faddr, + fport, INPLOOKUP_WILDCARD); + if (grp == NULL) + return; + /* Update timestamp if state is existing */ + hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport); + NET_EPOCH_ASSERT(); + s = in_pcblbstate_lookup(grp, hash, faddr, fport); + if (s != NULL) { + s->ts = time_uptime; + MPASS(s->inp == inp); + return; + } + + /* No state found. Try to allocate one. */ + s = malloc(sizeof(*s), M_PCB, M_ZERO | M_NOWAIT); + if (s == NULL) + return; + + s->ie.ie_faddr.s_addr = faddr->s_addr; + s->ie.ie_laddr.s_addr = laddr->s_addr; + s->ie.ie_fport = fport; + s->ie.ie_lport = lport; + s->ts = time_uptime; + s->inp = inp; + + mtx_lock(grp->il_lock); + if (in_pcblbstate_lookup(grp, hash, faddr, fport) == NULL) { + CK_LIST_INSERT_HEAD(&INP_LBSTATE_HASH(grp, hash), + s, grpchain); + CK_LIST_INSERT_HEAD(&inp->inp_lbstates, s, pcbchain); + inp->inp_lbscnt++; + LBDEBUG2("new state: %s:%u -> %s:%u", + inet_ntop(AF_INET, + &grp->il_laddr, _laddr, sizeof(_laddr)), + ntohs(grp->il_lport), + inet_ntop(AF_INET, + faddr, _faddr, sizeof(_faddr)), + ntohs(fport)); + } else { + free(s, M_PCB); + } + mtx_unlock(grp->il_lock); +} + +static struct inpcb * +in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb_lbstate *s; + struct inpcblbgroup *grp; + struct inpcb *inp; + uint32_t hash; + int i; + + grp = in_pcblbgroup_lookup(pcbinfo, laddr, lport, faddr, + fport, lookupflags); + if (grp == NULL) + return (NULL); + + hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport); + s = in_pcblbstate_lookup(grp, hash, faddr, fport); + if (s == NULL) { + inp = grp->il_inp[hash % grp->il_inpcnt]; + for (i = 0; i < grp->il_inpcnt; i++) { + if (inp->inp_lbscnt == 0) + break; + if (inp->inp_lbscnt > grp->il_inp[i]->inp_lbscnt) + inp = grp->il_inp[i]; + } + } else + inp = s->inp; + return (inp); } #ifdef PCBGROUP Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -157,6 +157,26 @@ SYSCTL_VNET_PCPUSTAT(_net_inet_udp, UDPCTL_STATS, stats, struct udpstat, udpstat, "UDP statistics (struct udpstat, netinet/udp_var.h)"); +VNET_DEFINE(int, udp_lbstate_lifetime) = 120; + +static int +sysctl_lbstate(SYSCTL_HANDLER_ARGS) +{ + int error; + + error = sysctl_handle_int(oidp, arg1, arg2, req); + if (error == 0) { + if (V_udp_lbstate_lifetime < 5) + V_udp_lbstate_lifetime = 5; + } + return (error); +} + +SYSCTL_PROC(_net_inet_udp, OID_AUTO, lbstate_lifetime, + CTLFLAG_VNET | CTLTYPE_INT |CTLFLAG_RW, + &VNET_NAME(udp_lbstate_lifetime), 0, &sysctl_lbstate, "I", + "The life time for load balancing states"); + #ifdef VIMAGE VNET_PCPUSTAT_SYSUNINIT(udpstat); #endif /* VIMAGE */ @@ -1522,6 +1542,11 @@ */ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ + + /* LB states handling */ + if (inp_so_options(inp) & SO_REUSEPORT_LB) { + in_pcblbstate_update(inp, &laddr, lport, &faddr, fport); + } if (pr == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); Index: sys/netinet6/in6_pcb.h =================================================================== --- sys/netinet6/in6_pcb.h +++ sys/netinet6/in6_pcb.h @@ -107,6 +107,8 @@ void in6_pcbnotify(struct inpcbinfo *, struct sockaddr *, u_int, const struct sockaddr *, u_int, int, void *, struct inpcb *(*)(struct inpcb *, int)); +void in6_pcblbstate_update(struct inpcb *, const struct in6_addr *, + uint16_t, const struct in6_addr *, uint16_t); struct inpcb * in6_rtchange(struct inpcb *, int); struct sockaddr * Index: sys/netinet6/in6_pcb.c =================================================================== --- sys/netinet6/in6_pcb.c +++ sys/netinet6/in6_pcb.c @@ -113,6 +113,24 @@ #include #include +#if 0 +#define LBDEBUG(fmt, ...) do { \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#define LBDEBUG1(fmt, ...) do { \ + char _addr[50]; \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#define LBDEBUG2(fmt, ...) do { \ + char _laddr[50], _faddr[50]; \ + printf("%s: " fmt "\n", __func__, ## __VA_ARGS__); \ +} while (0) +#else +#define LBDEBUG(fmt, ...) +#define LBDEBUG1(fmt, ...) +#define LBDEBUG2(fmt, ...) +#endif + int in6_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) @@ -887,31 +905,28 @@ return inp; } -static struct inpcb * -in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, - const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, - uint16_t fport, int lookupflags) +static struct inpcblbgroup * +in6_pcblbgroup_lookup(const struct inpcbinfo *pcbinfo, + const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, + uint16_t fport, int lookupflags) { - struct inpcb *local_wild; const struct inpcblbgrouphead *hdr; - struct inpcblbgroup *grp; - uint32_t idx; + struct inpcblbgroup *grp, *grp_local_wild; - INP_HASH_LOCK_ASSERT(pcbinfo); - - hdr = &pcbinfo->ipi_lbgrouphashbase[ - INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; - /* - * Order of socket selection: + * Order of group selection: * 1. non-wild. * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). * * NOTE: - * - Load balanced group does not contain jailed sockets. - * - Load balanced does not contain IPv4 mapped INET6 wild sockets. + * - Load balanced group does not contain jailed sockets + * - Load balanced group does not contain IPv4 mapped INET6 + * wild sockets */ - local_wild = NULL; + INP_HASH_LOCK_ASSERT(pcbinfo); + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBPORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; + grp_local_wild = NULL; CK_LIST_FOREACH(grp, hdr, il_list) { #ifdef INET if (!(grp->il_vflag & INP_IPV6)) @@ -920,15 +935,123 @@ if (grp->il_lport != lport) continue; - idx = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport, - fport) % grp->il_inpcnt; if (IN6_ARE_ADDR_EQUAL(&grp->il6_laddr, laddr)) - return (grp->il_inp[idx]); + return (grp); + if (IN6_IS_ADDR_UNSPECIFIED(&grp->il6_laddr) && - (lookupflags & INPLOOKUP_WILDCARD) != 0) - local_wild = grp->il_inp[idx]; + (lookupflags & INPLOOKUP_WILDCARD)) + grp_local_wild = grp; } - return (local_wild); + return (grp_local_wild); +} + +static struct inpcb_lbstate * +in6_pcblbstate_lookup(struct inpcblbgroup *grp, uint32_t hash, + const struct in6_addr *faddr, uint16_t fport) +{ + struct inpcb_lbstate *s; + + NET_EPOCH_ASSERT(); + CK_LIST_FOREACH(s, &INP_LBSTATE_HASH(grp, hash), grpchain) { + if (IN6_ARE_ADDR_EQUAL(&s->ie.ie6_faddr, faddr) && + s->ie.ie_fport == fport && + IN6_ARE_ADDR_EQUAL(&s->ie.ie6_laddr, &grp->il6_laddr) && + s->ie.ie_lport == grp->il_lport) { + LBDEBUG2("matched state: %s:%u -> %s:%u", + inet_ntop(AF_INET6, + &grp->il6_laddr, _laddr, sizeof(_laddr)), + grp->il_lport, + inet_ntop(AF_INET6, + faddr, _faddr, sizeof(_faddr)), + fport); + break; + } + } + return (s); +} + +void +in6_pcblbstate_update(struct inpcb *inp, const struct in6_addr *laddr, + uint16_t lport, const struct in6_addr *faddr, uint16_t fport) +{ + struct inpcblbgroup *grp; + struct inpcb_lbstate *s; + uint32_t hash; + + grp = in6_pcblbgroup_lookup(inp->inp_pcbinfo, laddr, lport, faddr, + fport, INPLOOKUP_WILDCARD); + if (grp == NULL) + return; + + /* Update timestamp if state is existing */ + hash = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport, fport); + s = in6_pcblbstate_lookup(grp, hash, faddr, fport); + if (s != NULL) { + s->ts = time_uptime; + MPASS(s->inp == inp); + return; + } + + /* No state found. Try to allocate one. */ + s = malloc(sizeof(*s), M_PCB, M_ZERO | M_NOWAIT); + if (s == NULL) + return; + + s->ie.ie6_faddr = *faddr; + s->ie.ie6_laddr = *laddr; + s->ie.ie_fport = fport; + s->ie.ie_lport = lport; + s->ts = time_uptime; + s->inp = inp; + + mtx_lock(grp->il_lock); + if (in6_pcblbstate_lookup(grp, hash, faddr, fport) == NULL) { + CK_LIST_INSERT_HEAD(&INP_LBSTATE_HASH(grp, hash), + s, grpchain); + CK_LIST_INSERT_HEAD(&inp->inp_lbstates, s, pcbchain); + inp->inp_lbscnt++; + LBDEBUG2("new state: %s:%u -> %s:%u", + inet_ntop(AF_INET6, + &grp->il6_laddr, _laddr, sizeof(_laddr)), + ntohs(grp->il_lport), + inet_ntop(AF_INET6, + faddr, _faddr, sizeof(_faddr)), + ntohs(fport)); + } else { + free(s, M_PCB); + } + mtx_unlock(grp->il_lock); +} + +static struct inpcb * +in6_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in6_addr *laddr, uint16_t lport, const struct in6_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb_lbstate *s; + struct inpcblbgroup *grp; + struct inpcb *inp; + uint32_t hash; + int i; + + grp = in6_pcblbgroup_lookup(pcbinfo, laddr, lport, faddr, + fport, lookupflags); + if (grp == NULL) + return (NULL); + + hash = INP_PCBLBGROUP_PKTHASH(INP6_PCBHASHKEY(faddr), lport, fport); + s = in6_pcblbstate_lookup(grp, hash, faddr, fport); + if (s == NULL) { + inp = grp->il_inp[hash % grp->il_inpcnt]; + for (i = 0; i < grp->il_inpcnt; i++) { + if (inp->inp_lbscnt == 0) + break; + if (inp->inp_lbscnt > grp->il_inp[i]->inp_lbscnt) + inp = grp->il_inp[i]; + } + } else + inp = s->inp; + return (inp); } #ifdef PCBGROUP Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -992,6 +992,12 @@ } #endif + /* LB states handling */ + /* XXXAE: addresses have embedded zone id */ + if (inp_so_options(inp) & SO_REUSEPORT_LB) + in6_pcblbstate_update(inp, laddr, inp->inp_lport, + faddr, fport); + UDPSTAT_INC(udps_opackets); if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6);