Index: sys/kern/uipc_debug.c =================================================================== --- sys/kern/uipc_debug.c +++ sys/kern/uipc_debug.c @@ -75,7 +75,7 @@ } static void -db_print_sooptions(short so_options) +db_print_sooptions(int so_options) { int comma; @@ -120,6 +120,10 @@ db_printf("%sSO_REUSEPORT", comma ? ", " : ""); comma = 1; } + if (so_options & SO_REUSEPORT_LB) { + db_printf("%sSO_REUSEPORT_LB", comma ? ", " : ""); + comma = 1; + } if (so_options & SO_TIMESTAMP) { db_printf("%sSO_TIMESTAMP", comma ? ", " : ""); comma = 1; Index: sys/kern/uipc_socket.c =================================================================== --- sys/kern/uipc_socket.c +++ sys/kern/uipc_socket.c @@ -280,7 +280,7 @@ static void socket_hhook_register(int subtype) { - + if (hhook_head_register(HHOOK_TYPE_SOCKET, subtype, &V_socket_hhh[subtype], HHOOK_NOWAIT|HHOOK_HEADISINVNET) != 0) @@ -290,7 +290,7 @@ static void socket_hhook_deregister(int subtype) { - + if (hhook_head_deregister(V_socket_hhh[subtype]) != 0) printf("%s: WARNING: unable to deregister hook\n", __func__); } @@ -448,6 +448,8 @@ static void sodealloc(struct socket *so) { + if(so->inherit) + printf("%s] dealloc inherited socket %p\n", __func__, so); KASSERT(so->so_count == 0, ("sodealloc(): so_count %d", so->so_count)); KASSERT(so->so_pcb == NULL, ("sodealloc(): so_pcb != NULL")); @@ -936,6 +938,9 @@ SOCK_UNLOCK(so); sorele(head); + if(so->inherit) + printf("%s] dequeueing inherited socket %p from socket %p\n", __func__, so, head); + *ret = so; return (0); } @@ -963,6 +968,9 @@ void sofree(struct socket *so) { + if(so->inherit) + printf("%s] inherited socket %p\n", __func__, so); + struct protosw *pr = so->so_proto; SOCK_LOCK_ASSERT(so); @@ -1005,6 +1013,7 @@ TAILQ_REMOVE(&sol->sol_incomp, so, so_list); sol->sol_incqlen--; /* This is guarenteed not to be the last. */ + printf("%s] calling refcount_release\n", __func__); refcount_release(&sol->so_count); so->so_qstate = SQ_NONE; so->so_listen = NULL; @@ -1053,6 +1062,114 @@ } /* + * Let socket in same load balance group (same port and address) + * inherit pending sockets of the closing socket. + * + * "so_inh" will inherit sockets from "so" + */ +void +soinherit(struct socket *so, struct socket *so_inh) +{ + struct thread *td = curthread; + struct proc *p = td->td_proc; + int pid = p->p_pid; + printf("%s] pid %d\n", __func__, pid); + + TAILQ_HEAD(, socket) comp, incomp; + struct socket *sp, *head, *head_inh; + int qlen, incqlen; + + KASSERT(so->so_options & SO_ACCEPTCONN, + ("so does not accept connection")); + KASSERT(so_inh->so_options & SO_ACCEPTCONN, + ("so_inh does not accept connection")); + + // XXX: Do we need to lock head? +restart: + SOCK_LOCK(so); + if ((head = so->so_listen) != NULL && + __predict_false(SOLISTEN_TRYLOCK(head) == 0)) { + SOCK_UNLOCK(so); + goto restart; + } + +restart_inh: + SOCK_LOCK(so_inh); + if ((head_inh = so_inh->so_listen) != NULL && + __predict_false(SOLISTEN_TRYLOCK(head_inh) == 0)) { + SOCK_UNLOCK(so_inh); + goto restart_inh; + } + + TAILQ_INIT(&comp); + TAILQ_INIT(&incomp); + + /* + * Save completed queue and incompleted queue + */ + TAILQ_CONCAT(&comp, &so->sol_comp, so_list); + qlen = so->sol_qlen; + so->sol_qlen = 0; + + TAILQ_CONCAT(&incomp, &so->sol_incomp, so_list); + incqlen = so->sol_incqlen; + so->sol_incqlen = 0; + + printf("%s] got closing socket qlen %d\n", __func__, qlen); + printf("%s] got closing socket incqlen %d\n", __func__, incqlen); + + /* + * Append the saved completed queue and incompleted + * queue to the socket inherits them. + * + * XXX: + * This may temporarily break the inheriting socket's + * so_qlimit. + */ + TAILQ_FOREACH(sp, &comp, so_list) { + /* XXX: got a problem with negative refcount, is this the correct solution? */ + refcount_acquire(&so_inh->so_count); + sp->so_listen = so_inh; + sp->inherit = 1; // for debugging + crfree(sp->so_cred); + sp->so_cred = crhold(so_inh->so_cred); + // XXX: Something more we need to do here? + printf("%s] listening socket %p is inheriting comp socket %p\n", __func__, so_inh, sp); + } + + TAILQ_FOREACH(sp, &incomp, so_list) { + /* XXX: got a problem with negative refcount, is this the correct solution? */ + refcount_acquire(&so_inh->so_count); + sp->inherit = 1; // for debugging + sp->so_listen = so_inh; + crfree(sp->so_cred); + sp->so_cred = crhold(so_inh->so_cred); + // XXX: Something more we need to do here? + printf("%s] listening socket %p is inheriting incomp socket %p\n", __func__, so_inh, sp); + } + + TAILQ_CONCAT(&so_inh->sol_comp, &comp, so_list); + so_inh->sol_qlen += qlen; + + TAILQ_CONCAT(&so_inh->sol_incomp, &incomp, so_list); + so_inh->sol_incqlen += incqlen; + + SOCK_UNLOCK(so); + if(head != NULL) + SOLISTEN_UNLOCK(head); + + SOCK_UNLOCK(so_inh); + if(head_inh != NULL) { + if(qlen > 0) { + /* "New" connections have arrived */ + solisten_wakeup(head_inh); + } else { + SOLISTEN_UNLOCK(head_inh); + } + } +} + +/* * Close a socket on last file table reference removal. Initiate disconnect * if connected. Free socket when disconnect complete. * @@ -1063,6 +1180,9 @@ int soclose(struct socket *so) { + if(so->inherit) + printf("%s] inherited socket %p\n", __func__, so); + struct accept_queue lqueue; bool listening; int error = 0; @@ -1114,6 +1234,7 @@ sp->so_listen = NULL; SOCK_UNLOCK(sp); /* Guaranteed not to be the last. */ + printf("%s] calling refcount_release\n", __func__); refcount_release(&so->so_count); } } @@ -1192,6 +1313,8 @@ int soconnect(struct socket *so, struct sockaddr *nam, struct thread *td) { + if(so->inherit) + printf("%s] connecting inherited socket %p\n", __func__, so); return (soconnectat(AT_FDCWD, so, nam, td)); } @@ -1247,6 +1370,9 @@ int sodisconnect(struct socket *so) { + if(so->inherit) + printf("%s] disconnecting inherited socket %p\n", __func__, so); + int error; if ((so->so_state & SS_ISCONNECTED) == 0) @@ -1429,6 +1555,8 @@ sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + if(so->inherit) + printf("%s] send_generic on inherited socket %p\n", __func__, so); long space; ssize_t resid; int clen = 0, error, dontroute; @@ -1610,6 +1738,14 @@ sosend(struct socket *so, struct sockaddr *addr, struct uio *uio, struct mbuf *top, struct mbuf *control, int flags, struct thread *td) { + if(so->inherit) { + struct thread *td = curthread; + struct proc *p = td->td_proc; + int pid = p->p_pid; + printf("%s] pid %d\n", __func__, pid); + + printf("%s] send on inherited socket %p\n", __func__, so); + } int error; CURVNET_SET(so->so_vnet); @@ -2547,6 +2683,9 @@ soreceive(struct socket *so, struct sockaddr **psa, struct uio *uio, struct mbuf **mp0, struct mbuf **controlp, int *flagsp) { + if(so->inherit) + printf("%s] receiving on inherited socket %p\n", __func__, so); + int error; CURVNET_SET(so->so_vnet); @@ -2772,6 +2911,7 @@ case SO_BROADCAST: case SO_REUSEADDR: case SO_REUSEPORT: + case SO_REUSEPORT_LB: case SO_OOBINLINE: case SO_TIMESTAMP: case SO_BINTIME: @@ -3021,6 +3161,7 @@ case SO_KEEPALIVE: case SO_REUSEADDR: case SO_REUSEPORT: + case SO_REUSEPORT_LB: case SO_BROADCAST: case SO_OOBINLINE: case SO_ACCEPTCONN: Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -76,6 +76,11 @@ struct in_addr ia46_addr4; }; +union in_dependaddr { + struct in_addr_4in6 id46_addr; + struct in6_addr id6_addr; +}; + /* * NOTE: ipv6 addrs should be 64-bit aligned, per RFC 2553. in_conninfo has * some extra padding to accomplish this. @@ -86,22 +91,14 @@ u_int16_t ie_fport; /* foreign port */ u_int16_t ie_lport; /* local port */ /* protocol dependent part, local and foreign addr */ - union { - /* foreign host table entry */ - struct in_addr_4in6 ie46_foreign; - struct in6_addr ie6_foreign; - } ie_dependfaddr; - union { - /* local host table entry */ - struct in_addr_4in6 ie46_local; - struct in6_addr ie6_local; - } ie_dependladdr; + union in_dependaddr ie_dependfaddr; /* foreign host table entry */ + union in_dependaddr ie_dependladdr; /* local host table entry */ +#define ie_faddr ie_dependfaddr.id46_addr.ia46_addr4 +#define ie_laddr ie_dependladdr.id46_addr.ia46_addr4 +#define ie6_faddr ie_dependfaddr.id6_addr +#define ie6_laddr ie_dependladdr.id6_addr u_int32_t ie6_zoneid; /* scope zone id */ }; -#define ie_faddr ie_dependfaddr.ie46_foreign.ia46_addr4 -#define ie_laddr ie_dependladdr.ie46_local.ia46_addr4 -#define ie6_faddr ie_dependfaddr.ie6_foreign -#define ie6_laddr ie_dependladdr.ie6_local /* * XXX The defines for inc_* are hacks and should be changed to direct @@ -328,6 +325,21 @@ u_short phd_port; }; +struct inpcblbgroup { + LIST_ENTRY(inpcblbgroup) il_list; + uint16_t il_lport; + u_char il_vflag; + u_char il_pad; + uint32_t il_pad2; + union in_dependaddr il_dependladdr; +#define il_laddr il_dependladdr.id46_addr.ia46_addr4 +#define il6_laddr il_dependladdr.id6_addr + uint32_t il_inpsiz; /* size of il_inp[] */ + uint32_t il_inpcnt; /* # of elem in il_inp[] */ + struct inpcb *il_inp[]; +}; +LIST_HEAD(inpcblbgrouphead, inpcblbgroup); + /*- * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. @@ -421,6 +433,13 @@ u_long ipi_wildmask; /* (p) */ /* + * Load balanced group used by the SO_REUSEPORT_LB option, + * hashed by local address and local port. + */ + struct inpcblbgrouphead *ipi_lbgrouphashbase; + u_long ipi_lbgrouphashmask; + + /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ @@ -506,7 +525,7 @@ inp_inpcbtotcpcb(struct inpcb *inp); void inp_4tuple_get(struct inpcb *inp, uint32_t *laddr, uint16_t *lp, uint32_t *faddr, uint16_t *fp); -short inp_so_options(const struct inpcb *inp); +int inp_so_options(const struct inpcb *inp); #endif /* _KERNEL */ @@ -569,6 +588,10 @@ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) #define INP_PCBPORTHASH(lport, mask) \ (ntohs((lport)) & (mask)) +#define INP_PCBLBGROUP_PORTHASH(lport, mask) \ + (ntohs((lport)) & (mask)) +#define INP_PCBLBGROUP_PKTHASH(faddr, lport, fport) \ + ((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) #define INP6_PCBHASHKEY(faddr) ((faddr)->s6_addr32[3]) /* @@ -624,11 +647,11 @@ /* * Flags for inp_flags2. */ -#define INP_LLE_VALID 0x00000001 /* cached lle is valid */ +#define INP_LLE_VALID 0x00000001 /* cached lle is valid */ #define INP_RT_VALID 0x00000002 /* cached rtentry is valid */ #define INP_PCBGROUPWILD 0x00000004 /* in pcbgroup wildcard list */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ -#define INP_FREED 0x00000010 /* inp itself is not valid */ +#define INP_FREED 0x00000010 /* inp itself is not valid */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ @@ -636,6 +659,7 @@ #define INP_RECVRSSBUCKETID 0x00000200 /* populate recv datagram with bucket id */ #define INP_RATE_LIMIT_CHANGED 0x00000400 /* rate limit needs attention */ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ +#define INP_REUSEPORT_LB 0x00001000 /* SO_REUSEPORT_LB option is set */ /* * Flags passed to in_pcblookup*() functions. @@ -739,6 +763,8 @@ in_pcblookup(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *); struct inpcb * + in_pcblookup_lbgroup_last(const struct inpcb *inp); +struct inpcb * in_pcblookup_mbuf(struct inpcbinfo *, struct in_addr, u_int, struct in_addr, u_int, int, struct ifnet *, struct mbuf *); void in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr, Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -102,6 +102,9 @@ #include +#define INPCBLBGROUP_SIZMIN 8 +#define INPCBLBGROUP_SIZMAX 256 + static struct callout ipport_tick_callout; /* @@ -211,6 +214,173 @@ * functions often modify hash chains or addresses in pcbs. */ +static struct inpcblbgroup * +in_pcblbgroup_alloc(struct inpcblbgrouphead *hdr, u_char vflag, + uint16_t port, const union in_dependaddr *addr, int size) +{ + struct inpcblbgroup *grp; + + size_t bytes = __offsetof(struct inpcblbgroup, il_inp[size]); + grp = malloc(bytes, M_PCB, M_WAITOK | M_ZERO); + grp->il_vflag = vflag; + grp->il_lport = port; + grp->il_dependladdr = *addr; + grp->il_inpsiz = size; + LIST_INSERT_HEAD(hdr, grp, il_list); + + return grp; +} + +static void +in_pcblbgroup_free(struct inpcblbgroup *grp) +{ + LIST_REMOVE(grp, il_list); + free(grp, M_TEMP); +} + +static struct inpcblbgroup * +in_pcblbgroup_resize(struct inpcblbgrouphead *hdr, + struct inpcblbgroup *old_grp, int size) +{ + struct inpcblbgroup *grp; + int i; + + grp = in_pcblbgroup_alloc(hdr, old_grp->il_vflag, + old_grp->il_lport, &old_grp->il_dependladdr, size); + + KASSERT(old_grp->il_inpcnt < grp->il_inpsiz, + ("invalid new local group size %d and old local group count %d", + grp->il_inpsiz, old_grp->il_inpcnt)); + for (i = 0; i < old_grp->il_inpcnt; ++i) + grp->il_inp[i] = old_grp->il_inp[i]; + grp->il_inpcnt = old_grp->il_inpcnt; + + in_pcblbgroup_free(old_grp); + + return grp; +} + +/* + * Add PCB to lb group (load balance used by SO_REUSEPORT_LB) + */ +static void +in_pcbinslbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo) +{ + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + + uint16_t hashmask = pcbinfo->ipi_lbgrouphashmask; + uint16_t lport = inp->inp_lport; + uint32_t group_index = INP_PCBLBGROUP_PORTHASH(lport, hashmask); + + hdr = &pcbinfo->ipi_lbgrouphashbase[group_index]; + + struct ucred *cred; + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return; + + /* + * don't allow jailed socket to join local group + */ + if (inp->inp_socket != NULL) + cred = inp->inp_socket->so_cred; + else + cred = NULL; + if (cred != NULL && jailed(cred)) + return; + +#ifdef INET6 + /* + * don't allow IPv4 mapped INET6 wild socket + */ + if ((inp->inp_vflag & INP_IPV4) && + inp->inp_laddr.s_addr == INADDR_ANY && + INP_CHECK_SOCKAF(inp->inp_socket, AF_INET6)) + return; +#endif + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; + + LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_vflag == inp->inp_vflag && + grp->il_lport == inp->inp_lport && + memcmp(&grp->il_dependladdr, + &inp->inp_inc.inc_ie.ie_dependladdr, + sizeof(grp->il_dependladdr)) == 0) { + break; + } + } + if (grp == NULL) { + /* Create new local group */ + grp = in_pcblbgroup_alloc(hdr, inp->inp_vflag, + inp->inp_lport, &inp->inp_inc.inc_ie.ie_dependladdr, + INPCBLBGROUP_SIZMIN); + } else if (grp->il_inpcnt == grp->il_inpsiz) { + if (grp->il_inpsiz >= INPCBLBGROUP_SIZMAX) { + static int limit_logged = 0; + + if (!limit_logged) { + limit_logged = 1; + printf("lb group port %d, " + "limit reached\n", ntohs(grp->il_lport)); + } + return; + } + + /* Expand this local group */ + grp = in_pcblbgroup_resize(hdr, grp, grp->il_inpsiz * 2); + } + + KASSERT(grp->il_inpcnt < grp->il_inpsiz, + ("invalid local group size %d and count %d", + grp->il_inpsiz, grp->il_inpcnt)); + + grp->il_inp[grp->il_inpcnt] = inp; + grp->il_inpcnt++; +} + +static void +in_pcbremlbgrouphash(struct inpcb *inp, struct inpcbinfo *pcbinfo) +{ + struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return; + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; + + LIST_FOREACH(grp, hdr, il_list) { + int i; + + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] != inp) + continue; + + if (grp->il_inpcnt == 1) { + /* Free this local group */ + in_pcblbgroup_free(grp); + } else { + /* Pull up inpcbs */ + for (; i + 1 < grp->il_inpcnt; ++i) + grp->il_inp[i] = grp->il_inp[i + 1]; + grp->il_inpcnt--; + + if (grp->il_inpsiz > INPCBLBGROUP_SIZMIN && + grp->il_inpcnt <= (grp->il_inpsiz / 4)) { + /* Shrink this local group */ + grp = in_pcblbgroup_resize(hdr, grp, + grp->il_inpsiz / 2); + } + } + return; + } + } +} + /* * Different protocols initialize their inpcbs differently - giving * different name to the lock. But they all are disposed the same. @@ -246,6 +416,8 @@ &pcbinfo->ipi_hashmask); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); + pcbinfo->ipi_lbgrouphashbase = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_init(pcbinfo, hashfields, hash_nelements); #endif @@ -269,6 +441,8 @@ hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); + hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, + pcbinfo->ipi_lbgrouphashmask); #ifdef PCBGROUP in_pcbgroup_destroy(pcbinfo); #endif @@ -507,18 +681,20 @@ /* * Return cached socket options. */ -short +int inp_so_options(const struct inpcb *inp) { - short so_options; + int so_options; - so_options = 0; + so_options = 0; - if ((inp->inp_flags2 & INP_REUSEPORT) != 0) - so_options |= SO_REUSEPORT; - if ((inp->inp_flags2 & INP_REUSEADDR) != 0) - so_options |= SO_REUSEADDR; - return (so_options); + if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) + so_options |= SO_REUSEPORT_LB; + if ((inp->inp_flags2 & INP_REUSEPORT) != 0) + so_options |= SO_REUSEPORT; + if ((inp->inp_flags2 & INP_REUSEADDR) != 0) + so_options |= SO_REUSEADDR; + return (so_options); } #endif /* INET || INET6 */ @@ -575,6 +751,12 @@ int error; /* + * XXX Maybe we could let SO_REUSEPORT_LB set SO_REUSEPORT bit here + * so that we don't have to add to the (already messy) code below + */ + int reuseport_lb = (so->so_options & SO_REUSEPORT_LB); + + /* * No state changes, so read locks are sufficient here. */ INP_LOCK_ASSERT(inp); @@ -585,7 +767,7 @@ laddr.s_addr = *laddrp; if (nam != NULL && laddr.s_addr != INADDR_ANY) return (EINVAL); - if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) == 0) + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT|SO_REUSEPORT_LB)) == 0) lookupflags = INPLOOKUP_WILDCARD; if (nam == NULL) { if ((error = prison_local_ip4(cred, &laddr)) != 0) @@ -620,18 +802,23 @@ * and a multicast address is bound on both * new and duplicated sockets. */ + + // XXX: How to deal with SO_REUSEPORT_LB here? + // Added equivalent treatment as SO_REUSEPORT here for now + if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT_LB)) != 0) + reuseport_lb = SO_REUSEADDR|SO_REUSEPORT_LB; if ((so->so_options & (SO_REUSEADDR|SO_REUSEPORT)) != 0) reuseport = SO_REUSEADDR|SO_REUSEPORT; } else if (sin->sin_addr.s_addr != INADDR_ANY) { sin->sin_port = 0; /* yech... */ bzero(&sin->sin_zero, sizeof(sin->sin_zero)); /* - * Is the address a local IP address? + * Is the address a local IP address? * If INP_BINDANY is set, then the socket may be bound * to any endpoint address, local or not. */ if ((inp->inp_flags & INP_BINDANY) == 0 && - ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) + ifa_ifwithaddr_check((struct sockaddr *)sin) == 0) return (EADDRNOTAVAIL); } laddr = sin->sin_addr; @@ -661,7 +848,8 @@ ntohl(t->inp_faddr.s_addr) == INADDR_ANY) && (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || ntohl(t->inp_laddr.s_addr) != INADDR_ANY || - (t->inp_flags2 & INP_REUSEPORT) == 0) && + (t->inp_flags2 & INP_REUSEPORT) || + (t->inp_flags2 & INP_REUSEPORT_LB) == 0) && (inp->inp_cred->cr_uid != t->inp_cred->cr_uid)) return (EADDRINUSE); @@ -686,11 +874,14 @@ */ tw = intotw(t); if (tw == NULL || - (reuseport & tw->tw_so_options) == 0) + ((reuseport & tw->tw_so_options) == 0 && + (reuseport_lb & tw->tw_so_options) == 0)) { return (EADDRINUSE); + } } else if (t && - ((inp->inp_flags2 & INP_BINDMULTI) == 0) && - (reuseport & inp_so_options(t)) == 0) { + ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || @@ -699,7 +890,7 @@ (inp->inp_vflag & INP_IPV6PROTO) == 0 || (t->inp_vflag & INP_IPV6PROTO) == 0) #endif - return (EADDRINUSE); + return (EADDRINUSE); if (t && (! in_pcbbind_check_bindmulti(inp, t))) return (EADDRINUSE); } @@ -816,7 +1007,7 @@ /* * If we found a route, use the address corresponding to * the outgoing interface. - * + * * Otherwise assume faddr is reachable on a directly connected * network and try to find a corresponding interface to take * the source address from. @@ -1360,6 +1551,7 @@ struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(inp->inp_pcbinfo); + in_pcbremlbgrouphash(inp, inp->inp_pcbinfo); LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { @@ -1620,6 +1812,100 @@ } #undef INP_LOOKUP_MAPPED_PCB_COST +struct inpcb * +in_pcblookup_lbgroup_last(const struct inpcb *inp) +{ + const struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; + const struct inpcblbgrouphead *hdr; + const struct inpcblbgroup *grp; + int i; + + if (pcbinfo->ipi_lbgrouphashbase == NULL) + return NULL; + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(inp->inp_lport, pcbinfo->ipi_lbgrouphashmask)]; + + LIST_FOREACH(grp, hdr, il_list) { + if (grp->il_vflag == inp->inp_vflag && + grp->il_lport == inp->inp_lport && + memcmp(&grp->il_dependladdr, + &inp->inp_inc.inc_ie.ie_dependladdr, + sizeof(grp->il_dependladdr)) == 0) { + break; + } + } + if (grp == NULL || grp->il_inpcnt == 1) + return NULL; + + KASSERT(grp->il_inpcnt >= 2, + ("invalid lbgroup inp count %d", grp->il_inpcnt)); + for (i = 0; i < grp->il_inpcnt; ++i) { + if (grp->il_inp[i] == inp) { + int last = grp->il_inpcnt - 1; + + if (i == last) + last = grp->il_inpcnt - 2; + printf("%s] returning inp at index %d (last)\n", __func__, last); + return grp->il_inp[last]; + } + } + printf("%s] returning NULL\n", __func__); + return NULL; +} + +static struct inpcb * +in_pcblookup_lbgroup(const struct inpcbinfo *pcbinfo, + const struct in_addr *laddr, uint16_t lport, const struct in_addr *faddr, + uint16_t fport, int lookupflags) +{ + struct inpcb *local_wild = NULL; + const struct inpcblbgrouphead *hdr; + struct inpcblbgroup *grp; + struct inpcblbgroup *grp_local_wild; + + hdr = &pcbinfo->ipi_lbgrouphashbase[ + INP_PCBLBGROUP_PORTHASH(lport, pcbinfo->ipi_lbgrouphashmask)]; + + /* + * Order of socket selection: + * 1. non-wild. + * 2. wild (if lookupflags contains INPLOOKUP_WILDCARD). + * + * NOTE: + * - Local group does not contain jailed sockets + * - Local group does not contain IPv4 mapped INET6 wild sockets + */ + LIST_FOREACH(grp, hdr, il_list) { +#ifdef INET6 + if (!(grp->il_vflag & INP_IPV4)) + continue; +#endif + + if (grp->il_lport == lport) { + + uint32_t idx = 0; + int pkt_hash = INP_PCBLBGROUP_PKTHASH(faddr->s_addr, lport, fport); + + idx = pkt_hash % grp->il_inpcnt; + + if (grp->il_laddr.s_addr == laddr->s_addr) { + return grp->il_inp[idx]; + } else { + if (grp->il_laddr.s_addr == INADDR_ANY && + (lookupflags & INPLOOKUP_WILDCARD)) { + local_wild = grp->il_inp[idx]; + grp_local_wild = grp; + } + } + } + } + if (local_wild != NULL) { + return local_wild; + } + return NULL; +} + #ifdef PCBGROUP /* * Lookup PCB in hash list, using pcbgroup tables. @@ -1884,6 +2170,16 @@ return (tmpinp); /* + * Then look in lb group + */ + if (pcbinfo->ipi_lbgrouphashbase != NULL) { + inp = in_pcblookup_lbgroup(pcbinfo, &laddr, lport, &faddr, fport, lookupflags); + if (inp != NULL) { + return inp; + } + } + + /* * Then look for a wildcard match, if requested. */ if ((lookupflags & INPLOOKUP_WILDCARD) != 0) { @@ -2085,6 +2381,7 @@ struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; u_int32_t hashkey_faddr; + int so_options; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); @@ -2105,6 +2402,16 @@ pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; + + /* + * Add entry in lb group + * Only do this if SO_REUSEPORT_LB is set + */ + so_options = inp_so_options(inp); + if(so_options & SO_REUSEPORT_LB) { + in_pcbinslbgrouphash(inp, pcbinfo); + } + /* * Go through port list and look for a head for this lport. */ @@ -2231,6 +2538,10 @@ struct inpcbport *phd = inp->inp_phd; INP_HASH_WLOCK(pcbinfo); + + // XXX Only do if SO_REUSEPORT_LB set? + in_pcbremlbgrouphash(inp, pcbinfo); + LIST_REMOVE(inp, inp_hash); LIST_REMOVE(inp, inp_portlist); if (LIST_FIRST(&phd->phd_pcblist) == NULL) { @@ -2319,7 +2630,7 @@ callout_stop(&ipport_tick_callout); } -/* +/* * The ipport_callout should start running at about the time we attach the * inet or inet6 domains. */ @@ -2333,7 +2644,7 @@ EVENTHANDLER_REGISTER(shutdown_pre_sync, ip_fini, NULL, SHUTDOWN_PRI_DEFAULT); } -SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, +SYSINIT(ipport_tick_init, SI_SUB_PROTO_DOMAIN, SI_ORDER_MIDDLE, ipport_tick_init, NULL); void Index: sys/netinet/ip_output.c =================================================================== --- sys/netinet/ip_output.c +++ sys/netinet/ip_output.c @@ -993,6 +993,15 @@ INP_WUNLOCK(inp); error = 0; break; + case SO_REUSEPORT_LB: + INP_WLOCK(inp); + if ((so->so_options & SO_REUSEPORT_LB) != 0) + inp->inp_flags2 |= INP_REUSEPORT_LB; + else + inp->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(inp); + error = 0; + break; case SO_SETFIB: INP_WLOCK(inp); inp->inp_inc.inc_fibnum = so->so_fibnum; Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -334,7 +334,7 @@ } } -void +void cc_conn_init(struct tcpcb *tp) { struct hc_metrics_lite metrics; @@ -437,7 +437,7 @@ EXIT_RECOVERY(tp->t_flags); if (CC_ALGO(tp)->cong_signal == NULL) { /* - * RFC5681 Section 3.1 + * RFC5681 Section 3.1 * ssthresh = max (FlightSize / 2, 2*SMSS) eq (4) */ tp->snd_ssthresh = @@ -1387,9 +1387,11 @@ TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); #ifdef TCP_RFC7413 + printf("%s] inp %p (TCP_RFC7413)\n", __func__, inp); if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) goto tfo_socket_result; #else + /* printf("%s] inp %p\n", __func__, inp); */ syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL); #endif /* @@ -1564,7 +1566,7 @@ #ifdef TCP_RFC7413 int tfo_syn; #endif - + #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1770,7 +1772,7 @@ th->th_seq == tp->rcv_nxt && (thflags & (TH_SYN|TH_FIN|TH_RST|TH_URG|TH_ACK)) == TH_ACK && tp->snd_nxt == tp->snd_max && - tiwin && tiwin == tp->snd_wnd && + tiwin && tiwin == tp->snd_wnd && ((tp->t_flags & (TF_NEEDSYN|TF_NEEDFIN)) == 0) && LIST_EMPTY(&tp->t_segq) && ((to.to_flags & TOF_TS) == 0 || @@ -1850,7 +1852,7 @@ if (SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - + /* * Let the congestion control algorithm update * congestion control related information. This @@ -1999,7 +2001,7 @@ goto dropwithreset; } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ - if ((tcp_timer_active(tp, TT_DELACK) || + if ((tcp_timer_active(tp, TT_DELACK) || tcp_timer_active(tp, TT_REXMT))) goto drop; } else if (!(thflags & (TH_ACK|TH_FIN|TH_RST))) { @@ -2065,7 +2067,7 @@ tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } - + /* * Received in SYN_SENT[*] state. * Transitions: @@ -2383,14 +2385,14 @@ /* * If last ACK falls within this segment's sequence numbers, * record its timestamp. - * NOTE: + * NOTE: * 1) That the test incorporates suggestions from the latest * proposal of the tcplw@cray.com list (Braden 1993/04/26). * 2) That updating only on newer timestamps interferes with * our earlier PAWS tests, so this check should be solely * predicated on the sequence space of this segment. - * 3) That we modify the segment boundary check to be - * Last.ACK.Sent <= SEG.SEQ + SEG.Len + * 3) That we modify the segment boundary check to be + * Last.ACK.Sent <= SEG.SEQ + SEG.Len * instead of RFC1323's * Last.ACK.Sent < SEG.SEQ + SEG.Len, * This modified check allows us to overcome RFC1323's @@ -2469,7 +2471,7 @@ /* * Account for the ACK of our SYN prior to * regular ACK processing below. - */ + */ tp->snd_una++; } /* @@ -2598,10 +2600,10 @@ if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; - + /* * Compute the amount of data in flight first. - * We can inject new data into the pipe iff + * We can inject new data into the pipe iff * we have less than 1/2 the original window's * worth of data in flight. */ Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -245,7 +245,7 @@ tcp_do_segment, tcp_default_ctloutput, NULL, - NULL, + NULL, NULL, NULL, NULL, @@ -305,11 +305,11 @@ find_and_ref_tcp_functions(struct tcp_function_set *fs) { struct tcp_function_block *blk; - - rw_rlock(&tcp_function_lock); + + rw_rlock(&tcp_function_lock); blk = find_tcp_functions_locked(fs); if (blk) - refcount_acquire(&blk->tfb_refcnt); + refcount_acquire(&blk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(blk); } @@ -318,10 +318,10 @@ find_and_ref_tcp_fb(struct tcp_function_block *blk) { struct tcp_function_block *rblk; - - rw_rlock(&tcp_function_lock); + + rw_rlock(&tcp_function_lock); rblk = find_tcp_fb_locked(blk, NULL); - if (rblk) + if (rblk) refcount_acquire(&rblk->tfb_refcnt); rw_runlock(&tcp_function_lock); return(rblk); @@ -343,7 +343,7 @@ strcpy(fs.function_set_name, blk->tfb_tcp_block_name); fs.pcbcnt = blk->tfb_refcnt; } - rw_runlock(&tcp_function_lock); + rw_runlock(&tcp_function_lock); error = sysctl_handle_string(oidp, fs.function_set_name, sizeof(fs.function_set_name), req); @@ -354,8 +354,8 @@ rw_wlock(&tcp_function_lock); blk = find_tcp_functions_locked(&fs); if ((blk == NULL) || - (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { - error = ENOENT; + (blk->tfb_flags & TCP_FUNC_BEING_REMOVED)) { + error = ENOENT; goto done; } tcp_func_set_ptr = blk; @@ -397,7 +397,7 @@ bufsz -= linesz; outsz = linesz; - rw_rlock(&tcp_function_lock); + rw_rlock(&tcp_function_lock); TAILQ_FOREACH(f, &t_functions, tf_next) { alias = (f->tf_name != f->tf_fb->tfb_tcp_block_name); linesz = snprintf(cp, bufsz, "%-32s%c %-32s %u\n", @@ -537,7 +537,7 @@ (blk->tfb_tcp_do_segment == NULL) || (blk->tfb_tcp_ctloutput == NULL) || (strlen(blk->tfb_tcp_block_name) == 0)) { - /* + /* * These functions are required and you * need a name. */ @@ -549,7 +549,7 @@ blk->tfb_tcp_timer_active || blk->tfb_tcp_timer_stop) { /* - * If you define one timer function you + * If you define one timer function you * must have them all. */ if ((blk->tfb_tcp_timer_stop_all == NULL) || @@ -651,7 +651,7 @@ { struct tcp_function *f; int error=ENOENT; - + if (strcmp(blk->tfb_tcp_block_name, "default") == 0) { /* You can't un-register the default */ return (EPERM); @@ -665,7 +665,7 @@ if (blk->tfb_refcnt) { /* Still tcb attached, mark it. */ blk->tfb_flags |= TCP_FUNC_BEING_REMOVED; - rw_wunlock(&tcp_function_lock); + rw_wunlock(&tcp_function_lock); return (EBUSY); } while (find_tcp_fb_locked(blk, &f) != NULL) { @@ -1069,7 +1069,7 @@ m = n; } else { /* - * reuse the mbuf. + * reuse the mbuf. * XXX MRT We inherit the FIB, which is lucky. */ m_freem(m->m_next); @@ -1439,6 +1439,9 @@ { struct socket *so = tp->t_inpcb->inp_socket; + if(so->inherit) + printf("%s] inherited socket %p\n", __func__, so); + INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); @@ -1481,12 +1484,12 @@ tcp_timer_stop(tp, TT_2MSL); tcp_timer_stop(tp, TT_DELACK); if (tp->t_fb->tfb_tcp_timer_stop_all) { - /* - * Call the stop-all function of the methods, + /* + * Call the stop-all function of the methods, * this function should call the tcp_timer_stop() * method with each of the function specific timeouts. * That stop will be called via the tfb_tcp_timer_stop() - * which should use the async drain function of the + * which should use the async drain function of the * callout system (see tcp_var.h). */ tp->t_fb->tfb_tcp_timer_stop_all(tp); @@ -1556,7 +1559,7 @@ if (tp->t_flags & TF_TOE) tcp_offload_detach(tp); #endif - + tcp_free_sackholes(tp); #ifdef TCPPCAP @@ -1594,7 +1597,7 @@ { struct inpcb *inp; struct tcpcb *tp; - + tp = (struct tcpcb *)ptp; CURVNET_SET(tp->t_vnet); INP_INFO_RLOCK(&V_tcbinfo); @@ -1633,10 +1636,38 @@ { struct inpcb *inp = tp->t_inpcb; struct socket *so; + struct inpcb *inp_inh = NULL; + int listen = tp->t_state & TCPS_LISTEN; + + printf("%s] inp %p\n", __func__, inp); INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + if (listen) { + /* + * Pending socket inheritance + * + * If this is a listen(2) socket, find another listen(2) + * socket in the same local group, which could inherit + * the syncache and sockets pending on the completion + * and incompletion queues. + * + * NOTE: + * Currently the inheritance could only happen on the + * listen(2) sockets with SO_REUSEPORT_LB set. + */ + + // XXX: How to handle this? + // ASSERT_IN_NETISR(0); + + inp_inh = in_pcblookup_lbgroup_last(inp); + if (inp_inh != NULL) + printf("%s] inp %p will inherit from inp %p\n", __func__, inp_inh, inp); + else + printf("%s] there is none that can inherit from inp %p\n", __func__, inp); + } + #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); @@ -1658,7 +1689,33 @@ tcp_state_change(tp, TCPS_CLOSED); KASSERT(inp->inp_socket != NULL, ("tcp_close: inp_socket NULL")); so = inp->inp_socket; + + if(so->inherit) + printf("%s] inherited socket %p\n", __func__, so); + soisdisconnected(so); + + // Socket inherit + if(listen) + { + // XXX What do we do with syncache? + // syncache seem to be stored separately from sockets/inps, + // no need to do anything?? + + // from dflybsd impl: + // syncache_destroy(tp, tp_inh); + + + if(inp_inh == NULL) { + printf("%s] inp_inh is NULL, can't inherit\n", __func__); + } else if(inp_inh->inp_socket == NULL) { + printf("%s] inp_inh->inp_socket is NULL, can't inherit\n", __func__); + } else { + soinherit(so, inp_inh->inp_socket); + } + } + + if (inp->inp_flags & INP_SOCKREF) { KASSERT(so->so_state & SS_PROTOREF, ("tcp_close: !SS_PROTOREF")); @@ -2023,7 +2080,7 @@ if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || - cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || + cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip) notify = tcp_drop_syn_sent; @@ -2159,7 +2216,7 @@ if (cmd == PRC_MSGSIZE) notify = tcp_mtudisc_notify; else if (V_icmp_may_rst && (cmd == PRC_UNREACH_ADMIN_PROHIB || - cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || + cmd == PRC_UNREACH_PORT || cmd == PRC_UNREACH_PROTOCOL || cmd == PRC_TIMXCEED_INTRANS) && ip6 != NULL) notify = tcp_drop_syn_sent; @@ -2437,7 +2494,7 @@ KASSERT(tp != NULL, ("tcp_mtudisc: tp == NULL")); tcp_mss_update(tp, -1, mtuoffer, NULL, NULL); - + so = inp->inp_socket; SOCKBUF_LOCK(&so->so_snd); /* If the mss is larger than the socket buffer, decrease the mss. */ Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -372,6 +372,7 @@ static void syncache_drop(struct syncache *sc, struct syncache_head *sch) { + printf("%s]\n", __func__); SCH_LOCK_ASSERT(sch); @@ -794,7 +795,7 @@ struct sockaddr_in sin; inp->inp_options = (m) ? ip_srcroute(m) : NULL; - + if (inp->inp_options == NULL) { inp->inp_options = sc->sc_ipopts; sc->sc_ipopts = NULL; @@ -838,11 +839,11 @@ if (V_functions_inherit_listen_socket_stack && blk != tp->t_fb) { /* * Our parents t_fb was not the default, - * we need to release our ref on tp->t_fb and + * we need to release our ref on tp->t_fb and * pickup one on the new entry. */ struct tcp_function_block *rblk; - + rblk = find_and_ref_tcp_fb(blk); KASSERT(rblk != NULL, ("cannot find blk %p out of syncache?", blk)); @@ -853,7 +854,7 @@ if (tp->t_fb->tfb_tcp_fb_init) { (*tp->t_fb->tfb_tcp_fb_init)(tp); } - } + } tp->snd_wl1 = sc->sc_irs; tp->snd_max = tp->iss + 1; tp->snd_nxt = tp->iss + 1; @@ -1066,7 +1067,7 @@ #endif /* TCP_SIGNATURE */ /* * Pull out the entry to unlock the bucket row. - * + * * NOTE: We must decrease TCPS_SYN_RECEIVED count here, not * tcp_state_change(). The tcpcb is not existent at this * moment. A new one will be allocated via syncache_socket-> @@ -1231,6 +1232,7 @@ struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, void *todctx) { + /* printf("%s] inp %p\n", __func__, inp); */ struct tcpcb *tp; struct socket *so; struct syncache *sc = NULL; @@ -2046,7 +2048,7 @@ } static struct syncache * -syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, +syncookie_lookup(struct in_conninfo *inc, struct syncache_head *sch, struct syncache *sc, struct tcphdr *th, struct tcpopt *to, struct socket *lso) { @@ -2084,7 +2086,7 @@ sc->sc_flags = 0; bcopy(inc, &sc->sc_inc, sizeof(struct in_conninfo)); sc->sc_ipopts = NULL; - + sc->sc_irs = seq; sc->sc_iss = ack; Index: sys/netinet6/ip6_output.c =================================================================== --- sys/netinet6/ip6_output.c +++ sys/netinet6/ip6_output.c @@ -1460,6 +1460,15 @@ INP_WUNLOCK(in6p); error = 0; break; + case SO_REUSEPORT_LB: + INP_WLOCK(in6p); + if ((so->so_options & SO_REUSEPORT_LB) != 0) + in6p->inp_flags2 |= INP_REUSEPORT_LB; + else + in6p->inp_flags2 &= ~INP_REUSEPORT_LB; + INP_WUNLOCK(in6p); + error = 0; + break; case SO_SETFIB: INP_WLOCK(in6p); in6p->inp_inc.inc_fibnum = so->so_fibnum; Index: sys/sys/socket.h =================================================================== --- sys/sys/socket.h +++ sys/sys/socket.h @@ -145,6 +145,9 @@ #define SO_NO_OFFLOAD 0x4000 /* socket cannot be offloaded */ #define SO_NO_DDP 0x8000 /* disable direct data placement */ +// XXX: so_options only 16 bit.. (increased to 32) +#define SO_REUSEPORT_LB 0x00010000 /* reuse with load balancing */ + /* * Additional options, not kept in so_options. */ Index: sys/sys/socketvar.h =================================================================== --- sys/sys/socketvar.h +++ sys/sys/socketvar.h @@ -73,12 +73,13 @@ */ TAILQ_HEAD(accept_queue, socket); struct socket { + uint32_t inherit; /* temporarily added for debugging */ struct mtx so_lock; volatile u_int so_count; /* (b / refcount) */ struct selinfo so_rdsel; /* (b/cr) for so_rcv/so_comp */ struct selinfo so_wrsel; /* (b/cs) for so_snd */ short so_type; /* (a) generic type, see socket.h */ - short so_options; /* (b) from socket call, see socket.h */ + int so_options; /* (b) from socket call, see socket.h */ short so_linger; /* time to linger close(2) */ short so_state; /* (b) internal state flags SS_* */ void *so_pcb; /* protocol control block */ @@ -200,12 +201,12 @@ size_t xso_len; /* length of this structure */ struct socket *xso_so; /* makes a convenient handle sometimes */ short so_type; - short so_options; + int so_options; short so_linger; short so_state; caddr_t so_pcb; /* another convenient handle */ - int xso_protocol; - int xso_family; + int xso_protocol; + int xso_family; u_int so_qlen; u_int so_incqlen; u_int so_qlimit; @@ -386,6 +387,7 @@ int sodisconnect(struct socket *so); struct sockaddr *sodupsockaddr(const struct sockaddr *sa, int mflags); void sofree(struct socket *so); +void soinherit(struct socket *so, struct socket *so_inh); void sohasoutofband(struct socket *so); int solisten(struct socket *so, int backlog, struct thread *td); void solisten_proto(struct socket *so, int backlog); @@ -431,7 +433,6 @@ void solisten_wakeup(struct socket *); int selsocket(struct socket *so, int events, struct timeval *tv, struct thread *td); - /* * Accept filter functions (duh). */