diff --git a/sys/netinet/in_pcb.h b/sys/netinet/in_pcb.h --- a/sys/netinet/in_pcb.h +++ b/sys/netinet/in_pcb.h @@ -45,13 +45,13 @@ #include #include #include +#include #include #ifdef _KERNEL #include #include #include -#include #include #include #include @@ -215,7 +215,8 @@ struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */ + CK_LIST_ENTRY(inpcb) inp_hash_exact; /* hash table linkage */ + CK_LIST_ENTRY(inpcb) inp_hash_wild; /* hash table linkage */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts @@ -261,11 +262,12 @@ u_char inp_ip_p; /* (c) protocol proto */ u_char inp_ip_minttl; /* (i) minimum TTL or drop */ uint32_t inp_flowid; /* (x) flow id / queue id */ + smr_seq_t inp_smr; /* (i) sequence number at disconnect */ struct m_snd_tag *inp_snd_tag; /* (i) send tag for outgoing mbufs */ uint32_t inp_flowtype; /* (x) M_HASHTYPE value */ /* Local and foreign ports, local and foreign addr. */ - struct in_conninfo inp_inc; /* (i) list for PCB's local port */ + struct in_conninfo inp_inc; /* (i,h) list for PCB's local port */ /* MAC and IPSEC policy information. */ struct label *inp_label; /* (i) MAC label */ @@ -430,10 +432,12 @@ /* * Global hash of inpcbs, hashed by local and foreign addresses and - * port numbers. + * port numbers. The "exact" hash holds PCBs connected to a foreign + * address, and "wild" holds the rest. */ struct mtx ipi_hash_lock; - struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */ + struct inpcbhead *ipi_hash_exact; /* (r:e/w:h) */ + struct inpcbhead *ipi_hash_wild; /* (r:e/w:h) */ u_long ipi_hashmask; /* (c) */ /* @@ -643,7 +647,6 @@ #define IN6P_RTHDRDSTOPTS 0x00200000 /* receive dstoptions before rthdr */ #define IN6P_TCLASS 0x00400000 /* receive traffic class value */ #define IN6P_AUTOFLOWLABEL 0x00800000 /* attach flowlabel automatically */ -/* was INP_TIMEWAIT 0x01000000 */ #define INP_ONESBCAST 0x02000000 /* send all-ones broadcast */ #define INP_DROPPED 0x04000000 /* protocol drop flag */ #define INP_SOCKREF 0x08000000 /* strong socket reference */ @@ -760,6 +763,7 @@ int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); +void in_pcbremhash_locked(struct inpcb *); bool in_pcbrele_rlocked(struct inpcb *); bool in_pcbrele_wlocked(struct inpcb *); diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -58,8 +58,10 @@ #include #include #include +#include #include #include +#include #include #include #include @@ -510,7 +512,9 @@ #endif CK_LIST_INIT(&pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; - pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, + pcbinfo->ipi_hash_exact = hashinit(hash_nelements, M_PCB, + &pcbinfo->ipi_hashmask); + pcbinfo->ipi_hash_wild = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, @@ -532,7 +536,8 @@ KASSERT(pcbinfo->ipi_count == 0, ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); - hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); + hashdestroy(pcbinfo->ipi_hash_exact, M_PCB, pcbinfo->ipi_hashmask); + hashdestroy(pcbinfo->ipi_hash_wild, M_PCB, pcbinfo->ipi_hashmask); hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, pcbinfo->ipi_porthashmask); hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, @@ -630,6 +635,8 @@ #ifdef INET inp->inp_vflag |= INP_IPV4; #endif + inp->inp_smr = SMR_SEQ_INVALID; + /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. @@ -1010,7 +1017,7 @@ */ int in_pcbconnect(struct inpcb *inp, struct sockaddr_in *sin, struct ucred *cred, - bool rehash) + bool rehash __unused) { u_short lport, fport; in_addr_t laddr, faddr; @@ -1018,6 +1025,8 @@ INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + KASSERT(in_nullhost(inp->inp_faddr), + ("%s: inp is already connected", __func__)); lport = inp->inp_lport; laddr = inp->inp_laddr.s_addr; @@ -1027,28 +1036,26 @@ if (error) return (error); + inp->inp_faddr.s_addr = faddr; + inp->inp_fport = fport; + /* Do the initial binding of the local address if required. */ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { - KASSERT(rehash == true, - ("Rehashing required for unbound inps")); inp->inp_lport = lport; inp->inp_laddr.s_addr = laddr; if (in_pcbinshash(inp) != 0) { - inp->inp_laddr.s_addr = INADDR_ANY; - inp->inp_lport = 0; + inp->inp_laddr.s_addr = inp->inp_faddr.s_addr = + INADDR_ANY; + inp->inp_lport = inp->inp_fport = 0; return (EAGAIN); } - } - - /* Commit the remaining changes. */ - inp->inp_lport = lport; - inp->inp_laddr.s_addr = laddr; - inp->inp_faddr.s_addr = faddr; - inp->inp_fport = fport; - if (rehash) { - in_pcbrehash(inp); } else { - in_pcbinshash(inp); + inp->inp_lport = lport; + inp->inp_laddr.s_addr = laddr; + if ((inp->inp_flags & INP_INHASHLIST) != 0) + in_pcbrehash(inp); + else + in_pcbinshash(inp); } if (anonport) @@ -1402,11 +1409,16 @@ INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + KASSERT(inp->inp_smr == SMR_SEQ_INVALID, + ("%s: inp %p was already disconnected", __func__, inp)); + + in_pcbremhash_locked(inp); + /* See the comment in in_pcbinshash(). */ + inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); inp->inp_laddr.s_addr = INADDR_ANY; inp->inp_faddr.s_addr = INADDR_ANY; inp->inp_fport = 0; - in_pcbrehash(inp); } #endif /* INET */ @@ -1551,11 +1563,11 @@ #define II_LIST_FIRST(ipi, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ - CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) + CK_LIST_FIRST(&(ipi)->ipi_hash_exact[(hash)])) #define II_LIST_NEXT(inp, hash) \ (((hash) == INP_ALL_LIST) ? \ CK_LIST_NEXT((inp), inp_list) : \ - CK_LIST_NEXT((inp), inp_hash)) + CK_LIST_NEXT((inp), inp_hash_exact)) #define II_LOCK_ASSERT(inp, lock) \ rw_assert(&(inp)->inp_lock, \ (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) @@ -1996,9 +2008,9 @@ * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ - head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, + head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash_wild) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -2178,9 +2190,9 @@ INP_HASH_LOCK_ASSERT(pcbinfo); match = NULL; - head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&faddr, lport, fport, + head = &pcbinfo->ipi_hash_exact[INP_PCBHASH(&faddr, lport, fport, pcbinfo->ipi_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash_exact) { #ifdef INET6 /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV4) == 0) @@ -2214,13 +2226,13 @@ * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ - head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, + head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; local_wild = local_exact = jail_wild = NULL; #ifdef INET6 local_wild_mapped = NULL; #endif - CK_LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash_wild) { bool injail; #ifdef INET6 @@ -2368,21 +2380,31 @@ struct inpcbporthead *pcbporthash; struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbport *phd; + uint32_t hash; + bool connected; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); - KASSERT((inp->inp_flags & INP_INHASHLIST) == 0, ("in_pcbinshash: INP_INHASHLIST")); #ifdef INET6 - if (inp->inp_vflag & INP_IPV6) - pcbhash = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, - inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; - else + if (inp->inp_vflag & INP_IPV6) { + hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, + inp->inp_fport, pcbinfo->ipi_hashmask); + connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); + } else #endif - pcbhash = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, - inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; + { + hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, + inp->inp_fport, pcbinfo->ipi_hashmask); + connected = !in_nullhost(inp->inp_faddr); + } + + if (connected) + pcbhash = &pcbinfo->ipi_hash_exact[hash]; + else + pcbhash = &pcbinfo->ipi_hash_wild[hash]; pcbporthash = &pcbinfo->ipi_porthashbase[ INP_PCBPORTHASH(inp->inp_lport, pcbinfo->ipi_porthashmask)]; @@ -2421,66 +2443,117 @@ } inp->inp_phd = phd; CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); - CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + + /* + * The PCB may have been disconnected in the past. Before we can safely + * make it visible in the hash table, we must wait for all readers which + * may be traversing this PCB to finish. + */ + if (inp->inp_smr != SMR_SEQ_INVALID) { + smr_wait(pcbinfo->ipi_smr, inp->inp_smr); + inp->inp_smr = SMR_SEQ_INVALID; + } + + if (connected) + CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); + else + CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_wild); inp->inp_flags |= INP_INHASHLIST; return (0); } -static void -in_pcbremhash(struct inpcb *inp) +void +in_pcbremhash_locked(struct inpcb *inp) { struct inpcbport *phd = inp->inp_phd; INP_WLOCK_ASSERT(inp); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); MPASS(inp->inp_flags & INP_INHASHLIST); - INP_HASH_WLOCK(inp->inp_pcbinfo); if ((inp->inp_flags2 & INP_REUSEPORT_LB) != 0) in_pcbremlbgrouphash(inp); - CK_LIST_REMOVE(inp, inp_hash); +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6) { + if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) + CK_LIST_REMOVE(inp, inp_hash_wild); + else + CK_LIST_REMOVE(inp, inp_hash_exact); + } else +#endif + { + if (in_nullhost(inp->inp_faddr)) + CK_LIST_REMOVE(inp, inp_hash_wild); + else + CK_LIST_REMOVE(inp, inp_hash_exact); + } CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); } - INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; } +static void +in_pcbremhash(struct inpcb *inp) +{ + INP_HASH_WLOCK(inp->inp_pcbinfo); + in_pcbremhash_locked(inp); + INP_HASH_WUNLOCK(inp->inp_pcbinfo); +} + /* * Move PCB to the proper hash bucket when { faddr, fport } have been * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. - * - * XXXGL: a race between this function and SMR-protected hash iterator - * will lead to iterator traversing a possibly wrong hash list. However, - * this race should have been here since change from rwlock to epoch. */ void in_pcbrehash(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct inpcbhead *head; + uint32_t hash; + bool connected; INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(pcbinfo); - KASSERT(inp->inp_flags & INP_INHASHLIST, - ("in_pcbrehash: !INP_INHASHLIST")); + ("%s: !INP_INHASHLIST", __func__)); + KASSERT(inp->inp_smr == SMR_SEQ_INVALID, + ("%s: inp was disconnected", __func__)); #ifdef INET6 - if (inp->inp_vflag & INP_IPV6) - head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(&inp->in6p_faddr, - inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; - else + if (inp->inp_vflag & INP_IPV6) { + hash = INP6_PCBHASH(&inp->in6p_faddr, inp->inp_lport, + inp->inp_fport, pcbinfo->ipi_hashmask); + connected = !IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr); + } else #endif - head = &pcbinfo->ipi_hashbase[INP_PCBHASH(&inp->inp_faddr, - inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; + { + hash = INP_PCBHASH(&inp->inp_faddr, inp->inp_lport, + inp->inp_fport, pcbinfo->ipi_hashmask); + connected = !in_nullhost(inp->inp_faddr); + } - CK_LIST_REMOVE(inp, inp_hash); - CK_LIST_INSERT_HEAD(head, inp, inp_hash); + /* + * When rehashing, the caller must ensure that either the new or the old + * foreign address was unspecified. + */ + if (connected) + CK_LIST_REMOVE(inp, inp_hash_wild); + else + CK_LIST_REMOVE(inp, inp_hash_exact); + + if (connected) { + head = &pcbinfo->ipi_hash_exact[hash]; + CK_LIST_INSERT_HEAD(head, inp, inp_hash_exact); + } else { + head = &pcbinfo->ipi_hash_wild[hash]; + CK_LIST_INSERT_HEAD(head, inp, inp_hash_wild); + } } /* diff --git a/sys/netinet/raw_ip.c b/sys/netinet/raw_ip.c --- a/sys/netinet/raw_ip.c +++ b/sys/netinet/raw_ip.c @@ -166,8 +166,8 @@ inp->inp_faddr.s_addr, pcbinfo->ipi_hashmask); } else hash = 0; - pcbhash = &pcbinfo->ipi_hashbase[hash]; - CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); + pcbhash = &pcbinfo->ipi_hash_exact[hash]; + CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash_exact); } static void @@ -177,7 +177,7 @@ INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); - CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_hash_exact); } #endif /* INET */ diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -81,7 +81,9 @@ #include #include #include +#include #include +#include #include #include #include @@ -398,7 +400,7 @@ */ int in6_pcbconnect(struct inpcb *inp, struct sockaddr_in6 *sin6, struct ucred *cred, - bool rehash) + bool rehash __unused) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; struct sockaddr_in6 laddr6; @@ -411,6 +413,8 @@ ("%s: invalid address family for %p", __func__, sin6)); KASSERT(sin6->sin6_len == sizeof(*sin6), ("%s: invalid address length for %p", __func__, sin6)); + KASSERT(IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr), + ("%s: inp is already connected", __func__)); bzero(&laddr6, sizeof(laddr6)); laddr6.sin6_family = AF_INET6; @@ -440,17 +444,6 @@ return (EADDRINUSE); if (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { if (inp->inp_lport == 0) { - /* - * rehash was required to be true in the past for - * this case; retain that convention. However, - * we now call in_pcb_lport_dest rather than - * in6_pcbbind; the former does not insert into - * the hash table, the latter does. Change rehash - * to false to do the in_pcbinshash below. - */ - KASSERT(rehash == true, - ("Rehashing required for unbound inps")); - rehash = false; error = in_pcb_lport_dest(inp, (struct sockaddr *) &laddr6, &inp->inp_lport, (struct sockaddr *) sin6, sin6->sin6_port, cred, @@ -468,7 +461,7 @@ inp->inp_flow |= (htonl(ip6_randomflowlabel()) & IPV6_FLOWLABEL_MASK); - if (rehash) { + if ((inp->inp_flags & INP_INHASHLIST) != 0) { in_pcbrehash(inp); } else { in_pcbinshash(inp); @@ -483,13 +476,20 @@ INP_WLOCK_ASSERT(inp); INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); + KASSERT(inp->inp_smr == SMR_SEQ_INVALID, + ("%s: inp %p was already disconnected", __func__, inp)); + + in_pcbremhash_locked(inp); + + /* See the comment in in_pcbinshash(). */ + inp->inp_smr = smr_advance(inp->inp_pcbinfo->ipi_smr); + /* XXX-MJ torn writes are visible to SMR lookup */ memset(&inp->in6p_laddr, 0, sizeof(inp->in6p_laddr)); memset(&inp->in6p_faddr, 0, sizeof(inp->in6p_faddr)); inp->inp_fport = 0; /* clear flowinfo - draft-itojun-ipv6-flowlabel-api-00 */ inp->inp_flow &= ~IPV6_FLOWLABEL_MASK; - in_pcbrehash(inp); } struct sockaddr * @@ -712,9 +712,9 @@ * Look for an unconnected (wildcard foreign addr) PCB that * matches the local address and port we're looking for. */ - head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, + head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash_wild) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; @@ -952,9 +952,9 @@ * First look for an exact match. */ match = NULL; - head = &pcbinfo->ipi_hashbase[INP6_PCBHASH(faddr, lport, fport, + head = &pcbinfo->ipi_hash_exact[INP6_PCBHASH(faddr, lport, fport, pcbinfo->ipi_hashmask)]; - CK_LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash_exact) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) continue; @@ -982,10 +982,10 @@ * 3. non-jailed, non-wild. * 4. non-jailed, wild. */ - head = &pcbinfo->ipi_hashbase[INP_PCBHASH_WILD(lport, + head = &pcbinfo->ipi_hash_wild[INP_PCBHASH_WILD(lport, pcbinfo->ipi_hashmask)]; local_wild = local_exact = jail_wild = NULL; - CK_LIST_FOREACH(inp, head, inp_hash) { + CK_LIST_FOREACH(inp, head, inp_hash_wild) { bool injail; /* XXX inp locking */