Changeset View
Standalone View
sys/netinet/in_pcb.c
Show First 20 Lines • Show All 108 Lines • ▼ Show 20 Lines | |||||
#endif | #endif | ||||
#include <netipsec/ipsec_support.h> | #include <netipsec/ipsec_support.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#define INPCBLBGROUP_SIZMIN 8 | #define INPCBLBGROUP_SIZMIN 8 | ||||
#define INPCBLBGROUP_SIZMAX 256 | #define INPCBLBGROUP_SIZMAX 256 | ||||
#define INP_FREED 0x00000200 /* See in_pcb.h. */ | |||||
static struct callout ipport_tick_callout; | static struct callout ipport_tick_callout; | ||||
/* | /* | ||||
* These configure the range of local port addresses assigned to | * These configure the range of local port addresses assigned to | ||||
* "unspecified" outgoing connections/packets/whatever. | * "unspecified" outgoing connections/packets/whatever. | ||||
*/ | */ | ||||
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ | VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ | ||||
Show All 16 Lines | |||||
VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ | VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ | ||||
VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ | VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ | ||||
VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ | VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ | ||||
VNET_DEFINE(int, ipport_tcpallocs); | VNET_DEFINE(int, ipport_tcpallocs); | ||||
VNET_DEFINE_STATIC(int, ipport_tcplastcount); | VNET_DEFINE_STATIC(int, ipport_tcplastcount); | ||||
#define V_ipport_tcplastcount VNET(ipport_tcplastcount) | #define V_ipport_tcplastcount VNET(ipport_tcplastcount) | ||||
static void in_pcbremlists(struct inpcb *inp); | |||||
#ifdef INET | #ifdef INET | ||||
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, | static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, | ||||
struct in_addr faddr, u_int fport_arg, | struct in_addr faddr, u_int fport_arg, | ||||
struct in_addr laddr, u_int lport_arg, | struct in_addr laddr, u_int lport_arg, | ||||
int lookupflags, struct ifnet *ifp, | int lookupflags, struct ifnet *ifp, | ||||
uint8_t numa_domain); | uint8_t numa_domain); | ||||
#define RANGECHK(var, min, max) \ | #define RANGECHK(var, min, max) \ | ||||
▲ Show 20 Lines • Show All 352 Lines • ▼ Show 20 Lines | |||||
static void | static void | ||||
inpcb_fini(void *mem, int size) | inpcb_fini(void *mem, int size) | ||||
{ | { | ||||
struct inpcb *inp = mem; | struct inpcb *inp = mem; | ||||
INP_LOCK_DESTROY(inp); | INP_LOCK_DESTROY(inp); | ||||
} | } | ||||
/* Make sure it is safe to use hashinit(9) on CK_LIST. */ | |||||
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); | |||||
/* | /* | ||||
* Initialize an inpcbinfo -- we should be able to reduce the number of | * Initialize an inpcbinfo -- we should be able to reduce the number of | ||||
* arguments in time. | * arguments in time. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, | in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, | ||||
struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, | u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, | ||||
char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) | uma_init inpcbzone_init) | ||||
markj: inpcbzone_name can be const, BTW. | |||||
Done Inline ActionsMore than that can be improved here. I got a patch in queue that would static-ize most of struct inpcbinfo initialization. Don't want to pile it into this review. glebius: More than that can be improved here. I got a patch in queue that would static-ize most of… | |||||
{ | { | ||||
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); | mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); | ||||
mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); | |||||
INP_INFO_LOCK_INIT(pcbinfo, name); | |||||
INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ | |||||
INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); | |||||
#ifdef VIMAGE | #ifdef VIMAGE | ||||
pcbinfo->ipi_vnet = curvnet; | pcbinfo->ipi_vnet = curvnet; | ||||
#endif | #endif | ||||
pcbinfo->ipi_listhead = listhead; | CK_LIST_INIT(&pcbinfo->ipi_listhead); | ||||
CK_LIST_INIT(pcbinfo->ipi_listhead); | |||||
pcbinfo->ipi_count = 0; | pcbinfo->ipi_count = 0; | ||||
pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, | pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, | ||||
&pcbinfo->ipi_hashmask); | &pcbinfo->ipi_hashmask); | ||||
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); | |||||
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, | pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, | ||||
&pcbinfo->ipi_porthashmask); | &pcbinfo->ipi_porthashmask); | ||||
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, | pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, | ||||
&pcbinfo->ipi_lbgrouphashmask); | &pcbinfo->ipi_lbgrouphashmask); | ||||
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), | pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), | ||||
NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); | NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, | ||||
UMA_ZONE_SMR); | |||||
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); | uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); | ||||
uma_zone_set_warning(pcbinfo->ipi_zone, | uma_zone_set_warning(pcbinfo->ipi_zone, | ||||
"kern.ipc.maxsockets limit reached"); | "kern.ipc.maxsockets limit reached"); | ||||
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); | |||||
pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, | |||||
sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); | |||||
Done Inline ActionsLooks like there's a missing newline here. markj: Looks like there's a missing newline here. | |||||
uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); | |||||
} | } | ||||
/* | /* | ||||
* Destroy an inpcbinfo. | * Destroy an inpcbinfo. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) | in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) | ||||
{ | { | ||||
KASSERT(pcbinfo->ipi_count == 0, | KASSERT(pcbinfo->ipi_count == 0, | ||||
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); | ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); | ||||
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); | hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); | ||||
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, | hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, | ||||
pcbinfo->ipi_porthashmask); | pcbinfo->ipi_porthashmask); | ||||
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, | hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, | ||||
pcbinfo->ipi_lbgrouphashmask); | pcbinfo->ipi_lbgrouphashmask); | ||||
uma_zdestroy(pcbinfo->ipi_zone); | uma_zdestroy(pcbinfo->ipi_zone); | ||||
INP_LIST_LOCK_DESTROY(pcbinfo); | mtx_destroy(&pcbinfo->ipi_hash_lock); | ||||
INP_HASH_LOCK_DESTROY(pcbinfo); | mtx_destroy(&pcbinfo->ipi_lock); | ||||
INP_INFO_LOCK_DESTROY(pcbinfo); | |||||
} | } | ||||
/* | /* | ||||
* Allocate a PCB and associate it with the socket. | * Allocate a PCB and associate it with the socket. | ||||
* On success return with the PCB locked. | * On success return with the PCB locked. | ||||
*/ | */ | ||||
int | int | ||||
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) | in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) | ||||
{ | { | ||||
struct inpcb *inp; | struct inpcb *inp; | ||||
int error; | int error; | ||||
error = 0; | error = 0; | ||||
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); | inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); | ||||
if (inp == NULL) | if (inp == NULL) | ||||
return (ENOBUFS); | return (ENOBUFS); | ||||
bzero(&inp->inp_start_zero, inp_zero_size); | bzero(&inp->inp_start_zero, inp_zero_size); | ||||
#ifdef NUMA | #ifdef NUMA | ||||
inp->inp_numa_domain = M_NODOM; | inp->inp_numa_domain = M_NODOM; | ||||
#endif | #endif | ||||
inp->inp_pcbinfo = pcbinfo; | inp->inp_pcbinfo = pcbinfo; | ||||
inp->inp_socket = so; | inp->inp_socket = so; | ||||
Show All 15 Lines | #endif | ||||
} | } | ||||
#endif /*IPSEC*/ | #endif /*IPSEC*/ | ||||
#ifdef INET6 | #ifdef INET6 | ||||
if (INP_SOCKAF(so) == AF_INET6) { | if (INP_SOCKAF(so) == AF_INET6) { | ||||
inp->inp_vflag |= INP_IPV6PROTO; | inp->inp_vflag |= INP_IPV6PROTO; | ||||
if (V_ip6_v6only) | if (V_ip6_v6only) | ||||
inp->inp_flags |= IN6P_IPV6_V6ONLY; | inp->inp_flags |= IN6P_IPV6_V6ONLY; | ||||
} | } | ||||
#endif | |||||
INP_WLOCK(inp); | |||||
INP_LIST_WLOCK(pcbinfo); | |||||
CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); | |||||
pcbinfo->ipi_count++; | |||||
so->so_pcb = (caddr_t)inp; | |||||
#ifdef INET6 | |||||
if (V_ip6_auto_flowlabel) | if (V_ip6_auto_flowlabel) | ||||
inp->inp_flags |= IN6P_AUTOFLOWLABEL; | inp->inp_flags |= IN6P_AUTOFLOWLABEL; | ||||
#endif | #endif | ||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ | |||||
/* | /* | ||||
* Routes in inpcb's can cache L2 as well; they are guaranteed | * Routes in inpcb's can cache L2 as well; they are guaranteed | ||||
* to be cleaned up. | * to be cleaned up. | ||||
*/ | */ | ||||
inp->inp_route.ro_flags = RT_LLE_CACHE; | inp->inp_route.ro_flags = RT_LLE_CACHE; | ||||
INP_LIST_WUNLOCK(pcbinfo); | #ifdef TCPHPTS | ||||
/* | |||||
* If using hpts lets drop a random number in so | |||||
* not all new connections fall on the same CPU. | |||||
*/ | |||||
inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp); | |||||
#endif | |||||
refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ | |||||
INP_WLOCK(inp); | |||||
Not Done Inline ActionsConceptually the INP_WLOCK() should be done before line 619. I know its not rrs: Conceptually the INP_WLOCK() should be done before line 619. I know its not
"lookup-able" at… | |||||
Done Inline ActionsThat's a good conceptual question! It should be either locked right after allocation and even before bzero, cause bzero already writes to the protected fields OR it should be locked right before it could be looked up. I'll leave that open for now and see what others think. P.S. I will also ask Drew, what if such change can be not only stylistic but affect performance? glebius: That's a good conceptual question! It should be either locked right after allocation and **even… | |||||
INP_INFO_WLOCK(pcbinfo); | |||||
pcbinfo->ipi_count++; | |||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); | |||||
INP_INFO_WUNLOCK(pcbinfo); | |||||
so->so_pcb = inp; | |||||
return (0); | |||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) | #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) | ||||
out: | out: | ||||
if (error != 0) { | |||||
crfree(inp->inp_cred); | crfree(inp->inp_cred); | ||||
uma_zfree(pcbinfo->ipi_zone, inp); | uma_zfree_smr(pcbinfo->ipi_zone, inp); | ||||
} | |||||
#endif | |||||
return (error); | return (error); | ||||
#endif | |||||
} | } | ||||
#ifdef INET | #ifdef INET | ||||
int | int | ||||
in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) | in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) | ||||
{ | { | ||||
int anonport, error; | int anonport, error; | ||||
▲ Show 20 Lines • Show All 694 Lines • ▼ Show 20 Lines | |||||
* caller can decide to override it. In all other cases, *oinpp | * caller can decide to override it. In all other cases, *oinpp | ||||
* is set to NULL. | * is set to NULL. | ||||
*/ | */ | ||||
int | int | ||||
in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, | in_pcbconnect_setup(struct inpcb *inp, struct sockaddr *nam, | ||||
in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, | in_addr_t *laddrp, u_short *lportp, in_addr_t *faddrp, u_short *fportp, | ||||
struct inpcb **oinpp, struct ucred *cred) | struct inpcb **oinpp, struct ucred *cred) | ||||
{ | { | ||||
struct rm_priotracker in_ifa_tracker; | |||||
struct sockaddr_in *sin = (struct sockaddr_in *)nam; | struct sockaddr_in *sin = (struct sockaddr_in *)nam; | ||||
struct in_ifaddr *ia; | struct in_ifaddr *ia; | ||||
struct inpcb *oinp; | struct inpcb *oinp; | ||||
struct in_addr laddr, faddr; | struct in_addr laddr, faddr; | ||||
u_short lport, fport; | u_short lport, fport; | ||||
int error; | int error; | ||||
KASSERT(sin->sin_family == AF_INET, | KASSERT(sin->sin_family == AF_INET, | ||||
Show All 32 Lines | if (!CK_STAILQ_EMPTY(&V_in_ifaddrhead)) { | ||||
/* | /* | ||||
* If the destination address is INADDR_ANY, | * If the destination address is INADDR_ANY, | ||||
* use the primary local address. | * use the primary local address. | ||||
* If the supplied address is INADDR_BROADCAST, | * If the supplied address is INADDR_BROADCAST, | ||||
* and the primary interface supports broadcast, | * and the primary interface supports broadcast, | ||||
* choose the broadcast address for that interface. | * choose the broadcast address for that interface. | ||||
*/ | */ | ||||
if (faddr.s_addr == INADDR_ANY) { | if (faddr.s_addr == INADDR_ANY) { | ||||
IN_IFADDR_RLOCK(&in_ifa_tracker); | |||||
faddr = | faddr = | ||||
IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; | IA_SIN(CK_STAILQ_FIRST(&V_in_ifaddrhead))->sin_addr; | ||||
IN_IFADDR_RUNLOCK(&in_ifa_tracker); | |||||
if (cred != NULL && | if (cred != NULL && | ||||
(error = prison_get_ip4(cred, &faddr)) != 0) | (error = prison_get_ip4(cred, &faddr)) != 0) | ||||
return (error); | return (error); | ||||
} else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { | } else if (faddr.s_addr == (u_long)INADDR_BROADCAST) { | ||||
IN_IFADDR_RLOCK(&in_ifa_tracker); | |||||
if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & | if (CK_STAILQ_FIRST(&V_in_ifaddrhead)->ia_ifp->if_flags & | ||||
IFF_BROADCAST) | IFF_BROADCAST) | ||||
faddr = satosin(&CK_STAILQ_FIRST( | faddr = satosin(&CK_STAILQ_FIRST( | ||||
&V_in_ifaddrhead)->ia_broadaddr)->sin_addr; | &V_in_ifaddrhead)->ia_broadaddr)->sin_addr; | ||||
IN_IFADDR_RUNLOCK(&in_ifa_tracker); | |||||
} | } | ||||
} | } | ||||
if (laddr.s_addr == INADDR_ANY) { | if (laddr.s_addr == INADDR_ANY) { | ||||
error = in_pcbladdr(inp, &faddr, &laddr, cred); | error = in_pcbladdr(inp, &faddr, &laddr, cred); | ||||
/* | /* | ||||
* If the destination address is multicast and an outgoing | * If the destination address is multicast and an outgoing | ||||
* interface has been set as a multicast option, prefer the | * interface has been set as a multicast option, prefer the | ||||
* address of that interface as our source address. | * address of that interface as our source address. | ||||
*/ | */ | ||||
if (IN_MULTICAST(ntohl(faddr.s_addr)) && | if (IN_MULTICAST(ntohl(faddr.s_addr)) && | ||||
inp->inp_moptions != NULL) { | inp->inp_moptions != NULL) { | ||||
struct ip_moptions *imo; | struct ip_moptions *imo; | ||||
struct ifnet *ifp; | struct ifnet *ifp; | ||||
imo = inp->inp_moptions; | imo = inp->inp_moptions; | ||||
if (imo->imo_multicast_ifp != NULL) { | if (imo->imo_multicast_ifp != NULL) { | ||||
ifp = imo->imo_multicast_ifp; | ifp = imo->imo_multicast_ifp; | ||||
IN_IFADDR_RLOCK(&in_ifa_tracker); | |||||
CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { | CK_STAILQ_FOREACH(ia, &V_in_ifaddrhead, ia_link) { | ||||
if ((ia->ia_ifp == ifp) && | if ((ia->ia_ifp == ifp) && | ||||
(cred == NULL || | (cred == NULL || | ||||
prison_check_ip4(cred, | prison_check_ip4(cred, | ||||
&ia->ia_addr.sin_addr) == 0)) | &ia->ia_addr.sin_addr) == 0)) | ||||
break; | break; | ||||
} | } | ||||
if (ia == NULL) | if (ia == NULL) | ||||
error = EADDRNOTAVAIL; | error = EADDRNOTAVAIL; | ||||
else { | else { | ||||
laddr = ia->ia_addr.sin_addr; | laddr = ia->ia_addr.sin_addr; | ||||
error = 0; | error = 0; | ||||
} | } | ||||
IN_IFADDR_RUNLOCK(&in_ifa_tracker); | |||||
} | } | ||||
} | } | ||||
if (error) | if (error) | ||||
return (error); | return (error); | ||||
} | } | ||||
if (lport != 0) { | if (lport != 0) { | ||||
oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, | oinp = in_pcblookup_hash_locked(inp->inp_pcbinfo, faddr, | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | #ifdef RATELIMIT | ||||
if (inp->inp_snd_tag != NULL) | if (inp->inp_snd_tag != NULL) | ||||
in_pcbdetach_txrtlmt(inp); | in_pcbdetach_txrtlmt(inp); | ||||
#endif | #endif | ||||
inp->inp_socket->so_pcb = NULL; | inp->inp_socket->so_pcb = NULL; | ||||
inp->inp_socket = NULL; | inp->inp_socket = NULL; | ||||
} | } | ||||
/* | /* | ||||
* in_pcbref() bumps the reference count on an inpcb in order to maintain | * inpcb hash lookups are protected by SMR section. | ||||
* stability of an inpcb pointer despite the inpcb lock being released. This | |||||
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, | |||||
* but where the inpcb lock may already held. | |||||
* | * | ||||
* in_pcbref() should be used only to provide brief memory stability, and | * Once desired pcb has been found, switching from SMR section to a pcb | ||||
* must always be followed by a call to INP_WLOCK() and in_pcbrele() to | * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK | ||||
* garbage collect the inpcb if it has been in_pcbfree()'d from another | * here because SMR is a critical section. | ||||
* context. Until in_pcbrele() has returned that the inpcb is still valid, | * In 99%+ cases inp_smr_lock() would obtain the lock immediately. | ||||
* lock and rele are the *only* safe operations that may be performed on the | |||||
* inpcb. | |||||
* | |||||
* While the inpcb will not be freed, releasing the inpcb lock means that the | |||||
* connection's state may change, so the caller should be careful to | |||||
* revalidate any cached state on reacquiring the lock. Drop the reference | |||||
* using in_pcbrele(). | |||||
*/ | */ | ||||
void | static inline void | ||||
in_pcbref(struct inpcb *inp) | inp_lock(struct inpcb *inp, const inp_lookup_t lock) | ||||
{ | { | ||||
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); | lock == INPLOOKUP_RLOCKPCB ? | ||||
Done Inline ActionsThese backslashes aren't needed. markj: These backslashes aren't needed. | |||||
rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); | |||||
} | |||||
refcount_acquire(&inp->inp_refcount); | static inline void | ||||
inp_unlock(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
lock == INPLOOKUP_RLOCKPCB ? | |||||
rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); | |||||
} | } | ||||
static inline int | |||||
inp_trylock(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
return (lock == INPLOOKUP_RLOCKPCB ? | |||||
rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); | |||||
} | |||||
static inline bool | |||||
in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
return (lock == INPLOOKUP_RLOCKPCB ? | |||||
in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); | |||||
} | |||||
bool | |||||
inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); | |||||
SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); | |||||
if (__predict_true(inp_trylock(inp, lock))) { | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
check_freed: | |||||
if (__predict_false(inp->inp_flags & INP_FREED)) { | |||||
inp_unlock(inp, lock); | |||||
return (false); | |||||
} else | |||||
return (true); | |||||
} | |||||
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
inp_lock(inp, lock); | |||||
if (__predict_false(in_pcbrele(inp, lock))) | |||||
return (false); | |||||
else | |||||
goto check_freed; | |||||
} else { | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
return (false); | |||||
} | |||||
} | |||||
/* | /* | ||||
* Drop a refcount on an inpcb elevated using in_pcbref(); because a call to | * inp_next() - inpcb hash/list traversal iterator | ||||
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we | |||||
* return a flag indicating whether or not the inpcb remains valid. If it is | |||||
* valid, we return with the inpcb lock held. | |||||
* | * | ||||
* Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a | * Requires initialized struct inpcb_iterator for context. | ||||
* reference on an inpcb. Historically more work was done here (actually, in | * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). | ||||
* in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the | * | ||||
* need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely | * - Iterator can have either write-lock or read-lock semantics, that can not | ||||
* about memory stability (and continued use of the write lock). | * be changed later. | ||||
* - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through | |||||
* a single hash slot. Note: only rip_input() does the latter. | |||||
* - Iterator may have optional bool matching function. The matching function | |||||
* will be executed for each inpcb in the SMR context, so it can not acquire | |||||
* locks and can safely access only immutable fields of inpcb. | |||||
* | |||||
* A fresh initialized iterator has NULL inpcb in its context and that | |||||
* means that inp_next() call would return the very first inpcb on the list | |||||
* locked with desired semantic. In all following calls the context pointer | |||||
* shall hold the current inpcb pointer. The KPI user is not supposed to | |||||
* unlock the current inpcb! Upon end of traversal inp_next() will return NULL | |||||
* and write NULL to its context. After end of traversal an iterator can be | |||||
* reused. | |||||
* | |||||
* List traversals have the following features/constraints: | |||||
* - New entries won't be seen, as they are always added to the head of a list. | |||||
* - Removed entries won't stop traversal as long as they are not added to | |||||
* a different list. This is violated by in_pcbrehash(). | |||||
*/ | */ | ||||
int | #define II_LIST_FIRST(ipi, hash) \ | ||||
in_pcbrele_rlocked(struct inpcb *inp) | (((hash) == INP_ALL_LIST) ? \ | ||||
CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ | |||||
CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) | |||||
#define II_LIST_NEXT(inp, hash) \ | |||||
(((hash) == INP_ALL_LIST) ? \ | |||||
CK_LIST_NEXT((inp), inp_list) : \ | |||||
CK_LIST_NEXT((inp), inp_hash)) | |||||
#define II_LOCK_ASSERT(inp, lock) \ | |||||
rw_assert(&(inp)->inp_lock, \ | |||||
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) | |||||
struct inpcb * | |||||
inp_next(struct inpcb_iterator *ii) | |||||
{ | { | ||||
struct inpcbinfo *pcbinfo; | struct inpcbhead freelist = CK_LIST_HEAD_INITIALIZER(freelist); | ||||
const struct inpcbinfo *ipi = ii->ipi; | |||||
inp_match_t *match = ii->match; | |||||
void *ctx = ii->ctx; | |||||
inp_lookup_t lock = ii->lock; | |||||
int hash = ii->hash; | |||||
struct inpcb *inp; | |||||
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); | if (ii->inp == NULL) { /* First call. */ | ||||
smr_enter(ipi->ipi_smr); | |||||
/* This is unrolled CK_LIST_FOREACH(). */ | |||||
for (inp = II_LIST_FIRST(ipi, hash); | |||||
inp != NULL; | |||||
inp = II_LIST_NEXT(inp, hash)) { | |||||
if (match != NULL && (match)(inp, ctx) == false) | |||||
continue; | |||||
if (__predict_true(inp_smr_lock(inp, lock))) | |||||
break; | |||||
else { | |||||
smr_enter(ipi->ipi_smr); | |||||
MPASS(inp != II_LIST_FIRST(ipi, hash)); | |||||
inp = II_LIST_FIRST(ipi, hash); | |||||
} | |||||
} | |||||
INP_RLOCK_ASSERT(inp); | if (inp == NULL) | ||||
smr_exit(ipi->ipi_smr); | |||||
else | |||||
ii->inp = inp; | |||||
if (refcount_release(&inp->inp_refcount) == 0) { | return (inp); | ||||
} | |||||
/* Not a first call. */ | |||||
smr_enter(ipi->ipi_smr); | |||||
Done Inline ActionsMaybe call this advance or next_locked instead of next. markj: Maybe call this `advance` or `next_locked` instead of `next`. | |||||
restart: | |||||
inp = ii->inp; | |||||
II_LOCK_ASSERT(inp, lock); | |||||
next: | |||||
inp = II_LIST_NEXT(inp, hash); | |||||
if (inp == NULL) { | |||||
smr_exit(ipi->ipi_smr); | |||||
goto found; | |||||
} | |||||
if (match != NULL && (match)(inp, ctx) == false) | |||||
Done Inline ActionsThis line is erroneous. Should have gone with the last update. It will be deleted in next update. glebius: This line is erroneous. Should have gone with the last update. It will be deleted in next… | |||||
goto next; | |||||
if (__predict_true(inp_trylock(inp, lock))) { | |||||
if (__predict_false(inp->inp_flags & INP_FREED)) { | |||||
/* | /* | ||||
* If the inpcb has been freed, let the caller know, even if | * Entries are never inserted in middle of a list, thus | ||||
* this isn't the last reference. | * as long as we are in SMR, we can continue traversal. | ||||
* Jump to 'restart' should yield in the same result, | |||||
* but could produce unnecessary looping. Could this | |||||
* looping be unbound? | |||||
*/ | */ | ||||
if (inp->inp_flags2 & INP_FREED) { | inp_unlock(inp, lock); | ||||
INP_RUNLOCK(inp); | goto next; | ||||
return (1); | } else { | ||||
smr_exit(ipi->ipi_smr); | |||||
goto found; | |||||
} | } | ||||
return (0); | |||||
} | } | ||||
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | |||||
#ifdef TCPHPTS | |||||
if (inp->inp_in_hpts || inp->inp_in_input) { | |||||
struct tcp_hpts_entry *hpts; | |||||
/* | /* | ||||
* We should not be on the hpts at | * Can't obtain lock immediately, thus going hard. Once we exit the | ||||
* this point in any form. we must | * SMR section we can no longer jump to 'next', and our only stable | ||||
* get the lock to be sure. | * anchoring point is ii->inp, which we keep locked for this case, so | ||||
* we jump to 'restart'. | |||||
*/ | */ | ||||
hpts = tcp_hpts_lock(inp); | if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { | ||||
if (inp->inp_in_hpts) | smr_exit(ipi->ipi_smr); | ||||
panic("Hpts:%p inp:%p at free still on hpts", | inp_lock(inp, lock); | ||||
hpts, inp); | if (__predict_true(refcount_release(&inp->inp_refcount) == 0)) { | ||||
mtx_unlock(&hpts->p_mtx); | if (__predict_false(inp->inp_flags & INP_FREED)) { | ||||
hpts = tcp_input_lock(inp); | inp_unlock(inp, lock); | ||||
if (inp->inp_in_input) | smr_enter(ipi->ipi_smr); | ||||
panic("Hpts:%p inp:%p at free still on input hpts", | goto restart; | ||||
hpts, inp); | |||||
mtx_unlock(&hpts->p_mtx); | |||||
} | } | ||||
#endif | goto found; | ||||
INP_RUNLOCK(inp); | |||||
pcbinfo = inp->inp_pcbinfo; | |||||
uma_zfree(pcbinfo->ipi_zone, inp); | |||||
return (1); | |||||
} | } | ||||
/* | |||||
* We have just stolen a pcb from in_pcbrele_(r|w)locked(). | |||||
* It is now our responsibility to free it. | |||||
*/ | |||||
MPASS(inp->inp_flags & INP_FREED); | |||||
inp_unlock(inp, lock); | |||||
if (hash == INP_ALL_LIST) | |||||
CK_LIST_INSERT_HEAD(&freelist, inp, inp_list); | |||||
else | |||||
CK_LIST_INSERT_HEAD(&freelist, inp, inp_hash); | |||||
smr_enter(ipi->ipi_smr); | |||||
goto restart; | |||||
} else | |||||
goto next; | |||||
int | found: | ||||
in_pcbrele_wlocked(struct inpcb *inp) | inp_unlock(ii->inp, lock); | ||||
Done Inline ActionsThere is no guarantee that next is a valid inp at this point. While outside of the SMR section, next could have been freed and reallocated, so it may belong to a different hash bucket, or it may have been moved to the end of the global list (so some elements can be skipped), or the backing slab could have been reallocated for some other purpose (inps are not type-stable), so next is a pointer to random memory. markj: There is no guarantee that `next` is a valid inp at this point. While outside of the SMR… | |||||
Done Inline ActionsThat's a very good catch. Thanks! I will look deeper into here. glebius: That's a very good catch. Thanks! I will look deeper into here. | |||||
{ | ii->inp = inp; | ||||
struct inpcbinfo *pcbinfo; | |||||
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); | if (__predict_false(CK_LIST_FIRST(&freelist) != NULL)) { | ||||
struct inpcb *tmp; | |||||
INP_WLOCK_ASSERT(inp); | if (hash == INP_ALL_LIST) | ||||
CK_LIST_FOREACH_SAFE(inp, &freelist, inp_list, tmp) | |||||
uma_zfree_smr(ipi->ipi_zone, inp); | |||||
else | |||||
CK_LIST_FOREACH_SAFE(inp, &freelist, inp_hash, tmp) | |||||
uma_zfree_smr(ipi->ipi_zone, inp); | |||||
} | |||||
if (refcount_release(&inp->inp_refcount) == 0) { | return (ii->inp); | ||||
/* | |||||
* If the inpcb has been freed, let the caller know, even if | |||||
* this isn't the last reference. | |||||
*/ | |||||
if (inp->inp_flags2 & INP_FREED) { | |||||
INP_WUNLOCK(inp); | |||||
return (1); | |||||
} | } | ||||
return (0); | |||||
} | |||||
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | |||||
#ifdef TCPHPTS | |||||
if (inp->inp_in_hpts || inp->inp_in_input) { | |||||
struct tcp_hpts_entry *hpts; | |||||
/* | /* | ||||
* We should not be on the hpts at | * in_pcbref() bumps the reference count on an inpcb in order to maintain | ||||
* this point in any form. we must | * stability of an inpcb pointer despite the inpcb lock being released or | ||||
* get the lock to be sure. | * SMR section exited. | ||||
* | |||||
* To free a reference later in_pcbrele_(r|w)locked() must be performed. | |||||
*/ | */ | ||||
hpts = tcp_hpts_lock(inp); | void | ||||
if (inp->inp_in_hpts) | in_pcbref(struct inpcb *inp) | ||||
panic("Hpts:%p inp:%p at free still on hpts", | { | ||||
hpts, inp); | u_int old __diagused; | ||||
mtx_unlock(&hpts->p_mtx); | |||||
hpts = tcp_input_lock(inp); | old = refcount_acquire(&inp->inp_refcount); | ||||
if (inp->inp_in_input) | KASSERT(old > 0, ("%s: refcount 0", __func__)); | ||||
panic("Hpts:%p inp:%p at free still on input hpts", | |||||
hpts, inp); | |||||
mtx_unlock(&hpts->p_mtx); | |||||
} | } | ||||
Done Inline Actionsrefcount_acquire() returns the old value, so you could use that to write a non-racy assertion: u_int old __diagused; old = refcount_acquire(&inp->inp_refcount); KASSERT(old > 0, ...); markj: refcount_acquire() returns the old value, so you could use that to write a non-racy assertion… | |||||
Done Inline ActionsWill do. Thanks! glebius: Will do. Thanks! | |||||
#endif | |||||
INP_WUNLOCK(inp); | |||||
pcbinfo = inp->inp_pcbinfo; | |||||
uma_zfree(pcbinfo->ipi_zone, inp); | |||||
return (1); | |||||
} | |||||
static void | /* | ||||
inpcbport_free(epoch_context_t ctx) | * Drop a refcount on an inpcb elevated using in_pcbref(), potentially | ||||
* freeing the pcb, if the reference was very last. | |||||
*/ | |||||
bool | |||||
in_pcbrele_rlocked(struct inpcb *inp) | |||||
{ | { | ||||
struct inpcbport *phd; | |||||
phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); | INP_RLOCK_ASSERT(inp); | ||||
free(phd, M_PCB); | |||||
Done Inline ActionsNo need for this assertion, refcount_release() will panic in this case. markj: No need for this assertion, refcount_release() will panic in this case. | |||||
if (refcount_release(&inp->inp_refcount) == 0) | |||||
return (false); | |||||
MPASS(inp->inp_flags & INP_FREED); | |||||
MPASS(inp->inp_socket == NULL); | |||||
MPASS(inp->inp_in_hpts == 0); | |||||
MPASS(inp->inp_in_input == 0); | |||||
INP_RUNLOCK(inp); | |||||
uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); | |||||
return (true); | |||||
} | } | ||||
static void | bool | ||||
in_pcbfree_deferred(epoch_context_t ctx) | in_pcbrele_wlocked(struct inpcb *inp) | ||||
{ | { | ||||
struct inpcb *inp; | |||||
int released __unused; | |||||
inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); | INP_WLOCK_ASSERT(inp); | ||||
INP_WLOCK(inp); | if (refcount_release(&inp->inp_refcount) == 0) | ||||
CURVNET_SET(inp->inp_vnet); | return (false); | ||||
#ifdef INET | |||||
struct ip_moptions *imo = inp->inp_moptions; | MPASS(inp->inp_flags & INP_FREED); | ||||
inp->inp_moptions = NULL; | MPASS(inp->inp_socket == NULL); | ||||
#endif | MPASS(inp->inp_in_hpts == 0); | ||||
/* XXXRW: Do as much as possible here. */ | MPASS(inp->inp_in_input == 0); | ||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) | INP_WUNLOCK(inp); | ||||
Not Done Inline ActionsConsider moving the assertions into a subroutine so they aren't duplicated. markj: Consider moving the assertions into a subroutine so they aren't duplicated. | |||||
Done Inline ActionsThe HPTS-related ones will go away later. So only two MPASS will remain. I'll leave as is for now, if you don't mind. glebius: The HPTS-related ones will go away later. So only two MPASS will remain. I'll leave as is for… | |||||
if (inp->inp_sp != NULL) | uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); | ||||
ipsec_delete_pcbpolicy(inp); | return (true); | ||||
#endif | |||||
#ifdef INET6 | |||||
struct ip6_moptions *im6o = NULL; | |||||
if (inp->inp_vflag & INP_IPV6PROTO) { | |||||
ip6_freepcbopts(inp->in6p_outputopts); | |||||
im6o = inp->in6p_moptions; | |||||
inp->in6p_moptions = NULL; | |||||
} | } | ||||
#endif | |||||
if (inp->inp_options) | |||||
(void)m_free(inp->inp_options); | |||||
inp->inp_vflag = 0; | |||||
crfree(inp->inp_cred); | |||||
#ifdef MAC | |||||
mac_inpcb_destroy(inp); | |||||
#endif | |||||
released = in_pcbrele_wlocked(inp); | |||||
MPASS(released); | |||||
#ifdef INET6 | |||||
ip6_freemoptions(im6o); | |||||
#endif | |||||
#ifdef INET | |||||
inp_freemoptions(imo); | |||||
#endif | |||||
CURVNET_RESTORE(); | |||||
} | |||||
/* | /* | ||||
* Unconditionally schedule an inpcb to be freed by decrementing its | * Unconditionally schedule an inpcb to be freed by decrementing its | ||||
* reference count, which should occur only after the inpcb has been detached | * reference count, which should occur only after the inpcb has been detached | ||||
* from its socket. If another thread holds a temporary reference (acquired | * from its socket. If another thread holds a temporary reference (acquired | ||||
* using in_pcbref()) then the free is deferred until that reference is | * using in_pcbref()) then the free is deferred until that reference is | ||||
* released using in_pcbrele(), but the inpcb is still unlocked. Almost all | * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. | ||||
* work, including removal from global lists, is done in this context, where | * Almost all work, including removal from global lists, is done in this | ||||
* the pcbinfo lock is held. | * context, where the pcbinfo lock is held. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbfree(struct inpcb *inp) | in_pcbfree(struct inpcb *inp) | ||||
{ | { | ||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | ||||
#ifdef INET | |||||
struct ip_moptions *imo; | |||||
#endif | |||||
#ifdef INET6 | |||||
struct ip6_moptions *im6o; | |||||
#endif | |||||
INP_WLOCK_ASSERT(inp); | |||||
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | ||||
KASSERT((inp->inp_flags2 & INP_FREED) == 0, | KASSERT((inp->inp_flags & INP_FREED) == 0, | ||||
("%s: called twice for pcb %p", __func__, inp)); | ("%s: called twice for pcb %p", __func__, inp)); | ||||
if (inp->inp_flags2 & INP_FREED) { | |||||
INP_WUNLOCK(inp); | inp->inp_flags |= INP_FREED; | ||||
return; | INP_INFO_WLOCK(pcbinfo); | ||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
pcbinfo->ipi_count--; | |||||
CK_LIST_REMOVE(inp, inp_list); | |||||
INP_INFO_WUNLOCK(pcbinfo); | |||||
if (inp->inp_flags & INP_INHASHLIST) { | |||||
struct inpcbport *phd = inp->inp_phd; | |||||
INP_HASH_WLOCK(pcbinfo); | |||||
/* XXX: Only do if SO_REUSEPORT_LB set? */ | |||||
in_pcbremlbgrouphash(inp); | |||||
CK_LIST_REMOVE(inp, inp_hash); | |||||
CK_LIST_REMOVE(inp, inp_portlist); | |||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | |||||
CK_LIST_REMOVE(phd, phd_hash); | |||||
uma_zfree_smr(pcbinfo->ipi_portzone, phd); | |||||
} | } | ||||
INP_HASH_WUNLOCK(pcbinfo); | |||||
inp->inp_flags &= ~INP_INHASHLIST; | |||||
} | |||||
INP_WLOCK_ASSERT(inp); | crfree(inp->inp_cred); | ||||
INP_LIST_WLOCK(pcbinfo); | |||||
in_pcbremlists(inp); | |||||
INP_LIST_WUNLOCK(pcbinfo); | |||||
RO_INVALIDATE_CACHE(&inp->inp_route); | RO_INVALIDATE_CACHE(&inp->inp_route); | ||||
/* mark as destruction in progress */ | #ifdef MAC | ||||
inp->inp_flags2 |= INP_FREED; | mac_inpcb_destroy(inp); | ||||
#endif | |||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) | |||||
if (inp->inp_sp != NULL) | |||||
ipsec_delete_pcbpolicy(inp); | |||||
#endif | |||||
#ifdef INET | |||||
if (inp->inp_options) | |||||
(void)m_free(inp->inp_options); | |||||
imo = inp->inp_moptions; | |||||
#endif | |||||
#ifdef INET6 | |||||
if (inp->inp_vflag & INP_IPV6PROTO) { | |||||
ip6_freepcbopts(inp->in6p_outputopts); | |||||
im6o = inp->in6p_moptions; | |||||
} else | |||||
im6o = NULL; | |||||
#endif | |||||
if (__predict_false(in_pcbrele_wlocked(inp) == false)) { | |||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); | |||||
} | } | ||||
#ifdef INET6 | |||||
ip6_freemoptions(im6o); | |||||
#endif | |||||
#ifdef INET | |||||
inp_freemoptions(imo); | |||||
#endif | |||||
} | |||||
/* | /* | ||||
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and | * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and | ||||
* port reservation, and preventing it from being returned by inpcb lookups. | * port reservation, and preventing it from being returned by inpcb lookups. | ||||
* | * | ||||
* It is used by TCP to mark an inpcb as unused and avoid future packet | * It is used by TCP to mark an inpcb as unused and avoid future packet | ||||
* delivery or event notification when a socket remains open but TCP has | * delivery or event notification when a socket remains open but TCP has | ||||
* closed. This might occur as a result of a shutdown()-initiated TCP close | * closed. This might occur as a result of a shutdown()-initiated TCP close | ||||
Show All 23 Lines | if (inp->inp_flags & INP_INHASHLIST) { | ||||
struct inpcbport *phd = inp->inp_phd; | struct inpcbport *phd = inp->inp_phd; | ||||
INP_HASH_WLOCK(inp->inp_pcbinfo); | INP_HASH_WLOCK(inp->inp_pcbinfo); | ||||
in_pcbremlbgrouphash(inp); | in_pcbremlbgrouphash(inp); | ||||
CK_LIST_REMOVE(inp, inp_hash); | CK_LIST_REMOVE(inp, inp_hash); | ||||
CK_LIST_REMOVE(inp, inp_portlist); | CK_LIST_REMOVE(inp, inp_portlist); | ||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | ||||
CK_LIST_REMOVE(phd, phd_hash); | CK_LIST_REMOVE(phd, phd_hash); | ||||
NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); | uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); | ||||
} | } | ||||
INP_HASH_WUNLOCK(inp->inp_pcbinfo); | INP_HASH_WUNLOCK(inp->inp_pcbinfo); | ||||
inp->inp_flags &= ~INP_INHASHLIST; | inp->inp_flags &= ~INP_INHASHLIST; | ||||
} | } | ||||
} | } | ||||
#ifdef INET | #ifdef INET | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | |||||
void | void | ||||
in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, | in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, | ||||
struct inpcb *(*notify)(struct inpcb *, int)) | struct inpcb *(*notify)(struct inpcb *, int)) | ||||
{ | { | ||||
struct inpcb *inp, *inp_temp; | struct inpcb *inp, *inp_temp; | ||||
INP_INFO_WLOCK(pcbinfo); | INP_INFO_WLOCK(pcbinfo); | ||||
CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { | CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { | ||||
INP_WLOCK(inp); | INP_WLOCK(inp); | ||||
#ifdef INET6 | #ifdef INET6 | ||||
if ((inp->inp_vflag & INP_IPV4) == 0) { | if ((inp->inp_vflag & INP_IPV4) == 0) { | ||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
continue; | continue; | ||||
} | } | ||||
#endif | #endif | ||||
if (inp->inp_faddr.s_addr != faddr.s_addr || | if (inp->inp_faddr.s_addr != faddr.s_addr || | ||||
inp->inp_socket == NULL) { | inp->inp_socket == NULL) { | ||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
continue; | continue; | ||||
} | } | ||||
if ((*notify)(inp, errno)) | if ((*notify)(inp, errno)) | ||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
} | } | ||||
INP_INFO_WUNLOCK(pcbinfo); | INP_INFO_WUNLOCK(pcbinfo); | ||||
} | } | ||||
static bool | |||||
inp_v4_multi_match(const struct inpcb *inp, void *v __unused) | |||||
{ | |||||
if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) | |||||
return (true); | |||||
else | |||||
return (false); | |||||
} | |||||
void | void | ||||
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) | in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) | ||||
{ | { | ||||
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, | |||||
inp_v4_multi_match, NULL); | |||||
struct inpcb *inp; | struct inpcb *inp; | ||||
struct in_multi *inm; | struct in_multi *inm; | ||||
struct in_mfilter *imf; | struct in_mfilter *imf; | ||||
struct ip_moptions *imo; | struct ip_moptions *imo; | ||||
INP_INFO_WLOCK(pcbinfo); | IN_MULTI_LOCK_ASSERT(); | ||||
CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { | |||||
INP_WLOCK(inp); | while ((inp = inp_next(&inpi)) != NULL) { | ||||
INP_WLOCK_ASSERT(inp); | |||||
imo = inp->inp_moptions; | imo = inp->inp_moptions; | ||||
if ((inp->inp_vflag & INP_IPV4) && | |||||
imo != NULL) { | |||||
/* | /* | ||||
* Unselect the outgoing interface if it is being | * Unselect the outgoing interface if it is being | ||||
* detached. | * detached. | ||||
*/ | */ | ||||
if (imo->imo_multicast_ifp == ifp) | if (imo->imo_multicast_ifp == ifp) | ||||
imo->imo_multicast_ifp = NULL; | imo->imo_multicast_ifp = NULL; | ||||
/* | /* | ||||
* Drop multicast group membership if we joined | * Drop multicast group membership if we joined | ||||
* through the interface being detached. | * through the interface being detached. | ||||
* | * | ||||
* XXX This can all be deferred to an epoch_call | * XXX This can all be deferred to an epoch_call | ||||
*/ | */ | ||||
restart: | restart: | ||||
IP_MFILTER_FOREACH(imf, &imo->imo_head) { | IP_MFILTER_FOREACH(imf, &imo->imo_head) { | ||||
if ((inm = imf->imf_inm) == NULL) | if ((inm = imf->imf_inm) == NULL) | ||||
continue; | continue; | ||||
if (inm->inm_ifp != ifp) | if (inm->inm_ifp != ifp) | ||||
continue; | continue; | ||||
ip_mfilter_remove(&imo->imo_head, imf); | ip_mfilter_remove(&imo->imo_head, imf); | ||||
IN_MULTI_LOCK_ASSERT(); | |||||
in_leavegroup_locked(inm, NULL); | in_leavegroup_locked(inm, NULL); | ||||
ip_mfilter_free(imf); | ip_mfilter_free(imf); | ||||
goto restart; | goto restart; | ||||
} | } | ||||
} | } | ||||
INP_WUNLOCK(inp); | |||||
} | } | ||||
INP_INFO_WUNLOCK(pcbinfo); | |||||
} | |||||
/* | /* | ||||
* Lookup a PCB based on the local address and port. Caller must hold the | * Lookup a PCB based on the local address and port. Caller must hold the | ||||
* hash lock. No inpcb locks or references are acquired. | * hash lock. No inpcb locks or references are acquired. | ||||
*/ | */ | ||||
#define INP_LOOKUP_MAPPED_PCB_COST 3 | #define INP_LOOKUP_MAPPED_PCB_COST 3 | ||||
struct inpcb * | struct inpcb * | ||||
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, | in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, | ||||
u_short lport, int lookupflags, struct ucred *cred) | u_short lport, int lookupflags, struct ucred *cred) | ||||
{ | { | ||||
struct inpcb *inp; | struct inpcb *inp; | ||||
#ifdef INET6 | #ifdef INET6 | ||||
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; | int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; | ||||
#else | #else | ||||
int matchwild = 3; | int matchwild = 3; | ||||
#endif | #endif | ||||
int wildcard; | int wildcard; | ||||
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, | KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, | ||||
("%s: invalid lookup flags %d", __func__, lookupflags)); | ("%s: invalid lookup flags %d", __func__, lookupflags)); | ||||
INP_HASH_LOCK_ASSERT(pcbinfo); | INP_HASH_LOCK_ASSERT(pcbinfo); | ||||
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { | if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { | ||||
struct inpcbhead *head; | struct inpcbhead *head; | ||||
/* | /* | ||||
* Look for an unconnected (wildcard foreign addr) PCB that | * Look for an unconnected (wildcard foreign addr) PCB that | ||||
* matches the local address and port we're looking for. | * matches the local address and port we're looking for. | ||||
*/ | */ | ||||
▲ Show 20 Lines • Show All 146 Lines • ▼ Show 20 Lines | #endif | ||||
if (numa_wild != NULL) | if (numa_wild != NULL) | ||||
return (numa_wild); | return (numa_wild); | ||||
return (local_wild); | return (local_wild); | ||||
} | } | ||||
/* | /* | ||||
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes | * Lookup PCB in hash list, using pcbinfo tables. This variation assumes | ||||
* that the caller has locked the hash list, and will not perform any further | * that the caller has either locked the hash list, which usually happens | ||||
* locking or reference operations on either the hash list or the connection. | * for bind(2) operations, or is in SMR section, which happens when sorting | ||||
* out incoming packets. | |||||
Done Inline ActionsIs this unfinished? markj: Is this unfinished? | |||||
Done Inline ActionsWill fix, thanks! glebius: Will fix, thanks! | |||||
*/ | */ | ||||
static struct inpcb * | static struct inpcb * | ||||
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, | in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, | ||||
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, | u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, | ||||
struct ifnet *ifp, uint8_t numa_domain) | struct ifnet *ifp, uint8_t numa_domain) | ||||
{ | { | ||||
struct inpcbhead *head; | struct inpcbhead *head; | ||||
struct inpcb *inp, *tmpinp; | struct inpcb *inp, *tmpinp; | ||||
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines | |||||
*/ | */ | ||||
static struct inpcb * | static struct inpcb * | ||||
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, | in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, | ||||
u_int fport, struct in_addr laddr, u_int lport, int lookupflags, | u_int fport, struct in_addr laddr, u_int lport, int lookupflags, | ||||
struct ifnet *ifp, uint8_t numa_domain) | struct ifnet *ifp, uint8_t numa_domain) | ||||
{ | { | ||||
struct inpcb *inp; | struct inpcb *inp; | ||||
smr_enter(pcbinfo->ipi_smr); | |||||
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, | inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, | ||||
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); | lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); | ||||
if (inp != NULL) { | if (inp != NULL) { | ||||
if (lookupflags & INPLOOKUP_WLOCKPCB) { | if (__predict_false(inp_smr_lock(inp, | ||||
INP_WLOCK(inp); | (lookupflags & INPLOOKUP_LOCKMASK)) == false)) | ||||
} else if (lookupflags & INPLOOKUP_RLOCKPCB) { | |||||
INP_RLOCK(inp); | |||||
} else | |||||
panic("%s: locking bug", __func__); | |||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) { | |||||
INP_UNLOCK(inp); | |||||
inp = NULL; | inp = NULL; | ||||
} | } else | ||||
} | smr_exit(pcbinfo->ipi_smr); | ||||
return (inp); | return (inp); | ||||
} | } | ||||
/* | /* | ||||
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf | * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf | ||||
* from which a pre-calculated hash value may be extracted. | * from which a pre-calculated hash value may be extracted. | ||||
*/ | */ | ||||
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | #endif | ||||
CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { | CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { | ||||
if (phd->phd_port == inp->inp_lport) | if (phd->phd_port == inp->inp_lport) | ||||
break; | break; | ||||
} | } | ||||
/* | /* | ||||
* If none exists, malloc one and tack it on. | * If none exists, malloc one and tack it on. | ||||
*/ | */ | ||||
if (phd == NULL) { | if (phd == NULL) { | ||||
phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); | phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); | ||||
if (phd == NULL) { | if (phd == NULL) { | ||||
return (ENOBUFS); /* XXX */ | return (ENOBUFS); /* XXX */ | ||||
} | } | ||||
bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); | |||||
phd->phd_port = inp->inp_lport; | phd->phd_port = inp->inp_lport; | ||||
CK_LIST_INIT(&phd->phd_pcblist); | CK_LIST_INIT(&phd->phd_pcblist); | ||||
CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); | CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); | ||||
} | } | ||||
inp->inp_phd = phd; | inp->inp_phd = phd; | ||||
CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); | CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); | ||||
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); | CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); | ||||
inp->inp_flags |= INP_INHASHLIST; | inp->inp_flags |= INP_INHASHLIST; | ||||
return (0); | return (0); | ||||
} | } | ||||
/* | /* | ||||
* Move PCB to the proper hash bucket when { faddr, fport } have been | * Move PCB to the proper hash bucket when { faddr, fport } have been | ||||
* changed. NOTE: This does not handle the case of the lport changing (the | * changed. NOTE: This does not handle the case of the lport changing (the | ||||
* hashed port list would have to be updated as well), so the lport must | * hashed port list would have to be updated as well), so the lport must | ||||
* not change after in_pcbinshash() has been called. | * not change after in_pcbinshash() has been called. | ||||
* | |||||
* XXXGL: a race between this function and SMR-protected hash iterator | |||||
* will lead to iterator traversing a possibly wrong hash list. However, | |||||
* this race should have been here since change from rwlock to epoch. | |||||
*/ | */ | ||||
void | void | ||||
in_pcbrehash(struct inpcb *inp) | in_pcbrehash(struct inpcb *inp) | ||||
{ | { | ||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | ||||
struct inpcbhead *head; | struct inpcbhead *head; | ||||
u_int32_t hashkey_faddr; | u_int32_t hashkey_faddr; | ||||
Show All 13 Lines | #endif | ||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, | head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, | ||||
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; | inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; | ||||
CK_LIST_REMOVE(inp, inp_hash); | CK_LIST_REMOVE(inp, inp_hash); | ||||
CK_LIST_INSERT_HEAD(head, inp, inp_hash); | CK_LIST_INSERT_HEAD(head, inp, inp_hash); | ||||
} | } | ||||
/* | /* | ||||
* Remove PCB from various lists. | |||||
*/ | |||||
static void | |||||
in_pcbremlists(struct inpcb *inp) | |||||
{ | |||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | |||||
INP_WLOCK_ASSERT(inp); | |||||
INP_LIST_WLOCK_ASSERT(pcbinfo); | |||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
if (inp->inp_flags & INP_INHASHLIST) { | |||||
struct inpcbport *phd = inp->inp_phd; | |||||
INP_HASH_WLOCK(pcbinfo); | |||||
/* XXX: Only do if SO_REUSEPORT_LB set? */ | |||||
in_pcbremlbgrouphash(inp); | |||||
CK_LIST_REMOVE(inp, inp_hash); | |||||
CK_LIST_REMOVE(inp, inp_portlist); | |||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | |||||
CK_LIST_REMOVE(phd, phd_hash); | |||||
NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); | |||||
} | |||||
INP_HASH_WUNLOCK(pcbinfo); | |||||
inp->inp_flags &= ~INP_INHASHLIST; | |||||
} | |||||
CK_LIST_REMOVE(inp, inp_list); | |||||
pcbinfo->ipi_count--; | |||||
} | |||||
/* | |||||
* Check for alternatives when higher level complains | * Check for alternatives when higher level complains | ||||
* about service problems. For now, invalidate cached | * about service problems. For now, invalidate cached | ||||
* routing information. If the route was created dynamically | * routing information. If the route was created dynamically | ||||
* (by a redirect), time to try a default gateway again. | * (by a redirect), time to try a default gateway again. | ||||
*/ | */ | ||||
void | void | ||||
in_losing(struct inpcb *inp) | in_losing(struct inpcb *inp) | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines | inp_unlock_assert(struct inpcb *inp) | ||||
INP_UNLOCK_ASSERT(inp); | INP_UNLOCK_ASSERT(inp); | ||||
} | } | ||||
#endif | #endif | ||||
void | void | ||||
inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) | inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) | ||||
{ | { | ||||
struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, | |||||
INPLOOKUP_WLOCKPCB); | |||||
struct inpcb *inp; | struct inpcb *inp; | ||||
INP_INFO_WLOCK(&V_tcbinfo); | while ((inp = inp_next(&inpi)) != NULL) | ||||
CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { | |||||
INP_WLOCK(inp); | |||||
func(inp, arg); | func(inp, arg); | ||||
INP_WUNLOCK(inp); | |||||
} | |||||
INP_INFO_WUNLOCK(&V_tcbinfo); | |||||
} | } | ||||
struct socket * | struct socket * | ||||
inp_inpcbtosocket(struct inpcb *inp) | inp_inpcbtosocket(struct inpcb *inp) | ||||
{ | { | ||||
INP_WLOCK_ASSERT(inp); | INP_WLOCK_ASSERT(inp); | ||||
return (inp->inp_socket); | return (inp->inp_socket); | ||||
▲ Show 20 Lines • Show All 634 Lines • Show Last 20 Lines |
inpcbzone_name can be const, BTW.