Changeset View
Changeset View
Standalone View
Standalone View
sys/netinet/in_pcb.c
Show First 20 Lines • Show All 107 Lines • ▼ Show 20 Lines | |||||
#endif | #endif | ||||
#include <netipsec/ipsec_support.h> | #include <netipsec/ipsec_support.h> | ||||
#include <security/mac/mac_framework.h> | #include <security/mac/mac_framework.h> | ||||
#define INPCBLBGROUP_SIZMIN 8 | #define INPCBLBGROUP_SIZMIN 8 | ||||
#define INPCBLBGROUP_SIZMAX 256 | #define INPCBLBGROUP_SIZMAX 256 | ||||
#define INP_FREED 0x00000200 /* See in_pcb.h. */ | |||||
static struct callout ipport_tick_callout; | static struct callout ipport_tick_callout; | ||||
/* | /* | ||||
* These configure the range of local port addresses assigned to | * These configure the range of local port addresses assigned to | ||||
* "unspecified" outgoing connections/packets/whatever. | * "unspecified" outgoing connections/packets/whatever. | ||||
*/ | */ | ||||
VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ | VNET_DEFINE(int, ipport_lowfirstauto) = IPPORT_RESERVED - 1; /* 1023 */ | ||||
Show All 16 Lines | |||||
VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ | VNET_DEFINE(int, ipport_randomcps) = 10; /* user controlled via sysctl */ | ||||
VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ | VNET_DEFINE(int, ipport_randomtime) = 45; /* user controlled via sysctl */ | ||||
VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ | VNET_DEFINE(int, ipport_stoprandom); /* toggled by ipport_tick */ | ||||
VNET_DEFINE(int, ipport_tcpallocs); | VNET_DEFINE(int, ipport_tcpallocs); | ||||
VNET_DEFINE_STATIC(int, ipport_tcplastcount); | VNET_DEFINE_STATIC(int, ipport_tcplastcount); | ||||
#define V_ipport_tcplastcount VNET(ipport_tcplastcount) | #define V_ipport_tcplastcount VNET(ipport_tcplastcount) | ||||
static void in_pcbremlists(struct inpcb *inp); | |||||
#ifdef INET | #ifdef INET | ||||
static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, | static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, | ||||
struct in_addr faddr, u_int fport_arg, | struct in_addr faddr, u_int fport_arg, | ||||
struct in_addr laddr, u_int lport_arg, | struct in_addr laddr, u_int lport_arg, | ||||
int lookupflags, struct ifnet *ifp, | int lookupflags, struct ifnet *ifp, | ||||
uint8_t numa_domain); | uint8_t numa_domain); | ||||
#define RANGECHK(var, min, max) \ | #define RANGECHK(var, min, max) \ | ||||
▲ Show 20 Lines • Show All 352 Lines • ▼ Show 20 Lines | |||||
static void | static void | ||||
inpcb_fini(void *mem, int size) | inpcb_fini(void *mem, int size) | ||||
{ | { | ||||
struct inpcb *inp = mem; | struct inpcb *inp = mem; | ||||
INP_LOCK_DESTROY(inp); | INP_LOCK_DESTROY(inp); | ||||
} | } | ||||
/* Make sure it is safe to use hashinit(9) on CK_LIST. */ | |||||
CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); | |||||
/* | /* | ||||
* Initialize an inpcbinfo -- we should be able to reduce the number of | * Initialize an inpcbinfo -- we should be able to reduce the number of | ||||
* arguments in time. | * arguments in time. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, | in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, | ||||
struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, | u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, | ||||
char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) | uma_init inpcbzone_init) | ||||
{ | { | ||||
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); | mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); | ||||
mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); | |||||
INP_INFO_LOCK_INIT(pcbinfo, name); | |||||
INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ | |||||
INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); | |||||
#ifdef VIMAGE | #ifdef VIMAGE | ||||
pcbinfo->ipi_vnet = curvnet; | pcbinfo->ipi_vnet = curvnet; | ||||
#endif | #endif | ||||
pcbinfo->ipi_listhead = listhead; | CK_LIST_INIT(&pcbinfo->ipi_listhead); | ||||
CK_LIST_INIT(pcbinfo->ipi_listhead); | |||||
pcbinfo->ipi_count = 0; | pcbinfo->ipi_count = 0; | ||||
pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, | pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, | ||||
&pcbinfo->ipi_hashmask); | &pcbinfo->ipi_hashmask); | ||||
porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); | |||||
pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, | pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, | ||||
&pcbinfo->ipi_porthashmask); | &pcbinfo->ipi_porthashmask); | ||||
pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, | pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, | ||||
&pcbinfo->ipi_lbgrouphashmask); | &pcbinfo->ipi_lbgrouphashmask); | ||||
pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), | pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), | ||||
NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); | NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, | ||||
UMA_ZONE_SMR); | |||||
uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); | uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); | ||||
uma_zone_set_warning(pcbinfo->ipi_zone, | uma_zone_set_warning(pcbinfo->ipi_zone, | ||||
"kern.ipc.maxsockets limit reached"); | "kern.ipc.maxsockets limit reached"); | ||||
pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); | |||||
pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, | |||||
sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); | |||||
uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); | |||||
} | } | ||||
/* | /* | ||||
* Destroy an inpcbinfo. | * Destroy an inpcbinfo. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) | in_pcbinfo_destroy(struct inpcbinfo *pcbinfo) | ||||
{ | { | ||||
KASSERT(pcbinfo->ipi_count == 0, | KASSERT(pcbinfo->ipi_count == 0, | ||||
("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); | ("%s: ipi_count = %u", __func__, pcbinfo->ipi_count)); | ||||
hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); | hashdestroy(pcbinfo->ipi_hashbase, M_PCB, pcbinfo->ipi_hashmask); | ||||
hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, | hashdestroy(pcbinfo->ipi_porthashbase, M_PCB, | ||||
pcbinfo->ipi_porthashmask); | pcbinfo->ipi_porthashmask); | ||||
hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, | hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, | ||||
pcbinfo->ipi_lbgrouphashmask); | pcbinfo->ipi_lbgrouphashmask); | ||||
uma_zdestroy(pcbinfo->ipi_zone); | uma_zdestroy(pcbinfo->ipi_zone); | ||||
INP_LIST_LOCK_DESTROY(pcbinfo); | mtx_destroy(&pcbinfo->ipi_hash_lock); | ||||
INP_HASH_LOCK_DESTROY(pcbinfo); | mtx_destroy(&pcbinfo->ipi_lock); | ||||
INP_INFO_LOCK_DESTROY(pcbinfo); | |||||
} | } | ||||
/* | /* | ||||
* Allocate a PCB and associate it with the socket. | * Allocate a PCB and associate it with the socket. | ||||
* On success return with the PCB locked. | * On success return with the PCB locked. | ||||
*/ | */ | ||||
int | int | ||||
in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) | in_pcballoc(struct socket *so, struct inpcbinfo *pcbinfo) | ||||
{ | { | ||||
struct inpcb *inp; | struct inpcb *inp; | ||||
int error; | int error; | ||||
error = 0; | error = 0; | ||||
inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); | inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); | ||||
if (inp == NULL) | if (inp == NULL) | ||||
return (ENOBUFS); | return (ENOBUFS); | ||||
bzero(&inp->inp_start_zero, inp_zero_size); | bzero(&inp->inp_start_zero, inp_zero_size); | ||||
#ifdef NUMA | #ifdef NUMA | ||||
inp->inp_numa_domain = M_NODOM; | inp->inp_numa_domain = M_NODOM; | ||||
#endif | #endif | ||||
inp->inp_pcbinfo = pcbinfo; | inp->inp_pcbinfo = pcbinfo; | ||||
inp->inp_socket = so; | inp->inp_socket = so; | ||||
Show All 15 Lines | #endif | ||||
} | } | ||||
#endif /*IPSEC*/ | #endif /*IPSEC*/ | ||||
#ifdef INET6 | #ifdef INET6 | ||||
if (INP_SOCKAF(so) == AF_INET6) { | if (INP_SOCKAF(so) == AF_INET6) { | ||||
inp->inp_vflag |= INP_IPV6PROTO; | inp->inp_vflag |= INP_IPV6PROTO; | ||||
if (V_ip6_v6only) | if (V_ip6_v6only) | ||||
inp->inp_flags |= IN6P_IPV6_V6ONLY; | inp->inp_flags |= IN6P_IPV6_V6ONLY; | ||||
} | } | ||||
#endif | |||||
INP_WLOCK(inp); | |||||
INP_LIST_WLOCK(pcbinfo); | |||||
CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); | |||||
pcbinfo->ipi_count++; | |||||
so->so_pcb = (caddr_t)inp; | |||||
#ifdef INET6 | |||||
if (V_ip6_auto_flowlabel) | if (V_ip6_auto_flowlabel) | ||||
inp->inp_flags |= IN6P_AUTOFLOWLABEL; | inp->inp_flags |= IN6P_AUTOFLOWLABEL; | ||||
#endif | #endif | ||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ | |||||
/* | /* | ||||
* Routes in inpcb's can cache L2 as well; they are guaranteed | * Routes in inpcb's can cache L2 as well; they are guaranteed | ||||
* to be cleaned up. | * to be cleaned up. | ||||
*/ | */ | ||||
inp->inp_route.ro_flags = RT_LLE_CACHE; | inp->inp_route.ro_flags = RT_LLE_CACHE; | ||||
INP_LIST_WUNLOCK(pcbinfo); | #ifdef TCPHPTS | ||||
/* | |||||
* If using hpts lets drop a random number in so | |||||
* not all new connections fall on the same CPU. | |||||
*/ | |||||
inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp); | |||||
#endif | |||||
refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ | |||||
INP_WLOCK(inp); | |||||
INP_INFO_WLOCK(pcbinfo); | |||||
pcbinfo->ipi_count++; | |||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); | |||||
INP_INFO_WUNLOCK(pcbinfo); | |||||
so->so_pcb = inp; | |||||
return (0); | |||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) | #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) | ||||
out: | out: | ||||
if (error != 0) { | |||||
crfree(inp->inp_cred); | crfree(inp->inp_cred); | ||||
uma_zfree(pcbinfo->ipi_zone, inp); | uma_zfree_smr(pcbinfo->ipi_zone, inp); | ||||
} | |||||
#endif | |||||
return (error); | return (error); | ||||
#endif | |||||
} | } | ||||
#ifdef INET | #ifdef INET | ||||
int | int | ||||
in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) | in_pcbbind(struct inpcb *inp, struct sockaddr *nam, struct ucred *cred) | ||||
{ | { | ||||
int anonport, error; | int anonport, error; | ||||
▲ Show 20 Lines • Show All 850 Lines • ▼ Show 20 Lines | #ifdef RATELIMIT | ||||
if (inp->inp_snd_tag != NULL) | if (inp->inp_snd_tag != NULL) | ||||
in_pcbdetach_txrtlmt(inp); | in_pcbdetach_txrtlmt(inp); | ||||
#endif | #endif | ||||
inp->inp_socket->so_pcb = NULL; | inp->inp_socket->so_pcb = NULL; | ||||
inp->inp_socket = NULL; | inp->inp_socket = NULL; | ||||
} | } | ||||
/* | /* | ||||
* in_pcbref() bumps the reference count on an inpcb in order to maintain | * inpcb hash lookups are protected by SMR section. | ||||
* stability of an inpcb pointer despite the inpcb lock being released. This | |||||
* is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, | |||||
* but where the inpcb lock may already held. | |||||
* | * | ||||
* in_pcbref() should be used only to provide brief memory stability, and | * Once desired pcb has been found, switching from SMR section to a pcb | ||||
* must always be followed by a call to INP_WLOCK() and in_pcbrele() to | * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK | ||||
* garbage collect the inpcb if it has been in_pcbfree()'d from another | * here because SMR is a critical section. | ||||
* context. Until in_pcbrele() has returned that the inpcb is still valid, | * In 99%+ cases inp_smr_lock() would obtain the lock immediately. | ||||
* lock and rele are the *only* safe operations that may be performed on the | |||||
* inpcb. | |||||
* | |||||
* While the inpcb will not be freed, releasing the inpcb lock means that the | |||||
* connection's state may change, so the caller should be careful to | |||||
* revalidate any cached state on reacquiring the lock. Drop the reference | |||||
* using in_pcbrele(). | |||||
*/ | */ | ||||
void | static inline void | ||||
in_pcbref(struct inpcb *inp) | inp_lock(struct inpcb *inp, const inp_lookup_t lock) | ||||
{ | { | ||||
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); | lock == INPLOOKUP_RLOCKPCB ? | ||||
rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); | |||||
} | |||||
refcount_acquire(&inp->inp_refcount); | static inline void | ||||
inp_unlock(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
lock == INPLOOKUP_RLOCKPCB ? | |||||
rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); | |||||
} | } | ||||
/* | static inline int | ||||
* Drop a refcount on an inpcb elevated using in_pcbref(); because a call to | inp_trylock(struct inpcb *inp, const inp_lookup_t lock) | ||||
* in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we | |||||
* return a flag indicating whether or not the inpcb remains valid. If it is | |||||
* valid, we return with the inpcb lock held. | |||||
* | |||||
* Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a | |||||
* reference on an inpcb. Historically more work was done here (actually, in | |||||
* in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the | |||||
* need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely | |||||
* about memory stability (and continued use of the write lock). | |||||
*/ | |||||
int | |||||
in_pcbrele_rlocked(struct inpcb *inp) | |||||
{ | { | ||||
struct inpcbinfo *pcbinfo; | |||||
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); | return (lock == INPLOOKUP_RLOCKPCB ? | ||||
rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); | |||||
} | |||||
INP_RLOCK_ASSERT(inp); | static inline bool | ||||
in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
if (refcount_release(&inp->inp_refcount) == 0) { | return (lock == INPLOOKUP_RLOCKPCB ? | ||||
in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); | |||||
} | |||||
bool | |||||
inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) | |||||
{ | |||||
MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); | |||||
SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); | |||||
if (__predict_true(inp_trylock(inp, lock))) { | |||||
if (__predict_false(inp->inp_flags & INP_FREED)) { | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
inp_unlock(inp, lock); | |||||
return (false); | |||||
} | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
return (true); | |||||
} | |||||
if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
inp_lock(inp, lock); | |||||
if (__predict_false(in_pcbrele(inp, lock))) | |||||
return (false); | |||||
/* | /* | ||||
* If the inpcb has been freed, let the caller know, even if | * inp acquired through refcount & lock for sure didn't went | ||||
* this isn't the last reference. | * through uma_zfree(). However, it may have already went | ||||
* through in_pcbfree() and has another reference, that | |||||
* prevented its release by our in_pcbrele(). | |||||
*/ | */ | ||||
if (inp->inp_flags2 & INP_FREED) { | if (__predict_false(inp->inp_flags & INP_FREED)) { | ||||
INP_RUNLOCK(inp); | inp_unlock(inp, lock); | ||||
return (1); | return (false); | ||||
} | } | ||||
return (0); | return (true); | ||||
} else { | |||||
smr_exit(inp->inp_pcbinfo->ipi_smr); | |||||
return (false); | |||||
} | } | ||||
} | |||||
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | |||||
#ifdef TCPHPTS | |||||
if (inp->inp_in_hpts || inp->inp_in_input) { | |||||
struct tcp_hpts_entry *hpts; | |||||
/* | /* | ||||
* We should not be on the hpts at | * inp_next() - inpcb hash/list traversal iterator | ||||
* this point in any form. we must | * | ||||
* get the lock to be sure. | * Requires initialized struct inpcb_iterator for context. | ||||
* The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). | |||||
* | |||||
* - Iterator can have either write-lock or read-lock semantics, that can not | |||||
* be changed later. | |||||
* - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through | |||||
* a single hash slot. Note: only rip_input() does the latter. | |||||
* - Iterator may have optional bool matching function. The matching function | |||||
* will be executed for each inpcb in the SMR context, so it can not acquire | |||||
* locks and can safely access only immutable fields of inpcb. | |||||
* | |||||
* A fresh initialized iterator has NULL inpcb in its context and that | |||||
* means that inp_next() call would return the very first inpcb on the list | |||||
* locked with desired semantic. In all following calls the context pointer | |||||
* shall hold the current inpcb pointer. The KPI user is not supposed to | |||||
* unlock the current inpcb! Upon end of traversal inp_next() will return NULL | |||||
* and write NULL to its context. After end of traversal an iterator can be | |||||
* reused. | |||||
* | |||||
* List traversals have the following features/constraints: | |||||
* - New entries won't be seen, as they are always added to the head of a list. | |||||
* - Removed entries won't stop traversal as long as they are not added to | |||||
* a different list. This is violated by in_pcbrehash(). | |||||
*/ | */ | ||||
hpts = tcp_hpts_lock(inp); | #define II_LIST_FIRST(ipi, hash) \ | ||||
if (inp->inp_in_hpts) | (((hash) == INP_ALL_LIST) ? \ | ||||
panic("Hpts:%p inp:%p at free still on hpts", | CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ | ||||
hpts, inp); | CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) | ||||
mtx_unlock(&hpts->p_mtx); | #define II_LIST_NEXT(inp, hash) \ | ||||
hpts = tcp_input_lock(inp); | (((hash) == INP_ALL_LIST) ? \ | ||||
if (inp->inp_in_input) | CK_LIST_NEXT((inp), inp_list) : \ | ||||
panic("Hpts:%p inp:%p at free still on input hpts", | CK_LIST_NEXT((inp), inp_hash)) | ||||
hpts, inp); | #define II_LOCK_ASSERT(inp, lock) \ | ||||
mtx_unlock(&hpts->p_mtx); | rw_assert(&(inp)->inp_lock, \ | ||||
(lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) | |||||
struct inpcb * | |||||
inp_next(struct inpcb_iterator *ii) | |||||
{ | |||||
const struct inpcbinfo *ipi = ii->ipi; | |||||
inp_match_t *match = ii->match; | |||||
void *ctx = ii->ctx; | |||||
inp_lookup_t lock = ii->lock; | |||||
int hash = ii->hash; | |||||
struct inpcb *inp; | |||||
if (ii->inp == NULL) { /* First call. */ | |||||
smr_enter(ipi->ipi_smr); | |||||
/* This is unrolled CK_LIST_FOREACH(). */ | |||||
for (inp = II_LIST_FIRST(ipi, hash); | |||||
inp != NULL; | |||||
inp = II_LIST_NEXT(inp, hash)) { | |||||
if (match != NULL && (match)(inp, ctx) == false) | |||||
continue; | |||||
if (__predict_true(inp_smr_lock(inp, lock))) | |||||
break; | |||||
else { | |||||
smr_enter(ipi->ipi_smr); | |||||
MPASS(inp != II_LIST_FIRST(ipi, hash)); | |||||
inp = II_LIST_FIRST(ipi, hash); | |||||
} | } | ||||
#endif | |||||
INP_RUNLOCK(inp); | |||||
pcbinfo = inp->inp_pcbinfo; | |||||
uma_zfree(pcbinfo->ipi_zone, inp); | |||||
return (1); | |||||
} | } | ||||
int | if (inp == NULL) | ||||
in_pcbrele_wlocked(struct inpcb *inp) | smr_exit(ipi->ipi_smr); | ||||
{ | else | ||||
struct inpcbinfo *pcbinfo; | ii->inp = inp; | ||||
KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); | return (inp); | ||||
} | |||||
INP_WLOCK_ASSERT(inp); | /* Not a first call. */ | ||||
smr_enter(ipi->ipi_smr); | |||||
restart: | |||||
inp = ii->inp; | |||||
II_LOCK_ASSERT(inp, lock); | |||||
next: | |||||
inp = II_LIST_NEXT(inp, hash); | |||||
if (inp == NULL) { | |||||
smr_exit(ipi->ipi_smr); | |||||
goto found; | |||||
} | |||||
if (refcount_release(&inp->inp_refcount) == 0) { | if (match != NULL && (match)(inp, ctx) == false) | ||||
goto next; | |||||
if (__predict_true(inp_trylock(inp, lock))) { | |||||
if (__predict_false(inp->inp_flags & INP_FREED)) { | |||||
/* | /* | ||||
* If the inpcb has been freed, let the caller know, even if | * Entries are never inserted in middle of a list, thus | ||||
* this isn't the last reference. | * as long as we are in SMR, we can continue traversal. | ||||
* Jump to 'restart' should yield in the same result, | |||||
* but could produce unnecessary looping. Could this | |||||
* looping be unbound? | |||||
*/ | */ | ||||
if (inp->inp_flags2 & INP_FREED) { | inp_unlock(inp, lock); | ||||
INP_WUNLOCK(inp); | goto next; | ||||
return (1); | } else { | ||||
smr_exit(ipi->ipi_smr); | |||||
goto found; | |||||
} | } | ||||
return (0); | |||||
} | } | ||||
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | |||||
#ifdef TCPHPTS | |||||
if (inp->inp_in_hpts || inp->inp_in_input) { | |||||
struct tcp_hpts_entry *hpts; | |||||
/* | /* | ||||
* We should not be on the hpts at | * Can't obtain lock immediately, thus going hard. Once we exit the | ||||
* this point in any form. we must | * SMR section we can no longer jump to 'next', and our only stable | ||||
* get the lock to be sure. | * anchoring point is ii->inp, which we keep locked for this case, so | ||||
* we jump to 'restart'. | |||||
*/ | */ | ||||
hpts = tcp_hpts_lock(inp); | if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { | ||||
if (inp->inp_in_hpts) | smr_exit(ipi->ipi_smr); | ||||
panic("Hpts:%p inp:%p at free still on hpts", | inp_lock(inp, lock); | ||||
hpts, inp); | if (__predict_false(in_pcbrele(inp, lock))) { | ||||
mtx_unlock(&hpts->p_mtx); | smr_enter(ipi->ipi_smr); | ||||
hpts = tcp_input_lock(inp); | goto restart; | ||||
if (inp->inp_in_input) | |||||
panic("Hpts:%p inp:%p at free still on input hpts", | |||||
hpts, inp); | |||||
mtx_unlock(&hpts->p_mtx); | |||||
} | } | ||||
#endif | /* | ||||
INP_WUNLOCK(inp); | * See comment in inp_smr_lock(). | ||||
pcbinfo = inp->inp_pcbinfo; | */ | ||||
uma_zfree(pcbinfo->ipi_zone, inp); | if (__predict_false(inp->inp_flags & INP_FREED)) { | ||||
return (1); | inp_unlock(inp, lock); | ||||
smr_enter(ipi->ipi_smr); | |||||
goto restart; | |||||
} | } | ||||
} else | |||||
goto next; | |||||
static void | found: | ||||
inpcbport_free(epoch_context_t ctx) | inp_unlock(ii->inp, lock); | ||||
ii->inp = inp; | |||||
return (ii->inp); | |||||
} | |||||
/* | |||||
* in_pcbref() bumps the reference count on an inpcb in order to maintain | |||||
* stability of an inpcb pointer despite the inpcb lock being released or | |||||
* SMR section exited. | |||||
* | |||||
* To free a reference later in_pcbrele_(r|w)locked() must be performed. | |||||
*/ | |||||
void | |||||
in_pcbref(struct inpcb *inp) | |||||
{ | { | ||||
struct inpcbport *phd; | u_int old __diagused; | ||||
phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); | old = refcount_acquire(&inp->inp_refcount); | ||||
free(phd, M_PCB); | KASSERT(old > 0, ("%s: refcount 0", __func__)); | ||||
} | } | ||||
static void | /* | ||||
in_pcbfree_deferred(epoch_context_t ctx) | * Drop a refcount on an inpcb elevated using in_pcbref(), potentially | ||||
* freeing the pcb, if the reference was very last. | |||||
*/ | |||||
bool | |||||
in_pcbrele_rlocked(struct inpcb *inp) | |||||
{ | { | ||||
struct inpcb *inp; | |||||
int released __unused; | |||||
inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); | INP_RLOCK_ASSERT(inp); | ||||
INP_WLOCK(inp); | if (refcount_release(&inp->inp_refcount) == 0) | ||||
CURVNET_SET(inp->inp_vnet); | return (false); | ||||
#ifdef INET | |||||
struct ip_moptions *imo = inp->inp_moptions; | MPASS(inp->inp_flags & INP_FREED); | ||||
inp->inp_moptions = NULL; | MPASS(inp->inp_socket == NULL); | ||||
#endif | MPASS(inp->inp_in_hpts == 0); | ||||
/* XXXRW: Do as much as possible here. */ | MPASS(inp->inp_in_input == 0); | ||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) | INP_RUNLOCK(inp); | ||||
if (inp->inp_sp != NULL) | uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); | ||||
ipsec_delete_pcbpolicy(inp); | return (true); | ||||
#endif | |||||
#ifdef INET6 | |||||
struct ip6_moptions *im6o = NULL; | |||||
if (inp->inp_vflag & INP_IPV6PROTO) { | |||||
ip6_freepcbopts(inp->in6p_outputopts); | |||||
im6o = inp->in6p_moptions; | |||||
inp->in6p_moptions = NULL; | |||||
} | } | ||||
#endif | |||||
if (inp->inp_options) | bool | ||||
(void)m_free(inp->inp_options); | in_pcbrele_wlocked(struct inpcb *inp) | ||||
inp->inp_vflag = 0; | { | ||||
crfree(inp->inp_cred); | |||||
#ifdef MAC | INP_WLOCK_ASSERT(inp); | ||||
mac_inpcb_destroy(inp); | |||||
#endif | if (refcount_release(&inp->inp_refcount) == 0) | ||||
released = in_pcbrele_wlocked(inp); | return (false); | ||||
MPASS(released); | |||||
#ifdef INET6 | MPASS(inp->inp_flags & INP_FREED); | ||||
ip6_freemoptions(im6o); | MPASS(inp->inp_socket == NULL); | ||||
#endif | MPASS(inp->inp_in_hpts == 0); | ||||
#ifdef INET | MPASS(inp->inp_in_input == 0); | ||||
inp_freemoptions(imo); | INP_WUNLOCK(inp); | ||||
#endif | uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); | ||||
CURVNET_RESTORE(); | return (true); | ||||
} | } | ||||
/* | /* | ||||
* Unconditionally schedule an inpcb to be freed by decrementing its | * Unconditionally schedule an inpcb to be freed by decrementing its | ||||
* reference count, which should occur only after the inpcb has been detached | * reference count, which should occur only after the inpcb has been detached | ||||
* from its socket. If another thread holds a temporary reference (acquired | * from its socket. If another thread holds a temporary reference (acquired | ||||
* using in_pcbref()) then the free is deferred until that reference is | * using in_pcbref()) then the free is deferred until that reference is | ||||
* released using in_pcbrele(), but the inpcb is still unlocked. Almost all | * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. | ||||
* work, including removal from global lists, is done in this context, where | * Almost all work, including removal from global lists, is done in this | ||||
* the pcbinfo lock is held. | * context, where the pcbinfo lock is held. | ||||
*/ | */ | ||||
void | void | ||||
in_pcbfree(struct inpcb *inp) | in_pcbfree(struct inpcb *inp) | ||||
{ | { | ||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | ||||
#ifdef INET | |||||
struct ip_moptions *imo; | |||||
#endif | |||||
#ifdef INET6 | |||||
struct ip6_moptions *im6o; | |||||
#endif | |||||
INP_WLOCK_ASSERT(inp); | |||||
KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); | ||||
KASSERT((inp->inp_flags2 & INP_FREED) == 0, | KASSERT((inp->inp_flags & INP_FREED) == 0, | ||||
("%s: called twice for pcb %p", __func__, inp)); | ("%s: called twice for pcb %p", __func__, inp)); | ||||
if (inp->inp_flags2 & INP_FREED) { | |||||
INP_WUNLOCK(inp); | inp->inp_flags |= INP_FREED; | ||||
return; | INP_INFO_WLOCK(pcbinfo); | ||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
pcbinfo->ipi_count--; | |||||
CK_LIST_REMOVE(inp, inp_list); | |||||
INP_INFO_WUNLOCK(pcbinfo); | |||||
if (inp->inp_flags & INP_INHASHLIST) { | |||||
struct inpcbport *phd = inp->inp_phd; | |||||
INP_HASH_WLOCK(pcbinfo); | |||||
/* XXX: Only do if SO_REUSEPORT_LB set? */ | |||||
in_pcbremlbgrouphash(inp); | |||||
CK_LIST_REMOVE(inp, inp_hash); | |||||
CK_LIST_REMOVE(inp, inp_portlist); | |||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | |||||
CK_LIST_REMOVE(phd, phd_hash); | |||||
uma_zfree_smr(pcbinfo->ipi_portzone, phd); | |||||
} | } | ||||
INP_HASH_WUNLOCK(pcbinfo); | |||||
inp->inp_flags &= ~INP_INHASHLIST; | |||||
} | |||||
INP_WLOCK_ASSERT(inp); | crfree(inp->inp_cred); | ||||
INP_LIST_WLOCK(pcbinfo); | |||||
in_pcbremlists(inp); | |||||
INP_LIST_WUNLOCK(pcbinfo); | |||||
RO_INVALIDATE_CACHE(&inp->inp_route); | RO_INVALIDATE_CACHE(&inp->inp_route); | ||||
/* mark as destruction in progress */ | #ifdef MAC | ||||
inp->inp_flags2 |= INP_FREED; | mac_inpcb_destroy(inp); | ||||
#endif | |||||
#if defined(IPSEC) || defined(IPSEC_SUPPORT) | |||||
if (inp->inp_sp != NULL) | |||||
ipsec_delete_pcbpolicy(inp); | |||||
#endif | |||||
#ifdef INET | |||||
if (inp->inp_options) | |||||
(void)m_free(inp->inp_options); | |||||
imo = inp->inp_moptions; | |||||
#endif | |||||
#ifdef INET6 | |||||
if (inp->inp_vflag & INP_IPV6PROTO) { | |||||
ip6_freepcbopts(inp->in6p_outputopts); | |||||
im6o = inp->in6p_moptions; | |||||
} else | |||||
im6o = NULL; | |||||
#endif | |||||
if (__predict_false(in_pcbrele_wlocked(inp) == false)) { | |||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); | |||||
} | } | ||||
#ifdef INET6 | |||||
ip6_freemoptions(im6o); | |||||
#endif | |||||
#ifdef INET | |||||
inp_freemoptions(imo); | |||||
#endif | |||||
} | |||||
/* | /* | ||||
* in_pcbdrop() removes an inpcb from hashed lists, releasing its address and | * in_pcbdrop() removes an inpcb from hashed lists, releasing its address and | ||||
* port reservation, and preventing it from being returned by inpcb lookups. | * port reservation, and preventing it from being returned by inpcb lookups. | ||||
* | * | ||||
* It is used by TCP to mark an inpcb as unused and avoid future packet | * It is used by TCP to mark an inpcb as unused and avoid future packet | ||||
* delivery or event notification when a socket remains open but TCP has | * delivery or event notification when a socket remains open but TCP has | ||||
* closed. This might occur as a result of a shutdown()-initiated TCP close | * closed. This might occur as a result of a shutdown()-initiated TCP close | ||||
Show All 23 Lines | if (inp->inp_flags & INP_INHASHLIST) { | ||||
struct inpcbport *phd = inp->inp_phd; | struct inpcbport *phd = inp->inp_phd; | ||||
INP_HASH_WLOCK(inp->inp_pcbinfo); | INP_HASH_WLOCK(inp->inp_pcbinfo); | ||||
in_pcbremlbgrouphash(inp); | in_pcbremlbgrouphash(inp); | ||||
CK_LIST_REMOVE(inp, inp_hash); | CK_LIST_REMOVE(inp, inp_hash); | ||||
CK_LIST_REMOVE(inp, inp_portlist); | CK_LIST_REMOVE(inp, inp_portlist); | ||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | ||||
CK_LIST_REMOVE(phd, phd_hash); | CK_LIST_REMOVE(phd, phd_hash); | ||||
NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); | uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); | ||||
} | } | ||||
INP_HASH_WUNLOCK(inp->inp_pcbinfo); | INP_HASH_WUNLOCK(inp->inp_pcbinfo); | ||||
inp->inp_flags &= ~INP_INHASHLIST; | inp->inp_flags &= ~INP_INHASHLIST; | ||||
} | } | ||||
} | } | ||||
#ifdef INET | #ifdef INET | ||||
/* | /* | ||||
▲ Show 20 Lines • Show All 54 Lines • ▼ Show 20 Lines | |||||
void | void | ||||
in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, | in_pcbnotifyall(struct inpcbinfo *pcbinfo, struct in_addr faddr, int errno, | ||||
struct inpcb *(*notify)(struct inpcb *, int)) | struct inpcb *(*notify)(struct inpcb *, int)) | ||||
{ | { | ||||
struct inpcb *inp, *inp_temp; | struct inpcb *inp, *inp_temp; | ||||
INP_INFO_WLOCK(pcbinfo); | INP_INFO_WLOCK(pcbinfo); | ||||
CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { | CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { | ||||
INP_WLOCK(inp); | INP_WLOCK(inp); | ||||
#ifdef INET6 | #ifdef INET6 | ||||
if ((inp->inp_vflag & INP_IPV4) == 0) { | if ((inp->inp_vflag & INP_IPV4) == 0) { | ||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
continue; | continue; | ||||
} | } | ||||
#endif | #endif | ||||
if (inp->inp_faddr.s_addr != faddr.s_addr || | if (inp->inp_faddr.s_addr != faddr.s_addr || | ||||
inp->inp_socket == NULL) { | inp->inp_socket == NULL) { | ||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
continue; | continue; | ||||
} | } | ||||
if ((*notify)(inp, errno)) | if ((*notify)(inp, errno)) | ||||
INP_WUNLOCK(inp); | INP_WUNLOCK(inp); | ||||
} | } | ||||
INP_INFO_WUNLOCK(pcbinfo); | INP_INFO_WUNLOCK(pcbinfo); | ||||
} | } | ||||
static bool | |||||
inp_v4_multi_match(const struct inpcb *inp, void *v __unused) | |||||
{ | |||||
if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) | |||||
return (true); | |||||
else | |||||
return (false); | |||||
} | |||||
void | void | ||||
in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) | in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) | ||||
{ | { | ||||
struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, | |||||
inp_v4_multi_match, NULL); | |||||
struct inpcb *inp; | struct inpcb *inp; | ||||
struct in_multi *inm; | struct in_multi *inm; | ||||
struct in_mfilter *imf; | struct in_mfilter *imf; | ||||
struct ip_moptions *imo; | struct ip_moptions *imo; | ||||
INP_INFO_WLOCK(pcbinfo); | IN_MULTI_LOCK_ASSERT(); | ||||
CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { | |||||
INP_WLOCK(inp); | while ((inp = inp_next(&inpi)) != NULL) { | ||||
INP_WLOCK_ASSERT(inp); | |||||
imo = inp->inp_moptions; | imo = inp->inp_moptions; | ||||
if ((inp->inp_vflag & INP_IPV4) && | |||||
imo != NULL) { | |||||
/* | /* | ||||
* Unselect the outgoing interface if it is being | * Unselect the outgoing interface if it is being | ||||
* detached. | * detached. | ||||
*/ | */ | ||||
if (imo->imo_multicast_ifp == ifp) | if (imo->imo_multicast_ifp == ifp) | ||||
imo->imo_multicast_ifp = NULL; | imo->imo_multicast_ifp = NULL; | ||||
/* | /* | ||||
* Drop multicast group membership if we joined | * Drop multicast group membership if we joined | ||||
* through the interface being detached. | * through the interface being detached. | ||||
* | * | ||||
* XXX This can all be deferred to an epoch_call | * XXX This can all be deferred to an epoch_call | ||||
*/ | */ | ||||
restart: | restart: | ||||
IP_MFILTER_FOREACH(imf, &imo->imo_head) { | IP_MFILTER_FOREACH(imf, &imo->imo_head) { | ||||
if ((inm = imf->imf_inm) == NULL) | if ((inm = imf->imf_inm) == NULL) | ||||
continue; | continue; | ||||
if (inm->inm_ifp != ifp) | if (inm->inm_ifp != ifp) | ||||
continue; | continue; | ||||
ip_mfilter_remove(&imo->imo_head, imf); | ip_mfilter_remove(&imo->imo_head, imf); | ||||
IN_MULTI_LOCK_ASSERT(); | |||||
in_leavegroup_locked(inm, NULL); | in_leavegroup_locked(inm, NULL); | ||||
ip_mfilter_free(imf); | ip_mfilter_free(imf); | ||||
goto restart; | goto restart; | ||||
} | } | ||||
} | } | ||||
INP_WUNLOCK(inp); | |||||
} | } | ||||
INP_INFO_WUNLOCK(pcbinfo); | |||||
} | |||||
/* | /* | ||||
* Lookup a PCB based on the local address and port. Caller must hold the | * Lookup a PCB based on the local address and port. Caller must hold the | ||||
* hash lock. No inpcb locks or references are acquired. | * hash lock. No inpcb locks or references are acquired. | ||||
*/ | */ | ||||
#define INP_LOOKUP_MAPPED_PCB_COST 3 | #define INP_LOOKUP_MAPPED_PCB_COST 3 | ||||
struct inpcb * | struct inpcb * | ||||
in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, | in_pcblookup_local(struct inpcbinfo *pcbinfo, struct in_addr laddr, | ||||
u_short lport, int lookupflags, struct ucred *cred) | u_short lport, int lookupflags, struct ucred *cred) | ||||
{ | { | ||||
struct inpcb *inp; | struct inpcb *inp; | ||||
#ifdef INET6 | #ifdef INET6 | ||||
int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; | int matchwild = 3 + INP_LOOKUP_MAPPED_PCB_COST; | ||||
#else | #else | ||||
int matchwild = 3; | int matchwild = 3; | ||||
#endif | #endif | ||||
int wildcard; | int wildcard; | ||||
KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, | KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, | ||||
("%s: invalid lookup flags %d", __func__, lookupflags)); | ("%s: invalid lookup flags %d", __func__, lookupflags)); | ||||
INP_HASH_LOCK_ASSERT(pcbinfo); | INP_HASH_LOCK_ASSERT(pcbinfo); | ||||
if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { | if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { | ||||
struct inpcbhead *head; | struct inpcbhead *head; | ||||
/* | /* | ||||
* Look for an unconnected (wildcard foreign addr) PCB that | * Look for an unconnected (wildcard foreign addr) PCB that | ||||
* matches the local address and port we're looking for. | * matches the local address and port we're looking for. | ||||
*/ | */ | ||||
▲ Show 20 Lines • Show All 146 Lines • ▼ Show 20 Lines | #endif | ||||
if (numa_wild != NULL) | if (numa_wild != NULL) | ||||
return (numa_wild); | return (numa_wild); | ||||
return (local_wild); | return (local_wild); | ||||
} | } | ||||
/* | /* | ||||
* Lookup PCB in hash list, using pcbinfo tables. This variation assumes | * Lookup PCB in hash list, using pcbinfo tables. This variation assumes | ||||
* that the caller has locked the hash list, and will not perform any further | * that the caller has either locked the hash list, which usually happens | ||||
* locking or reference operations on either the hash list or the connection. | * for bind(2) operations, or is in SMR section, which happens when sorting | ||||
* out incoming packets. | |||||
*/ | */ | ||||
static struct inpcb * | static struct inpcb * | ||||
in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, | in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, | ||||
u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, | u_int fport_arg, struct in_addr laddr, u_int lport_arg, int lookupflags, | ||||
struct ifnet *ifp, uint8_t numa_domain) | struct ifnet *ifp, uint8_t numa_domain) | ||||
{ | { | ||||
struct inpcbhead *head; | struct inpcbhead *head; | ||||
struct inpcb *inp, *tmpinp; | struct inpcb *inp, *tmpinp; | ||||
▲ Show 20 Lines • Show All 124 Lines • ▼ Show 20 Lines | |||||
*/ | */ | ||||
static struct inpcb * | static struct inpcb * | ||||
in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, | in_pcblookup_hash(struct inpcbinfo *pcbinfo, struct in_addr faddr, | ||||
u_int fport, struct in_addr laddr, u_int lport, int lookupflags, | u_int fport, struct in_addr laddr, u_int lport, int lookupflags, | ||||
struct ifnet *ifp, uint8_t numa_domain) | struct ifnet *ifp, uint8_t numa_domain) | ||||
{ | { | ||||
struct inpcb *inp; | struct inpcb *inp; | ||||
smr_enter(pcbinfo->ipi_smr); | |||||
inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, | inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, | ||||
lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); | lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); | ||||
if (inp != NULL) { | if (inp != NULL) { | ||||
if (lookupflags & INPLOOKUP_WLOCKPCB) { | if (__predict_false(inp_smr_lock(inp, | ||||
INP_WLOCK(inp); | (lookupflags & INPLOOKUP_LOCKMASK)) == false)) | ||||
} else if (lookupflags & INPLOOKUP_RLOCKPCB) { | |||||
INP_RLOCK(inp); | |||||
} else | |||||
panic("%s: locking bug", __func__); | |||||
if (__predict_false(inp->inp_flags2 & INP_FREED)) { | |||||
INP_UNLOCK(inp); | |||||
inp = NULL; | inp = NULL; | ||||
} | } else | ||||
} | smr_exit(pcbinfo->ipi_smr); | ||||
return (inp); | return (inp); | ||||
} | } | ||||
/* | /* | ||||
* Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf | * Public inpcb lookup routines, accepting a 4-tuple, and optionally, an mbuf | ||||
* from which a pre-calculated hash value may be extracted. | * from which a pre-calculated hash value may be extracted. | ||||
*/ | */ | ||||
▲ Show 20 Lines • Show All 78 Lines • ▼ Show 20 Lines | #endif | ||||
CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { | CK_LIST_FOREACH(phd, pcbporthash, phd_hash) { | ||||
if (phd->phd_port == inp->inp_lport) | if (phd->phd_port == inp->inp_lport) | ||||
break; | break; | ||||
} | } | ||||
/* | /* | ||||
* If none exists, malloc one and tack it on. | * If none exists, malloc one and tack it on. | ||||
*/ | */ | ||||
if (phd == NULL) { | if (phd == NULL) { | ||||
phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); | phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); | ||||
if (phd == NULL) { | if (phd == NULL) { | ||||
return (ENOBUFS); /* XXX */ | return (ENOBUFS); /* XXX */ | ||||
} | } | ||||
bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); | |||||
phd->phd_port = inp->inp_lport; | phd->phd_port = inp->inp_lport; | ||||
CK_LIST_INIT(&phd->phd_pcblist); | CK_LIST_INIT(&phd->phd_pcblist); | ||||
CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); | CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); | ||||
} | } | ||||
inp->inp_phd = phd; | inp->inp_phd = phd; | ||||
CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); | CK_LIST_INSERT_HEAD(&phd->phd_pcblist, inp, inp_portlist); | ||||
CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); | CK_LIST_INSERT_HEAD(pcbhash, inp, inp_hash); | ||||
inp->inp_flags |= INP_INHASHLIST; | inp->inp_flags |= INP_INHASHLIST; | ||||
return (0); | return (0); | ||||
} | } | ||||
/* | /* | ||||
* Move PCB to the proper hash bucket when { faddr, fport } have been | * Move PCB to the proper hash bucket when { faddr, fport } have been | ||||
* changed. NOTE: This does not handle the case of the lport changing (the | * changed. NOTE: This does not handle the case of the lport changing (the | ||||
* hashed port list would have to be updated as well), so the lport must | * hashed port list would have to be updated as well), so the lport must | ||||
* not change after in_pcbinshash() has been called. | * not change after in_pcbinshash() has been called. | ||||
* | |||||
* XXXGL: a race between this function and SMR-protected hash iterator | |||||
* will lead to iterator traversing a possibly wrong hash list. However, | |||||
* this race should have been here since change from rwlock to epoch. | |||||
*/ | */ | ||||
void | void | ||||
in_pcbrehash(struct inpcb *inp) | in_pcbrehash(struct inpcb *inp) | ||||
{ | { | ||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | ||||
struct inpcbhead *head; | struct inpcbhead *head; | ||||
u_int32_t hashkey_faddr; | u_int32_t hashkey_faddr; | ||||
Show All 13 Lines | #endif | ||||
head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, | head = &pcbinfo->ipi_hashbase[INP_PCBHASH(hashkey_faddr, | ||||
inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; | inp->inp_lport, inp->inp_fport, pcbinfo->ipi_hashmask)]; | ||||
CK_LIST_REMOVE(inp, inp_hash); | CK_LIST_REMOVE(inp, inp_hash); | ||||
CK_LIST_INSERT_HEAD(head, inp, inp_hash); | CK_LIST_INSERT_HEAD(head, inp, inp_hash); | ||||
} | } | ||||
/* | /* | ||||
* Remove PCB from various lists. | |||||
*/ | |||||
static void | |||||
in_pcbremlists(struct inpcb *inp) | |||||
{ | |||||
struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; | |||||
INP_WLOCK_ASSERT(inp); | |||||
INP_LIST_WLOCK_ASSERT(pcbinfo); | |||||
inp->inp_gencnt = ++pcbinfo->ipi_gencnt; | |||||
if (inp->inp_flags & INP_INHASHLIST) { | |||||
struct inpcbport *phd = inp->inp_phd; | |||||
INP_HASH_WLOCK(pcbinfo); | |||||
/* XXX: Only do if SO_REUSEPORT_LB set? */ | |||||
in_pcbremlbgrouphash(inp); | |||||
CK_LIST_REMOVE(inp, inp_hash); | |||||
CK_LIST_REMOVE(inp, inp_portlist); | |||||
if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { | |||||
CK_LIST_REMOVE(phd, phd_hash); | |||||
NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); | |||||
} | |||||
INP_HASH_WUNLOCK(pcbinfo); | |||||
inp->inp_flags &= ~INP_INHASHLIST; | |||||
} | |||||
CK_LIST_REMOVE(inp, inp_list); | |||||
pcbinfo->ipi_count--; | |||||
} | |||||
/* | |||||
* Check for alternatives when higher level complains | * Check for alternatives when higher level complains | ||||
* about service problems. For now, invalidate cached | * about service problems. For now, invalidate cached | ||||
* routing information. If the route was created dynamically | * routing information. If the route was created dynamically | ||||
* (by a redirect), time to try a default gateway again. | * (by a redirect), time to try a default gateway again. | ||||
*/ | */ | ||||
void | void | ||||
in_losing(struct inpcb *inp) | in_losing(struct inpcb *inp) | ||||
{ | { | ||||
▲ Show 20 Lines • Show All 117 Lines • ▼ Show 20 Lines | inp_unlock_assert(struct inpcb *inp) | ||||
INP_UNLOCK_ASSERT(inp); | INP_UNLOCK_ASSERT(inp); | ||||
} | } | ||||
#endif | #endif | ||||
void | void | ||||
inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) | inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) | ||||
{ | { | ||||
struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, | |||||
INPLOOKUP_WLOCKPCB); | |||||
struct inpcb *inp; | struct inpcb *inp; | ||||
INP_INFO_WLOCK(&V_tcbinfo); | while ((inp = inp_next(&inpi)) != NULL) | ||||
CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { | |||||
INP_WLOCK(inp); | |||||
func(inp, arg); | func(inp, arg); | ||||
INP_WUNLOCK(inp); | |||||
} | |||||
INP_INFO_WUNLOCK(&V_tcbinfo); | |||||
} | } | ||||
struct socket * | struct socket * | ||||
inp_inpcbtosocket(struct inpcb *inp) | inp_inpcbtosocket(struct inpcb *inp) | ||||
{ | { | ||||
INP_WLOCK_ASSERT(inp); | INP_WLOCK_ASSERT(inp); | ||||
return (inp->inp_socket); | return (inp->inp_socket); | ||||
▲ Show 20 Lines • Show All 634 Lines • Show Last 20 Lines |