Index: sys/kern/subr_witness.c =================================================================== --- sys/kern/subr_witness.c +++ sys/kern/subr_witness.c @@ -564,15 +564,15 @@ /* * UDP/IP */ - { "udp", &lock_class_mtx_sleep }, { "udpinp", &lock_class_rw }, + { "udp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ - { "tcp", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, + { "tcp", &lock_class_mtx_sleep }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* Index: sys/kern/uipc_ktls.c =================================================================== --- sys/kern/uipc_ktls.c +++ sys/kern/uipc_ktls.c @@ -854,10 +854,6 @@ inp = so->so_pcb; INP_WLOCK(inp); - if (inp->inp_flags2 & INP_FREED) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_WUNLOCK(inp); return (ECONNRESET); @@ -909,10 +905,6 @@ int error; INP_RLOCK(inp); - if (inp->inp_flags2 & INP_FREED) { - INP_RUNLOCK(inp); - return (ECONNRESET); - } if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { INP_RUNLOCK(inp); return (ECONNRESET); @@ -2716,8 +2708,7 @@ INP_WLOCK(inp); so = inp->inp_socket; MPASS(so != NULL); - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { goto out; } @@ -2729,7 +2720,6 @@ counter_u64_add(ktls_ifnet_disable_ok, 1); /* ktls_set_tx_mode() drops inp wlock, so recheck flags */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) == 0 && - (inp->inp_flags2 & INP_FREED) == 0 && (tp = intotcpcb(inp)) != NULL && tp->t_fb->tfb_hwtls_change != NULL) (*tp->t_fb->tfb_hwtls_change)(tp, 0); Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -49,7 +49,9 @@ #ifdef _KERNEL #include +#include #include +#include #include #include #endif @@ -133,32 +135,19 @@ * struct inpcb captures the network layer state for TCP, UDP, and raw IPv4 and * IPv6 sockets. In the case of TCP and UDP, further per-connection state is * hung off of inp_ppcb most of the time. Almost all fields of struct inpcb - * are static after creation or protected by a per-inpcb rwlock, inp_lock. A - * few fields are protected by multiple locks as indicated in the locking notes - * below. For these fields, all of the listed locks must be write-locked for - * any modifications. However, these fields can be safely read while any one of - * the listed locks are read-locked. This model can permit greater concurrency - * for read operations. For example, connections can be looked up while only - * holding a read lock on the global pcblist lock. This is important for - * performance when attempting to find the connection for a packet given its IP - * and port tuple. + * are static after creation or protected by a per-inpcb rwlock, inp_lock. * - * One noteworthy exception is that the global pcbinfo lock follows a different - * set of rules in relation to the inp_list field. Rather than being - * write-locked for modifications and read-locked for list iterations, it must - * be read-locked during modifications and write-locked during list iterations. - * This ensures that the relatively rare global list iterations safely walk a - * stable snapshot of connections while allowing more common list modifications - * to safely grab the pcblist lock just while adding or removing a connection - * from the global list. + * A inpcb database is indexed by addresses/ports hash as well as list of + * all pcbs that belong to a certain proto. Database lookups or list traversals + * are be performed inside SMR section. Once desired PCB is found its own + * lock is to be obtained and SMR section exited. * * Key: * (b) - Protected by the hpts lock. * (c) - Constant after initialization - * (e) - Protected by the net_epoch_prempt epoch + * (e) - Protected by the SMR section * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb - * (l) - Protected by the pcblist lock for the inpcb * (h) - Protected by the pcbhash lock for the inpcb * (s) - Protected by another subsystem's locks * (x) - Undefined locking @@ -219,17 +208,13 @@ * socket has been freed), or there may be close(2)-related races. * * The inp_vflag field is overloaded, and would otherwise ideally be (c). - * - * TODO: Currently only the TCP stack is leveraging the global pcbinfo lock - * read-lock usage during modification, this model can be applied to other - * protocols (especially SCTP). */ struct icmp6_filter; struct inpcbpolicy; struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ + CK_LIST_ENTRY(inpcb) inp_hash; /* (w:h/r:e) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ #define inp_start_zero inp_hpts @@ -311,8 +296,8 @@ int in6p_cksum; short in6p_hops; }; - CK_LIST_ENTRY(inpcb) inp_portlist; /* (i/h) */ - struct inpcbport *inp_phd; /* (i/h) head of this list */ + CK_LIST_ENTRY(inpcb) inp_portlist; /* (r:e/w:h) port list */ + struct inpcbport *inp_phd; /* (r:e/w:h) head of this list */ inp_gen_t inp_gencnt; /* (c) generation count */ void *spare_ptr; /* Spare pointer. */ rt_gen_t inp_rt_cookie; /* generation for route entry */ @@ -320,10 +305,7 @@ struct route inp_route; struct route_in6 inp_route6; }; - CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ - /* (e[r]) for list iteration */ - /* (p[w]/l) for addition/removal */ - struct epoch_context inp_epoch_ctx; + CK_LIST_ENTRY(inpcb) inp_list; /* (r:e/w:p) all PCBs for proto */ }; #endif /* _KERNEL */ @@ -396,80 +378,58 @@ #endif #endif /* _SYS_SOCKETVAR_H_ */ -struct inpcbport { - struct epoch_context phd_epoch_ctx; - CK_LIST_ENTRY(inpcbport) phd_hash; - struct inpcbhead phd_pcblist; - u_short phd_port; -}; - -/*- +#ifdef _KERNEL +/* * Global data structure for each high-level protocol (UDP, TCP, ...) in both * IPv4 and IPv6. Holds inpcb lists and information for managing them. * - * Each pcbinfo is protected by three locks: ipi_lock, ipi_hash_lock and - * ipi_list_lock: - * - ipi_lock covering the global pcb list stability during loop iteration, - * - ipi_hash_lock covering the hashed lookup tables, - * - ipi_list_lock covering mutable global fields (such as the global - * pcb list) - * - * The lock order is: - * - * ipi_lock (before) - * inpcb locks (before) - * ipi_list locks (before) + * The pcbs are protected with SMR section and thus all lists in inpcbinfo + * are CK-lists. Locking is required to insert a pcb into database. Two + * locks are provided: one for the hash and one for the global list of pcbs, + * as well as overall count and generation count. * * Locking key: * * (c) Constant or nearly constant after initialisation - * (e) - Protected by the net_epoch_prempt epoch + * (e) Protected by SMR section * (g) Locked by ipi_lock - * (l) Locked by ipi_list_lock - * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock - * (x) Synchronisation properties poorly defined + * (h) Locked by ipi_hash_lock */ struct inpcbinfo { /* * Global lock protecting inpcb list modification */ struct mtx ipi_lock; - - /* - * Global list of inpcbs on the protocol. - */ - struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */ - u_int ipi_count; /* (l) */ + struct inpcbhead ipi_listhead; /* (r:e/w:g) */ + u_int ipi_count; /* (g) */ /* * Generation count -- incremented each time a connection is allocated * or freed. */ - u_quad_t ipi_gencnt; /* (l) */ + u_quad_t ipi_gencnt; /* (g) */ /* * Fields associated with port lookup and allocation. */ - u_short ipi_lastport; /* (x) */ - u_short ipi_lastlow; /* (x) */ - u_short ipi_lasthi; /* (x) */ + u_short ipi_lastport; /* (h) */ + u_short ipi_lastlow; /* (h) */ + u_short ipi_lasthi; /* (h) */ /* * UMA zone from which inpcbs are allocated for this protocol. */ - struct uma_zone *ipi_zone; /* (c) */ - - /* - * Global lock protecting modification hash lookup tables. - */ - struct mtx ipi_hash_lock; + uma_zone_t ipi_zone; /* (c) */ + uma_zone_t ipi_portzone; /* (c) */ + smr_t ipi_smr; /* (c) */ /* * Global hash of inpcbs, hashed by local and foreign addresses and * port numbers. */ - struct inpcbhead *ipi_hashbase; /* (h) */ - u_long ipi_hashmask; /* (h) */ + struct mtx ipi_hash_lock; + struct inpcbhead *ipi_hashbase; /* (r:e/w:h) */ + u_long ipi_hashmask; /* (c) */ /* * Global hash of inpcbs, hashed by only local port number. @@ -481,26 +441,15 @@ * Load balance groups used for the SO_REUSEPORT_LB option, * hashed by local port. */ - struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (h) */ + struct inpcblbgrouphead *ipi_lbgrouphashbase; /* (r:e/w:h) */ u_long ipi_lbgrouphashmask; /* (h) */ /* * Pointer to network stack instance */ struct vnet *ipi_vnet; /* (c) */ - - /* - * general use 2 - */ - void *ipi_pspare[2]; - - /* - * Global lock protecting global inpcb list, inpcb count, etc. - */ - struct rwlock ipi_list_lock; }; -#ifdef _KERNEL /* * Load balance groups used for the SO_REUSEPORT_LB socket option. Each group * (or unique address:port combination) can be re-used at most @@ -523,7 +472,7 @@ }; #define INP_LOCK_INIT(inp, d, t) \ - rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) + rw_init_flags(&(inp)->inp_lock, (t), RW_RECURSE | RW_DUPOK) #define INP_LOCK_DESTROY(inp) rw_destroy(&(inp)->inp_lock) #define INP_RLOCK(inp) rw_rlock(&(inp)->inp_lock) #define INP_WLOCK(inp) rw_wlock(&(inp)->inp_lock) @@ -571,51 +520,21 @@ #endif /* _KERNEL */ -#define INP_INFO_LOCK_INIT(ipi, d) \ - mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) -#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) -#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \ + mtx_owned(&(ipi)->ipi_lock)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) #define INP_INFO_WUNLOCK_ASSERT(ipi) \ - mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) - -#define INP_LIST_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) -#define INP_LIST_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_list_lock) -#define INP_LIST_RLOCK(ipi) rw_rlock(&(ipi)->ipi_list_lock) -#define INP_LIST_WLOCK(ipi) rw_wlock(&(ipi)->ipi_list_lock) -#define INP_LIST_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_list_lock) -#define INP_LIST_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_list_lock) -#define INP_LIST_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_list_lock) -#define INP_LIST_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_list_lock) -#define INP_LIST_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_list_lock) -#define INP_LIST_LOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_LOCKED) -#define INP_LIST_RLOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_RLOCKED) -#define INP_LIST_WLOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_WLOCKED) -#define INP_LIST_UNLOCK_ASSERT(ipi) \ - rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) - -#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) -#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) + mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) + #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) -#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); - -#define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ - MTX_DEF | MTX_DUPOK) -#define INP_GROUP_LOCK_DESTROY(ipg) mtx_destroy(&(ipg)->ipg_lock) - -#define INP_GROUP_LOCK(ipg) mtx_lock(&(ipg)->ipg_lock) -#define INP_GROUP_LOCK_ASSERT(ipg) mtx_assert(&(ipg)->ipg_lock, MA_OWNED) -#define INP_GROUP_UNLOCK(ipg) mtx_unlock(&(ipg)->ipg_lock) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(SMR_ENTERED((ipi)->ipi_smr) || \ + mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, \ + MA_OWNED) #define INP_PCBHASH(faddr, lport, fport, mask) \ (((faddr) ^ ((faddr) >> 16) ^ ntohs((lport) ^ (fport))) & (mask)) @@ -644,7 +563,7 @@ #define INP_ANONPORT 0x00000040 /* port chosen for user */ #define INP_RECVIF 0x00000080 /* receive incoming interface */ #define INP_MTUDISC 0x00000100 /* user can do MTU discovery */ - /* 0x000200 unused: was INP_FAITH */ +/* INP_FREED 0x00000200 private to in_pcb.c */ #define INP_RECVTTL 0x00000400 /* receive incoming IP TTL */ #define INP_DONTFRAG 0x00000800 /* don't fragment packet */ #define INP_BINDANY 0x00001000 /* allow bind to any address */ @@ -682,7 +601,7 @@ #define INP_MBUF_ACKCMP 0x00000002 /* TCP mbuf ack compression ok */ /* 0x00000004 */ #define INP_REUSEPORT 0x00000008 /* SO_REUSEPORT option is set */ -#define INP_FREED 0x00000010 /* inp itself is not valid */ +/* 0x00000010 */ #define INP_REUSEADDR 0x00000020 /* SO_REUSEADDR option is set */ #define INP_BINDMULTI 0x00000040 /* IP_BINDMULTI option is set */ #define INP_RSS_BUCKET_SET 0x00000080 /* IP_RSS_LISTEN_BUCKET is set */ @@ -702,15 +621,19 @@ #define INP_2PCP_BASE INP_2PCP_BIT0 #define INP_2PCP_MASK (INP_2PCP_BIT0 | INP_2PCP_BIT1 | INP_2PCP_BIT2) #define INP_2PCP_SHIFT 18 /* shift PCP field in/out of inp_flags2 */ + /* - * Flags passed to in_pcblookup*() functions. + * Flags passed to in_pcblookup*(), inp_smr_lock() and inp_next(). */ -#define INPLOOKUP_WILDCARD 0x00000001 /* Allow wildcard sockets. */ -#define INPLOOKUP_RLOCKPCB 0x00000002 /* Return inpcb read-locked. */ -#define INPLOOKUP_WLOCKPCB 0x00000004 /* Return inpcb write-locked. */ +typedef enum { + INPLOOKUP_WILDCARD = 0x00000001, /* Allow wildcard sockets. */ + INPLOOKUP_RLOCKPCB = 0x00000002, /* Return inpcb read-locked. */ + INPLOOKUP_WLOCKPCB = 0x00000004, /* Return inpcb write-locked. */ +} inp_lookup_t; #define INPLOOKUP_MASK (INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB | \ INPLOOKUP_WLOCKPCB) +#define INPLOOKUP_LOCKMASK (INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB) #define sotoinpcb(so) ((struct inpcb *)(so)->so_pcb) @@ -718,13 +641,6 @@ #define INP_CHECK_SOCKAF(so, af) (INP_SOCKAF(so) == af) -/* - * Constants for pcbinfo.ipi_hashfields. - */ -#define IPI_HASHFIELDS_NONE 0 -#define IPI_HASHFIELDS_2TUPLE 1 -#define IPI_HASHFIELDS_4TUPLE 2 - #ifdef _KERNEL VNET_DECLARE(int, ipport_reservedhigh); VNET_DECLARE(int, ipport_reservedlow); @@ -755,8 +671,8 @@ #define V_ipport_tcpallocs VNET(ipport_tcpallocs) void in_pcbinfo_destroy(struct inpcbinfo *); -void in_pcbinfo_init(struct inpcbinfo *, const char *, struct inpcbhead *, - int, int, char *, uma_init, u_int); +void in_pcbinfo_init(struct inpcbinfo *, const char *, u_int, int, char *, + uma_init); int in_pcbbind_check_bindmulti(const struct inpcb *ni, const struct inpcb *oi); @@ -788,8 +704,37 @@ int, struct inpcb *(*)(struct inpcb *, int)); void in_pcbref(struct inpcb *); void in_pcbrehash(struct inpcb *); -int in_pcbrele_rlocked(struct inpcb *); -int in_pcbrele_wlocked(struct inpcb *); +bool in_pcbrele_rlocked(struct inpcb *); +bool in_pcbrele_wlocked(struct inpcb *); + +typedef bool inp_match_t(const struct inpcb *, void *); +struct inpcb_iterator { + const struct inpcbinfo *ipi; + struct inpcb *inp; + inp_match_t *match; + void *ctx; + int hash; +#define INP_ALL_LIST -1 + const inp_lookup_t lock; +}; + +/* Note: sparse initializers guarantee .inp = NULL. */ +#define INP_ITERATOR(_ipi, _lock, _match, _ctx) \ + { \ + .ipi = (_ipi), \ + .lock = (_lock), \ + .hash = INP_ALL_LIST, \ + .match = (_match), \ + .ctx = (_ctx), \ + } +#define INP_ALL_ITERATOR(_ipi, _lock) \ + { \ + .ipi = (_ipi), \ + .lock = (_lock), \ + .hash = INP_ALL_LIST, \ + } + +struct inpcb *inp_next(struct inpcb_iterator *); void in_losing(struct inpcb *); void in_pcbsetsolabel(struct socket *so); int in_getpeeraddr(struct socket *so, struct sockaddr **nam); Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -113,6 +113,7 @@ #define INPCBLBGROUP_SIZMIN 8 #define INPCBLBGROUP_SIZMAX 256 +#define INP_FREED 0x00000200 /* See in_pcb.h. */ static struct callout ipport_tick_callout; @@ -145,7 +146,6 @@ #define V_ipport_tcplastcount VNET(ipport_tcplastcount) -static void in_pcbremlists(struct inpcb *inp); #ifdef INET static struct inpcb *in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, u_int fport_arg, @@ -514,38 +514,43 @@ INP_LOCK_DESTROY(inp); } +/* Make sure it is safe to use hashinit(9) on CK_LIST. */ +CTASSERT(sizeof(struct inpcbhead) == sizeof(LIST_HEAD(, inpcb))); + /* * Initialize an inpcbinfo -- we should be able to reduce the number of * arguments in time. */ void in_pcbinfo_init(struct inpcbinfo *pcbinfo, const char *name, - struct inpcbhead *listhead, int hash_nelements, int porthash_nelements, - char *inpcbzone_name, uma_init inpcbzone_init, u_int hashfields) + u_int hash_nelements, int porthash_nelements, char *inpcbzone_name, + uma_init inpcbzone_init) { - porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); - - INP_INFO_LOCK_INIT(pcbinfo, name); - INP_HASH_LOCK_INIT(pcbinfo, "pcbinfohash"); /* XXXRW: argument? */ - INP_LIST_LOCK_INIT(pcbinfo, "pcbinfolist"); + mtx_init(&pcbinfo->ipi_lock, name, NULL, MTX_DEF); + mtx_init(&pcbinfo->ipi_hash_lock, "pcbinfohash", NULL, MTX_DEF); #ifdef VIMAGE pcbinfo->ipi_vnet = curvnet; #endif - pcbinfo->ipi_listhead = listhead; - CK_LIST_INIT(pcbinfo->ipi_listhead); + CK_LIST_INIT(&pcbinfo->ipi_listhead); pcbinfo->ipi_count = 0; pcbinfo->ipi_hashbase = hashinit(hash_nelements, M_PCB, &pcbinfo->ipi_hashmask); + porthash_nelements = imin(porthash_nelements, IPPORT_MAX + 1); pcbinfo->ipi_porthashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_porthashmask); pcbinfo->ipi_lbgrouphashbase = hashinit(porthash_nelements, M_PCB, &pcbinfo->ipi_lbgrouphashmask); pcbinfo->ipi_zone = uma_zcreate(inpcbzone_name, sizeof(struct inpcb), - NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, 0); + NULL, NULL, inpcbzone_init, inpcb_fini, UMA_ALIGN_PTR, + UMA_ZONE_SMR); uma_zone_set_max(pcbinfo->ipi_zone, maxsockets); uma_zone_set_warning(pcbinfo->ipi_zone, "kern.ipc.maxsockets limit reached"); + pcbinfo->ipi_smr = uma_zone_get_smr(pcbinfo->ipi_zone); + pcbinfo->ipi_portzone = uma_zcreate(inpcbzone_name, + sizeof(struct inpcbport), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); + uma_zone_set_smr(pcbinfo->ipi_portzone, pcbinfo->ipi_smr); } /* @@ -564,9 +569,8 @@ hashdestroy(pcbinfo->ipi_lbgrouphashbase, M_PCB, pcbinfo->ipi_lbgrouphashmask); uma_zdestroy(pcbinfo->ipi_zone); - INP_LIST_LOCK_DESTROY(pcbinfo); - INP_HASH_LOCK_DESTROY(pcbinfo); - INP_INFO_LOCK_DESTROY(pcbinfo); + mtx_destroy(&pcbinfo->ipi_hash_lock); + mtx_destroy(&pcbinfo->ipi_lock); } /* @@ -580,7 +584,7 @@ int error; error = 0; - inp = uma_zalloc(pcbinfo->ipi_zone, M_NOWAIT); + inp = uma_zalloc_smr(pcbinfo->ipi_zone, M_NOWAIT); if (inp == NULL) return (ENOBUFS); bzero(&inp->inp_start_zero, inp_zero_size); @@ -612,33 +616,38 @@ if (V_ip6_v6only) inp->inp_flags |= IN6P_IPV6_V6ONLY; } -#endif - INP_WLOCK(inp); - INP_LIST_WLOCK(pcbinfo); - CK_LIST_INSERT_HEAD(pcbinfo->ipi_listhead, inp, inp_list); - pcbinfo->ipi_count++; - so->so_pcb = (caddr_t)inp; -#ifdef INET6 if (V_ip6_auto_flowlabel) inp->inp_flags |= IN6P_AUTOFLOWLABEL; #endif - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - refcount_init(&inp->inp_refcount, 1); /* Reference from inpcbinfo */ - /* * Routes in inpcb's can cache L2 as well; they are guaranteed * to be cleaned up. */ inp->inp_route.ro_flags = RT_LLE_CACHE; - INP_LIST_WUNLOCK(pcbinfo); +#ifdef TCPHPTS + /* + * If using hpts lets drop a random number in so + * not all new connections fall on the same CPU. + */ + inp->inp_hpts_cpu = inp->inp_input_cpu = hpts_random_cpu(inp); +#endif + refcount_init(&inp->inp_refcount, 1); /* Reference from socket. */ + INP_WLOCK(inp); + INP_INFO_WLOCK(pcbinfo); + pcbinfo->ipi_count++; + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + CK_LIST_INSERT_HEAD(&pcbinfo->ipi_listhead, inp, inp_list); + INP_INFO_WUNLOCK(pcbinfo); + so->so_pcb = inp; + + return (0); + #if defined(IPSEC) || defined(IPSEC_SUPPORT) || defined(MAC) out: - if (error != 0) { - crfree(inp->inp_cred); - uma_zfree(pcbinfo->ipi_zone, inp); - } -#endif + crfree(inp->inp_cred); + uma_zfree_smr(pcbinfo->ipi_zone, inp); return (error); +#endif } #ifdef INET @@ -1505,192 +1514,274 @@ } /* - * in_pcbref() bumps the reference count on an inpcb in order to maintain - * stability of an inpcb pointer despite the inpcb lock being released. This - * is used in TCP when the inpcbinfo lock needs to be acquired or upgraded, - * but where the inpcb lock may already held. - * - * in_pcbref() should be used only to provide brief memory stability, and - * must always be followed by a call to INP_WLOCK() and in_pcbrele() to - * garbage collect the inpcb if it has been in_pcbfree()'d from another - * context. Until in_pcbrele() has returned that the inpcb is still valid, - * lock and rele are the *only* safe operations that may be performed on the - * inpcb. + * inpcb hash lookups are protected by SMR section. * - * While the inpcb will not be freed, releasing the inpcb lock means that the - * connection's state may change, so the caller should be careful to - * revalidate any cached state on reacquiring the lock. Drop the reference - * using in_pcbrele(). + * Once desired pcb has been found, switching from SMR section to a pcb + * lock is performed with inp_smr_lock(). We can not use INP_(W|R)LOCK + * here because SMR is a critical section. + * In 99%+ cases inp_smr_lock() would obtain the lock immediately. */ -void -in_pcbref(struct inpcb *inp) +static inline void +inp_lock(struct inpcb *inp, const inp_lookup_t lock) { - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + lock == INPLOOKUP_RLOCKPCB ? + rw_rlock(&inp->inp_lock) : rw_wlock(&inp->inp_lock); +} + +static inline void +inp_unlock(struct inpcb *inp, const inp_lookup_t lock) +{ - refcount_acquire(&inp->inp_refcount); + lock == INPLOOKUP_RLOCKPCB ? + rw_runlock(&inp->inp_lock) : rw_wunlock(&inp->inp_lock); } -/* - * Drop a refcount on an inpcb elevated using in_pcbref(); because a call to - * in_pcbfree() may have been made between in_pcbref() and in_pcbrele(), we - * return a flag indicating whether or not the inpcb remains valid. If it is - * valid, we return with the inpcb lock held. - * - * Notice that, unlike in_pcbref(), the inpcb lock must be held to drop a - * reference on an inpcb. Historically more work was done here (actually, in - * in_pcbfree_internal()) but has been moved to in_pcbfree() to avoid the - * need for the pcbinfo lock in in_pcbrele(). Deferring the free is entirely - * about memory stability (and continued use of the write lock). - */ -int -in_pcbrele_rlocked(struct inpcb *inp) +static inline int +inp_trylock(struct inpcb *inp, const inp_lookup_t lock) { - struct inpcbinfo *pcbinfo; - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + return (lock == INPLOOKUP_RLOCKPCB ? + rw_try_rlock(&inp->inp_lock) : rw_try_wlock(&inp->inp_lock)); +} - INP_RLOCK_ASSERT(inp); +static inline bool +in_pcbrele(struct inpcb *inp, const inp_lookup_t lock) +{ - if (refcount_release(&inp->inp_refcount) == 0) { - /* - * If the inpcb has been freed, let the caller know, even if - * this isn't the last reference. - */ - if (inp->inp_flags2 & INP_FREED) { - INP_RUNLOCK(inp); - return (1); + return (lock == INPLOOKUP_RLOCKPCB ? + in_pcbrele_rlocked(inp) : in_pcbrele_wlocked(inp)); +} + +bool +inp_smr_lock(struct inpcb *inp, const inp_lookup_t lock) +{ + + MPASS(lock == INPLOOKUP_RLOCKPCB || lock == INPLOOKUP_WLOCKPCB); + SMR_ASSERT_ENTERED(inp->inp_pcbinfo->ipi_smr); + + if (__predict_true(inp_trylock(inp, lock))) { + if (__predict_false(inp->inp_flags & INP_FREED)) { + smr_exit(inp->inp_pcbinfo->ipi_smr); + inp_unlock(inp, lock); + return (false); } - return (0); + smr_exit(inp->inp_pcbinfo->ipi_smr); + return (true); } - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); -#ifdef TCPHPTS - if (inp->inp_in_hpts || inp->inp_in_input) { - struct tcp_hpts_entry *hpts; + if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { + smr_exit(inp->inp_pcbinfo->ipi_smr); + inp_lock(inp, lock); + if (__predict_false(in_pcbrele(inp, lock))) + return (false); /* - * We should not be on the hpts at - * this point in any form. we must - * get the lock to be sure. + * inp acquired through refcount & lock for sure didn't went + * through uma_zfree(). However, it may have already went + * through in_pcbfree() and has another reference, that + * prevented its release by our in_pcbrele(). */ - hpts = tcp_hpts_lock(inp); - if (inp->inp_in_hpts) - panic("Hpts:%p inp:%p at free still on hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - hpts = tcp_input_lock(inp); - if (inp->inp_in_input) - panic("Hpts:%p inp:%p at free still on input hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); + if (__predict_false(inp->inp_flags & INP_FREED)) { + inp_unlock(inp, lock); + return (false); + } + return (true); + } else { + smr_exit(inp->inp_pcbinfo->ipi_smr); + return (false); } -#endif - INP_RUNLOCK(inp); - pcbinfo = inp->inp_pcbinfo; - uma_zfree(pcbinfo->ipi_zone, inp); - return (1); } -int -in_pcbrele_wlocked(struct inpcb *inp) +/* + * inp_next() - inpcb hash/list traversal iterator + * + * Requires initialized struct inpcb_iterator for context. + * The structure can be initialized with INP_ITERATOR() or INP_ALL_ITERATOR(). + * + * - Iterator can have either write-lock or read-lock semantics, that can not + * be changed later. + * - Iterator can iterate either over all pcbs list (INP_ALL_LIST), or through + * a single hash slot. Note: only rip_input() does the latter. + * - Iterator may have optional bool matching function. The matching function + * will be executed for each inpcb in the SMR context, so it can not acquire + * locks and can safely access only immutable fields of inpcb. + * + * A fresh initialized iterator has NULL inpcb in its context and that + * means that inp_next() call would return the very first inpcb on the list + * locked with desired semantic. In all following calls the context pointer + * shall hold the current inpcb pointer. The KPI user is not supposed to + * unlock the current inpcb! Upon end of traversal inp_next() will return NULL + * and write NULL to its context. After end of traversal an iterator can be + * reused. + * + * List traversals have the following features/constraints: + * - New entries won't be seen, as they are always added to the head of a list. + * - Removed entries won't stop traversal as long as they are not added to + * a different list. This is violated by in_pcbrehash(). + */ +#define II_LIST_FIRST(ipi, hash) \ + (((hash) == INP_ALL_LIST) ? \ + CK_LIST_FIRST(&(ipi)->ipi_listhead) : \ + CK_LIST_FIRST(&(ipi)->ipi_hashbase[(hash)])) +#define II_LIST_NEXT(inp, hash) \ + (((hash) == INP_ALL_LIST) ? \ + CK_LIST_NEXT((inp), inp_list) : \ + CK_LIST_NEXT((inp), inp_hash)) +#define II_LOCK_ASSERT(inp, lock) \ + rw_assert(&(inp)->inp_lock, \ + (lock) == INPLOOKUP_RLOCKPCB ? RA_RLOCKED : RA_WLOCKED ) +struct inpcb * +inp_next(struct inpcb_iterator *ii) { - struct inpcbinfo *pcbinfo; + const struct inpcbinfo *ipi = ii->ipi; + inp_match_t *match = ii->match; + void *ctx = ii->ctx; + inp_lookup_t lock = ii->lock; + int hash = ii->hash; + struct inpcb *inp; - KASSERT(inp->inp_refcount > 0, ("%s: refcount 0", __func__)); + if (ii->inp == NULL) { /* First call. */ + smr_enter(ipi->ipi_smr); + /* This is unrolled CK_LIST_FOREACH(). */ + for (inp = II_LIST_FIRST(ipi, hash); + inp != NULL; + inp = II_LIST_NEXT(inp, hash)) { + if (match != NULL && (match)(inp, ctx) == false) + continue; + if (__predict_true(inp_smr_lock(inp, lock))) + break; + else { + smr_enter(ipi->ipi_smr); + MPASS(inp != II_LIST_FIRST(ipi, hash)); + inp = II_LIST_FIRST(ipi, hash); + } + } - INP_WLOCK_ASSERT(inp); + if (inp == NULL) + smr_exit(ipi->ipi_smr); + else + ii->inp = inp; - if (refcount_release(&inp->inp_refcount) == 0) { - /* - * If the inpcb has been freed, let the caller know, even if - * this isn't the last reference. - */ - if (inp->inp_flags2 & INP_FREED) { - INP_WUNLOCK(inp); - return (1); + return (inp); + } + + /* Not a first call. */ + smr_enter(ipi->ipi_smr); +restart: + inp = ii->inp; + II_LOCK_ASSERT(inp, lock); +next: + inp = II_LIST_NEXT(inp, hash); + if (inp == NULL) { + smr_exit(ipi->ipi_smr); + goto found; + } + + if (match != NULL && (match)(inp, ctx) == false) + goto next; + + if (__predict_true(inp_trylock(inp, lock))) { + if (__predict_false(inp->inp_flags & INP_FREED)) { + /* + * Entries are never inserted in middle of a list, thus + * as long as we are in SMR, we can continue traversal. + * Jump to 'restart' should yield in the same result, + * but could produce unnecessary looping. Could this + * looping be unbound? + */ + inp_unlock(inp, lock); + goto next; + } else { + smr_exit(ipi->ipi_smr); + goto found; } - return (0); } - KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); -#ifdef TCPHPTS - if (inp->inp_in_hpts || inp->inp_in_input) { - struct tcp_hpts_entry *hpts; + /* + * Can't obtain lock immediately, thus going hard. Once we exit the + * SMR section we can no longer jump to 'next', and our only stable + * anchoring point is ii->inp, which we keep locked for this case, so + * we jump to 'restart'. + */ + if (__predict_true(refcount_acquire_if_not_zero(&inp->inp_refcount))) { + smr_exit(ipi->ipi_smr); + inp_lock(inp, lock); + if (__predict_false(in_pcbrele(inp, lock))) { + smr_enter(ipi->ipi_smr); + goto restart; + } /* - * We should not be on the hpts at - * this point in any form. we must - * get the lock to be sure. + * See comment in inp_smr_lock(). */ - hpts = tcp_hpts_lock(inp); - if (inp->inp_in_hpts) - panic("Hpts:%p inp:%p at free still on hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - hpts = tcp_input_lock(inp); - if (inp->inp_in_input) - panic("Hpts:%p inp:%p at free still on input hpts", - hpts, inp); - mtx_unlock(&hpts->p_mtx); - } -#endif - INP_WUNLOCK(inp); - pcbinfo = inp->inp_pcbinfo; - uma_zfree(pcbinfo->ipi_zone, inp); - return (1); + if (__predict_false(inp->inp_flags & INP_FREED)) { + inp_unlock(inp, lock); + smr_enter(ipi->ipi_smr); + goto restart; + } + } else + goto next; + +found: + inp_unlock(ii->inp, lock); + ii->inp = inp; + + return (ii->inp); } -static void -inpcbport_free(epoch_context_t ctx) +/* + * in_pcbref() bumps the reference count on an inpcb in order to maintain + * stability of an inpcb pointer despite the inpcb lock being released or + * SMR section exited. + * + * To free a reference later in_pcbrele_(r|w)locked() must be performed. + */ +void +in_pcbref(struct inpcb *inp) { - struct inpcbport *phd; + u_int old __diagused; - phd = __containerof(ctx, struct inpcbport, phd_epoch_ctx); - free(phd, M_PCB); + old = refcount_acquire(&inp->inp_refcount); + KASSERT(old > 0, ("%s: refcount 0", __func__)); } -static void -in_pcbfree_deferred(epoch_context_t ctx) +/* + * Drop a refcount on an inpcb elevated using in_pcbref(), potentially + * freeing the pcb, if the reference was very last. + */ +bool +in_pcbrele_rlocked(struct inpcb *inp) { - struct inpcb *inp; - int released __unused; - inp = __containerof(ctx, struct inpcb, inp_epoch_ctx); + INP_RLOCK_ASSERT(inp); - INP_WLOCK(inp); - CURVNET_SET(inp->inp_vnet); -#ifdef INET - struct ip_moptions *imo = inp->inp_moptions; - inp->inp_moptions = NULL; -#endif - /* XXXRW: Do as much as possible here. */ -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - if (inp->inp_sp != NULL) - ipsec_delete_pcbpolicy(inp); -#endif -#ifdef INET6 - struct ip6_moptions *im6o = NULL; - if (inp->inp_vflag & INP_IPV6PROTO) { - ip6_freepcbopts(inp->in6p_outputopts); - im6o = inp->in6p_moptions; - inp->in6p_moptions = NULL; - } -#endif - if (inp->inp_options) - (void)m_free(inp->inp_options); - inp->inp_vflag = 0; - crfree(inp->inp_cred); -#ifdef MAC - mac_inpcb_destroy(inp); -#endif - released = in_pcbrele_wlocked(inp); - MPASS(released); -#ifdef INET6 - ip6_freemoptions(im6o); -#endif -#ifdef INET - inp_freemoptions(imo); -#endif - CURVNET_RESTORE(); + if (refcount_release(&inp->inp_refcount) == 0) + return (false); + + MPASS(inp->inp_flags & INP_FREED); + MPASS(inp->inp_socket == NULL); + MPASS(inp->inp_in_hpts == 0); + MPASS(inp->inp_in_input == 0); + INP_RUNLOCK(inp); + uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); + return (true); +} + +bool +in_pcbrele_wlocked(struct inpcb *inp) +{ + + INP_WLOCK_ASSERT(inp); + + if (refcount_release(&inp->inp_refcount) == 0) + return (false); + + MPASS(inp->inp_flags & INP_FREED); + MPASS(inp->inp_socket == NULL); + MPASS(inp->inp_in_hpts == 0); + MPASS(inp->inp_in_input == 0); + INP_WUNLOCK(inp); + uma_zfree_smr(inp->inp_pcbinfo->ipi_zone, inp); + return (true); } /* @@ -1698,32 +1789,81 @@ * reference count, which should occur only after the inpcb has been detached * from its socket. If another thread holds a temporary reference (acquired * using in_pcbref()) then the free is deferred until that reference is - * released using in_pcbrele(), but the inpcb is still unlocked. Almost all - * work, including removal from global lists, is done in this context, where - * the pcbinfo lock is held. + * released using in_pcbrele_(r|w)locked(), but the inpcb is still unlocked. + * Almost all work, including removal from global lists, is done in this + * context, where the pcbinfo lock is held. */ void in_pcbfree(struct inpcb *inp) { struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; +#ifdef INET + struct ip_moptions *imo; +#endif +#ifdef INET6 + struct ip6_moptions *im6o; +#endif + INP_WLOCK_ASSERT(inp); KASSERT(inp->inp_socket == NULL, ("%s: inp_socket != NULL", __func__)); - KASSERT((inp->inp_flags2 & INP_FREED) == 0, + KASSERT((inp->inp_flags & INP_FREED) == 0, ("%s: called twice for pcb %p", __func__, inp)); - if (inp->inp_flags2 & INP_FREED) { - INP_WUNLOCK(inp); - return; + + inp->inp_flags |= INP_FREED; + INP_INFO_WLOCK(pcbinfo); + inp->inp_gencnt = ++pcbinfo->ipi_gencnt; + pcbinfo->ipi_count--; + CK_LIST_REMOVE(inp, inp_list); + INP_INFO_WUNLOCK(pcbinfo); + + if (inp->inp_flags & INP_INHASHLIST) { + struct inpcbport *phd = inp->inp_phd; + + INP_HASH_WLOCK(pcbinfo); + /* XXX: Only do if SO_REUSEPORT_LB set? */ + in_pcbremlbgrouphash(inp); + + CK_LIST_REMOVE(inp, inp_hash); + CK_LIST_REMOVE(inp, inp_portlist); + if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { + CK_LIST_REMOVE(phd, phd_hash); + uma_zfree_smr(pcbinfo->ipi_portzone, phd); + } + INP_HASH_WUNLOCK(pcbinfo); + inp->inp_flags &= ~INP_INHASHLIST; } - INP_WLOCK_ASSERT(inp); - INP_LIST_WLOCK(pcbinfo); - in_pcbremlists(inp); - INP_LIST_WUNLOCK(pcbinfo); + crfree(inp->inp_cred); RO_INVALIDATE_CACHE(&inp->inp_route); - /* mark as destruction in progress */ - inp->inp_flags2 |= INP_FREED; - INP_WUNLOCK(inp); - NET_EPOCH_CALL(in_pcbfree_deferred, &inp->inp_epoch_ctx); +#ifdef MAC + mac_inpcb_destroy(inp); +#endif +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + if (inp->inp_sp != NULL) + ipsec_delete_pcbpolicy(inp); +#endif +#ifdef INET + if (inp->inp_options) + (void)m_free(inp->inp_options); + imo = inp->inp_moptions; +#endif +#ifdef INET6 + if (inp->inp_vflag & INP_IPV6PROTO) { + ip6_freepcbopts(inp->in6p_outputopts); + im6o = inp->in6p_moptions; + } else + im6o = NULL; +#endif + + if (__predict_false(in_pcbrele_wlocked(inp) == false)) { + INP_WUNLOCK(inp); + } +#ifdef INET6 + ip6_freemoptions(im6o); +#endif +#ifdef INET + inp_freemoptions(imo); +#endif } /* @@ -1764,7 +1904,7 @@ CK_LIST_REMOVE(inp, inp_portlist); if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { CK_LIST_REMOVE(phd, phd_hash); - NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); + uma_zfree_smr(inp->inp_pcbinfo->ipi_portzone, phd); } INP_HASH_WUNLOCK(inp->inp_pcbinfo); inp->inp_flags &= ~INP_INHASHLIST; @@ -1835,7 +1975,7 @@ struct inpcb *inp, *inp_temp; INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { + CK_LIST_FOREACH_SAFE(inp, &pcbinfo->ipi_listhead, inp_list, inp_temp) { INP_WLOCK(inp); #ifdef INET6 if ((inp->inp_vflag & INP_IPV4) == 0) { @@ -1854,49 +1994,57 @@ INP_INFO_WUNLOCK(pcbinfo); } +static bool +inp_v4_multi_match(const struct inpcb *inp, void *v __unused) +{ + + if ((inp->inp_vflag & INP_IPV4) && inp->inp_moptions != NULL) + return (true); + else + return (false); +} + void in_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { + struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, + inp_v4_multi_match, NULL); struct inpcb *inp; struct in_multi *inm; struct in_mfilter *imf; struct ip_moptions *imo; - INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { - INP_WLOCK(inp); + IN_MULTI_LOCK_ASSERT(); + + while ((inp = inp_next(&inpi)) != NULL) { + INP_WLOCK_ASSERT(inp); + imo = inp->inp_moptions; - if ((inp->inp_vflag & INP_IPV4) && - imo != NULL) { - /* - * Unselect the outgoing interface if it is being - * detached. - */ - if (imo->imo_multicast_ifp == ifp) - imo->imo_multicast_ifp = NULL; + /* + * Unselect the outgoing interface if it is being + * detached. + */ + if (imo->imo_multicast_ifp == ifp) + imo->imo_multicast_ifp = NULL; - /* - * Drop multicast group membership if we joined - * through the interface being detached. - * - * XXX This can all be deferred to an epoch_call - */ + /* + * Drop multicast group membership if we joined + * through the interface being detached. + * + * XXX This can all be deferred to an epoch_call + */ restart: - IP_MFILTER_FOREACH(imf, &imo->imo_head) { - if ((inm = imf->imf_inm) == NULL) - continue; - if (inm->inm_ifp != ifp) - continue; - ip_mfilter_remove(&imo->imo_head, imf); - IN_MULTI_LOCK_ASSERT(); - in_leavegroup_locked(inm, NULL); - ip_mfilter_free(imf); - goto restart; - } + IP_MFILTER_FOREACH(imf, &imo->imo_head) { + if ((inm = imf->imf_inm) == NULL) + continue; + if (inm->inm_ifp != ifp) + continue; + ip_mfilter_remove(&imo->imo_head, imf); + in_leavegroup_locked(inm, NULL); + ip_mfilter_free(imf); + goto restart; } - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(pcbinfo); } /* @@ -1918,7 +2066,6 @@ KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - INP_HASH_LOCK_ASSERT(pcbinfo); if ((lookupflags & INPLOOKUP_WILDCARD) == 0) { @@ -2081,8 +2228,9 @@ /* * Lookup PCB in hash list, using pcbinfo tables. This variation assumes - * that the caller has locked the hash list, and will not perform any further - * locking or reference operations on either the hash list or the connection. + * that the caller has either locked the hash list, which usually happens + * for bind(2) operations, or is in SMR section, which happens when sorting + * out incoming packets. */ static struct inpcb * in_pcblookup_hash_locked(struct inpcbinfo *pcbinfo, struct in_addr faddr, @@ -2223,20 +2371,15 @@ { struct inpcb *inp; + smr_enter(pcbinfo->ipi_smr); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - } else if (lookupflags & INPLOOKUP_RLOCKPCB) { - INP_RLOCK(inp); - } else - panic("%s: locking bug", __func__); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_UNLOCK(inp); + if (__predict_false(inp_smr_lock(inp, + (lookupflags & INPLOOKUP_LOCKMASK)) == false)) inp = NULL; - } - } + } else + smr_exit(pcbinfo->ipi_smr); return (inp); } @@ -2331,11 +2474,10 @@ * If none exists, malloc one and tack it on. */ if (phd == NULL) { - phd = malloc(sizeof(struct inpcbport), M_PCB, M_NOWAIT); + phd = uma_zalloc_smr(pcbinfo->ipi_portzone, M_NOWAIT); if (phd == NULL) { return (ENOBUFS); /* XXX */ } - bzero(&phd->phd_epoch_ctx, sizeof(struct epoch_context)); phd->phd_port = inp->inp_lport; CK_LIST_INIT(&phd->phd_pcblist); CK_LIST_INSERT_HEAD(pcbporthash, phd, phd_hash); @@ -2353,6 +2495,10 @@ * changed. NOTE: This does not handle the case of the lport changing (the * hashed port list would have to be updated as well), so the lport must * not change after in_pcbinshash() has been called. + * + * XXXGL: a race between this function and SMR-protected hash iterator + * will lead to iterator traversing a possibly wrong hash list. However, + * this race should have been here since change from rwlock to epoch. */ void in_pcbrehash(struct inpcb *inp) @@ -2381,39 +2527,6 @@ CK_LIST_INSERT_HEAD(head, inp, inp_hash); } -/* - * Remove PCB from various lists. - */ -static void -in_pcbremlists(struct inpcb *inp) -{ - struct inpcbinfo *pcbinfo = inp->inp_pcbinfo; - - INP_WLOCK_ASSERT(inp); - INP_LIST_WLOCK_ASSERT(pcbinfo); - - inp->inp_gencnt = ++pcbinfo->ipi_gencnt; - if (inp->inp_flags & INP_INHASHLIST) { - struct inpcbport *phd = inp->inp_phd; - - INP_HASH_WLOCK(pcbinfo); - - /* XXX: Only do if SO_REUSEPORT_LB set? */ - in_pcbremlbgrouphash(inp); - - CK_LIST_REMOVE(inp, inp_hash); - CK_LIST_REMOVE(inp, inp_portlist); - if (CK_LIST_FIRST(&phd->phd_pcblist) == NULL) { - CK_LIST_REMOVE(phd, phd_hash); - NET_EPOCH_CALL(inpcbport_free, &phd->phd_epoch_ctx); - } - INP_HASH_WUNLOCK(pcbinfo); - inp->inp_flags &= ~INP_INHASHLIST; - } - CK_LIST_REMOVE(inp, inp_list); - pcbinfo->ipi_count--; -} - /* * Check for alternatives when higher level complains * about service problems. For now, invalidate cached @@ -2548,15 +2661,12 @@ void inp_apply_all(void (*func)(struct inpcb *, void *), void *arg) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); struct inpcb *inp; - INP_INFO_WLOCK(&V_tcbinfo); - CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { - INP_WLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) func(inp, arg); - INP_WUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_tcbinfo); } struct socket * Index: sys/netinet/in_pcb_var.h =================================================================== --- sys/netinet/in_pcb_var.h +++ sys/netinet/in_pcb_var.h @@ -44,6 +44,7 @@ * Definitions shared between netinet/in_pcb.c and netinet6/in6_pcb.c */ +bool inp_smr_lock(struct inpcb *, const inp_lookup_t); int in_pcb_lport(struct inpcb *, struct in_addr *, u_short *, struct ucred *, int); int in_pcb_lport_dest(struct inpcb *inp, struct sockaddr *lsa, @@ -52,4 +53,10 @@ struct inpcb * in_pcblookup_local(struct inpcbinfo *, struct in_addr, u_short, int, struct ucred *); +struct inpcbport { + struct inpcbhead phd_pcblist; + CK_LIST_ENTRY(inpcbport) phd_hash; + u_short phd_port; +}; + #endif /* !_NETINET_IN_PCB_VAR_H_ */ Index: sys/netinet/ip_divert.c =================================================================== --- sys/netinet/ip_divert.c +++ sys/netinet/ip_divert.c @@ -111,10 +111,7 @@ */ /* Internal variables. */ -VNET_DEFINE_STATIC(struct inpcbhead, divcb); VNET_DEFINE_STATIC(struct inpcbinfo, divcbinfo); - -#define V_divcb VNET(divcb) #define V_divcbinfo VNET(divcbinfo) static u_long div_sendspace = DIVSNDQ; /* XXX sysctl ? */ @@ -154,8 +151,7 @@ * allocate one-entry hash lists than it is to check all over the * place for hashbase == NULL. */ - in_pcbinfo_init(&V_divcbinfo, "div", &V_divcb, 1, 1, "divcb", - div_inpcb_init, IPI_HASHFIELDS_NONE); + in_pcbinfo_init(&V_divcbinfo, "div", 1, 1, "divcb", div_inpcb_init); } static void @@ -181,6 +177,14 @@ return (IPPROTO_DONE); } +static bool +div_port_match(const struct inpcb *inp, void *v) +{ + uint16_t nport = *(uint16_t *)v; + + return (inp->inp_lport == nport); +} + /* * Divert a packet by passing it up to the divert socket at port 'port'. * @@ -195,6 +199,8 @@ struct socket *sa; u_int16_t nport; struct sockaddr_in divsrc; + struct inpcb_iterator inpi = INP_ITERATOR(&V_divcbinfo, + INPLOOKUP_RLOCKPCB, div_port_match, &nport); struct m_tag *mtag; NET_EPOCH_ASSERT(); @@ -294,27 +300,20 @@ /* Put packet on socket queue, if any */ sa = NULL; + /* nport is inp_next's context. */ nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); - CK_LIST_FOREACH(inp, &V_divcb, inp_list) { + while ((inp = inp_next(&inpi)) != NULL) { + sa = inp->inp_socket; + SOCKBUF_LOCK(&sa->so_rcv); + if (sbappendaddr_locked(&sa->so_rcv, + (struct sockaddr *)&divsrc, m, NULL) == 0) { + soroverflow_locked(sa); + sa = NULL; /* force mbuf reclaim below */ + } else + sorwakeup_locked(sa); /* XXX why does only one socket match? */ - if (inp->inp_lport == nport) { - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } - sa = inp->inp_socket; - SOCKBUF_LOCK(&sa->so_rcv); - if (sbappendaddr_locked(&sa->so_rcv, - (struct sockaddr *)&divsrc, m, - (struct mbuf *)0) == 0) { - soroverflow_locked(sa); - sa = NULL; /* force mbuf reclaim below */ - } else - sorwakeup_locked(sa); - INP_RUNLOCK(inp); - break; - } + INP_RUNLOCK(inp); + break; } if (sa == NULL) { m_freem(m); @@ -609,14 +608,10 @@ error = soreserve(so, div_sendspace, div_recvspace); if (error) return error; - INP_INFO_WLOCK(&V_divcbinfo); error = in_pcballoc(so, &V_divcbinfo); - if (error) { - INP_INFO_WUNLOCK(&V_divcbinfo); + if (error) return error; - } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&V_divcbinfo); inp->inp_ip_p = proto; inp->inp_vflag |= INP_IPV4; inp->inp_flags |= INP_HDRINCL; @@ -631,11 +626,9 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("div_detach: inp == NULL")); - INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_divcbinfo); } static int @@ -658,13 +651,11 @@ if (nam->sa_len != sizeof(struct sockaddr_in)) return EINVAL; ((struct sockaddr_in *)nam)->sin_addr.s_addr = INADDR_ANY; - INP_INFO_WLOCK(&V_divcbinfo); INP_WLOCK(inp); INP_HASH_WLOCK(&V_divcbinfo); error = in_pcbbind(inp, nam, td->td_ucred); INP_HASH_WUNLOCK(&V_divcbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_divcbinfo); return error; } @@ -703,8 +694,9 @@ static int div_pcblist(SYSCTL_HANDLER_ARGS) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_divcbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; - struct epoch_tracker et; struct inpcb *inp; int error; @@ -732,21 +724,18 @@ if (error) return error; - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - } else - INP_RUNLOCK(inp); + if (error) { + INP_RUNLOCK(inp); + break; + } + } } - NET_EPOCH_EXIT(et); if (!error) { /* Index: sys/netinet/ip_gre.c =================================================================== --- sys/netinet/ip_gre.c +++ sys/netinet/ip_gre.c @@ -223,25 +223,11 @@ in_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { - struct epoch_tracker et; struct gre_socket *gs; struct gre_softc *sc; in_addr_t dst; - NET_EPOCH_ENTER(et); - /* - * udp_append() holds reference to inp, it is safe to check - * inp_flags2 without INP_RLOCK(). - * If socket was closed before we have entered NET_EPOCH section, - * INP_FREED flag should be set. Otherwise it should be safe to - * make access to ctx data, because gre_so will be freed by - * gre_sofree() via NET_EPOCH_CALL(). - */ - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - NET_EPOCH_EXIT(et); - m_freem(m); - return; - } + NET_EPOCH_ASSERT(); gs = (struct gre_socket *)ctx; dst = ((const struct sockaddr_in *)sa)->sin_addr.s_addr; @@ -251,11 +237,9 @@ } if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); - NET_EPOCH_EXIT(et); return; } m_freem(m); - NET_EPOCH_EXIT(et); } static int Index: sys/netinet/raw_ip.c =================================================================== --- sys/netinet/raw_ip.c +++ sys/netinet/raw_ip.c @@ -87,10 +87,7 @@ &VNET_NAME(ip_defttl), 0, "Maximum TTL on IP packets"); -VNET_DEFINE(struct inpcbhead, ripcb); VNET_DEFINE(struct inpcbinfo, ripcbinfo); - -#define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) /* @@ -160,7 +157,7 @@ struct inpcbhead *pcbhash; int hash; - INP_INFO_WLOCK_ASSERT(pcbinfo); + INP_HASH_WLOCK_ASSERT(pcbinfo); INP_WLOCK_ASSERT(inp); if (inp->inp_ip_p != 0 && @@ -178,7 +175,7 @@ rip_delhash(struct inpcb *inp) { - INP_INFO_WLOCK_ASSERT(inp->inp_pcbinfo); + INP_HASH_WLOCK_ASSERT(inp->inp_pcbinfo); INP_WLOCK_ASSERT(inp); CK_LIST_REMOVE(inp, inp_hash); @@ -212,8 +209,8 @@ rip_init(void) { - in_pcbinfo_init(&V_ripcbinfo, "rip", &V_ripcb, INP_PCBHASH_RAW_SIZE, - 1, "ripcb", rip_inpcb_init, IPI_HASHFIELDS_NONE); + in_pcbinfo_init(&V_ripcbinfo, "rip", INP_PCBHASH_RAW_SIZE, 1, "ripcb", + rip_inpcb_init); EVENTHANDLER_REGISTER(maxsockets_change, rip_zone_change, NULL, EVENTHANDLER_PRI_ANY); } @@ -230,47 +227,90 @@ #ifdef INET static int -rip_append(struct inpcb *last, struct ip *ip, struct mbuf *n, +rip_append(struct inpcb *inp, struct ip *ip, struct mbuf *m, struct sockaddr_in *ripsrc) { - int policyfail = 0; + struct socket *so = inp->inp_socket; + struct mbuf *n, *opts = NULL; - INP_LOCK_ASSERT(last); + INP_LOCK_ASSERT(inp); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* check AH/ESP integrity. */ - if (IPSEC_ENABLED(ipv4)) { - if (IPSEC_CHECK_POLICY(ipv4, n, last) != 0) - policyfail = 1; - } + if (IPSEC_ENABLED(ipv4) && IPSEC_CHECK_POLICY(ipv4, m, inp) != 0) + return (0); #endif /* IPSEC */ #ifdef MAC - if (!policyfail && mac_inpcb_check_deliver(last, n) != 0) - policyfail = 1; + if (mac_inpcb_check_deliver(inp, m) != 0) + return (0); #endif /* Check the minimum TTL for socket. */ - if (last->inp_ip_minttl && last->inp_ip_minttl > ip->ip_ttl) - policyfail = 1; - if (!policyfail) { - struct mbuf *opts = NULL; - struct socket *so; - - so = last->inp_socket; - if ((last->inp_flags & INP_CONTROLOPTS) || - (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) - ip_savecontrol(last, &opts, ip, n); - SOCKBUF_LOCK(&so->so_rcv); - if (sbappendaddr_locked(&so->so_rcv, - (struct sockaddr *)ripsrc, n, opts) == 0) { - soroverflow_locked(so); - m_freem(n); - if (opts) - m_freem(opts); - } else - sorwakeup_locked(so); - } else + if (inp->inp_ip_minttl && inp->inp_ip_minttl > ip->ip_ttl) + return (0); + + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) + return (0); + + if ((inp->inp_flags & INP_CONTROLOPTS) || + (so->so_options & (SO_TIMESTAMP | SO_BINTIME))) + ip_savecontrol(inp, &opts, ip, n); + SOCKBUF_LOCK(&so->so_rcv); + if (sbappendaddr_locked(&so->so_rcv, + (struct sockaddr *)ripsrc, n, opts) == 0) { + soroverflow_locked(so); m_freem(n); - return (policyfail); + if (opts) + m_freem(opts); + return (0); + } + sorwakeup_locked(so); + + return (1); +} + +struct rip_inp_match_ctx { + struct ip *ip; + int proto; +}; + +static bool +rip_inp_match1(const struct inpcb *inp, void *v) +{ + struct rip_inp_match_ctx *ctx = v; + + if (inp->inp_ip_p != ctx->proto) + return (false); +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + return (false); +#endif + if (inp->inp_laddr.s_addr != ctx->ip->ip_dst.s_addr) + return (false); + if (inp->inp_faddr.s_addr != ctx->ip->ip_src.s_addr) + return (false); + return (true); +} + +static bool +rip_inp_match2(const struct inpcb *inp, void *v) +{ + struct rip_inp_match_ctx *ctx = v; + + if (inp->inp_ip_p && inp->inp_ip_p != ctx->proto) + return (false); +#ifdef INET6 + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV4) == 0) + return (false); +#endif + if (!in_nullhost(inp->inp_laddr) && + !in_hosteq(inp->inp_laddr, ctx->ip->ip_dst)) + return (false); + if (!in_nullhost(inp->inp_faddr) && + !in_hosteq(inp->inp_faddr, ctx->ip->ip_src)) + return (false); + return (true); } /* @@ -280,102 +320,57 @@ int rip_input(struct mbuf **mp, int *offp, int proto) { + struct rip_inp_match_ctx ctx = { + .ip = mtod(*mp, struct ip *), + .proto = proto, + }; + struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB, rip_inp_match1, &ctx); struct ifnet *ifp; struct mbuf *m = *mp; - struct ip *ip = mtod(m, struct ip *); - struct inpcb *inp, *last; + struct inpcb *inp; struct sockaddr_in ripsrc; - int hash; - - NET_EPOCH_ASSERT(); + int appended; *mp = NULL; + appended = 0; bzero(&ripsrc, sizeof(ripsrc)); ripsrc.sin_len = sizeof(ripsrc); ripsrc.sin_family = AF_INET; - ripsrc.sin_addr = ip->ip_src; - last = NULL; + ripsrc.sin_addr = ctx.ip->ip_src; ifp = m->m_pkthdr.rcvif; - hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, - ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); - CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { - if (inp->inp_ip_p != proto) - continue; -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_laddr.s_addr != ip->ip_dst.s_addr) - continue; - if (inp->inp_faddr.s_addr != ip->ip_src.s_addr) - continue; - if (last != NULL) { - struct mbuf *n; - - n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (n != NULL) - (void) rip_append(last, ip, n, &ripsrc); - /* XXX count dropped packet */ - INP_RUNLOCK(last); - last = NULL; - } - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) - goto skip_1; - if (jailed_without_vnet(inp->inp_cred)) { + inpi.hash = INP_PCBHASH_RAW(proto, ctx.ip->ip_src.s_addr, + ctx.ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + if (jailed_without_vnet(inp->inp_cred) && + prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) { /* * XXX: If faddr was bound to multicast group, * jailed raw socket will drop datagram. */ - if (prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - goto skip_1; - } - last = inp; - continue; - skip_1: - INP_RUNLOCK(inp); - } - CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[0], inp_hash) { - if (inp->inp_ip_p && inp->inp_ip_p != proto) - continue; -#ifdef INET6 - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV4) == 0) continue; -#endif - if (!in_nullhost(inp->inp_laddr) && - !in_hosteq(inp->inp_laddr, ip->ip_dst)) - continue; - if (!in_nullhost(inp->inp_faddr) && - !in_hosteq(inp->inp_faddr, ip->ip_src)) - continue; - if (last != NULL) { - struct mbuf *n; - - n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - if (n != NULL) - (void) rip_append(last, ip, n, &ripsrc); - /* XXX count dropped packet */ - INP_RUNLOCK(last); - last = NULL; } - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) - goto skip_2; - if (jailed_without_vnet(inp->inp_cred)) { + appended += rip_append(inp, ctx.ip, m, &ripsrc); + } + + inpi.hash = 0; + inpi.match = rip_inp_match2; + MPASS(inpi.inp == NULL); + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + if (jailed_without_vnet(inp->inp_cred) && + !IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr)) && + prison_check_ip4(inp->inp_cred, &ctx.ip->ip_dst) != 0) /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ - if (!IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) && - prison_check_ip4(inp->inp_cred, &ip->ip_dst) != 0) - goto skip_2; - } + continue; /* * If this raw socket has multicast state, and we * have received a multicast, check if this socket @@ -383,7 +378,7 @@ * the responsibility of the transport layer. */ if (inp->inp_moptions != NULL && - IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + IN_MULTICAST(ntohl(ctx.ip->ip_dst.s_addr))) { /* * If the incoming datagram is for IGMP, allow it * through unconditionally to the raw socket. @@ -405,7 +400,7 @@ bzero(&group, sizeof(struct sockaddr_in)); group.sin_len = sizeof(struct sockaddr_in); group.sin_family = AF_INET; - group.sin_addr = ip->ip_dst; + group.sin_addr = ctx.ip->ip_dst; blocked = imo_multi_filter(inp->inp_moptions, ifp, @@ -415,27 +410,18 @@ if (blocked != MCAST_PASS) { IPSTAT_INC(ips_notmember); - goto skip_2; + continue; } } - last = inp; - continue; - skip_2: - INP_RUNLOCK(inp); - } - if (last != NULL) { - if (rip_append(last, ip, m, &ripsrc) != 0) - IPSTAT_INC(ips_delivered); - INP_RUNLOCK(last); - } else { - if (inetsw[ip_protox[ip->ip_p]].pr_input == rip_input) { - IPSTAT_INC(ips_noproto); - IPSTAT_DEC(ips_delivered); - icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); - } else { - m_freem(m); - } + appended += rip_append(inp, ctx.ip, m, &ripsrc); } + if (appended == 0 && + inetsw[ip_protox[ctx.ip->ip_p]].pr_input == rip_input) { + IPSTAT_INC(ips_noproto); + IPSTAT_DEC(ips_delivered); + icmp_error(m, ICMP_UNREACH, ICMP_UNREACH_PROTOCOL, 0, 0); + } else + m_freem(m); return (IPPROTO_DONE); } @@ -898,18 +884,16 @@ error = soreserve(so, rip_sendspace, rip_recvspace); if (error) return (error); - INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); - if (error) { - INP_INFO_WUNLOCK(&V_ripcbinfo); + if (error) return (error); - } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV4; inp->inp_ip_p = proto; inp->inp_ip_ttl = V_ip_defttl; + INP_HASH_WLOCK(&V_ripcbinfo); rip_inshash(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); + INP_HASH_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); return (0); } @@ -924,9 +908,10 @@ KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("rip_detach: not closed")); - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_ripcbinfo); rip_delhash(inp); + INP_HASH_WUNLOCK(&V_ripcbinfo); if (so == V_ip_mrouter && ip_mrouter_done) ip_mrouter_done(); if (ip_rsvp_force_done) @@ -935,7 +920,6 @@ ip_rsvp_done(); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } static void @@ -944,16 +928,16 @@ struct inpcbinfo *pcbinfo; pcbinfo = inp->inp_pcbinfo; - INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(pcbinfo); rip_delhash(inp); inp->inp_faddr.s_addr = INADDR_ANY; rip_inshash(inp); + INP_HASH_WUNLOCK(pcbinfo); SOCK_LOCK(so); so->so_state &= ~SS_ISCONNECTED; SOCK_UNLOCK(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(pcbinfo); } static void @@ -1019,13 +1003,13 @@ ifa_ifwithaddr_check((struct sockaddr *)addr) == 0)) return (EADDRNOTAVAIL); - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_ripcbinfo); rip_delhash(inp); inp->inp_laddr = addr->sin_addr; rip_inshash(inp); + INP_HASH_WUNLOCK(&V_ripcbinfo); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1045,14 +1029,14 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("rip_connect: inp == NULL")); - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); + INP_HASH_WLOCK(&V_ripcbinfo); rip_delhash(inp); inp->inp_faddr = addr->sin_addr; rip_inshash(inp); + INP_HASH_WUNLOCK(&V_ripcbinfo); soisconnected(so); INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); return (0); } @@ -1118,8 +1102,9 @@ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; - struct epoch_tracker et; struct inpcb *inp; int error; @@ -1147,24 +1132,19 @@ if (error) return (error); - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - if (error) + if (error) { + INP_RUNLOCK(inp); break; - } else - INP_RUNLOCK(inp); + } + } } - NET_EPOCH_EXIT(et); if (!error) { /* Index: sys/netinet/tcp_hpts.c =================================================================== --- sys/netinet/tcp_hpts.c +++ sys/netinet/tcp_hpts.c @@ -579,28 +579,10 @@ static void tcp_remove_hpts_ref(struct inpcb *inp, struct tcp_hpts_entry *hpts, int line) { - int32_t add_freed; int32_t ret; - if (inp->inp_flags2 & INP_FREED) { - /* - * Need to play a special trick so that in_pcbrele_wlocked - * does not return 1 when it really should have returned 0. - */ - add_freed = 1; - inp->inp_flags2 &= ~INP_FREED; - } else { - add_freed = 0; - } -#ifndef INP_REF_DEBUG ret = in_pcbrele_wlocked(inp); -#else - ret = __in_pcbrele_wlocked(inp, line); -#endif KASSERT(ret != 1, ("inpcb:%p release ret 1", inp)); - if (add_freed) { - inp->inp_flags2 |= INP_FREED; - } } static void @@ -1291,8 +1273,7 @@ #ifdef VIMAGE CURVNET_SET(inp->inp_vnet); #endif - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { out: hpts->p_inp = NULL; if (in_pcbrele_wlocked(inp) == 0) { @@ -1593,8 +1574,7 @@ hpts->p_inp = NULL; continue; } - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { out_now: KASSERT(mtx_owned(&hpts->p_mtx) == 0, ("Hpts:%p owns mtx prior-to lock line:%d", Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -244,8 +244,6 @@ &VNET_NAME(tcp_autorcvbuf_max), 0, "Max size of automatic receive buffer"); -VNET_DEFINE(struct inpcbhead, tcb); -#define tcb6 tcb /* for KAME src sync over BSD*'s */ VNET_DEFINE(struct inpcbinfo, tcbinfo); /* Index: sys/netinet/tcp_lro.c =================================================================== --- sys/netinet/tcp_lro.c +++ sys/netinet/tcp_lro.c @@ -1310,8 +1310,7 @@ /* Check if the inp is dead, Jim. */ if (tp == NULL || - (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT)) || - (inp->inp_flags2 & INP_FREED)) { + (inp->inp_flags & (INP_DROPPED | INP_TIMEWAIT))) { INP_WUNLOCK(inp); return (TCP_LRO_CANNOT); } Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -1373,6 +1373,8 @@ * to the default stack. */ if (force && blk->tfb_refcnt) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); @@ -1382,22 +1384,14 @@ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); - CK_LIST_FOREACH(inp, V_tcbinfo.ipi_listhead, inp_list) { - INP_WLOCK(inp); - if (inp->inp_flags & INP_TIMEWAIT) { - INP_WUNLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { + if (inp->inp_flags & INP_TIMEWAIT) continue; - } tp = intotcpcb(inp); - if (tp == NULL || tp->t_fb != blk) { - INP_WUNLOCK(inp); + if (tp == NULL || tp->t_fb != blk) continue; - } tcp_switch_back_to_default(tp); - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -1485,8 +1479,8 @@ "clipped from %d to %d.\n", __func__, oldhashsize, hashsize); } - in_pcbinfo_init(&V_tcbinfo, "tcp", &V_tcb, hashsize, hashsize, - "tcp_inpcb", tcp_inpcb_init, IPI_HASHFIELDS_4TUPLE); + in_pcbinfo_init(&V_tcbinfo, "tcp", hashsize, hashsize, + "tcp_inpcb", tcp_inpcb_init); /* * These have to be type stable for the benefit of the timers. @@ -1596,9 +1590,9 @@ * Sleep to let all tcpcb timers really disappear and cleanup. */ for (;;) { - INP_LIST_RLOCK(&V_tcbinfo); + INP_INFO_WLOCK(&V_tcbinfo); n = V_tcbinfo.ipi_count; - INP_LIST_RUNLOCK(&V_tcbinfo); + INP_INFO_WUNLOCK(&V_tcbinfo); if (n == 0) break; pause("tcpdes", hz / 10); @@ -2283,6 +2277,8 @@ struct inpcb *inp; struct tcpcb *tp; VNET_ITERATOR_DECL(vnet_iter); + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); /* * Check all active control blocks across all network stacks and change @@ -2292,17 +2288,12 @@ VNET_LIST_RLOCK(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - INP_INFO_WLOCK(&V_tcbinfo); /* - * New connections already part way through being initialised - * with the CC algo we're removing will not race with this code - * because the INP_INFO_WLOCK is held during initialisation. We - * therefore don't enter the loop below until the connection - * list has stabilised. + * XXXGL: would new accept(2)d connections use algo being + * unloaded? */ newalgo = CC_DEFAULT_ALGO(); - CK_LIST_FOREACH(inp, &V_tcb, inp_list) { - INP_WLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { /* Important to skip tcptw structs. */ if (!(inp->inp_flags & INP_TIMEWAIT) && (tp = intotcpcb(inp)) != NULL) { @@ -2336,7 +2327,6 @@ * need to try again. */ INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); VNET_LIST_RUNLOCK(); return (err); @@ -2353,9 +2343,7 @@ } } } - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK(); @@ -2373,7 +2361,6 @@ struct socket *so = tp->t_inpcb->inp_socket; NET_EPOCH_ASSERT(); - INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); if (TCPS_HAVERCVDSYN(tp->t_state)) { @@ -2559,7 +2546,6 @@ struct inpcb *inp = tp->t_inpcb; struct socket *so; - INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); #ifdef TCP_OFFLOAD @@ -2598,6 +2584,8 @@ void tcp_drain(void) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_WLOCKPCB); VNET_ITERATOR_DECL(vnet_iter); if (!do_tcpdrain) @@ -2617,13 +2605,9 @@ * where we're really low on mbufs, this is potentially * useful. */ - INP_INFO_WLOCK(&V_tcbinfo); - CK_LIST_FOREACH(inpb, V_tcbinfo.ipi_listhead, inp_list) { - INP_WLOCK(inpb); - if (inpb->inp_flags & INP_TIMEWAIT) { - INP_WUNLOCK(inpb); + while ((inpb = inp_next(&inpi)) != NULL) { + if (inpb->inp_flags & INP_TIMEWAIT) continue; - } if ((tcpb = intotcpcb(inpb)) != NULL) { tcp_reass_flush(tcpb); tcp_clean_sackreport(tcpb); @@ -2638,9 +2622,7 @@ } #endif } - INP_WUNLOCK(inpb); } - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); @@ -2659,7 +2641,6 @@ { struct tcpcb *tp; - INP_INFO_LOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -2705,9 +2686,10 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { - struct epoch_tracker et; - struct inpcb *inp; + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_tcbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; + struct inpcb *inp; int error; if (req->newptr != NULL) @@ -2740,11 +2722,7 @@ if (error) return (error); - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen) { int crerr; @@ -2765,17 +2743,15 @@ struct xtcpcb xt; tcp_inptoxtp(inp, &xt); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xt, sizeof xt); - if (error) + if (error) { + INP_RUNLOCK(inp); break; - else + } else continue; } } - INP_RUNLOCK(inp); } - NET_EPOCH_EXIT(et); if (!error) { /* Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -908,7 +908,6 @@ VNET_DECLARE(int, tcp_sendspace); VNET_DECLARE(int, tcp_udp_tunneling_overhead); VNET_DECLARE(int, tcp_udp_tunneling_port); -VNET_DECLARE(struct inpcbhead, tcb); VNET_DECLARE(struct inpcbinfo, tcbinfo); #define V_tcp_do_lrd VNET(tcp_do_lrd) @@ -917,7 +916,6 @@ #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) -#define V_tcb VNET(tcb) #define V_tcbinfo VNET(tcbinfo) #define V_tcp_abc_l_var VNET(tcp_abc_l_var) #define V_tcp_autorcvbuf_max VNET(tcp_autorcvbuf_max) Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -147,9 +147,7 @@ SYSCTL_ULONG(_net_inet_udp, UDPCTL_RECVSPACE, recvspace, CTLFLAG_RW, &udp_recvspace, 0, "Maximum space for incoming UDP datagrams"); -VNET_DEFINE(struct inpcbhead, udb); /* from udp_var.h */ VNET_DEFINE(struct inpcbinfo, udbinfo); -VNET_DEFINE(struct inpcbhead, ulitecb); VNET_DEFINE(struct inpcbinfo, ulitecbinfo); VNET_DEFINE_STATIC(uma_zone_t, udpcb_zone); #define V_udpcb_zone VNET(udpcb_zone) @@ -211,8 +209,8 @@ * Once we can calculate the flowid that way and re-establish * a 4-tuple, flip this to 4-tuple. */ - in_pcbinfo_init(&V_udbinfo, "udp", &V_udb, UDBHASHSIZE, UDBHASHSIZE, - "udp_inpcb", udp_inpcb_init, IPI_HASHFIELDS_2TUPLE); + in_pcbinfo_init(&V_udbinfo, "udp", UDBHASHSIZE, UDBHASHSIZE, + "udp_inpcb", udp_inpcb_init); V_udpcb_zone = uma_zcreate("udpcb", sizeof(struct udpcb), NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); uma_zone_set_max(V_udpcb_zone, maxsockets); @@ -225,9 +223,8 @@ udplite_init(void) { - in_pcbinfo_init(&V_ulitecbinfo, "udplite", &V_ulitecb, UDBHASHSIZE, - UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init, - IPI_HASHFIELDS_2TUPLE); + in_pcbinfo_init(&V_ulitecbinfo, "udplite", UDBHASHSIZE, + UDBHASHSIZE, "udplite_inpcb", udplite_inpcb_init); } /* @@ -393,6 +390,123 @@ return (0); } +static bool +udp_multi_match(const struct inpcb *inp, void *v) +{ + struct ip *ip = v; + struct udphdr *uh = (struct udphdr *)(ip + 1); + + if (inp->inp_lport != uh->uh_dport) + return (false); +#ifdef INET6 + if ((inp->inp_vflag & INP_IPV4) == 0) + return (false); +#endif + if (inp->inp_laddr.s_addr != INADDR_ANY && + inp->inp_laddr.s_addr != ip->ip_dst.s_addr) + return (false); + if (inp->inp_faddr.s_addr != INADDR_ANY && + inp->inp_faddr.s_addr != ip->ip_src.s_addr) + return (false); + if (inp->inp_fport != 0 && + inp->inp_fport != uh->uh_sport) + return (false); + + return (true); +} + +static int +udp_multi_input(struct mbuf *m, int proto, struct sockaddr_in *udp_in) +{ + struct ip *ip = mtod(m, struct ip *); + struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), + INPLOOKUP_RLOCKPCB, udp_multi_match, ip); + struct udphdr *uh = (struct udphdr *)(ip + 1); + struct inpcb *inp; + struct mbuf *n; + int appends = 0; + + MPASS(ip->ip_hl == sizeof(struct ip) >> 2); + + while ((inp = inp_next(&inpi)) != NULL) { + /* + * XXXRW: Because we weren't holding either the inpcb + * or the hash lock when we checked for a match + * before, we should probably recheck now that the + * inpcb lock is held. + */ + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { + struct ip_moptions *imo; + struct sockaddr_in group; + int blocked; + + imo = inp->inp_moptions; + if (imo == NULL) + continue; + bzero(&group, sizeof(struct sockaddr_in)); + group.sin_len = sizeof(struct sockaddr_in); + group.sin_family = AF_INET; + group.sin_addr = ip->ip_dst; + + blocked = imo_multi_filter(imo, m->m_pkthdr.rcvif, + (struct sockaddr *)&group, + (struct sockaddr *)&udp_in[0]); + if (blocked != MCAST_PASS) { + if (blocked == MCAST_NOTGMEMBER) + IPSTAT_INC(ips_notmember); + if (blocked == MCAST_NOTSMEMBER || + blocked == MCAST_MUTED) + UDPSTAT_INC(udps_filtermcast); + continue; + } + } + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, inp, ip, inp, uh); + else + UDP_PROBE(receive, NULL, inp, ip, inp, uh); + if (udp_append(inp, ip, n, sizeof(struct ip), udp_in)) { + INP_RUNLOCK(inp); + break; + } else + appends++; + } + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids + * searching through all pcbs in the common case of a + * non-shared port. It assumes that an application + * will never clear these options after setting them. + */ + if ((inp->inp_socket->so_options & + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { + INP_RUNLOCK(inp); + break; + } + } + m_freem(m); + + if (appends == 0) { + /* + * No matching pcb found; discard datagram. (No need + * to send an ICMP Port Unreachable for a broadcast + * or multicast datgram.) + */ + UDPSTAT_INC(udps_noport); + if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) + UDPSTAT_INC(udps_noportmcast); + else + UDPSTAT_INC(udps_noportbcast); + } + + return (IPPROTO_DONE); +} + int udp_input(struct mbuf **mp, int *offp, int proto) { @@ -519,140 +633,15 @@ } } - pcbinfo = udp_get_inpcbinfo(proto); if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr)) || - in_broadcast(ip->ip_dst, ifp)) { - struct inpcb *last; - struct inpcbhead *pcblist; - - NET_EPOCH_ASSERT(); - - pcblist = udp_get_pcblist(proto); - last = NULL; - CK_LIST_FOREACH(inp, pcblist, inp_list) { - if (inp->inp_lport != uh->uh_dport) - continue; -#ifdef INET6 - if ((inp->inp_vflag & INP_IPV4) == 0) - continue; -#endif - if (inp->inp_laddr.s_addr != INADDR_ANY && - inp->inp_laddr.s_addr != ip->ip_dst.s_addr) - continue; - if (inp->inp_faddr.s_addr != INADDR_ANY && - inp->inp_faddr.s_addr != ip->ip_src.s_addr) - continue; - if (inp->inp_fport != 0 && - inp->inp_fport != uh->uh_sport) - continue; - - INP_RLOCK(inp); - - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } - - /* - * XXXRW: Because we weren't holding either the inpcb - * or the hash lock when we checked for a match - * before, we should probably recheck now that the - * inpcb lock is held. - */ - - /* - * Handle socket delivery policy for any-source - * and source-specific multicast. [RFC3678] - */ - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) { - struct ip_moptions *imo; - struct sockaddr_in group; - int blocked; - - imo = inp->inp_moptions; - if (imo == NULL) { - INP_RUNLOCK(inp); - continue; - } - bzero(&group, sizeof(struct sockaddr_in)); - group.sin_len = sizeof(struct sockaddr_in); - group.sin_family = AF_INET; - group.sin_addr = ip->ip_dst; - - blocked = imo_multi_filter(imo, ifp, - (struct sockaddr *)&group, - (struct sockaddr *)&udp_in[0]); - if (blocked != MCAST_PASS) { - if (blocked == MCAST_NOTGMEMBER) - IPSTAT_INC(ips_notmember); - if (blocked == MCAST_NOTSMEMBER || - blocked == MCAST_MUTED) - UDPSTAT_INC(udps_filtermcast); - INP_RUNLOCK(inp); - continue; - } - } - if (last != NULL) { - struct mbuf *n; - - if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != - NULL) { - if (proto == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, last, ip, - last, uh); - else - UDP_PROBE(receive, NULL, last, ip, last, - uh); - if (udp_append(last, ip, n, iphlen, - udp_in)) { - INP_RUNLOCK(inp); - goto badunlocked; - } - } - /* Release PCB lock taken on previous pass. */ - INP_RUNLOCK(last); - } - last = inp; - /* - * Don't look for additional matches if this one does - * not have either the SO_REUSEPORT or SO_REUSEADDR - * socket options set. This heuristic avoids - * searching through all pcbs in the common case of a - * non-shared port. It assumes that an application - * will never clear these options after setting them. - */ - if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) - break; - } + in_broadcast(ip->ip_dst, ifp)) + return (udp_multi_input(m, proto, udp_in)); - if (last == NULL) { - /* - * No matching pcb found; discard datagram. (No need - * to send an ICMP Port Unreachable for a broadcast - * or multicast datgram.) - */ - UDPSTAT_INC(udps_noport); - if (IN_MULTICAST(ntohl(ip->ip_dst.s_addr))) - UDPSTAT_INC(udps_noportmcast); - else - UDPSTAT_INC(udps_noportbcast); - goto badunlocked; - } - if (proto == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, last, ip, last, uh); - else - UDP_PROBE(receive, NULL, last, ip, last, uh); - if (udp_append(last, ip, m, iphlen, udp_in) == 0) - INP_RUNLOCK(last); - return (IPPROTO_DONE); - } + pcbinfo = udp_get_inpcbinfo(proto); /* * Locate pcb for datagram. - */ - - /* + * * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ if ((m->m_flags & M_IP_NEXTHOP) && @@ -852,8 +841,9 @@ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { + struct inpcb_iterator inpi = INP_ALL_ITERATOR(&V_udbinfo, + INPLOOKUP_RLOCKPCB); struct xinpgen xig; - struct epoch_tracker et; struct inpcb *inp; int error; @@ -881,24 +871,19 @@ if (error) return (error); - NET_EPOCH_ENTER(et); - for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead); - inp != NULL; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_RLOCK(inp); + while ((inp = inp_next(&inpi)) != NULL) { if (inp->inp_gencnt <= xig.xig_gen && cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); - INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); - if (error) + if (error) { + INP_RUNLOCK(inp); break; - } else - INP_RUNLOCK(inp); + } + } } - NET_EPOCH_EXIT(et); if (!error) { /* @@ -1284,15 +1269,16 @@ laddr = inp->inp_laddr; lport = inp->inp_lport; if (src.sin_family == AF_INET) { - INP_HASH_LOCK_ASSERT(pcbinfo); if ((lport == 0) || (laddr.s_addr == INADDR_ANY && src.sin_addr.s_addr == INADDR_ANY)) { error = EINVAL; goto release; } + INP_HASH_WLOCK(pcbinfo); error = in_pcbbind_setup(inp, (struct sockaddr *)&src, &laddr.s_addr, &lport, td->td_ucred); + INP_HASH_WUNLOCK(pcbinfo); if (error) goto release; } @@ -1335,12 +1321,14 @@ inp->inp_lport == 0 || sin->sin_addr.s_addr == INADDR_ANY || sin->sin_addr.s_addr == INADDR_BROADCAST) { - INP_HASH_LOCK_ASSERT(pcbinfo); + INP_HASH_WLOCK(pcbinfo); error = in_pcbconnect_setup(inp, addr, &laddr.s_addr, &lport, &faddr.s_addr, &fport, NULL, td->td_ucred); - if (error) + if (error) { + INP_HASH_WUNLOCK(pcbinfo); goto release; + } /* * XXXRW: Why not commit the port if the address is @@ -1357,7 +1345,6 @@ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; - INP_HASH_WLOCK(pcbinfo); error = in_pcbinshash(inp); INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { @@ -1366,7 +1353,8 @@ goto release; } inp->inp_flags |= INP_ANONPORT; - } + } else + INP_HASH_WUNLOCK(pcbinfo); } else { faddr = sin->sin_addr; fport = sin->sin_port; @@ -1560,12 +1548,9 @@ error = soreserve(so, udp_sendspace, udp_recvspace); if (error) return (error); - INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); - if (error) { - INP_INFO_WUNLOCK(pcbinfo); + if (error) return (error); - } inp = sotoinpcb(so); inp->inp_vflag |= INP_IPV4; @@ -1577,12 +1562,10 @@ if (error) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); return (error); } - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(pcbinfo); + return (0); } #endif /* INET */ @@ -1718,14 +1701,12 @@ KASSERT(inp != NULL, ("udp_detach: inp == NULL")); KASSERT(inp->inp_faddr.s_addr == INADDR_ANY, ("udp_detach: not disconnected")); - INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); inp->inp_ppcb = NULL; in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); } Index: sys/netinet/udp_var.h =================================================================== --- sys/netinet/udp_var.h +++ sys/netinet/udp_var.h @@ -136,13 +136,9 @@ SYSCTL_DECL(_net_inet_udp); extern struct pr_usrreqs udp_usrreqs; -VNET_DECLARE(struct inpcbhead, udb); VNET_DECLARE(struct inpcbinfo, udbinfo); -VNET_DECLARE(struct inpcbhead, ulitecb); VNET_DECLARE(struct inpcbinfo, ulitecbinfo); -#define V_udb VNET(udb) #define V_udbinfo VNET(udbinfo) -#define V_ulitecb VNET(ulitecb) #define V_ulitecbinfo VNET(ulitecbinfo) extern u_long udp_sendspace; @@ -165,12 +161,6 @@ return (protocol == IPPROTO_UDP) ? &V_udbinfo : &V_ulitecbinfo; } -static __inline struct inpcbhead * -udp_get_pcblist(int protocol) -{ - return (protocol == IPPROTO_UDP) ? &V_udb : &V_ulitecb; -} - int udp_newudpcb(struct inpcb *); void udp_discardcb(struct udpcb *); Index: sys/netinet6/icmp6.c =================================================================== --- sys/netinet6/icmp6.c +++ sys/netinet6/icmp6.c @@ -124,14 +124,12 @@ #endif /* VIMAGE */ VNET_DECLARE(struct inpcbinfo, ripcbinfo); -VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(int, icmp6errppslim); VNET_DEFINE_STATIC(int, icmp6errpps_count) = 0; VNET_DEFINE_STATIC(struct timeval, icmp6errppslim_last); VNET_DECLARE(int, icmp6_nodeinfo); #define V_ripcbinfo VNET(ripcbinfo) -#define V_ripcb VNET(ripcb) #define V_icmp6errppslim VNET(icmp6errppslim) #define V_icmp6errpps_count VNET(icmp6errpps_count) #define V_icmp6errppslim_last VNET(icmp6errppslim_last) @@ -1875,21 +1873,39 @@ return (copied); } +static bool +icmp6_rip6_match(const struct inpcb *inp, void *v) +{ + struct ip6_hdr *ip6 = v; + + if ((inp->inp_vflag & INP_IPV6) == 0) + return (false); + if (inp->inp_ip_p != IPPROTO_ICMPV6) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) + return (false); + return (true); +} + /* * XXX almost dup'ed code with rip6_input. */ static int icmp6_rip6_input(struct mbuf **mp, int off) { - struct mbuf *m = *mp; + struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); + struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB, icmp6_rip6_match, ip6); struct inpcb *inp; - struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; struct mbuf *opts = NULL; - - NET_EPOCH_ASSERT(); + int delivered = 0; /* This is assumed to be safe; icmp6_input() does a pullup. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); @@ -1908,125 +1924,64 @@ return (IPPROTO_DONE); } - CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (inp->inp_ip_p != IPPROTO_ICMPV6) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) - continue; - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } + while ((inp = inp_next(&inpi)) != NULL) { if (ICMP6_FILTER_WILLBLOCK(icmp6->icmp6_type, - inp->in6p_icmp6filt)) { - INP_RUNLOCK(inp); + inp->in6p_icmp6filt)) continue; - } - if (last != NULL) { - struct mbuf *n = NULL; - - /* - * Recent network drivers tend to allocate a single - * mbuf cluster, rather than to make a couple of - * mbufs without clusters. Also, since the IPv6 code - * path tries to avoid m_pullup(), it is highly - * probable that we still have an mbuf cluster here - * even though the necessary length can be stored in an - * mbuf's internal buffer. - * Meanwhile, the default size of the receive socket - * buffer for raw sockets is not so large. This means - * the possibility of packet loss is relatively higher - * than before. To avoid this scenario, we copy the - * received data to a separate mbuf that does not use - * a cluster, if possible. - * XXX: it is better to copy the data after stripping - * intermediate headers. - */ - if ((m->m_flags & M_EXT) && m->m_next == NULL && - m->m_len <= MHLEN) { - n = m_get(M_NOWAIT, m->m_type); - if (n != NULL) { - if (m_dup_pkthdr(n, m, M_NOWAIT)) { - bcopy(m->m_data, n->m_data, - m->m_len); - n->m_len = m->m_len; - } else { - m_free(n); - n = NULL; - } - } - } - if (n != NULL || - (n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { - if (last->inp_flags & INP_CONTROLOPTS) - ip6_savecontrol(last, n, &opts); - /* strip intermediate headers */ - m_adj(n, off); - SOCKBUF_LOCK(&last->inp_socket->so_rcv); - if (sbappendaddr_locked( - &last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, n, opts) - == 0) { - soroverflow_locked(last->inp_socket); - m_freem(n); - if (opts) { - m_freem(opts); - } - } else - sorwakeup_locked(last->inp_socket); - opts = NULL; - } - INP_RUNLOCK(last); - } - last = inp; - } - if (last != NULL) { - if (last->inp_flags & INP_CONTROLOPTS) - ip6_savecontrol(last, m, &opts); - /* strip intermediate headers */ - m_adj(m, off); - - /* avoid using mbuf clusters if possible (see above) */ + /* + * Recent network drivers tend to allocate a single + * mbuf cluster, rather than to make a couple of + * mbufs without clusters. Also, since the IPv6 code + * path tries to avoid m_pullup(), it is highly + * probable that we still have an mbuf cluster here + * even though the necessary length can be stored in an + * mbuf's internal buffer. + * Meanwhile, the default size of the receive socket + * buffer for raw sockets is not so large. This means + * the possibility of packet loss is relatively higher + * than before. To avoid this scenario, we copy the + * received data to a separate mbuf that does not use + * a cluster, if possible. + * XXX: it is better to copy the data after stripping + * intermediate headers. + */ if ((m->m_flags & M_EXT) && m->m_next == NULL && m->m_len <= MHLEN) { - struct mbuf *n; - n = m_get(M_NOWAIT, m->m_type); if (n != NULL) { if (m_dup_pkthdr(n, m, M_NOWAIT)) { bcopy(m->m_data, n->m_data, m->m_len); n->m_len = m->m_len; - - m_freem(m); - m = n; } else { - m_freem(n); + m_free(n); n = NULL; } } - } - SOCKBUF_LOCK(&last->inp_socket->so_rcv); - if (sbappendaddr_locked(&last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, m, opts) == 0) { - m_freem(m); + } else + n = m_copym(m, 0, M_COPYALL, M_NOWAIT); + if (n == NULL) + continue; + if (inp->inp_flags & INP_CONTROLOPTS) + ip6_savecontrol(inp, n, &opts); + /* strip intermediate headers */ + m_adj(n, off); + SOCKBUF_LOCK(&inp->inp_socket->so_rcv); + if (sbappendaddr_locked(&inp->inp_socket->so_rcv, + (struct sockaddr *)&fromsa, n, opts) == 0) { + soroverflow_locked(inp->inp_socket); + m_freem(n); if (opts) m_freem(opts); - soroverflow_locked(last->inp_socket); - } else - sorwakeup_locked(last->inp_socket); - INP_RUNLOCK(last); - } else { - m_freem(m); - IP6STAT_DEC(ip6s_delivered); + } else { + sorwakeup_locked(inp->inp_socket); + delivered++; + } + opts = NULL; } + m_freem(m); *mp = NULL; + if (delivered == 0) + IP6STAT_DEC(ip6s_delivered); return (IPPROTO_DONE); } Index: sys/netinet6/in6_pcb.c =================================================================== --- sys/netinet6/in6_pcb.c +++ sys/netinet6/in6_pcb.c @@ -673,13 +673,21 @@ * Call the protocol specific routine (if any) to report * any errors for each matching socket. */ +static bool +inp_match6(const struct inpcb *inp, void *v __unused) +{ + + return ((inp->inp_vflag & INP_IPV6) != 0); +} void in6_pcbnotify(struct inpcbinfo *pcbinfo, struct sockaddr *dst, u_int fport_arg, const struct sockaddr *src, u_int lport_arg, int cmd, void *cmdarg, struct inpcb *(*notify)(struct inpcb *, int)) { - struct inpcb *inp, *inp_temp; + struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_WLOCKPCB, + inp_match6, NULL); + struct inpcb *inp; struct sockaddr_in6 sa6_src, *sa6_dst; u_short fport = fport_arg, lport = lport_arg; u_int32_t flowinfo; @@ -715,14 +723,8 @@ notify = in6_rtchange; } errno = inet6ctlerrmap[cmd]; - INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH_SAFE(inp, pcbinfo->ipi_listhead, inp_list, inp_temp) { - INP_WLOCK(inp); - if ((inp->inp_vflag & INP_IPV6) == 0) { - INP_WUNLOCK(inp); - continue; - } - + while ((inp = inp_next(&inpi)) != NULL) { + INP_WLOCK_ASSERT(inp); /* * If the error designates a new path MTU for a destination * and the application (associated with this socket) wanted to @@ -754,18 +756,13 @@ !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &sa6_src.sin6_addr)) || (fport && inp->inp_fport != fport)) { - INP_WUNLOCK(inp); continue; } do_notify: - if (notify) { - if ((*notify)(inp, errno)) - INP_WUNLOCK(inp); - } else - INP_WUNLOCK(inp); + if (notify) + (*notify)(inp, errno); } - INP_INFO_WUNLOCK(pcbinfo); } /* @@ -866,49 +863,54 @@ } } +static bool +in6_multi_match(const struct inpcb *inp, void *v __unused) +{ + + if ((inp->inp_vflag & INP_IPV6) && inp->in6p_moptions != NULL) + return (true); + else + return (false); +} + void in6_pcbpurgeif0(struct inpcbinfo *pcbinfo, struct ifnet *ifp) { + struct inpcb_iterator inpi = INP_ITERATOR(pcbinfo, INPLOOKUP_RLOCKPCB, + in6_multi_match, NULL); struct inpcb *inp; struct in6_multi *inm; struct in6_mfilter *imf; struct ip6_moptions *im6o; - INP_INFO_WLOCK(pcbinfo); - CK_LIST_FOREACH(inp, pcbinfo->ipi_listhead, inp_list) { - INP_WLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_WUNLOCK(inp); - continue; - } + IN6_MULTI_LOCK_ASSERT(); + + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + im6o = inp->in6p_moptions; - if ((inp->inp_vflag & INP_IPV6) && im6o != NULL) { - /* - * Unselect the outgoing ifp for multicast if it - * is being detached. - */ - if (im6o->im6o_multicast_ifp == ifp) - im6o->im6o_multicast_ifp = NULL; - /* - * Drop multicast group membership if we joined - * through the interface being detached. - */ + /* + * Unselect the outgoing ifp for multicast if it + * is being detached. + */ + if (im6o->im6o_multicast_ifp == ifp) + im6o->im6o_multicast_ifp = NULL; + /* + * Drop multicast group membership if we joined + * through the interface being detached. + */ restart: - IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) { - if ((inm = imf->im6f_in6m) == NULL) - continue; - if (inm->in6m_ifp != ifp) - continue; - ip6_mfilter_remove(&im6o->im6o_head, imf); - IN6_MULTI_LOCK_ASSERT(); - in6_leavegroup_locked(inm, NULL); - ip6_mfilter_free(imf); - goto restart; - } + IP6_MFILTER_FOREACH(imf, &im6o->im6o_head) { + if ((inm = imf->im6f_in6m) == NULL) + continue; + if (inm->in6m_ifp != ifp) + continue; + ip6_mfilter_remove(&im6o->im6o_head, imf); + in6_leavegroup_locked(inm, NULL); + ip6_mfilter_free(imf); + goto restart; } - INP_WUNLOCK(inp); } - INP_INFO_WUNLOCK(pcbinfo); } /* @@ -1124,20 +1126,16 @@ { struct inpcb *inp; + smr_enter(pcbinfo->ipi_smr); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, lookupflags & INPLOOKUP_WILDCARD, ifp, numa_domain); if (inp != NULL) { - if (lookupflags & INPLOOKUP_WLOCKPCB) { - INP_WLOCK(inp); - } else if (lookupflags & INPLOOKUP_RLOCKPCB) { - INP_RLOCK(inp); - } else - panic("%s: locking bug", __func__); - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_UNLOCK(inp); + if (__predict_false(inp_smr_lock(inp, + (lookupflags & INPLOOKUP_LOCKMASK)) == false)) inp = NULL; - } - } + } else + smr_exit(pcbinfo->ipi_smr); + return (inp); } Index: sys/netinet6/ip6_gre.c =================================================================== --- sys/netinet6/ip6_gre.c +++ sys/netinet6/ip6_gre.c @@ -216,30 +216,15 @@ in6_gre_udp_input(struct mbuf *m, int off, struct inpcb *inp, const struct sockaddr *sa, void *ctx) { - struct epoch_tracker et; struct gre_socket *gs; struct gre_softc *sc; struct sockaddr_in6 dst; - NET_EPOCH_ENTER(et); - /* - * udp_append() holds reference to inp, it is safe to check - * inp_flags2 without INP_RLOCK(). - * If socket was closed before we have entered NET_EPOCH section, - * INP_FREED flag should be set. Otherwise it should be safe to - * make access to ctx data, because gre_so will be freed by - * gre_sofree() via NET_EPOCH_CALL(). - */ - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - NET_EPOCH_EXIT(et); - m_freem(m); - return; - } + NET_EPOCH_ASSERT(); gs = (struct gre_socket *)ctx; dst = *(const struct sockaddr_in6 *)sa; if (sa6_embedscope(&dst, 0)) { - NET_EPOCH_EXIT(et); m_freem(m); return; } @@ -249,11 +234,9 @@ } if (sc != NULL && (GRE2IFP(sc)->if_flags & IFF_UP) != 0){ gre_input(m, off + sizeof(struct udphdr), IPPROTO_UDP, sc); - NET_EPOCH_EXIT(et); return; } m_freem(m); - NET_EPOCH_EXIT(et); } static int Index: sys/netinet6/raw_ip6.c =================================================================== --- sys/netinet6/raw_ip6.c +++ sys/netinet6/raw_ip6.c @@ -119,9 +119,7 @@ * Raw interface to IP6 protocol. */ -VNET_DECLARE(struct inpcbhead, ripcb); VNET_DECLARE(struct inpcbinfo, ripcbinfo); -#define V_ripcb VNET(ripcb) #define V_ripcbinfo VNET(ripcbinfo) extern u_long rip_sendspace; @@ -153,6 +151,33 @@ int (*ip6_mforward)(struct ip6_hdr *, struct ifnet *, struct mbuf *); int (*mrt6_ioctl)(u_long, caddr_t); +struct rip6_inp_match_ctx { + struct ip6_hdr *ip6; + int proto; +}; + +static bool +rip6_inp_match(const struct inpcb *inp, void *v) +{ + struct rip6_inp_match_ctx *c = v; + struct ip6_hdr *ip6 = c->ip6; + int proto = c->proto; + + /* XXX inp locking */ + if ((inp->inp_vflag & INP_IPV6) == 0) + return (false); + if (inp->inp_ip_p && inp->inp_ip_p != proto) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) + return (false); + + return (true); +} + /* * Setup generic address and protocol structures for raw_input routine, then * pass them along with mbuf chain. @@ -161,12 +186,15 @@ rip6_input(struct mbuf **mp, int *offp, int proto) { struct ifnet *ifp; - struct mbuf *m = *mp; + struct mbuf *n, *m = *mp; struct ip6_hdr *ip6 = mtod(m, struct ip6_hdr *); struct inpcb *inp; - struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; + struct rip6_inp_match_ctx ctx = { .ip6 = ip6, .proto = proto }; + struct inpcb_iterator inpi = INP_ITERATOR(&V_ripcbinfo, + INPLOOKUP_RLOCKPCB, rip6_inp_match, &ctx); + int delivered = 0; NET_EPOCH_ASSERT(); @@ -176,70 +204,27 @@ ifp = m->m_pkthdr.rcvif; - CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { - /* XXX inp locking */ - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (inp->inp_ip_p && - inp->inp_ip_p != proto) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ip6->ip6_dst)) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && - !IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ip6->ip6_src)) - continue; - if (last != NULL) { - struct mbuf *n = m_copym(m, 0, M_COPYALL, M_NOWAIT); - + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); #if defined(IPSEC) || defined(IPSEC_SUPPORT) - /* - * Check AH/ESP integrity. - */ - if (IPSEC_ENABLED(ipv6)) { - if (n != NULL && - IPSEC_CHECK_POLICY(ipv6, n, last) != 0) { - m_freem(n); - /* Do not inject data into pcb. */ - n = NULL; - } - } -#endif /* IPSEC */ - if (n) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, n, &opts); - /* strip intermediate headers */ - m_adj(n, *offp); - if (sbappendaddr(&last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, - n, opts) == 0) { - soroverflow(last->inp_socket); - m_freem(n); - if (opts) - m_freem(opts); - RIP6STAT_INC(rip6s_fullsock); - } else - sorwakeup(last->inp_socket); - opts = NULL; - } - INP_RUNLOCK(last); - last = NULL; + /* + * Check AH/ESP integrity. + */ + if (IPSEC_ENABLED(ipv6) && + IPSEC_CHECK_POLICY(ipv6, m, inp) != 0) { + /* Do not inject data into pcb. */ + continue; } - INP_RLOCK(inp); - if (__predict_false(inp->inp_flags2 & INP_FREED)) - goto skip_2; - if (jailed_without_vnet(inp->inp_cred)) { +#endif /* IPSEC */ + if (jailed_without_vnet(inp->inp_cred) && + !IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && + prison_check_ip6(inp->inp_cred, &ip6->ip6_dst) != 0) /* * Allow raw socket in jail to receive multicast; * assume process had PRIV_NETINET_RAW at attach, * and fall through into normal filter path if so. */ - if (!IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst) && - prison_check_ip6(inp->inp_cred, - &ip6->ip6_dst) != 0) - goto skip_2; - } + continue; if (inp->in6p_cksum != -1) { RIP6STAT_INC(rip6s_isum); if (m->m_pkthdr.len - (*offp + inp->in6p_cksum) < 2 || @@ -251,8 +236,9 @@ * ICMP6 message. Set proto to IPPROTO_NONE * to achieve that. */ + INP_RUNLOCK(inp); proto = IPPROTO_NONE; - goto skip_2; + break; } } /* @@ -298,43 +284,30 @@ } if (blocked != MCAST_PASS) { IP6STAT_INC(ip6s_notmember); - goto skip_2; + continue; } } - last = inp; - continue; -skip_2: - INP_RUNLOCK(inp); - } -#if defined(IPSEC) || defined(IPSEC_SUPPORT) - /* - * Check AH/ESP integrity. - */ - if (IPSEC_ENABLED(ipv6) && last != NULL && - IPSEC_CHECK_POLICY(ipv6, m, last) != 0) { - m_freem(m); - IP6STAT_DEC(ip6s_delivered); - /* Do not inject data into pcb. */ - INP_RUNLOCK(last); - } else -#endif /* IPSEC */ - if (last != NULL) { - if (last->inp_flags & INP_CONTROLOPTS || - last->inp_socket->so_options & SO_TIMESTAMP) - ip6_savecontrol(last, m, &opts); - /* Strip intermediate headers. */ - m_adj(m, *offp); - if (sbappendaddr(&last->inp_socket->so_rcv, - (struct sockaddr *)&fromsa, m, opts) == 0) { - soroverflow(last->inp_socket); - m_freem(m); + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) == NULL) + continue; + if (inp->inp_flags & INP_CONTROLOPTS || + inp->inp_socket->so_options & SO_TIMESTAMP) + ip6_savecontrol(inp, n, &opts); + /* strip intermediate headers */ + m_adj(n, *offp); + if (sbappendaddr(&inp->inp_socket->so_rcv, + (struct sockaddr *)&fromsa, n, opts) == 0) { + soroverflow(inp->inp_socket); + m_freem(n); if (opts) m_freem(opts); RIP6STAT_INC(rip6s_fullsock); - } else - sorwakeup(last->inp_socket); - INP_RUNLOCK(last); - } else { + } else { + sorwakeup(inp->inp_socket); + delivered++; + } + opts = NULL; + } + if (delivered == 0) { RIP6STAT_INC(rip6s_nosock); if (m->m_flags & M_MCAST) RIP6STAT_INC(rip6s_nosockmcast); @@ -345,7 +318,8 @@ ICMP6_PARAMPROB_NEXTHEADER, ip6_get_prevhdr(m, *offp)); IP6STAT_DEC(ip6s_delivered); - } + } else + m_freem(m); return (IPPROTO_DONE); } @@ -678,15 +652,12 @@ filter = malloc(sizeof(struct icmp6_filter), M_PCB, M_NOWAIT); if (filter == NULL) return (ENOMEM); - INP_INFO_WLOCK(&V_ripcbinfo); error = in_pcballoc(so, &V_ripcbinfo); if (error) { - INP_INFO_WUNLOCK(&V_ripcbinfo); free(filter, M_PCB); return (error); } inp = (struct inpcb *)so->so_pcb; - INP_INFO_WUNLOCK(&V_ripcbinfo); inp->inp_vflag |= INP_IPV6; inp->inp_ip_p = (long)proto; inp->in6p_hops = -1; /* use kernel default */ @@ -708,12 +679,10 @@ if (so == V_ip6_mrouter && ip6_mrouter_done) ip6_mrouter_done(); /* xxx: RSVP */ - INP_INFO_WLOCK(&V_ripcbinfo); INP_WLOCK(inp); free(inp->in6p_icmp6filt, M_PCB); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(&V_ripcbinfo); } /* XXXRW: This can't ever be called. */ Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -207,6 +207,137 @@ return (0); } +struct udp6_multi_match_ctx { + struct ip6_hdr *ip6; + struct udphdr *uh; +}; + +static bool +udp6_multi_match(const struct inpcb *inp, void *v) +{ + struct udp6_multi_match_ctx *ctx = v; + + if ((inp->inp_vflag & INP_IPV6) == 0) + return(false); + if (inp->inp_lport != ctx->uh->uh_dport) + return(false); + if (inp->inp_fport != 0 && inp->inp_fport != ctx->uh->uh_sport) + return(false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && + !IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, &ctx->ip6->ip6_dst)) + return (false); + if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr) && + (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, &ctx->ip6->ip6_src) || + inp->inp_fport != ctx->uh->uh_sport)) + return (false); + + return (true); +} + +static int +udp6_multi_input(struct mbuf *m, int off, int proto, + struct sockaddr_in6 *fromsa) +{ + struct udp6_multi_match_ctx ctx; + struct inpcb_iterator inpi = INP_ITERATOR(udp_get_inpcbinfo(proto), + INPLOOKUP_RLOCKPCB, udp6_multi_match, &ctx); + struct inpcb *inp; + struct ip6_moptions *imo; + struct mbuf *n; + int appends = 0; + + /* + * In the event that laddr should be set to the link-local + * address (this happens in RIPng), the multicast address + * specified in the received packet will not match laddr. To + * handle this situation, matching is relaxed if the + * receiving interface is the same as one specified in the + * socket and if the destination multicast address matches + * one of the multicast groups specified in the socket. + */ + + /* + * KAME note: traditionally we dropped udpiphdr from mbuf + * here. We need udphdr for IPsec processing so we do that + * later. + */ + ctx.ip6 = mtod(m, struct ip6_hdr *); + ctx.uh = (struct udphdr *)((char *)ctx.ip6 + off); + while ((inp = inp_next(&inpi)) != NULL) { + INP_RLOCK_ASSERT(inp); + /* + * XXXRW: Because we weren't holding either the inpcb + * or the hash lock when we checked for a match + * before, we should probably recheck now that the + * inpcb lock is (supposed to be) held. + */ + /* + * Handle socket delivery policy for any-source + * and source-specific multicast. [RFC3678] + */ + if ((imo = inp->in6p_moptions) != NULL) { + struct sockaddr_in6 mcaddr; + int blocked; + + bzero(&mcaddr, sizeof(struct sockaddr_in6)); + mcaddr.sin6_len = sizeof(struct sockaddr_in6); + mcaddr.sin6_family = AF_INET6; + mcaddr.sin6_addr = ctx.ip6->ip6_dst; + + blocked = im6o_mc_filter(imo, m->m_pkthdr.rcvif, + (struct sockaddr *)&mcaddr, + (struct sockaddr *)&fromsa[0]); + if (blocked != MCAST_PASS) { + if (blocked == MCAST_NOTGMEMBER) + IP6STAT_INC(ip6s_notmember); + if (blocked == MCAST_NOTSMEMBER || + blocked == MCAST_MUTED) + UDPSTAT_INC(udps_filtermcast); + continue; + } + } + if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != NULL) { + if (proto == IPPROTO_UDPLITE) + UDPLITE_PROBE(receive, NULL, inp, ctx.ip6, + inp, ctx.uh); + else + UDP_PROBE(receive, NULL, inp, ctx.ip6, inp, + ctx.uh); + if (udp6_append(inp, n, off, fromsa)) { + INP_RUNLOCK(inp); + break; + } else + appends++; + } + /* + * Don't look for additional matches if this one does + * not have either the SO_REUSEPORT or SO_REUSEADDR + * socket options set. This heuristic avoids + * searching through all pcbs in the common case of a + * non-shared port. It assumes that an application + * will never clear these options after setting them. + */ + if ((inp->inp_socket->so_options & + (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) { + INP_RUNLOCK(inp); + break; + } + } + m_freem(m); + + if (appends == 0) { + /* + * No matching pcb found; discard datagram. (No need + * to send an ICMP Port Unreachable for a broadcast + * or multicast datgram.) + */ + UDPSTAT_INC(udps_noport); + UDPSTAT_INC(udps_noportmcast); + } + + return (IPPROTO_DONE); +} + int udp6_input(struct mbuf **mp, int *offp, int proto) { @@ -311,144 +442,11 @@ fromsa[1].sin6_port = uh->uh_dport; pcbinfo = udp_get_inpcbinfo(nxt); - if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { - struct inpcb *last; - struct inpcbhead *pcblist; - struct ip6_moptions *imo; - - /* - * In the event that laddr should be set to the link-local - * address (this happens in RIPng), the multicast address - * specified in the received packet will not match laddr. To - * handle this situation, matching is relaxed if the - * receiving interface is the same as one specified in the - * socket and if the destination multicast address matches - * one of the multicast groups specified in the socket. - */ - - /* - * KAME note: traditionally we dropped udpiphdr from mbuf - * here. We need udphdr for IPsec processing so we do that - * later. - */ - pcblist = udp_get_pcblist(nxt); - last = NULL; - CK_LIST_FOREACH(inp, pcblist, inp_list) { - if ((inp->inp_vflag & INP_IPV6) == 0) - continue; - if (inp->inp_lport != uh->uh_dport) - continue; - if (inp->inp_fport != 0 && - inp->inp_fport != uh->uh_sport) - continue; - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr)) { - if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_laddr, - &ip6->ip6_dst)) - continue; - } - if (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_faddr)) { - if (!IN6_ARE_ADDR_EQUAL(&inp->in6p_faddr, - &ip6->ip6_src) || - inp->inp_fport != uh->uh_sport) - continue; - } - - INP_RLOCK(inp); - - if (__predict_false(inp->inp_flags2 & INP_FREED)) { - INP_RUNLOCK(inp); - continue; - } - - /* - * XXXRW: Because we weren't holding either the inpcb - * or the hash lock when we checked for a match - * before, we should probably recheck now that the - * inpcb lock is (supposed to be) held. - */ - - /* - * Handle socket delivery policy for any-source - * and source-specific multicast. [RFC3678] - */ - imo = inp->in6p_moptions; - if (imo != NULL) { - struct sockaddr_in6 mcaddr; - int blocked; - - bzero(&mcaddr, sizeof(struct sockaddr_in6)); - mcaddr.sin6_len = sizeof(struct sockaddr_in6); - mcaddr.sin6_family = AF_INET6; - mcaddr.sin6_addr = ip6->ip6_dst; - - blocked = im6o_mc_filter(imo, ifp, - (struct sockaddr *)&mcaddr, - (struct sockaddr *)&fromsa[0]); - if (blocked != MCAST_PASS) { - if (blocked == MCAST_NOTGMEMBER) - IP6STAT_INC(ip6s_notmember); - if (blocked == MCAST_NOTSMEMBER || - blocked == MCAST_MUTED) - UDPSTAT_INC(udps_filtermcast); - INP_RUNLOCK(inp); - continue; - } - } - - if (last != NULL) { - struct mbuf *n; - - if ((n = m_copym(m, 0, M_COPYALL, M_NOWAIT)) != - NULL) { - if (nxt == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, - last, ip6, last, uh); - else - UDP_PROBE(receive, NULL, last, - ip6, last, uh); - if (udp6_append(last, n, off, - fromsa)) { - INP_RUNLOCK(inp); - goto badunlocked; - } - } - /* Release PCB lock taken on previous pass. */ - INP_RUNLOCK(last); - } - last = inp; - /* - * Don't look for additional matches if this one does - * not have either the SO_REUSEPORT or SO_REUSEADDR - * socket options set. This heuristic avoids - * searching through all pcbs in the common case of a - * non-shared port. It assumes that an application - * will never clear these options after setting them. - */ - if ((last->inp_socket->so_options & - (SO_REUSEPORT|SO_REUSEPORT_LB|SO_REUSEADDR)) == 0) - break; - } - - if (last == NULL) { - /* - * No matching pcb found; discard datagram. (No need - * to send an ICMP Port Unreachable for a broadcast - * or multicast datgram.) - */ - UDPSTAT_INC(udps_noport); - UDPSTAT_INC(udps_noportmcast); - goto badunlocked; - } - - if (nxt == IPPROTO_UDPLITE) - UDPLITE_PROBE(receive, NULL, last, ip6, last, uh); - else - UDP_PROBE(receive, NULL, last, ip6, last, uh); - if (udp6_append(last, m, off, fromsa) == 0) - INP_RUNLOCK(last); + if (IN6_IS_ADDR_MULTICAST(&ip6->ip6_dst)) { *mp = NULL; - return (IPPROTO_DONE); + return (udp6_multi_input(m, off, proto, fromsa)); } + /* * Locate pcb for datagram. */ @@ -1043,12 +1041,9 @@ if (error) return (error); } - INP_INFO_WLOCK(pcbinfo); error = in_pcballoc(so, pcbinfo); - if (error) { - INP_INFO_WUNLOCK(pcbinfo); + if (error) return (error); - } inp = (struct inpcb *)so->so_pcb; inp->inp_vflag |= INP_IPV6; if ((inp->inp_flags & IN6P_IPV6_V6ONLY) == 0) @@ -1067,11 +1062,9 @@ if (error) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); return (error); } INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(pcbinfo); return (0); } @@ -1275,13 +1268,11 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("udp6_detach: inp == NULL")); - INP_INFO_WLOCK(pcbinfo); INP_WLOCK(inp); up = intoudpcb(inp); KASSERT(up != NULL, ("%s: up == NULL", __func__)); in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_WUNLOCK(pcbinfo); udp_discardcb(up); }