Index: sys/dev/cxgbe/cxgbei/cxgbei.c =================================================================== --- sys/dev/cxgbe/cxgbei/cxgbei.c +++ sys/dev/cxgbe/cxgbei/cxgbei.c @@ -412,12 +412,12 @@ SOCKBUF_UNLOCK(sb); INP_WUNLOCK(inp); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); icl_cxgbei_conn_pdu_free(NULL, ip); #ifdef INVARIANTS Index: sys/dev/cxgbe/tom/t4_connect.c =================================================================== --- sys/dev/cxgbe/tom/t4_connect.c +++ sys/dev/cxgbe/tom/t4_connect.c @@ -124,12 +124,12 @@ CURVNET_SET(toep->vnet); if (status != EAGAIN) - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); toe_connect_failed(tod, inp, status); final_cpl_received(toep); /* unlocks inp */ if (status != EAGAIN) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } Index: sys/dev/cxgbe/tom/t4_cpl_io.c =================================================================== --- sys/dev/cxgbe/tom/t4_cpl_io.c +++ sys/dev/cxgbe/tom/t4_cpl_io.c @@ -1214,7 +1214,7 @@ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1260,7 +1260,7 @@ case TCPS_FIN_WAIT_2: tcp_twstart(tp); INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); @@ -1273,7 +1273,7 @@ } done: INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } @@ -1303,7 +1303,7 @@ KASSERT(toep->tid == tid, ("%s: toep tid mismatch", __func__)); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1321,7 +1321,7 @@ tcp_twstart(tp); release: INP_UNLOCK_ASSERT(inp); /* safe, we have a ref on the inp */ - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); INP_WLOCK(inp); @@ -1346,7 +1346,7 @@ } done: INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } @@ -1423,7 +1423,7 @@ inp = toep->inp; CURVNET_SET(toep->vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for tcp_close */ + NET_EPOCH_ENTER(et); /* for tcp_close */ INP_WLOCK(inp); tp = intotcpcb(inp); @@ -1457,7 +1457,7 @@ final_cpl_received(toep); done: - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); send_abort_rpl(sc, ofld_txq, tid, CPL_ABORT_NO_RST); return (0); @@ -1572,12 +1572,12 @@ INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); Index: sys/dev/cxgbe/tom/t4_listen.c =================================================================== --- sys/dev/cxgbe/tom/t4_listen.c +++ sys/dev/cxgbe/tom/t4_listen.c @@ -949,7 +949,7 @@ #endif struct toepcb *toep = synqe->toep; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* prevents bad race with accept() */ + NET_EPOCH_ASSERT(); /* prevents bad race with accept() */ INP_WLOCK_ASSERT(inp); KASSERT(synqe->flags & TPF_SYNQE, ("%s: %p not a synq_entry?", __func__, arg)); @@ -1242,12 +1242,12 @@ REJECT_PASS_ACCEPT_REQ(true); /* Don't offload if the 4-tuple is already in use */ - INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for 4-tuple check */ + NET_EPOCH_ENTER(et); /* for 4-tuple check */ if (toe_4tuple_check(&inc, &th, ifp) != 0) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); REJECT_PASS_ACCEPT_REQ(false); } - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); inp = lctx->inp; /* listening socket, not owned by TOE */ INP_WLOCK(inp); @@ -1396,7 +1396,7 @@ ("%s: tid %u (ctx %p) not a synqe", __func__, tid, synqe)); CURVNET_SET(lctx->vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); /* for syncache_expand */ + NET_EPOCH_ENTER(et); /* for syncache_expand */ INP_WLOCK(inp); CTR6(KTR_CXGBE, @@ -1412,7 +1412,7 @@ reset: send_reset_synqe(TOEDEV(ifp), synqe); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); } @@ -1471,7 +1471,7 @@ inp = release_synqe(sc, synqe); if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); Index: sys/dev/cxgbe/tom/t4_tls.c =================================================================== --- sys/dev/cxgbe/tom/t4_tls.c +++ sys/dev/cxgbe/tom/t4_tls.c @@ -2125,12 +2125,12 @@ INP_WUNLOCK(inp); CURVNET_SET(toep->vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); tp = tcp_drop(tp, ECONNRESET); if (tp) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return (0); Index: sys/kern/uipc_ktls.c =================================================================== --- sys/kern/uipc_ktls.c +++ sys/kern/uipc_ktls.c @@ -1137,7 +1137,7 @@ * the send tag is fixed or just rely on timers? */ } else { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (!in_pcbrele_wlocked(inp)) { if (!(inp->inp_flags & INP_TIMEWAIT) && @@ -1150,7 +1150,7 @@ } else INP_WUNLOCK(inp); } - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); counter_u64_add(ktls_ifnet_reset_failed, 1); Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -586,6 +586,7 @@ #define INP_TRY_WLOCK(inp) rw_try_wlock(&(inp)->inp_lock) #define INP_RUNLOCK(inp) rw_runlock(&(inp)->inp_lock) #define INP_WUNLOCK(inp) rw_wunlock(&(inp)->inp_lock) +#define INP_UNLOCK(inp) rw_unlock(&(inp)->inp_lock) #define INP_TRY_UPGRADE(inp) rw_try_upgrade(&(inp)->inp_lock) #define INP_DOWNGRADE(inp) rw_downgrade(&(inp)->inp_lock) #define INP_WLOCKED(inp) rw_wowned(&(inp)->inp_lock) @@ -628,19 +629,14 @@ #define INP_INFO_LOCK_INIT(ipi, d) \ mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) #define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK_ET(ipi, et) NET_EPOCH_ENTER((et)) #define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) #define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) #define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT((et)) -#define INP_INFO_RUNLOCK_TP(ipi, tp) NET_EPOCH_EXIT(*(tp)->t_inpcb->inp_et) #define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) #define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_lock)) -#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt)) #define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) #define INP_INFO_WUNLOCK_ASSERT(ipi) \ mtx_assert(&(ipi)->ipi_lock, MA_NOTOWNED) -#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch(net_epoch_preempt) && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) @@ -663,11 +659,7 @@ #define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) #define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) -#define INP_HASH_RLOCK(ipi) struct epoch_tracker inp_hash_et; epoch_enter_preempt(net_epoch_preempt, &inp_hash_et) -#define INP_HASH_RLOCK_ET(ipi, et) epoch_enter_preempt(net_epoch_preempt, &(et)) #define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT(inp_hash_et) -#define INP_HASH_RUNLOCK_ET(ipi, et) NET_EPOCH_EXIT((et)) #define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) #define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch(net_epoch_preempt) || mtx_owned(&(ipi)->ipi_hash_lock)) #define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -515,7 +515,7 @@ #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { - INP_INFO_RLOCK_ASSERT(pcbinfo); + NET_EPOCH_ASSERT(); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } @@ -2252,12 +2252,10 @@ struct inpcb *inp, *tmpinp; u_short fport = fport_arg, lport = lport_arg; -#ifdef INVARIANTS KASSERT((lookupflags & ~(INPLOOKUP_WILDCARD)) == 0, ("%s: invalid lookup flags %d", __func__, lookupflags)); - if (!mtx_owned(&pcbinfo->ipi_hash_lock)) - MPASS(in_epoch_verbose(net_epoch_preempt, 1)); -#endif + INP_HASH_LOCK_ASSERT(pcbinfo); + /* * First look for an exact match. */ @@ -2384,7 +2382,6 @@ { struct inpcb *inp; - INP_HASH_RLOCK(pcbinfo); inp = in_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { @@ -2411,7 +2408,7 @@ } #endif } - INP_HASH_RUNLOCK(pcbinfo); + return (inp); } @@ -2657,7 +2654,7 @@ #ifdef INVARIANTS if (pcbinfo == &V_tcbinfo) { - INP_INFO_RLOCK_ASSERT(pcbinfo); + NET_EPOCH_ASSERT(); } else { INP_INFO_WLOCK_ASSERT(pcbinfo); } Index: sys/netinet/ip_divert.c =================================================================== --- sys/netinet/ip_divert.c +++ sys/netinet/ip_divert.c @@ -192,7 +192,8 @@ u_int16_t nport; struct sockaddr_in divsrc; struct m_tag *mtag; - struct epoch_tracker et; + + NET_EPOCH_ASSERT(); mtag = m_tag_locate(m, MTAG_IPFW_RULE, 0, NULL); if (mtag == NULL) { @@ -231,7 +232,6 @@ /* Sanity check */ M_ASSERTPKTHDR(m); - NET_EPOCH_ASSERT(); /* Find IP address for receive interface */ ifp = m->m_pkthdr.rcvif; @@ -272,7 +272,6 @@ /* Put packet on socket queue, if any */ sa = NULL; nport = htons((u_int16_t)(((struct ipfw_rule_ref *)(mtag+1))->info)); - INP_INFO_RLOCK_ET(&V_divcbinfo, et); CK_LIST_FOREACH(inp, &V_divcb, inp_list) { /* XXX why does only one socket match? */ if (inp->inp_lport == nport) { @@ -290,7 +289,6 @@ break; } } - INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); if (sa == NULL) { m_freem(m); KMOD_IPSTAT_INC(ips_noproto); @@ -631,71 +629,41 @@ static int div_pcblist(SYSCTL_HANDLER_ARGS) { - int error, i, n; - struct inpcb *inp, **inp_list; - inp_gen_t gencnt; struct xinpgen xig; struct epoch_tracker et; + struct inpcb *inp; + int error; + + if (req->newptr != 0) + return EPERM; - /* - * The process of preparing the TCB list is too time-consuming and - * resource-intensive to repeat twice on every request. - */ if (req->oldptr == 0) { + int n; + n = V_divcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return 0; } - if (req->newptr != 0) - return EPERM; - - /* - * OK, now we're committed to doing something. - */ - INP_INFO_WLOCK(&V_divcbinfo); - gencnt = V_divcbinfo.ipi_gencnt; - n = V_divcbinfo.ipi_count; - INP_INFO_WUNLOCK(&V_divcbinfo); - - error = sysctl_wire_old_buffer(req, - 2 * sizeof(xig) + n*sizeof(struct xinpcb)); - if (error != 0) + if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; - xig.xig_count = n; - xig.xig_gen = gencnt; + xig.xig_count = V_divcbinfo.ipi_count; + xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return error; - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return ENOMEM; - - INP_INFO_RLOCK_ET(&V_divcbinfo, et); - for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead), i = 0; inp && i < n; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_WLOCK(inp); - if (inp->inp_gencnt <= gencnt && - cr_canseeinpcb(req->td->td_ucred, inp) == 0) { - in_pcbref(inp); - inp_list[i++] = inp; - } - INP_WUNLOCK(inp); - } - INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); - n = i; - - error = 0; - for (i = 0; i < n; i++) { - inp = inp_list[i]; + NET_EPOCH_ENTER(et); + for (inp = CK_LIST_FIRST(V_divcbinfo.ipi_listhead); + inp != NULL; + inp = CK_LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= xig.xig_gen) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); @@ -704,17 +672,9 @@ } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_divcbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_divcbinfo); + NET_EPOCH_EXIT(et); if (!error) { - struct epoch_tracker et; /* * Give the user an updated idea of our state. * If the generation differs from what we told @@ -722,15 +682,13 @@ * while we were processing this request, and it * might be necessary to retry. */ - INP_INFO_RLOCK_ET(&V_divcbinfo, et); xig.xig_gen = V_divcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_divcbinfo.ipi_count; - INP_INFO_RUNLOCK_ET(&V_divcbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); - return error; + + return (error); } #ifdef SYSCTL_NODE Index: sys/netinet/raw_ip.c =================================================================== --- sys/netinet/raw_ip.c +++ sys/netinet/raw_ip.c @@ -284,9 +284,10 @@ struct ip *ip = mtod(m, struct ip *); struct inpcb *inp, *last; struct sockaddr_in ripsrc; - struct epoch_tracker et; int hash; + NET_EPOCH_ASSERT(); + *mp = NULL; bzero(&ripsrc, sizeof(ripsrc)); @@ -299,7 +300,6 @@ hash = INP_PCBHASH_RAW(proto, ip->ip_src.s_addr, ip->ip_dst.s_addr, V_ripcbinfo.ipi_hashmask); - INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(inp, &V_ripcbinfo.ipi_hashbase[hash], inp_hash) { if (inp->inp_ip_p != proto) continue; @@ -422,7 +422,6 @@ skip_2: INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { if (rip_append(last, ip, m, &ripsrc) != 0) IPSTAT_INC(ips_delivered); @@ -1068,97 +1067,67 @@ static int rip_pcblist(SYSCTL_HANDLER_ARGS) { - int error, i, n; - struct inpcb *inp, **inp_list; - inp_gen_t gencnt; struct xinpgen xig; struct epoch_tracker et; + struct inpcb *inp; + int error; + + if (req->newptr != 0) + return (EPERM); - /* - * The process of preparing the TCB list is too time-consuming and - * resource-intensive to repeat twice on every request. - */ if (req->oldptr == 0) { + int n; + n = V_ripcbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } - if (req->newptr != 0) - return (EPERM); - - /* - * OK, now we're committed to doing something. - */ - INP_INFO_WLOCK(&V_ripcbinfo); - gencnt = V_ripcbinfo.ipi_gencnt; - n = V_ripcbinfo.ipi_count; - INP_INFO_WUNLOCK(&V_ripcbinfo); + if ((error = sysctl_wire_old_buffer(req, 0)) != 0) + return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; - xig.xig_count = n; - xig.xig_gen = gencnt; + xig.xig_count = V_ripcbinfo.ipi_count; + xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - - INP_INFO_RLOCK_ET(&V_ripcbinfo, et); - for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead), i = 0; inp && i < n; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_WLOCK(inp); - if (inp->inp_gencnt <= gencnt && - cr_canseeinpcb(req->td->td_ucred, inp) == 0) { - in_pcbref(inp); - inp_list[i++] = inp; - } - INP_WUNLOCK(inp); - } - INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); - n = i; - - error = 0; - for (i = 0; i < n; i++) { - inp = inp_list[i]; + NET_EPOCH_ENTER(et); + for (inp = CK_LIST_FIRST(V_ripcbinfo.ipi_listhead); + inp != NULL; + inp = CK_LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= xig.xig_gen && + cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); + if (error) + break; } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_ripcbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_ripcbinfo); + NET_EPOCH_EXIT(et); if (!error) { - struct epoch_tracker et; /* * Give the user an updated idea of our state. If the * generation differs from what we told her before, she knows * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK_ET(&V_ripcbinfo, et); xig.xig_gen = V_ripcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_ripcbinfo.ipi_count; - INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); + return (error); } Index: sys/netinet/tcp_hpts.c =================================================================== --- sys/netinet/tcp_hpts.c +++ sys/netinet/tcp_hpts.c @@ -1245,12 +1245,10 @@ int16_t set_cpu; uint32_t did_prefetch = 0; int dropped; - struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); -#ifndef VIMAGE - INP_INFO_RLOCK_ET(&V_tcbinfo, et); -#endif + NET_EPOCH_ASSERT(); + while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); @@ -1266,7 +1264,6 @@ INP_WLOCK(inp); #ifdef VIMAGE CURVNET_SET(inp->inp_vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); #endif if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { @@ -1276,7 +1273,6 @@ INP_WUNLOCK(inp); } #ifdef VIMAGE - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); #endif mtx_lock(&hpts->p_mtx); @@ -1296,7 +1292,6 @@ if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); #ifdef VIMAGE - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); #endif mtx_lock(&hpts->p_mtx); @@ -1349,22 +1344,16 @@ INP_WUNLOCK(inp); INP_UNLOCK_ASSERT(inp); #ifdef VIMAGE - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); #endif mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; } -#ifndef VIMAGE - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); -#endif } static void tcp_hptsi(struct tcp_hpts_entry *hpts) { - struct epoch_tracker et; struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; @@ -1378,6 +1367,8 @@ int16_t set_cpu; HPTS_MTX_ASSERT(hpts); + NET_EPOCH_ASSERT(); + /* record previous info for any logging */ hpts->saved_lasttick = hpts->p_lasttick; hpts->saved_curtick = hpts->p_curtick; @@ -1469,9 +1460,6 @@ goto no_one; } HPTS_MTX_ASSERT(hpts); -#ifndef VIMAGE - INP_INFO_RLOCK_ET(&V_tcbinfo, et); -#endif for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there @@ -1586,7 +1574,6 @@ } #ifdef VIMAGE CURVNET_SET(inp->inp_vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); #endif /* Lets do any logging that we might want to */ if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { @@ -1658,7 +1645,6 @@ INP_WUNLOCK(inp); skip_pacing: #ifdef VIMAGE - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); #endif INP_UNLOCK_ASSERT(inp); @@ -1678,9 +1664,6 @@ hpts->p_runningtick = 0; } } -#ifndef VIMAGE - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); -#endif no_one: HPTS_MTX_ASSERT(hpts); hpts->p_delayed_by = 0; @@ -1820,6 +1803,7 @@ tcp_hpts_thread(void *ctx) { struct tcp_hpts_entry *hpts; + struct epoch_tracker et; struct timeval tv; sbintime_t sb; @@ -1839,7 +1823,9 @@ } hpts->p_hpts_wake_scheduled = 0; hpts->p_hpts_active = 1; + NET_EPOCH_ENTER(et); tcp_hptsi(hpts); + NET_EPOCH_EXIT(et); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -562,7 +562,6 @@ int rstreason = 0; /* For badport_bandlim accounting purposes */ uint8_t iptos; struct m_tag *fwd_tag = NULL; - struct epoch_tracker et; #ifdef INET6 struct ip6_hdr *ip6 = NULL; int isipv6; @@ -571,7 +570,6 @@ #endif /* INET6 */ struct tcpopt to; /* options in this segment */ char *s = NULL; /* address and port logging */ - int ti_locked; #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -582,6 +580,8 @@ short ostate = 0; #endif + NET_EPOCH_ASSERT(); + #ifdef INET6 isipv6 = (mtod(m, struct ip *)->ip_v == 6) ? 1 : 0; #endif @@ -747,19 +747,6 @@ */ drop_hdrlen = off0 + off; - /* - * Locate pcb for segment; if we're likely to add or remove a - * connection then first acquire pcbinfo lock. There are three cases - * where we might discover later we need a write lock despite the - * flags: ACKs moving a connection out of the syncache, ACKs for a - * connection in TIMEWAIT and SYNs not targeting a listening socket. - */ - if ((thflags & (TH_FIN | TH_RST)) != 0) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else - ti_locked = TI_UNLOCKED; - /* * Grab info from PACKET_TAG_IPFORWARD tag prepended to the chain. */ @@ -777,13 +764,6 @@ fwd_tag = m_tag_find(m, PACKET_TAG_IPFORWARD, NULL); findpcb: -#ifdef INVARIANTS - if (ti_locked == TI_RLOCKED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } else { - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - } -#endif #ifdef INET6 if (isipv6 && fwd_tag != NULL) { struct sockaddr_in6 *next_hop6; @@ -943,12 +923,6 @@ * XXXRW: It may be time to rethink timewait locking. */ if (inp->inp_flags & INP_TIMEWAIT) { - if (ti_locked == TI_UNLOCKED) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - if (thflags & TH_SYN) tcp_dooptions(&to, optp, optlen, TO_SYN); /* @@ -956,7 +930,6 @@ */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* @@ -978,27 +951,6 @@ } #endif - /* - * We've identified a valid inpcb, but it could be that we need an - * inpcbinfo write lock but don't hold it. In this case, attempt to - * acquire using the same strategy as the TIMEWAIT case above. If we - * relock, we have to jump back to 'relocked' as the connection might - * now be in TIMEWAIT. - */ -#ifdef INVARIANTS - if ((thflags & (TH_FIN | TH_RST)) != 0) - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); -#endif - if (!((tp->t_state == TCPS_ESTABLISHED && (thflags & TH_SYN) == 0) || - (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && - !IS_FASTOPEN(tp->t_flags)))) { - if (ti_locked == TI_UNLOCKED) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } - #ifdef MAC INP_WLOCK_ASSERT(inp); if (mac_inpcb_check_deliver(inp, m)) @@ -1053,7 +1005,6 @@ */ if ((thflags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* * Parse the TCP options here because * syncookies need access to the reflected @@ -1131,8 +1082,6 @@ TCP_PROBE5(receive, NULL, tp, m, tp, th); tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); } /* @@ -1335,10 +1284,6 @@ * Entry added to syncache and mbuf consumed. * Only the listen socket is unlocked by syncache_add(). */ - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_UNLOCKED; - } INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); return (IPPROTO_DONE); } else if (tp->t_state == TCPS_LISTEN) { @@ -1371,25 +1316,11 @@ * the inpcb, and unlocks pcbinfo. */ tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return (IPPROTO_DONE); dropwithreset: TCP_PROBE5(receive, NULL, tp, m, tp, th); - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_UNLOCKED; - } -#ifdef INVARIANTS - else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropwithreset " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - } -#endif - if (inp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); INP_WUNLOCK(inp); @@ -1402,18 +1333,6 @@ if (m != NULL) TCP_PROBE5(receive, NULL, tp, m, tp, th); - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_UNLOCKED; - } -#ifdef INVARIANTS - else { - KASSERT(ti_locked == TI_UNLOCKED, ("%s: dropunlock " - "ti_locked: %d", __func__, ti_locked)); - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - } -#endif - if (inp != NULL) INP_WUNLOCK(inp); @@ -1502,7 +1421,7 @@ struct mbuf *mfree; struct tcpopt to; int tfo_syn; - + #ifdef TCPDEBUG /* * The size of tcp_saveipgen must be the size of the max ip header, @@ -1517,16 +1436,8 @@ tp->sackhint.last_sack_ack = 0; sack_changed = 0; nsegs = max(1, m->m_pkthdr.lro_nsegs); - /* - * If this is either a state-changing packet or current state isn't - * established, we require a write lock on tcbinfo. Otherwise, we - * allow the tcbinfo to be in either alocked or unlocked, as the - * caller may have unnecessarily acquired a write lock due to a race. - */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tp->t_state != TCPS_ESTABLISHED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } + + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); @@ -2048,7 +1959,6 @@ tcp_state_change(tp, TCPS_SYN_RECEIVED); } - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* @@ -2121,7 +2031,6 @@ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); @@ -2164,8 +2073,6 @@ */ if ((thflags & TH_SYN) && tp->t_state != TCPS_SYN_SENT && tp->t_state != TCPS_SYN_RECEIVED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && SEQ_GEQ(th->th_seq, tp->last_ack_sent) && @@ -2289,8 +2196,6 @@ */ if ((so->so_state & SS_NOFDREF) && tp->t_state > TCPS_CLOSE_WAIT && tlen) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - if ((s = tcp_log_addrs(inc, th, NULL, NULL))) { log(LOG_DEBUG, "%s; %s: %s: Received %d bytes of data " "after socket was closed, " @@ -2876,7 +2781,6 @@ */ case TCPS_CLOSING: if (ourfinisacked) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); m_freem(m); return; @@ -2891,7 +2795,6 @@ */ case TCPS_LAST_ACK: if (ourfinisacked) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); goto drop; } @@ -3140,8 +3043,6 @@ * standard timers. */ case TCPS_FIN_WAIT_2: - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - tcp_twstart(tp); return; } Index: sys/netinet/tcp_lro.c =================================================================== --- sys/netinet/tcp_lro.c +++ sys/netinet/tcp_lro.c @@ -884,7 +884,7 @@ */ if ((tcplro_stacks_wanting_mbufq == 0) || (le->m_head->m_flags & M_VLANTAG)) goto skip_lookup; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); switch (le->eh_type) { #ifdef INET6 case ETHERTYPE_IPV6: @@ -903,7 +903,7 @@ break; #endif } - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); if (inp && ((inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || (inp->inp_flags2 & INP_FREED))) { /* We don't want this guy */ Index: sys/netinet/tcp_stacks/bbr.c =================================================================== --- sys/netinet/tcp_stacks/bbr.c +++ sys/netinet/tcp_stacks/bbr.c @@ -8618,7 +8618,6 @@ bbr->rc_timer_first = 1; bbr_timer_cancel(bbr, __LINE__, bbr->r_ctl.rc_rcvtime); - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); tcp_twstart(tp); return (1); @@ -9619,7 +9618,6 @@ struct tcpcb *tp, int32_t * tlen, struct tcphdr *th, struct socket *so) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); if (bbr->rc_allow_data_af_clo == 0) { close_now: tp = tcp_close(tp); @@ -9861,7 +9859,6 @@ return (ret_val); } if (ourfinisacked) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); m_freem(m); return (1); @@ -9974,7 +9971,6 @@ return (ret_val); } if (ourfinisacked) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); ctf_do_drop(m, tp); return (1); Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -5875,7 +5875,6 @@ case TCPS_FIN_WAIT_2: rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); return (1); } @@ -6353,7 +6352,6 @@ tp->t_flags |= (TF_ACKNOW | TF_NEEDSYN); tcp_state_change(tp, TCPS_SYN_RECEIVED); } - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); /* * Advance th->th_seq to correspond to first data byte. If data, @@ -6847,7 +6845,6 @@ { struct tcp_rack *rack; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->rc_allow_data_af_clo == 0) { close_now: @@ -7079,7 +7076,6 @@ return (ret_val); } if (ourfinisacked) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tcp_twstart(tp); m_freem(m); return (1); @@ -7187,7 +7183,6 @@ return (ret_val); } if (ourfinisacked) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); ctf_do_drop(m, tp); return (1); @@ -7650,16 +7645,8 @@ kern_prefetch(rack, &prev_state); prev_state = 0; thflags = th->th_flags; - /* - * If this is either a state-changing packet or current state isn't - * established, we require a read lock on tcbinfo. Otherwise, we - * allow the tcbinfo to be in either locked or unlocked, as the - * caller may have unnecessarily acquired a lock due to a race. - */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tp->t_state != TCPS_ESTABLISHED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } + + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); Index: sys/netinet/tcp_stacks/rack_bbr_common.c =================================================================== --- sys/netinet/tcp_stacks/rack_bbr_common.c +++ sys/netinet/tcp_stacks/rack_bbr_common.c @@ -253,7 +253,6 @@ */ struct mbuf *m_save; struct ether_header *eh; - struct epoch_tracker et; struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ @@ -268,14 +267,8 @@ uint16_t drop_hdrlen; uint8_t iptos, no_vn=0, bpf_req=0; - /* - * This is a bit deceptive, we get the - * "info epoch" which is really the network - * epoch. This covers us on both any INP - * type change but also if the ifp goes - * away it covers us as well. - */ - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ASSERT(); + if (m && m->m_pkthdr.rcvif) ifp = m->m_pkthdr.rcvif; else @@ -445,7 +438,6 @@ } if (no_vn == 0) CURVNET_RESTORE(); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return(retval); } skipped_pkt: @@ -453,7 +445,6 @@ } if (no_vn == 0) CURVNET_RESTORE(); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return(retval); } @@ -680,7 +671,6 @@ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); KASSERT(tp->t_state != TCPS_SYN_SENT, ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", __func__, th, tp)); @@ -732,7 +722,8 @@ void ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + NET_EPOCH_ASSERT(); TCPSTAT_INC(tcps_badsyn); if (V_tcp_insecure_syn && Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -1941,7 +1941,7 @@ tp = (struct tcpcb *)ptp; CURVNET_SET(tp->t_vnet); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); inp = tp->t_inpcb; KASSERT(inp != NULL, ("%s: tp %p tp->t_inpcb == NULL", __func__, tp)); @@ -1961,13 +1961,13 @@ tp->t_inpcb = NULL; uma_zfree(V_tcpcb_zone, tp); if (in_pcbrele_wlocked(inp)) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); return; } } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); CURVNET_RESTORE(); } @@ -2127,17 +2127,17 @@ static int tcp_pcblist(SYSCTL_HANDLER_ARGS) { - int error, i, m, n, pcb_count; - struct inpcb *inp, **inp_list; - inp_gen_t gencnt; - struct xinpgen xig; struct epoch_tracker et; + struct inpcb *inp; + struct xinpgen xig; + int error; + + if (req->newptr != NULL) + return (EPERM); - /* - * The process of preparing the TCB list is too time-consuming and - * resource-intensive to repeat twice on every request. - */ if (req->oldptr == NULL) { + int n; + n = V_tcbinfo.ipi_count + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); n += imax(n / 8, 10); @@ -2145,44 +2145,29 @@ return (0); } - if (req->newptr != NULL) - return (EPERM); - - /* - * OK, now we're committed to doing something. - */ - INP_LIST_RLOCK(&V_tcbinfo); - gencnt = V_tcbinfo.ipi_gencnt; - n = V_tcbinfo.ipi_count; - INP_LIST_RUNLOCK(&V_tcbinfo); - - m = counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); - - error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) - + (n + m) * sizeof(struct xtcpcb)); - if (error != 0) + if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; - xig.xig_count = n + m; - xig.xig_gen = gencnt; + xig.xig_count = V_tcbinfo.ipi_count + + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); + xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); - error = syncache_pcblist(req, m, &pcb_count); + error = syncache_pcblist(req); if (error) return (error); - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - - INP_INFO_WLOCK(&V_tcbinfo); - for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead), i = 0; - inp != NULL && i < n; inp = CK_LIST_NEXT(inp, inp_list)) { - INP_WLOCK(inp); - if (inp->inp_gencnt <= gencnt) { + NET_EPOCH_ENTER(et); + for (inp = CK_LIST_FIRST(V_tcbinfo.ipi_listhead); + inp != NULL; + inp = CK_LIST_NEXT(inp, inp_list)) { + INP_RLOCK(inp); + if (inp->inp_gencnt <= xig.xig_gen) { /* * XXX: This use of cr_cansee(), introduced with * TCP state changes, is not quite right, but for @@ -2197,36 +2182,18 @@ } else error = cr_canseeinpcb(req->td->td_ucred, inp); if (error == 0) { - in_pcbref(inp); - inp_list[i++] = inp; - } - } - INP_WUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_tcbinfo); - n = i; - - error = 0; - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (inp->inp_gencnt <= gencnt) { - struct xtcpcb xt; + struct xtcpcb xt; - tcp_inptoxtp(inp, &xt); - INP_RUNLOCK(inp); - error = SYSCTL_OUT(req, &xt, sizeof xt); + tcp_inptoxtp(inp, &xt); + INP_RUNLOCK(inp); + error = SYSCTL_OUT(req, &xt, sizeof xt); + if (error) + break; + } } else INP_RUNLOCK(inp); } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); if (!error) { /* @@ -2236,14 +2203,13 @@ * while we were processing this request, and it * might be necessary to retry. */ - INP_LIST_RLOCK(&V_tcbinfo); xig.xig_gen = V_tcbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; - xig.xig_count = V_tcbinfo.ipi_count + pcb_count; - INP_LIST_RUNLOCK(&V_tcbinfo); + xig.xig_count = V_tcbinfo.ipi_count + + counter_u64_fetch(V_tcps_states[TCPS_SYN_RECEIVED]); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); + return (error); } @@ -2257,6 +2223,7 @@ { struct xucred xuc; struct sockaddr_in addrs[2]; + struct epoch_tracker et; struct inpcb *inp; int error; @@ -2266,8 +2233,10 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); + NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_tcbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_RLOCKPCB, NULL); + NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; @@ -2292,6 +2261,7 @@ static int tcp6_getcred(SYSCTL_HANDLER_ARGS) { + struct epoch_tracker et; struct xucred xuc; struct sockaddr_in6 addrs[2]; struct inpcb *inp; @@ -2319,6 +2289,7 @@ return (EINVAL); } + NET_EPOCH_ENTER(et); #ifdef INET if (mapped == 1) inp = in_pcblookup(&V_tcbinfo, @@ -2332,6 +2303,7 @@ &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_RLOCKPCB, NULL); + NET_EPOCH_EXIT(et); if (inp != NULL) { if (inp->inp_socket == NULL) error = ENOENT; @@ -2365,7 +2337,6 @@ struct inpcb *(*notify)(struct inpcb *, int) = tcp_notify; struct icmp *icp; struct in_conninfo inc; - struct epoch_tracker et; tcp_seq icmp_tcp_seq; int mtu; @@ -2397,7 +2368,6 @@ icp = (struct icmp *)((caddr_t)ip - offsetof(struct icmp, icmp_ip)); th = (struct tcphdr *)((caddr_t)ip + (ip->ip_hl << 2)); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = in_pcblookup(&V_tcbinfo, faddr, th->th_dport, ip->ip_src, th->th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { @@ -2462,7 +2432,6 @@ out: if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #endif /* INET */ @@ -2480,7 +2449,6 @@ struct ip6ctlparam *ip6cp = NULL; const struct sockaddr_in6 *sa6_src = NULL; struct in_conninfo inc; - struct epoch_tracker et; struct tcp_ports { uint16_t th_sport; uint16_t th_dport; @@ -2542,7 +2510,6 @@ } bzero(&t_ports, sizeof(struct tcp_ports)); m_copydata(m, off, sizeof(struct tcp_ports), (caddr_t)&t_ports); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); inp = in6_pcblookup(&V_tcbinfo, &ip6->ip6_dst, t_ports.th_dport, &ip6->ip6_src, t_ports.th_sport, INPLOOKUP_WLOCKPCB, NULL); if (inp != NULL && PRC_IS_REDIRECT(cmd)) { @@ -2614,7 +2581,6 @@ out: if (inp != NULL) INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); } #endif /* INET6 */ @@ -2770,7 +2736,7 @@ { struct tcpcb *tp; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) || @@ -3042,7 +3008,7 @@ default: return (EINVAL); } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: @@ -3081,7 +3047,7 @@ INP_WUNLOCK(inp); } else error = ESRCH; - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); return (error); } @@ -3157,7 +3123,7 @@ default: return (EINVAL); } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); switch (addrs[0].ss_family) { #ifdef INET6 case AF_INET6: @@ -3173,7 +3139,7 @@ break; #endif } - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); if (inp != NULL) { if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) != 0 || inp->inp_socket == NULL) { Index: sys/netinet/tcp_syncache.h =================================================================== --- sys/netinet/tcp_syncache.h +++ sys/netinet/tcp_syncache.h @@ -48,7 +48,7 @@ void *, void *); void syncache_chkrst(struct in_conninfo *, struct tcphdr *, struct mbuf *); void syncache_badack(struct in_conninfo *); -int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported); +int syncache_pcblist(struct sysctl_req *); struct syncache { TAILQ_ENTRY(syncache) sc_hash; Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -771,7 +771,7 @@ int error; char *s; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); /* * Ok, create the full blown connection, and set things up @@ -1091,11 +1091,7 @@ char *s; bool locked; - /* - * Global TCP locks are held because we manipulate the PCB lists - * and create a new socket. - */ - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); KASSERT((th->th_flags & (TH_RST|TH_ACK|TH_SYN)) == TH_ACK, ("%s: can handle only ACK", __func__)); @@ -1331,11 +1327,7 @@ struct tcpcb *tp; unsigned int *pending_counter; - /* - * Global TCP locks are held because we manipulate the PCB lists - * and create a new socket. - */ - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); pending_counter = intotcpcb(sotoinpcb(*lsop))->t_tfo_pending; *lsop = syncache_socket(sc, *lsop, m); @@ -2460,46 +2452,41 @@ * amount of space the caller allocated for this function to use. */ int -syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported) +syncache_pcblist(struct sysctl_req *req) { struct xtcpcb xt; struct syncache *sc; struct syncache_head *sch; - int count, error, i; + int error, i; + + bzero(&xt, sizeof(xt)); + xt.xt_len = sizeof(xt); + xt.t_state = TCPS_SYN_RECEIVED; + xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; + xt.xt_inp.xi_socket.xso_len = sizeof (struct xsocket); + xt.xt_inp.xi_socket.so_type = SOCK_STREAM; + xt.xt_inp.xi_socket.so_state = SS_ISCONNECTING; - for (count = 0, error = 0, i = 0; i < V_tcp_syncache.hashsize; i++) { + for (i = 0; i < V_tcp_syncache.hashsize; i++) { sch = &V_tcp_syncache.hashbase[i]; SCH_LOCK(sch); TAILQ_FOREACH(sc, &sch->sch_bucket, sc_hash) { - if (count >= max_pcbs) { - SCH_UNLOCK(sch); - goto exit; - } if (cr_cansee(req->td->td_ucred, sc->sc_cred) != 0) continue; - bzero(&xt, sizeof(xt)); - xt.xt_len = sizeof(xt); if (sc->sc_inc.inc_flags & INC_ISIPV6) xt.xt_inp.inp_vflag = INP_IPV6; else xt.xt_inp.inp_vflag = INP_IPV4; bcopy(&sc->sc_inc, &xt.xt_inp.inp_inc, sizeof (struct in_conninfo)); - xt.t_state = TCPS_SYN_RECEIVED; - xt.xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; - xt.xt_inp.xi_socket.xso_len = sizeof (struct xsocket); - xt.xt_inp.xi_socket.so_type = SOCK_STREAM; - xt.xt_inp.xi_socket.so_state = SS_ISCONNECTING; error = SYSCTL_OUT(req, &xt, sizeof xt); if (error) { SCH_UNLOCK(sch); - goto exit; + return (0); } - count++; } SCH_UNLOCK(sch); } -exit: - *pcbs_exported = count; - return error; + + return (0); } Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -339,9 +339,9 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); tp = tcp_close(tp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); tcp_inpinfo_lock_del(inp, tp); goto out; } else { @@ -353,9 +353,9 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); tp = tcp_close(tp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -478,7 +478,7 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); tp = tcp_drop(tp, ETIMEDOUT); #ifdef TCPDEBUG @@ -487,7 +487,7 @@ PRU_SLOWTIMO); #endif TCP_PROBE2(debug__user, tp, PRU_SLOWTIMO); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); tcp_inpinfo_lock_del(inp, tp); out: CURVNET_RESTORE(); @@ -542,9 +542,9 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); tp = tcp_drop(tp, ETIMEDOUT); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -559,9 +559,9 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); tp = tcp_drop(tp, ETIMEDOUT); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); tcp_inpinfo_lock_del(inp, tp); goto out; } @@ -628,9 +628,9 @@ tcp_inpinfo_lock_del(inp, tp); goto out; } - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); tp = tcp_drop(tp, ETIMEDOUT); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); tcp_inpinfo_lock_del(inp, tp); goto out; } Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -209,10 +209,10 @@ struct tcptw *tw; struct epoch_tracker et; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) tcp_twclose(tw, 0); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); TW_LOCK_DESTROY(V_tw_lock); uma_zdestroy(V_tcptw_zone); @@ -236,7 +236,7 @@ bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); /* A dropped inp should never transition to TIME_WAIT state. */ @@ -382,7 +382,7 @@ int thflags; tcp_seq seq; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); /* @@ -488,7 +488,7 @@ inp = tw->tw_inpcb; KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* in_pcbfree() */ + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); tcp_tw_2msl_stop(tw, reuse); @@ -644,7 +644,7 @@ tcp_tw_2msl_reset(struct tcptw *tw, int rearm) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tw->tw_inpcb); TW_WLOCK(V_tw_lock); @@ -662,7 +662,7 @@ struct inpcb *inp; int released __unused; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); TW_WLOCK(V_tw_lock); inp = tw->tw_inpcb; @@ -689,25 +689,8 @@ { struct tcptw *tw; struct inpcb *inp; - struct epoch_tracker et; -#ifdef INVARIANTS - if (reuse) { - /* - * Exclusive pcbinfo lock is not required in reuse case even if - * two inpcb locks can be acquired simultaneously: - * - the inpcb transitioning to TIME_WAIT state in - * tcp_tw_start(), - * - the inpcb closed by tcp_twclose(). - * - * It is because only inpcbs in FIN_WAIT2 or CLOSING states can - * transition in TIME_WAIT state. Then a pcbcb cannot be in - * TIME_WAIT list and transitioning to TIME_WAIT state at same - * time. - */ - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } -#endif + NET_EPOCH_ASSERT(); for (;;) { TW_RLOCK(V_tw_lock); @@ -723,12 +706,10 @@ in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); INP_WLOCK(inp); tw = intotw(inp); if (in_pcbrele_wlocked(inp)) { if (__predict_true(tw == NULL)) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); continue; } else { /* This should not happen as in TIMEWAIT @@ -747,7 +728,6 @@ "|| inp last reference) && tw != " "NULL", __func__); #endif - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); break; } } @@ -755,12 +735,10 @@ if (tw == NULL) { /* tcp_twclose() has already been called */ INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); continue; } tcp_twclose(tw, reuse); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (reuse) return tw; } Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -283,7 +283,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_detach: inp == NULL")); if (!INP_INFO_WLOCKED(&V_tcbinfo)) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); rlock = 1; } INP_WLOCK(inp); @@ -291,7 +291,7 @@ ("tcp_usr_detach: inp_socket == NULL")); tcp_detach(so, inp); if (rlock) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); } #ifdef INET @@ -706,7 +706,7 @@ int error = 0; TCPDEBUG0; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_disconnect: inp == NULL")); INP_WLOCK(inp); @@ -723,7 +723,7 @@ TCPDEBUG2(PRU_DISCONNECT); TCP_PROBE2(debug__user, tp, PRU_DISCONNECT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); return (error); } @@ -792,7 +792,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp6_usr_accept: inp == NULL")); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { error = ECONNABORTED; @@ -819,7 +819,7 @@ TCPDEBUG2(PRU_ACCEPT); TCP_PROBE2(debug__user, tp, PRU_ACCEPT); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); if (error == 0) { if (v4) *nam = in6_v4mapsin6_sockaddr(port, &addr); @@ -842,7 +842,7 @@ struct epoch_tracker et; TCPDEBUG0; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("inp == NULL")); INP_WLOCK(inp); @@ -861,7 +861,7 @@ TCPDEBUG2(PRU_SHUTDOWN); TCP_PROBE2(debug__user, tp, PRU_SHUTDOWN); INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); return (error); } @@ -921,10 +921,10 @@ tcp_usr_send(struct socket *so, int flags, struct mbuf *m, struct sockaddr *nam, struct mbuf *control, struct thread *td) { + struct epoch_tracker et; int error = 0; struct inpcb *inp; struct tcpcb *tp = NULL; - struct epoch_tracker net_et; #ifdef INET #ifdef INET6 struct sockaddr_in sin; @@ -940,11 +940,11 @@ TCPDEBUG0; /* - * We require the pcbinfo lock if we will close the socket as part of - * this call. + * We require the pcbinfo "read lock" if we will close the socket + * as part of this call. */ if (flags & PRUS_EOF) - INP_INFO_RLOCK_ET(&V_tcbinfo, net_et); + NET_EPOCH_ENTER(et); inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_send: inp == NULL")); INP_WLOCK(inp); @@ -1135,7 +1135,7 @@ * Close the send side of the connection after * the data is sent. */ - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); socantsendmore(so); tcp_usrclosed(tp); } @@ -1231,7 +1231,7 @@ ((flags & PRUS_EOF) ? PRU_SEND_EOF : PRU_SEND)); INP_WUNLOCK(inp); if (flags & PRUS_EOF) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, net_et); + NET_EPOCH_EXIT(et); return (error); } @@ -1275,7 +1275,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_abort: inp == NULL")); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_abort: inp_socket == NULL")); @@ -1301,7 +1301,7 @@ } INP_WUNLOCK(inp); dropped: - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); } /* @@ -1318,7 +1318,7 @@ inp = sotoinpcb(so); KASSERT(inp != NULL, ("tcp_usr_close: inp == NULL")); - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); INP_WLOCK(inp); KASSERT(inp->inp_socket != NULL, ("tcp_usr_close: inp_socket == NULL")); @@ -1342,7 +1342,7 @@ inp->inp_flags |= INP_SOCKREF; } INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); } /* @@ -2317,10 +2317,10 @@ } so->so_rcv.sb_flags |= SB_AUTOSIZE; so->so_snd.sb_flags |= SB_AUTOSIZE; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_ENTER(et); error = in_pcballoc(so, &V_tcbinfo); if (error) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); return (error); } inp = sotoinpcb(so); @@ -2338,12 +2338,12 @@ if (tp == NULL) { in_pcbdetach(inp); in_pcbfree(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); return (ENOBUFS); } tp->t_state = TCPS_CLOSED; INP_WUNLOCK(inp); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + NET_EPOCH_EXIT(et); TCPSTATES_INC(TCPS_CLOSED); return (0); } @@ -2362,7 +2362,7 @@ struct inpcb *inp = tp->t_inpcb; struct socket *so = inp->inp_socket; - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(inp); /* @@ -2401,7 +2401,7 @@ tcp_usrclosed(struct tcpcb *tp) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); INP_WLOCK_ASSERT(tp->t_inpcb); switch (tp->t_state) { Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -360,7 +360,7 @@ struct tcphdr *th, struct socket **lsop) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); return (syncache_expand(inc, to, th, lsop, NULL)); } @@ -390,8 +390,6 @@ INP_WLOCK_ASSERT(inp); if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) { - - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); /* for twcheck */ if (!tcp_twcheck(inp, NULL, th, NULL, 0)) return (EADDRINUSE); } else { @@ -529,7 +527,7 @@ (void) tp->t_fb->tfb_tcp_output(tp); } else { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + NET_EPOCH_ASSERT(); tp = tcp_drop(tp, err); if (tp == NULL) INP_WLOCK(inp); /* re-acquire */ Index: sys/netinet/udp_usrreq.c =================================================================== --- sys/netinet/udp_usrreq.c +++ sys/netinet/udp_usrreq.c @@ -399,7 +399,6 @@ struct sockaddr_in udp_in[2]; struct mbuf *m; struct m_tag *fwd_tag; - struct epoch_tracker et; int cscov_partial, iphlen; m = *mp; @@ -529,7 +528,8 @@ struct inpcb *last; struct inpcbhead *pcblist; - INP_INFO_RLOCK_ET(pcbinfo, et); + NET_EPOCH_ASSERT(); + pcblist = udp_get_pcblist(proto); last = NULL; CK_LIST_FOREACH(inp, pcblist, inp_list) { @@ -636,7 +636,6 @@ UDPSTAT_INC(udps_noportbcast); if (inp) INP_RUNLOCK(inp); - INP_INFO_RUNLOCK_ET(pcbinfo, et); goto badunlocked; } if (proto == IPPROTO_UDPLITE) @@ -646,7 +645,6 @@ if (udp_append(last, ip, m, iphlen, udp_in) == 0) INP_RUNLOCK(last); inp_lost: - INP_INFO_RUNLOCK_ET(pcbinfo, et); return (IPPROTO_DONE); } @@ -854,87 +852,53 @@ static int udp_pcblist(SYSCTL_HANDLER_ARGS) { - int error, i, n; - struct inpcb *inp, **inp_list; - inp_gen_t gencnt; struct xinpgen xig; struct epoch_tracker et; + struct inpcb *inp; + int error; + + if (req->newptr != 0) + return (EPERM); - /* - * The process of preparing the PCB list is too time-consuming and - * resource-intensive to repeat twice on every request. - */ if (req->oldptr == 0) { + int n; + n = V_udbinfo.ipi_count; n += imax(n / 8, 10); req->oldidx = 2 * (sizeof xig) + n * sizeof(struct xinpcb); return (0); } - if (req->newptr != 0) - return (EPERM); - - /* - * OK, now we're committed to doing something. - */ - INP_INFO_RLOCK_ET(&V_udbinfo, et); - gencnt = V_udbinfo.ipi_gencnt; - n = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK_ET(&V_udbinfo, et); - - error = sysctl_wire_old_buffer(req, 2 * (sizeof xig) - + n * sizeof(struct xinpcb)); - if (error != 0) + if ((error = sysctl_wire_old_buffer(req, 0)) != 0) return (error); bzero(&xig, sizeof(xig)); xig.xig_len = sizeof xig; - xig.xig_count = n; - xig.xig_gen = gencnt; + xig.xig_count = V_udbinfo.ipi_count; + xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; error = SYSCTL_OUT(req, &xig, sizeof xig); if (error) return (error); - inp_list = malloc(n * sizeof *inp_list, M_TEMP, M_WAITOK); - if (inp_list == NULL) - return (ENOMEM); - - INP_INFO_RLOCK_ET(&V_udbinfo, et); - for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead), i = 0; inp && i < n; - inp = CK_LIST_NEXT(inp, inp_list)) { - INP_WLOCK(inp); - if (inp->inp_gencnt <= gencnt && - cr_canseeinpcb(req->td->td_ucred, inp) == 0) { - in_pcbref(inp); - inp_list[i++] = inp; - } - INP_WUNLOCK(inp); - } - INP_INFO_RUNLOCK_ET(&V_udbinfo, et); - n = i; - - error = 0; - for (i = 0; i < n; i++) { - inp = inp_list[i]; + NET_EPOCH_ENTER(et); + for (inp = CK_LIST_FIRST(V_udbinfo.ipi_listhead); + inp != NULL; + inp = CK_LIST_NEXT(inp, inp_list)) { INP_RLOCK(inp); - if (inp->inp_gencnt <= gencnt) { + if (inp->inp_gencnt <= xig.xig_gen && + cr_canseeinpcb(req->td->td_ucred, inp) == 0) { struct xinpcb xi; in_pcbtoxinpcb(inp, &xi); INP_RUNLOCK(inp); error = SYSCTL_OUT(req, &xi, sizeof xi); + if (error) + break; } else INP_RUNLOCK(inp); } - INP_INFO_WLOCK(&V_udbinfo); - for (i = 0; i < n; i++) { - inp = inp_list[i]; - INP_RLOCK(inp); - if (!in_pcbrele_rlocked(inp)) - INP_RUNLOCK(inp); - } - INP_INFO_WUNLOCK(&V_udbinfo); + NET_EPOCH_EXIT(et); if (!error) { /* @@ -943,14 +907,12 @@ * that something happened while we were processing this * request, and it might be necessary to retry. */ - INP_INFO_RLOCK_ET(&V_udbinfo, et); xig.xig_gen = V_udbinfo.ipi_gencnt; xig.xig_sogen = so_gencnt; xig.xig_count = V_udbinfo.ipi_count; - INP_INFO_RUNLOCK_ET(&V_udbinfo, et); error = SYSCTL_OUT(req, &xig, sizeof xig); } - free(inp_list, M_TEMP); + return (error); } @@ -964,6 +926,7 @@ { struct xucred xuc; struct sockaddr_in addrs[2]; + struct epoch_tracker et; struct inpcb *inp; int error; @@ -973,9 +936,11 @@ error = SYSCTL_IN(req, addrs, sizeof(addrs)); if (error) return (error); + NET_EPOCH_ENTER(et); inp = in_pcblookup(&V_udbinfo, addrs[1].sin_addr, addrs[1].sin_port, addrs[0].sin_addr, addrs[0].sin_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); + NET_EPOCH_EXIT(et); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) @@ -1116,9 +1081,6 @@ } #ifdef INET -#define UH_WLOCKED 2 -#define UH_RLOCKED 1 -#define UH_UNLOCKED 0 static int udp_output(struct inpcb *inp, struct mbuf *m, struct sockaddr *addr, struct mbuf *control, struct thread *td) @@ -1134,19 +1096,12 @@ int error = 0; int ipflags; u_short fport, lport; - int unlock_udbinfo, unlock_inp; u_char tos; uint8_t pr; uint16_t cscov = 0; uint32_t flowid = 0; uint8_t flowtype = M_HASHTYPE_NONE; - /* - * udp_output() may need to temporarily bind or connect the current - * inpcb. As such, we don't know up front whether we will need the - * pcbinfo lock or not. Do any work to decide what is needed up - * front before acquiring any locks. - */ if (len + sizeof(struct udpiphdr) > IP_MAXPACKET) { if (control) m_freem(control); @@ -1156,28 +1111,22 @@ src.sin_family = 0; sin = (struct sockaddr_in *)addr; -retry: + + /* + * udp_output() may need to temporarily bind or connect the current + * inpcb. As such, we don't know up front whether we will need the + * pcbinfo lock or not. Do any work to decide what is needed up + * front before acquiring any locks. + * + * We will need network epoch in either case, to safely lookup into + * pcb hash. + */ if (sin == NULL || - (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { + (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) INP_WLOCK(inp); - /* - * In case we lost a race and another thread bound addr/port - * on the inp we cannot keep the wlock (which still would be - * fine) as further down, based on these values we make - * decisions for the pcbinfo lock. If the locks are not in - * synch the assertions on unlock will fire, hence we go for - * one retry loop. - */ - if (sin != NULL && (inp->inp_laddr.s_addr != INADDR_ANY || - inp->inp_lport != 0)) { - INP_WUNLOCK(inp); - goto retry; - } - unlock_inp = UH_WLOCKED; - } else { + else INP_RLOCK(inp); - unlock_inp = UH_RLOCKED; - } + NET_EPOCH_ENTER(et); tos = inp->inp_ip_tos; if (control != NULL) { /* @@ -1185,13 +1134,9 @@ * stored in a single mbuf. */ if (control->m_next) { - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); m_freem(control); - m_freem(m); - return (EINVAL); + error = EINVAL; + goto release; } for (; control->m_len > 0; control->m_data += CMSG_ALIGN(cm->cmsg_len), @@ -1262,56 +1207,11 @@ } m_freem(control); } - if (error) { - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); - m_freem(m); - return (error); - } + if (error) + goto release; - /* - * In the old days, depending on whether or not the application had - * bound or connected the socket, we had to do varying levels of work. - * The optimal case was for a connected UDP socket, as a global lock - * wasn't required at all. - * In order to decide which we need, we required stability of the - * inpcb binding, which we ensured by acquiring a read lock on the - * inpcb. This didn't strictly follow the lock order, so we played - * the trylock and retry game. - * With the re-introduction of the route-cache in some cases, we started - * to acquire an early inp wlock and a possible race during re-lock - * went away. With the introduction of epoch(9) some read locking - * became epoch(9) and the lock-order issues also went away. - * Due to route-cache we may now hold more conservative locks than - * otherwise required and have split up the 2nd case in case 2 and 3 - * in order to keep the udpinfo lock level in sync with the inp one - * for the IP_SENDSRCADDR case below. - */ pr = inp->inp_socket->so_proto->pr_protocol; pcbinfo = udp_get_inpcbinfo(pr); - if (sin != NULL && - (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0)) { - INP_HASH_WLOCK(pcbinfo); - unlock_udbinfo = UH_WLOCKED; - } else if (sin != NULL && - (sin->sin_addr.s_addr == INADDR_ANY || - sin->sin_addr.s_addr == INADDR_BROADCAST || - inp->inp_laddr.s_addr == INADDR_ANY || - inp->inp_lport == 0)) { - INP_HASH_RLOCK_ET(pcbinfo, et); - unlock_udbinfo = UH_RLOCKED; - } else if (src.sin_family == AF_INET) { - if (unlock_inp == UH_WLOCKED) { - INP_HASH_WLOCK(pcbinfo); - unlock_udbinfo = UH_WLOCKED; - } else { - INP_HASH_RLOCK_ET(pcbinfo, et); - unlock_udbinfo = UH_RLOCKED; - } - } else - unlock_udbinfo = UH_UNLOCKED; /* * If the IP_SENDSRCADDR control message was specified, override the @@ -1387,7 +1287,6 @@ if (inp->inp_laddr.s_addr == INADDR_ANY && inp->inp_lport == 0) { INP_WLOCK_ASSERT(inp); - INP_HASH_WLOCK_ASSERT(pcbinfo); /* * Remember addr if jailed, to prevent * rebinding. @@ -1395,7 +1294,10 @@ if (prison_flag(td->td_ucred, PR_IP4)) inp->inp_laddr = laddr; inp->inp_lport = lport; - if (in_pcbinshash(inp) != 0) { + INP_HASH_WLOCK(pcbinfo); + error = in_pcbinshash(inp); + INP_HASH_WUNLOCK(pcbinfo); + if (error != 0) { inp->inp_lport = 0; error = EAGAIN; goto release; @@ -1560,48 +1462,20 @@ ipflags |= IP_NODEFAULTFLOWID; #endif /* RSS */ - if (unlock_udbinfo == UH_WLOCKED) - INP_HASH_WUNLOCK(pcbinfo); - else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK_ET(pcbinfo, et); if (pr == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); else UDP_PROBE(send, NULL, inp, &ui->ui_i, inp, &ui->ui_u); error = ip_output(m, inp->inp_options, - (unlock_inp == UH_WLOCKED ? &inp->inp_route : NULL), ipflags, + INP_WLOCKED(inp) ? &inp->inp_route : NULL, ipflags, inp->inp_moptions, inp); - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); return (error); release: - if (unlock_udbinfo == UH_WLOCKED) { - KASSERT(unlock_inp == UH_WLOCKED, - ("%s: excl udbinfo lock %#03x, shared inp lock %#03x, " - "sin %p daddr %#010x inp %p laddr %#010x lport %#06x " - "src fam %#04x", - __func__, unlock_udbinfo, unlock_inp, sin, - (sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp, - inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family)); - INP_HASH_WUNLOCK(pcbinfo); - INP_WUNLOCK(inp); - } else if (unlock_udbinfo == UH_RLOCKED) { - KASSERT(unlock_inp == UH_RLOCKED, - ("%s: shared udbinfo lock %#03x, excl inp lock %#03x, " - "sin %p daddr %#010x inp %p laddr %#010x lport %#06x " - "src fam %#04x", - __func__, unlock_udbinfo, unlock_inp, sin, - (sin != NULL) ? sin->sin_addr.s_addr : 0xfefefefe, inp, - inp->inp_laddr.s_addr, inp->inp_lport, src.sin_family)); - INP_HASH_RUNLOCK_ET(pcbinfo, et); - INP_RUNLOCK(inp); - } else if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); m_freem(m); return (error); } Index: sys/netinet6/icmp6.c =================================================================== --- sys/netinet6/icmp6.c +++ sys/netinet6/icmp6.c @@ -1893,9 +1893,10 @@ struct inpcb *last = NULL; struct sockaddr_in6 fromsa; struct icmp6_hdr *icmp6; - struct epoch_tracker et; struct mbuf *opts = NULL; + NET_EPOCH_ASSERT(); + #ifndef PULLDOWN_TEST /* this is assumed to be safe. */ icmp6 = (struct icmp6_hdr *)((caddr_t)ip6 + off); @@ -1920,7 +1921,6 @@ return (IPPROTO_DONE); } - INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { if ((inp->inp_vflag & INP_IPV6) == 0) continue; @@ -2002,7 +2002,6 @@ } last = inp; } - INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); if (last != NULL) { if (last->inp_flags & INP_CONTROLOPTS) ip6_savecontrol(last, m, &opts); Index: sys/netinet6/in6_pcb.c =================================================================== --- sys/netinet6/in6_pcb.c +++ sys/netinet6/in6_pcb.c @@ -1245,7 +1245,6 @@ { struct inpcb *inp; - INP_HASH_RLOCK(pcbinfo); inp = in6_pcblookup_hash_locked(pcbinfo, faddr, fport, laddr, lport, (lookupflags & ~(INPLOOKUP_RLOCKPCB | INPLOOKUP_WLOCKPCB)), ifp); if (inp != NULL) { @@ -1272,7 +1271,6 @@ } #endif } - INP_HASH_RUNLOCK(pcbinfo); return (inp); } Index: sys/netinet6/raw_ip6.c =================================================================== --- sys/netinet6/raw_ip6.c +++ sys/netinet6/raw_ip6.c @@ -165,7 +165,8 @@ struct inpcb *last = NULL; struct mbuf *opts = NULL; struct sockaddr_in6 fromsa; - struct epoch_tracker et; + + NET_EPOCH_ASSERT(); RIP6STAT_INC(rip6s_ipackets); @@ -173,7 +174,6 @@ ifp = m->m_pkthdr.rcvif; - INP_INFO_RLOCK_ET(&V_ripcbinfo, et); CK_LIST_FOREACH(inp, &V_ripcb, inp_list) { /* XXX inp locking */ if ((inp->inp_vflag & INP_IPV6) == 0) @@ -303,7 +303,6 @@ skip_2: INP_RUNLOCK(inp); } - INP_INFO_RUNLOCK_ET(&V_ripcbinfo, et); #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Check AH/ESP integrity. Index: sys/netinet6/udp6_usrreq.c =================================================================== --- sys/netinet6/udp6_usrreq.c +++ sys/netinet6/udp6_usrreq.c @@ -214,12 +214,13 @@ int off = *offp; int cscov_partial; int plen, ulen; - struct epoch_tracker et; struct sockaddr_in6 fromsa[2]; struct m_tag *fwd_tag; uint16_t uh_sum; uint8_t nxt; + NET_EPOCH_ASSERT(); + ifp = m->m_pkthdr.rcvif; #ifndef PULLDOWN_TEST @@ -301,7 +302,6 @@ struct inpcbhead *pcblist; struct ip6_moptions *imo; - INP_INFO_RLOCK_ET(pcbinfo, et); /* * In the event that laddr should be set to the link-local * address (this happens in RIPng), the multicast address @@ -395,7 +395,7 @@ UDP_PROBE(receive, NULL, last, ip6, last, uh); if (udp6_append(last, n, off, fromsa)) - goto inp_lost; + return (IPPROTO_DONE); } INP_RUNLOCK(last); } @@ -422,7 +422,7 @@ */ UDPSTAT_INC(udps_noport); UDPSTAT_INC(udps_noportmcast); - goto badheadlocked; + goto badunlocked; } INP_RLOCK(last); if (__predict_true(last->inp_flags2 & INP_FREED) == 0) { @@ -434,8 +434,6 @@ INP_RUNLOCK(last); } else INP_RUNLOCK(last); - inp_lost: - INP_INFO_RUNLOCK_ET(pcbinfo, et); return (IPPROTO_DONE); } /* @@ -522,8 +520,6 @@ INP_RUNLOCK(inp); return (IPPROTO_DONE); -badheadlocked: - INP_INFO_RUNLOCK_ET(pcbinfo, et); badunlocked: if (m) m_freem(m); @@ -638,6 +634,7 @@ { struct xucred xuc; struct sockaddr_in6 addrs[2]; + struct epoch_tracker et; struct inpcb *inp; int error; @@ -656,9 +653,11 @@ (error = sa6_embedscope(&addrs[1], V_ip6_use_defzone)) != 0) { return (error); } + NET_EPOCH_ENTER(et); inp = in6_pcblookup(&V_udbinfo, &addrs[1].sin6_addr, addrs[1].sin6_port, &addrs[0].sin6_addr, addrs[0].sin6_port, INPLOOKUP_WILDCARD | INPLOOKUP_RLOCKPCB, NULL); + NET_EPOCH_EXIT(et); if (inp != NULL) { INP_RLOCK_ASSERT(inp); if (inp->inp_socket == NULL) @@ -679,14 +678,10 @@ SYSCTL_PROC(_net_inet6_udp6, OID_AUTO, getcred, CTLTYPE_OPAQUE|CTLFLAG_RW, 0, 0, udp6_getcred, "S,xucred", "Get the xucred of a UDP6 connection"); -#define UH_WLOCKED 2 -#define UH_RLOCKED 1 -#define UH_UNLOCKED 0 static int udp6_output(struct socket *so, int flags_arg, struct mbuf *m, struct sockaddr *addr6, struct mbuf *control, struct thread *td) { - struct inpcbinfo *pcbinfo; struct inpcb *inp; struct ip6_hdr *ip6; struct udphdr *udp6; @@ -698,7 +693,7 @@ u_int32_t ulen, plen; uint16_t cscov; u_short fport; - uint8_t nxt, unlock_inp, unlock_udbinfo; + uint8_t nxt; /* addr6 has been validated in udp6_send(). */ sin6 = (struct sockaddr_in6 *)addr6; @@ -741,30 +736,17 @@ * - on connected sockets (sin6 is NULL) for route cache updates, * - when we are not bound to an address and source port (it is * in6_pcbsetport() which will require the write lock). + * + * We check the inp fields before actually locking the inp, so + * here exists a race, and we may WLOCK the inp and end with already + * bound one by other thread. This is fine. */ -retry: if (sin6 == NULL || (IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && - inp->inp_lport == 0)) { + inp->inp_lport == 0)) INP_WLOCK(inp); - /* - * In case we lost a race and another thread bound addr/port - * on the inp we cannot keep the wlock (which still would be - * fine) as further down, based on these values we make - * decisions for the pcbinfo lock. If the locks are not in - * synch the assertions on unlock will fire, hence we go for - * one retry loop. - */ - if (sin6 != NULL && - (!IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || - inp->inp_lport != 0)) { - INP_WUNLOCK(inp); - goto retry; - } - unlock_inp = UH_WLOCKED; - } else { + else INP_RLOCK(inp); - unlock_inp = UH_RLOCKED; - } + nxt = (inp->inp_socket->so_proto->pr_protocol == IPPROTO_UDP) ? IPPROTO_UDP : IPPROTO_UDPLITE; @@ -788,10 +770,7 @@ * potential race in which the factors causing us to * select the UDPv4 output routine are invalidated? */ - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); if (sin6) in6_sin6_2_sin_in_sock((struct sockaddr *)sin6); pru = inetsw[ip_protox[nxt]].pr_usrreqs; @@ -806,21 +785,17 @@ * Given this is either an IPv6-only socket or no INET is * supported we will fail the send if the given destination * address is a v4mapped address. + * + * XXXGL: do we leak m and control? */ - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); return (EINVAL); } if (control) { if ((error = ip6_setpktopts(control, &opt, inp->in6p_outputopts, td->td_ucred, nxt)) != 0) { - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); ip6_clearpktopts(&opt, -1); if (control) m_freem(control); @@ -831,20 +806,7 @@ } else optp = inp->in6p_outputopts; - pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); - if (sin6 != NULL && - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) && inp->inp_lport == 0) { - INP_HASH_WLOCK(pcbinfo); - unlock_udbinfo = UH_WLOCKED; - } else if (sin6 != NULL && - (IN6_IS_ADDR_UNSPECIFIED(&sin6->sin6_addr) || - IN6_IS_ADDR_UNSPECIFIED(&inp->in6p_laddr) || - inp->inp_lport == 0)) { - INP_HASH_RLOCK_ET(pcbinfo, et); - unlock_udbinfo = UH_RLOCKED; - } else - unlock_udbinfo = UH_UNLOCKED; - + NET_EPOCH_ENTER(et); if (sin6) { /* @@ -880,9 +842,14 @@ laddr = &in6a; if (inp->inp_lport == 0) { + struct inpcbinfo *pcbinfo; INP_WLOCK_ASSERT(inp); + + pcbinfo = udp_get_inpcbinfo(so->so_proto->pr_protocol); + INP_HASH_WLOCK(pcbinfo); error = in6_pcbsetport(laddr, inp, td->td_ucred); + INP_HASH_WUNLOCK(pcbinfo); if (error != 0) { /* Undo an address bind that may have occurred. */ inp->in6p_laddr = in6addr_any; @@ -1006,21 +973,15 @@ #endif UDPSTAT_INC(udps_opackets); - if (unlock_udbinfo == UH_WLOCKED) - INP_HASH_WUNLOCK(pcbinfo); - else if (unlock_udbinfo == UH_RLOCKED) - INP_HASH_RUNLOCK_ET(pcbinfo, et); if (nxt == IPPROTO_UDPLITE) UDPLITE_PROBE(send, NULL, inp, ip6, inp, udp6); else UDP_PROBE(send, NULL, inp, ip6, inp, udp6); error = ip6_output(m, optp, - (unlock_inp == UH_WLOCKED) ? &inp->inp_route6 : NULL, flags, + INP_WLOCKED(inp) ? &inp->inp_route6 : NULL, flags, inp->in6p_moptions, NULL, inp); - if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); @@ -1029,22 +990,8 @@ return (error); release: - if (unlock_udbinfo == UH_WLOCKED) { - KASSERT(unlock_inp == UH_WLOCKED, ("%s: excl udbinfo lock, " - "non-excl inp lock: pcbinfo %p %#x inp %p %#x", - __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); - INP_HASH_WUNLOCK(pcbinfo); - INP_WUNLOCK(inp); - } else if (unlock_udbinfo == UH_RLOCKED) { - KASSERT(unlock_inp == UH_RLOCKED, ("%s: non-excl udbinfo lock, " - "excl inp lock: pcbinfo %p %#x inp %p %#x", - __func__, pcbinfo, unlock_udbinfo, inp, unlock_inp)); - INP_HASH_RUNLOCK_ET(pcbinfo, et); - INP_RUNLOCK(inp); - } else if (unlock_inp == UH_WLOCKED) - INP_WUNLOCK(inp); - else - INP_RUNLOCK(inp); + INP_UNLOCK(inp); + NET_EPOCH_EXIT(et); if (control) { ip6_clearpktopts(&opt, -1); m_freem(control);