Index: sys/kern/subr_witness.c =================================================================== --- sys/kern/subr_witness.c +++ sys/kern/subr_witness.c @@ -561,14 +561,14 @@ /* * UDP/IP */ - { "udp", &lock_class_rw }, + { "udp", &lock_class_mtx_sleep }, { "udpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, /* * TCP/IP */ - { "tcp", &lock_class_rw }, + { "tcp", &lock_class_mtx_sleep }, { "tcpinp", &lock_class_rw }, { "so_snd", &lock_class_mtx_sleep }, { NULL, NULL }, Index: sys/netinet/in_pcb.h =================================================================== --- sys/netinet/in_pcb.h +++ sys/netinet/in_pcb.h @@ -51,6 +51,8 @@ #include #include #include +#include +#include #include #endif #include @@ -157,6 +159,7 @@ * Key: * (b) - Protected by the hpts lock. * (c) - Constant after initialization + * (e) - Protected by the net_epoch_prempt epoch * (g) - Protected by the pcbgroup lock * (i) - Protected by the inpcb lock * (p) - Protected by the pcbinfo lock for the inpcb @@ -231,7 +234,7 @@ struct m_snd_tag; struct inpcb { /* Cache line #1 (amd64) */ - CK_LIST_ENTRY(inpcb) inp_hash; /* (h/i) hash list */ + CK_LIST_ENTRY(inpcb) inp_hash; /* [w](h/i) [r](e/i) hash list */ CK_LIST_ENTRY(inpcb) inp_pcbgrouphash; /* (g/i) hash list */ struct rwlock inp_lock; /* Cache line #2 (amd64) */ @@ -324,8 +327,8 @@ struct route_in6 inp_route6; }; CK_LIST_ENTRY(inpcb) inp_list; /* (p/l) list for all PCBs for proto */ - /* (p[w]) for list iteration */ - /* (p[r]/l) for addition/removal */ + /* (e[r]) for list iteration */ + /* (p[w]/l) for addition/removal */ struct epoch_context inp_epoch_ctx; }; #endif /* _KERNEL */ @@ -436,22 +439,23 @@ * Locking key: * * (c) Constant or nearly constant after initialisation + * (e) - Protected by the net_epoch_prempt epoch * (g) Locked by ipi_lock * (l) Locked by ipi_list_lock - * (h) Read using either ipi_hash_lock or inpcb lock; write requires both + * (h) Read using either net_epoch_preempt or inpcb lock; write requires both ipi_hash_lock and inpcb lock * (p) Protected by one or more pcbgroup locks * (x) Synchronisation properties poorly defined */ struct inpcbinfo { /* - * Global lock protecting full inpcb list traversal + * Global lock protecting inpcb modification */ - struct rwlock ipi_lock; + struct mtx ipi_lock; /* * Global list of inpcbs on the protocol. */ - struct inpcbhead *ipi_listhead; /* (g/l) */ + struct inpcbhead *ipi_listhead; /* [r](e) [w](g/l) */ u_int ipi_count; /* (l) */ /* @@ -482,9 +486,9 @@ u_int ipi_hashfields; /* (c) */ /* - * Global lock protecting non-pcbgroup hash lookup tables. + * Global lock protecting modification non-pcbgroup hash lookup tables. */ - struct rwlock ipi_hash_lock; + struct mtx ipi_hash_lock; /* * Global hash of inpcbs, hashed by local and foreign addresses and @@ -626,20 +630,18 @@ #endif /* _KERNEL */ #define INP_INFO_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_lock, (d), RW_RECURSE) -#define INP_INFO_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_lock) -#define INP_INFO_RLOCK(ipi) rw_rlock(&(ipi)->ipi_lock) -#define INP_INFO_WLOCK(ipi) rw_wlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_RLOCK(ipi) rw_try_rlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_WLOCK(ipi) rw_try_wlock(&(ipi)->ipi_lock) -#define INP_INFO_TRY_UPGRADE(ipi) rw_try_upgrade(&(ipi)->ipi_lock) -#define INP_INFO_WLOCKED(ipi) rw_wowned(&(ipi)->ipi_lock) -#define INP_INFO_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_lock) -#define INP_INFO_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_lock) -#define INP_INFO_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_LOCKED) -#define INP_INFO_RLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_RLOCKED) -#define INP_INFO_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_WLOCKED) -#define INP_INFO_UNLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_lock, RA_UNLOCKED) + mtx_init(&(ipi)->ipi_lock, (d), NULL, MTX_DEF| MTX_RECURSE) +#define INP_INFO_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_lock) +#define INP_INFO_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_INFO_WLOCK(ipi) mtx_lock(&(ipi)->ipi_lock) +#define INP_INFO_TRY_WLOCK(ipi) mtx_trylock(&(ipi)->ipi_lock) +#define INP_INFO_WLOCKED(ipi) mtx_owned(&(ipi)->ipi_lock) +#define INP_INFO_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_INFO_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_lock) +#define INP_INFO_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_lock)) +#define INP_INFO_RLOCK_ASSERT(ipi) MPASS(in_epoch()) +#define INP_INFO_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_lock, MA_OWNED) +#define INP_INFO_UNLOCK_ASSERT(ipi) MPASS(!in_epoch() && !mtx_owned(&(ipi)->ipi_lock)) #define INP_LIST_LOCK_INIT(ipi, d) \ rw_init_flags(&(ipi)->ipi_list_lock, (d), 0) @@ -660,17 +662,14 @@ #define INP_LIST_UNLOCK_ASSERT(ipi) \ rw_assert(&(ipi)->ipi_list_lock, RA_UNLOCKED) -#define INP_HASH_LOCK_INIT(ipi, d) \ - rw_init_flags(&(ipi)->ipi_hash_lock, (d), 0) -#define INP_HASH_LOCK_DESTROY(ipi) rw_destroy(&(ipi)->ipi_hash_lock) -#define INP_HASH_RLOCK(ipi) rw_rlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_WLOCK(ipi) rw_wlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_RUNLOCK(ipi) rw_runlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_WUNLOCK(ipi) rw_wunlock(&(ipi)->ipi_hash_lock) -#define INP_HASH_LOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ - RA_LOCKED) -#define INP_HASH_WLOCK_ASSERT(ipi) rw_assert(&(ipi)->ipi_hash_lock, \ - RA_WLOCKED) +#define INP_HASH_LOCK_INIT(ipi, d) mtx_init(&(ipi)->ipi_hash_lock, (d), NULL, MTX_DEF) +#define INP_HASH_LOCK_DESTROY(ipi) mtx_destroy(&(ipi)->ipi_hash_lock) +#define INP_HASH_RLOCK(ipi) NET_EPOCH_ENTER() +#define INP_HASH_WLOCK(ipi) mtx_lock(&(ipi)->ipi_hash_lock) +#define INP_HASH_RUNLOCK(ipi) NET_EPOCH_EXIT() +#define INP_HASH_WUNLOCK(ipi) mtx_unlock(&(ipi)->ipi_hash_lock) +#define INP_HASH_LOCK_ASSERT(ipi) MPASS(in_epoch() || mtx_owned(&(ipi)->ipi_hash_lock)) +#define INP_HASH_WLOCK_ASSERT(ipi) mtx_assert(&(ipi)->ipi_hash_lock, MA_OWNED); #define INP_GROUP_LOCK_INIT(ipg, d) mtx_init(&(ipg)->ipg_lock, (d), NULL, \ MTX_DEF | MTX_DUPOK) Index: sys/netinet/in_pcb.c =================================================================== --- sys/netinet/in_pcb.c +++ sys/netinet/in_pcb.c @@ -2209,7 +2209,14 @@ locked = INP_TRY_RLOCK(inp); else panic("%s: locking bug", __func__); - if (!locked) + if (__predict_false(locked && (inp->inp_flags2 & INP_FREED))) { + if (lookupflags & INPLOOKUP_WLOCKPCB) + INP_WUNLOCK(inp); + else + INP_RUNLOCK(inp); + INP_HASH_RUNLOCK(pcbinfo); + return (NULL); + } else if (!locked) in_pcbref(inp); INP_GROUP_UNLOCK(pcbgroup); if (!locked) { Index: sys/netinet/tcp_hpts.c =================================================================== --- sys/netinet/tcp_hpts.c +++ sys/netinet/tcp_hpts.c @@ -185,9 +185,6 @@ static struct tcp_hptsi tcp_pace; -static int -tcp_hptsi_lock_inpinfo(struct inpcb *inp, - struct tcpcb **tp); static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); @@ -499,59 +496,6 @@ 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); -/* - * Try to get the INP_INFO lock. - * - * This function always succeeds in getting the lock. It will clear - * *tpp and return (1) if something critical changed while the inpcb - * was unlocked. Otherwise, it will leave *tpp unchanged and return (0). - * - * This function relies on the fact that the hpts always holds a - * reference on the inpcb while the segment is on the hptsi wheel and - * in the input queue. - * - */ -static int -tcp_hptsi_lock_inpinfo(struct inpcb *inp, struct tcpcb **tpp) -{ - struct tcp_function_block *tfb; - struct tcpcb *tp; - void *ptr; - - /* Try the easy way. */ - if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) - return (0); - - /* - * OK, let's try the hard way. We'll save the function pointer block - * to make sure that doesn't change while we aren't holding the - * lock. - */ - tp = *tpp; - tfb = tp->t_fb; - ptr = tp->t_fb_ptr; - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - INP_WLOCK(inp); - /* If the session went away, return an error. */ - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { - *tpp = NULL; - return (1); - } - /* - * If the function block or stack-specific data block changed, - * report an error. - */ - tp = intotcpcb(inp); - if ((tp->t_fb != tfb) && (tp->t_fb_ptr != ptr)) { - *tpp = NULL; - return (1); - } - return (0); -} - - static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { @@ -1289,10 +1233,7 @@ (m->m_pkthdr.pace_lock == TI_RLOCKED || tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - if (tcp_hptsi_lock_inpinfo(inp, &tp)) { - CURVNET_RESTORE(); - goto out; - } + INP_INFO_RLOCK(&V_tcbinfo); m = tp->t_in_pkt; } if (in_newts_every_tcb) { @@ -1359,7 +1300,6 @@ */ if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { - out_free: while (m) { m_freem(m); m = n; @@ -1376,8 +1316,7 @@ if (ti_locked == TI_UNLOCKED && (tp->t_state != TCPS_ESTABLISHED)) { ti_locked = TI_RLOCKED; - if (tcp_hptsi_lock_inpinfo(inp, &tp)) - goto out_free; + INP_INFO_RLOCK(&V_tcbinfo); } } /** end while(m) */ } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -960,25 +960,10 @@ * * XXXRW: It may be time to rethink timewait locking. */ -relocked: if (inp->inp_flags & INP_TIMEWAIT) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - inp = NULL; - goto findpcb; - } else if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - inp = NULL; - goto findpcb; - } - } else - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK(); + ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); @@ -1026,23 +1011,8 @@ (tp->t_state == TCPS_LISTEN && (thflags & TH_SYN) && !IS_FASTOPEN(tp->t_flags)))) { if (ti_locked == TI_UNLOCKED) { - if (INP_INFO_TRY_RLOCK(&V_tcbinfo) == 0) { - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - inp = NULL; - goto findpcb; - } else if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - inp = NULL; - goto findpcb; - } - goto relocked; - } else - ti_locked = TI_RLOCKED; + INP_INFO_RLOCK(); + ti_locked = TI_RLOCKED; } INP_INFO_RLOCK_ASSERT(&V_tcbinfo); } Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -6814,34 +6814,8 @@ * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ - if (ti_locked != TI_RLOCKED && INP_INFO_TRY_RLOCK(&V_tcbinfo)) - ti_locked = TI_RLOCKED; - if (ti_locked != TI_RLOCKED) { - inp = tp->t_inpcb; - tfb = tp->t_fb; - in_pcbref(inp); - INP_WUNLOCK(inp); - INP_INFO_RLOCK(&V_tcbinfo); - ti_locked = TI_RLOCKED; - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) - inp = NULL; - if (inp == NULL || (inp->inp_flags2 & INP_FREED) || - (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED))) { - /* The TCPCB went away. Free the packet. */ - INP_INFO_RUNLOCK(&V_tcbinfo); - if (inp) - INP_WUNLOCK(inp); - m_freem(m); - return; - } - /* If the stack changed, call the correct stack. */ - if (tp->t_fb != tfb) { - tp->t_fb->tfb_tcp_do_segment(m, th, so, tp, - drop_hdrlen, tlen, iptos, ti_locked); - return; - } - } + INP_INFO_RLOCK(); + ti_locked = TI_RLOCKED; tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, ti_locked, 0, &tv); Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -707,54 +707,46 @@ in_pcbref(inp); TW_RUNLOCK(V_tw_lock); - if (INP_INFO_TRY_RLOCK(&V_tcbinfo)) { - - INP_WLOCK(inp); - tw = intotw(inp); - if (in_pcbrele_wlocked(inp)) { - if (__predict_true(tw == NULL)) { - INP_INFO_RUNLOCK(&V_tcbinfo); - continue; - } else { - /* This should not happen as in TIMEWAIT - * state the inp should not be destroyed - * before its tcptw. If INVARIANTS is - * defined panic. - */ + INP_INFO_RLOCK(&V_tcbinfo); + INP_WLOCK(inp); + tw = intotw(inp); + if (in_pcbrele_wlocked(inp)) { + if (__predict_true(tw == NULL)) { + INP_INFO_RUNLOCK(&V_tcbinfo); + continue; + } else { + /* This should not happen as in TIMEWAIT + * state the inp should not be destroyed + * before its tcptw. If INVARIANTS is + * defined panic. + */ #ifdef INVARIANTS - panic("%s: Panic before an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); + panic("%s: Panic before an infinite " + "loop: INP_TIMEWAIT && (INP_FREED " + "|| inp last reference) && tw != " + "NULL", __func__); #else - log(LOG_ERR, "%s: Avoid an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); + log(LOG_ERR, "%s: Avoid an infinite " + "loop: INP_TIMEWAIT && (INP_FREED " + "|| inp last reference) && tw != " + "NULL", __func__); #endif - INP_INFO_RUNLOCK(&V_tcbinfo); - break; - } - } - - if (tw == NULL) { - /* tcp_twclose() has already been called */ - INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); - continue; + break; } + } - tcp_twclose(tw, reuse); + if (tw == NULL) { + /* tcp_twclose() has already been called */ + INP_WUNLOCK(inp); INP_INFO_RUNLOCK(&V_tcbinfo); - if (reuse) - return tw; - } else { - /* INP_INFO lock is busy, continue later. */ - INP_WLOCK(inp); - if (!in_pcbrele_wlocked(inp)) - INP_WUNLOCK(inp); - break; + continue; } + + tcp_twclose(tw, reuse); + INP_INFO_RUNLOCK(&V_tcbinfo); + if (reuse) + return tw; } return NULL;