Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c (revision 277315) +++ sys/netinet/tcp_subr.c (working copy) @@ -790,11 +790,16 @@ V_tcp_mssdflt; /* Set up our timeouts. */ - callout_init(&tp->t_timers->tt_rexmt, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_persist, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_keep, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_2msl, CALLOUT_MPSAFE); - callout_init(&tp->t_timers->tt_delack, CALLOUT_MPSAFE); + callout_init_rw(&tp->t_timers->tt_rexmt, + &inp->inp_lock, CALLOUT_RETURNUNLOCKED); + callout_init_rw(&tp->t_timers->tt_persist, + &inp->inp_lock, CALLOUT_RETURNUNLOCKED); + callout_init_rw(&tp->t_timers->tt_keep, + &inp->inp_lock, CALLOUT_RETURNUNLOCKED); + callout_init_rw(&tp->t_timers->tt_2msl, + &inp->inp_lock, CALLOUT_RETURNUNLOCKED); + callout_init_rw(&tp->t_timers->tt_delack, + &inp->inp_lock, CALLOUT_RETURNUNLOCKED); if (V_tcp_do_rfc1323) tp->t_flags = (TF_REQ_SCALE|TF_REQ_TSTMP); @@ -888,6 +893,32 @@ } /* + * Switch locks to avoid LOR + * Return values: + * 0: No lock switching needed + * 1: Success + * 2: Failure and all locks dropped + */ +int +tcp_switch_locks(struct inpcb *inp) +{ + INP_WLOCK_ASSERT(inp); + + if (rw_wowned(&V_tcbinfo.ipi_lock)) + return (0); + + in_pcbref(inp); + INP_WUNLOCK(inp); + INP_INFO_WLOCK(&V_tcbinfo); + INP_WLOCK(inp); + if (in_pcbrele_wlocked(inp)) { + INP_INFO_WUNLOCK(&V_tcbinfo); + return (2); + } + return (1); +} + +/* * Drop a TCP connection, reporting * the specified error. If connection is synchronized, * then send a RST to peer. @@ -896,10 +927,17 @@ tcp_drop(struct tcpcb *tp, int errno) { struct socket *so = tp->t_inpcb->inp_socket; + struct tcpcb *retval; + int do_unlock; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(tp->t_inpcb); + do_unlock = tcp_switch_locks(tp->t_inpcb); + if (do_unlock == 2) + return (NULL); + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + if (TCPS_HAVERCVDSYN(tp->t_state)) { tcp_state_change(tp, TCPS_CLOSED); (void) tcp_output(tp); @@ -909,9 +947,34 @@ if (errno == ETIMEDOUT && tp->t_softerror) errno = tp->t_softerror; so->so_error = errno; - return (tcp_close(tp)); + retval = tcp_close(tp); + if (do_unlock) + INP_INFO_WUNLOCK(&V_tcbinfo); + return (retval); } +static void +tcp_callout_drain(void *arg) +{ + struct tcpcb *tp = arg; + struct inpcb *inp; + + inp = tp->t_inpcb; + + INP_WLOCK(inp); + if (--(tp->t_calloutref) != 0) { + INP_WUNLOCK(inp); + return; + } + inp->inp_ppcb = NULL; + if (in_pcbrele_wlocked(inp) == 0) + INP_WUNLOCK(inp); + + CC_ALGO(tp) = NULL; + tp->t_inpcb = NULL; + uma_zfree(V_tcpcb_zone, tp); +} + void tcp_discardcb(struct tcpcb *tp) { @@ -924,26 +987,25 @@ INP_WLOCK_ASSERT(inp); /* - * Make sure that all of our timers are stopped before we delete the - * PCB. - * - * XXXRW: Really, we would like to use callout_drain() here in order - * to avoid races experienced in tcp_timer.c where a timer is already - * executing at this point. However, we can't, both because we're - * running in a context where we can't sleep, and also because we - * hold locks required by the timers. What we instead need to do is - * test to see if callout_drain() is required, and if so, defer some - * portion of the remainder of tcp_discardcb() to an asynchronous - * context that can callout_drain() and then continue. Some care - * will be required to ensure that no further processing takes place - * on the tcpcb, even though it hasn't been freed (a flag?). + * Make sure that all of our timers are stopped before we + * delete the PCB: */ - callout_stop(&tp->t_timers->tt_rexmt); - callout_stop(&tp->t_timers->tt_persist); - callout_stop(&tp->t_timers->tt_keep); - callout_stop(&tp->t_timers->tt_2msl); - callout_stop(&tp->t_timers->tt_delack); + tp->t_calloutref = 6; /* set initial value */ + + if (callout_drain_async(&tp->t_timers->tt_rexmt, &tcp_callout_drain, tp) == 0) + tp->t_calloutref--; + if (callout_drain_async(&tp->t_timers->tt_persist, &tcp_callout_drain, tp) == 0) + tp->t_calloutref--; + if (callout_drain_async(&tp->t_timers->tt_keep, &tcp_callout_drain, tp) == 0) + tp->t_calloutref--; + if (callout_drain_async(&tp->t_timers->tt_2msl, &tcp_callout_drain, tp) == 0) + tp->t_calloutref--; + if (callout_drain_async(&tp->t_timers->tt_delack, &tcp_callout_drain, tp) == 0) + tp->t_calloutref--; + /* this ref is taken by the final tcp_callout_drain() */ + in_pcbref(inp); + /* * If we got enough samples through the srtt filter, * save the rtt and rttvar in the routing entry. @@ -1017,10 +1079,7 @@ khelp_destroy_osd(tp->osd); - CC_ALGO(tp) = NULL; - inp->inp_ppcb = NULL; - tp->t_inpcb = NULL; - uma_zfree(V_tcpcb_zone, tp); + tcp_callout_drain(tp); } /* @@ -1032,10 +1091,16 @@ { struct inpcb *inp = tp->t_inpcb; struct socket *so; + int do_unlock; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); INP_WLOCK_ASSERT(inp); + do_unlock = tcp_switch_locks(tp->t_inpcb); + if (do_unlock == 2) + return (NULL); + + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); + #ifdef TCP_OFFLOAD if (tp->t_state == TCPS_LISTEN) tcp_offload_listen_stop(tp); @@ -1054,8 +1119,10 @@ SOCK_LOCK(so); so->so_state &= ~SS_PROTOREF; sofree(so); - return (NULL); + tp = NULL; } + if (do_unlock) + INP_INFO_WUNLOCK(&V_tcbinfo); return (tp); } Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c (revision 277315) +++ sys/netinet/tcp_timer.c (working copy) @@ -273,25 +273,7 @@ CURVNET_SET(tp->t_vnet); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_delack: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - CURVNET_RESTORE(); - return; - } - INP_WLOCK(inp); - if (callout_pending(&tp->t_timers->tt_delack) || - !callout_active(&tp->t_timers->tt_delack)) { - INP_WUNLOCK(inp); - CURVNET_RESTORE(); - return; - } + callout_deactivate(&tp->t_timers->tt_delack); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); @@ -317,37 +299,11 @@ ostate = tp->t_state; #endif - /* - * XXXRW: Does this actually happen? - */ - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_2msl: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - INP_WLOCK(inp); tcp_free_sackholes(tp); - if (callout_pending(&tp->t_timers->tt_2msl) || - !callout_active(&tp->t_timers->tt_2msl)) { - INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } callout_deactivate(&tp->t_timers->tt_2msl); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -383,7 +339,6 @@ #endif if (tp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -399,33 +354,10 @@ ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_keep: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - INP_WLOCK(inp); - if (callout_pending(&tp->t_timers->tt_keep) || - !callout_active(&tp->t_timers->tt_keep)) { - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } callout_deactivate(&tp->t_timers->tt_keep); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -472,7 +404,6 @@ PRU_SLOWTIMO); #endif INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; @@ -487,7 +418,6 @@ #endif if (tp != NULL) INP_WUNLOCK(tp->t_inpcb); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -502,33 +432,11 @@ ostate = tp->t_state; #endif - INP_INFO_WLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_persist: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - INP_WLOCK(inp); - if (callout_pending(&tp->t_timers->tt_persist) || - !callout_active(&tp->t_timers->tt_persist)) { - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + callout_deactivate(&tp->t_timers->tt_persist); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -573,7 +481,6 @@ #endif if (tp != NULL) INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -583,7 +490,7 @@ struct tcpcb *tp = xtp; CURVNET_SET(tp->t_vnet); int rexmt; - int headlocked; + int do_unlock = 0; struct inpcb *inp; #ifdef TCPDEBUG int ostate; @@ -590,34 +497,10 @@ ostate = tp->t_state; #endif - - INP_INFO_RLOCK(&V_tcbinfo); inp = tp->t_inpcb; - /* - * XXXRW: While this assert is in fact correct, bugs in the tcpcb - * tear-down mean we need it as a work-around for races between - * timers and tcp_discardcb(). - * - * KASSERT(inp != NULL, ("tcp_timer_rexmt: inp == NULL")); - */ - if (inp == NULL) { - tcp_timer_race++; - INP_INFO_RUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } - INP_WLOCK(inp); - if (callout_pending(&tp->t_timers->tt_rexmt) || - !callout_active(&tp->t_timers->tt_rexmt)) { - INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } callout_deactivate(&tp->t_timers->tt_rexmt); if ((inp->inp_flags & INP_DROPPED) != 0) { INP_WUNLOCK(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); return; } @@ -630,30 +513,18 @@ if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); - in_pcbref(inp); - INP_INFO_RUNLOCK(&V_tcbinfo); - INP_WUNLOCK(inp); - INP_INFO_WLOCK(&V_tcbinfo); - INP_WLOCK(inp); - if (in_pcbrele_wlocked(inp)) { - INP_INFO_WUNLOCK(&V_tcbinfo); + do_unlock = tcp_switch_locks(inp); + if (do_unlock == 2) { CURVNET_RESTORE(); return; } - if (inp->inp_flags & INP_DROPPED) { - INP_WUNLOCK(inp); - INP_INFO_WUNLOCK(&V_tcbinfo); - CURVNET_RESTORE(); - return; - } + if (inp->inp_flags & INP_DROPPED) + goto out; tp = tcp_drop(tp, tp->t_softerror ? tp->t_softerror : ETIMEDOUT); - headlocked = 1; goto out; } - INP_INFO_RUNLOCK(&V_tcbinfo); - headlocked = 0; if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be @@ -843,7 +714,7 @@ #endif if (tp != NULL) INP_WUNLOCK(inp); - if (headlocked) + if (do_unlock) INP_INFO_WUNLOCK(&V_tcbinfo); CURVNET_RESTORE(); } @@ -856,6 +727,8 @@ struct inpcb *inp = tp->t_inpcb; int cpu = inp_to_cpuid(inp); + INP_WLOCK_ASSERT(tp->t_inpcb); + #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) return; Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h (revision 277315) +++ sys/netinet/tcp_var.h (working copy) @@ -153,6 +153,7 @@ u_long max_sndwnd; /* largest window peer has offered */ int t_softerror; /* possible error not yet reported */ + u_char t_calloutref; /* callout drain references */ /* out-of-band data */ char t_oobflags; /* have some */ char t_iobc; /* input character */ @@ -654,6 +655,7 @@ void tcp_twclose(struct tcptw *, int); void tcp_ctlinput(int, struct sockaddr *, void *); int tcp_ctloutput(struct socket *, struct sockopt *); +int tcp_switch_locks(struct inpcb *inp); struct tcpcb * tcp_drop(struct tcpcb *, int); void tcp_drain(void);