Index: sys/netinet/tcp_timer.h =================================================================== --- sys/netinet/tcp_timer.h +++ sys/netinet/tcp_timer.h @@ -178,8 +178,7 @@ void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); struct tcptw * - tcp_tw_2msl_reuse(void); /* XXX temporary? */ -void tcp_tw_2msl_scan(void); + tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -192,7 +192,7 @@ VNET_LIST_RLOCK_NOSLEEP(); VNET_FOREACH(vnet_iter) { CURVNET_SET(vnet_iter); - tcp_tw_2msl_scan(); + (void) tcp_tw_2msl_scan(0); CURVNET_RESTORE(); } VNET_LIST_RUNLOCK_NOSLEEP(); Index: sys/netinet/tcp_timewait.c =================================================================== --- sys/netinet/tcp_timewait.c +++ sys/netinet/tcp_timewait.c @@ -104,7 +104,7 @@ * Rules on tcptw usage: * - a inpcb is always freed _after_ its tcptw * - a tcptw relies on its inpcb reference counting for memory stability - * - a tcptw is valid only under its inpcb locked + * - a tcptw is dereferenceable only while its inpcb is locked */ static VNET_DEFINE(TAILQ_HEAD(, tcptw), twq_2msl); #define V_twq_2msl VNET(twq_2msl) @@ -257,8 +257,11 @@ * Reached limit on total number of TIMEWAIT connections * allowed. Remove a connection from TIMEWAIT queue in LRU * fashion to make room for this connection. + * + * pcbinfo lock is needed here to prevent deadlock as + * two inpcb locks can be acquired simultaneously. */ - tw = tcp_tw_2msl_reuse(); + tw = tcp_tw_2msl_scan(1); if (tw == NULL) { tp = tcp_close(tp); if (tp != NULL) @@ -266,11 +269,11 @@ return; } } - tw->tw_inpcb = inp; /* * The tcptw will hold a reference on its inpcb until tcp_twclose * is called */ + tw->tw_inpcb = inp; in_pcbref(inp); /* Reference from tw */ /* @@ -653,58 +656,28 @@ } struct tcptw * -tcp_tw_2msl_reuse(void) +tcp_tw_2msl_scan(int reuse) { struct tcptw *tw; struct inpcb *inp; - INP_INFO_WLOCK_ASSERT(&V_tcbinfo); - - for (;;) { - TW_RLOCK(V_tw_lock); - tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL) { - TW_RUNLOCK(V_tw_lock); - break; - } - KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", - __func__)); - - inp = tw->tw_inpcb; - in_pcbref(inp); - TW_RUNLOCK(V_tw_lock); - - INP_WLOCK(inp); - tw = intotw(inp); - if (in_pcbrele_wlocked(inp)) { - KASSERT(tw == NULL, ("%s: held last inp reference but " - "tw not NULL", __func__)); - continue; - } - - if (tw == NULL) { - /* tcp_twclose() has already been called */ - INP_WUNLOCK(inp); - continue; - } - - tcp_twclose(tw, 1); - break; +#ifdef INVARIANTS + if (reuse) { + /* + * pcbinfo lock is needed in reuse case to prevent deadlock + * as two inpcb locks can be acquired simultaneously: + * - the inpcb transitioning to TIME_WAIT state in + * tcp_tw_start(), + * - the inpcb closed by tcp_twclose(). + */ + INP_INFO_WLOCK_ASSERT(&V_tcbinfo); } - - return (tw); -} - -void -tcp_tw_2msl_scan(void) -{ - struct tcptw *tw; - struct inpcb *inp; +#endif for (;;) { TW_RLOCK(V_tw_lock); tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL || tw->tw_time - ticks > 0) { + if (tw == NULL || (!reuse && tw->tw_time - ticks > 0)) { TW_RUNLOCK(V_tw_lock); break; } @@ -733,8 +706,10 @@ continue; } - tcp_twclose(tw, 0); + tcp_twclose(tw, reuse); INP_INFO_WUNLOCK(&V_tcbinfo); + if (reuse) + return tw; } else { /* INP_INFO lock is busy, continue later. */ INP_WLOCK(inp); @@ -743,4 +718,6 @@ break; } } + + return NULL; }