diff --git a/sys/netinet/in_pcb.c b/sys/netinet/in_pcb.c --- a/sys/netinet/in_pcb.c +++ b/sys/netinet/in_pcb.c @@ -1031,7 +1031,6 @@ laddr = sin->sin_addr; if (lport) { struct inpcb *t; - struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && @@ -1070,24 +1069,9 @@ } t = in_pcblookup_local(pcbinfo, sin->sin_addr, lport, lookupflags, cred); - if (t && (t->inp_flags & INP_TIMEWAIT)) { - /* - * XXXRW: If an incpb has had its timewait - * state recycled, we treat the address as - * being in use (for now). This is better - * than a panic, but not desirable. - */ - tw = intotw(t); - if (tw == NULL || - ((reuseport & tw->tw_so_options) == 0 && - (reuseport_lb & - tw->tw_so_options) == 0)) { - return (EADDRINUSE); - } - } else if (t && - ((inp->inp_flags2 & INP_BINDMULTI) == 0) && - (reuseport & inp_so_options(t)) == 0 && - (reuseport_lb & inp_so_options(t)) == 0) { + if (t && ((inp->inp_flags2 & INP_BINDMULTI) == 0) && + (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { #ifdef INET6 if (ntohl(sin->sin_addr.s_addr) != INADDR_ANY || diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -999,29 +999,29 @@ goto dropunlock; } - /* - * A previous connection in TIMEWAIT state is supposed to catch stray - * or duplicate segments arriving late. If this segment was a - * legitimate new connection attempt, the old INPCB gets removed and - * we can try again to find a listening socket. - */ - if (inp->inp_flags & INP_TIMEWAIT) { + tp = intotcpcb(inp); + switch (tp->t_state) { + case TCPS_TIME_WAIT: + /* + * A previous connection in TIMEWAIT state is supposed to catch + * stray or duplicate segments arriving late. If this segment + * was a legitimate new connection attempt, the old INPCB gets + * removed and we can try again to find a listening socket. + */ tcp_dooptions(&to, optp, optlen, (thflags & TH_SYN) ? TO_SYN : 0); /* - * NB: tcp_twcheck unlocks the INP and frees the mbuf. + * tcp_twcheck unlocks the inp always, and frees the m if fails. */ if (tcp_twcheck(inp, &to, th, m, tlen)) goto findpcb; return (IPPROTO_DONE); - } - /* - * The TCPCB may no longer exist if the connection is winding - * down or it is in the CLOSED state. Either way we drop the - * segment and send an appropriate response. - */ - tp = intotcpcb(inp); - if (tp == NULL || tp->t_state == TCPS_CLOSED) { + case TCPS_CLOSED: + /* + * The TCPCB may no longer exist if the connection is winding + * down or it is in the CLOSED state. Either way we drop the + * segment and send an appropriate response. + */ rstreason = BANDLIM_RST_CLOSEDPORT; goto dropwithreset; } @@ -3030,10 +3030,6 @@ * Starting the timer is contrary to the * specification, but if we don't get a FIN * we'll hang forever. - * - * XXXjl: - * we should release the tp also, and use a - * compressed state. */ if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { soisdisconnected(so); diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -11360,8 +11360,6 @@ INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); - KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", - __func__)); tp->t_rcvtime = ticks; /* diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -14154,8 +14154,7 @@ INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); - KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", - __func__)); + if ((tp->t_state >= TCPS_FIN_WAIT_1) && (tp->t_flags & TF_GPUTINPROG)) { /* diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1485,7 +1485,6 @@ uma_zone_set_max(V_tcpcb_zone, maxsockets); uma_zone_set_warning(V_tcpcb_zone, "kern.ipc.maxsockets limit reached"); - tcp_tw_init(); syncache_init(); tcp_hc_init(); @@ -1647,7 +1646,6 @@ } tcp_hc_destroy(); syncache_destroy(); - tcp_tw_destroy(); in_pcbinfo_destroy(&V_tcbinfo); /* tcp_discardcb() clears the sack_holes up. */ uma_zdestroy(V_sack_hole_zone); @@ -2678,33 +2676,17 @@ return (error); while ((inp = inp_next(&inpi)) != NULL) { - if (inp->inp_gencnt <= xig.xig_gen) { - int crerr; - - /* - * XXX: This use of cr_cansee(), introduced with - * TCP state changes, is not quite right, but for - * now, better than nothing. - */ - if (inp->inp_flags & INP_TIMEWAIT) { - if (intotw(inp) != NULL) - crerr = cr_cansee(req->td->td_ucred, - intotw(inp)->tw_cred); - else - crerr = EINVAL; /* Skip this inp. */ + if (inp->inp_gencnt <= xig.xig_gen && + cr_canseeinpcb(req->td->td_ucred, inp) == 0) { + struct xtcpcb xt; + + tcp_inptoxtp(inp, &xt); + error = SYSCTL_OUT(req, &xt, sizeof xt); + if (error) { + INP_RUNLOCK(inp); + break; } else - crerr = cr_canseeinpcb(req->td->td_ucred, inp); - if (crerr == 0) { - struct xtcpcb xt; - - tcp_inptoxtp(inp, &xt); - error = SYSCTL_OUT(req, &xt, sizeof xt); - if (error) { - INP_RUNLOCK(inp); - break; - } else - continue; - } + continue; } } @@ -3639,7 +3621,6 @@ struct sockaddr_storage addrs[2]; struct inpcb *inp; struct tcpcb *tp; - struct tcptw *tw; #ifdef INET struct sockaddr_in *fin = NULL, *lin = NULL; #endif @@ -3721,19 +3702,7 @@ #endif } if (inp != NULL) { - if (inp->inp_flags & INP_TIMEWAIT) { - /* - * XXXRW: There currently exists a state where an - * inpcb is present, but its timewait state has been - * discarded. For now, don't allow dropping of this - * type of inpcb. - */ - tw = intotw(inp); - if (tw != NULL) - tcp_twclose(tw, 0); - else - INP_WUNLOCK(inp); - } else if ((inp->inp_flags & INP_DROPPED) == 0 && + if ((inp->inp_flags & INP_DROPPED) == 0 && !SOLISTENING(inp->inp_socket)) { tp = intotcpcb(inp); tp = tcp_drop(tp, ECONNABORTED); @@ -4027,56 +3996,49 @@ tcp_inptoxtp(const struct inpcb *inp, struct xtcpcb *xt) { struct tcpcb *tp = intotcpcb(inp); - struct tcptw *tw = intotw(inp); sbintime_t now; bzero(xt, sizeof(*xt)); - if (inp->inp_flags & INP_TIMEWAIT) { - xt->t_state = TCPS_TIME_WAIT; - xt->xt_encaps_port = tw->t_port; - } else { - xt->t_state = tp->t_state; - xt->t_logstate = tp->t_logstate; - xt->t_flags = tp->t_flags; - xt->t_sndzerowin = tp->t_sndzerowin; - xt->t_sndrexmitpack = tp->t_sndrexmitpack; - xt->t_rcvoopack = tp->t_rcvoopack; - xt->t_rcv_wnd = tp->rcv_wnd; - xt->t_snd_wnd = tp->snd_wnd; - xt->t_snd_cwnd = tp->snd_cwnd; - xt->t_snd_ssthresh = tp->snd_ssthresh; - xt->t_dsack_bytes = tp->t_dsack_bytes; - xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; - xt->t_dsack_pack = tp->t_dsack_pack; - xt->t_maxseg = tp->t_maxseg; - xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + - (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; - - now = getsbinuptime(); -#define COPYTIMER(ttt) do { \ - if (callout_active(&tp->t_timers->ttt)) \ - xt->ttt = (tp->t_timers->ttt.c_time - now) / \ - SBT_1MS; \ - else \ - xt->ttt = 0; \ + xt->t_state = tp->t_state; + xt->t_logstate = tp->t_logstate; + xt->t_flags = tp->t_flags; + xt->t_sndzerowin = tp->t_sndzerowin; + xt->t_sndrexmitpack = tp->t_sndrexmitpack; + xt->t_rcvoopack = tp->t_rcvoopack; + xt->t_rcv_wnd = tp->rcv_wnd; + xt->t_snd_wnd = tp->snd_wnd; + xt->t_snd_cwnd = tp->snd_cwnd; + xt->t_snd_ssthresh = tp->snd_ssthresh; + xt->t_dsack_bytes = tp->t_dsack_bytes; + xt->t_dsack_tlp_bytes = tp->t_dsack_tlp_bytes; + xt->t_dsack_pack = tp->t_dsack_pack; + xt->t_maxseg = tp->t_maxseg; + xt->xt_ecn = (tp->t_flags2 & TF2_ECN_PERMIT) ? 1 : 0 + + (tp->t_flags2 & TF2_ACE_PERMIT) ? 2 : 0; + + now = getsbinuptime(); +#define COPYTIMER(ttt) do { \ + if (callout_active(&tp->t_timers->ttt)) \ + xt->ttt = (tp->t_timers->ttt.c_time - now) / \ + SBT_1MS; \ + else \ + xt->ttt = 0; \ } while (0) - COPYTIMER(tt_delack); - COPYTIMER(tt_rexmt); - COPYTIMER(tt_persist); - COPYTIMER(tt_keep); - COPYTIMER(tt_2msl); + COPYTIMER(tt_delack); + COPYTIMER(tt_rexmt); + COPYTIMER(tt_persist); + COPYTIMER(tt_keep); + COPYTIMER(tt_2msl); #undef COPYTIMER - xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; + xt->t_rcvtime = 1000 * (ticks - tp->t_rcvtime) / hz; - xt->xt_encaps_port = tp->t_port; - bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, - TCP_FUNCTION_NAME_LEN_MAX); - bcopy(CC_ALGO(tp)->name, xt->xt_cc, - TCP_CA_NAME_MAX); + xt->xt_encaps_port = tp->t_port; + bcopy(tp->t_fb->tfb_tcp_block_name, xt->xt_stack, + TCP_FUNCTION_NAME_LEN_MAX); + bcopy(CC_ALGO(tp)->name, xt->xt_cc, TCP_CA_NAME_MAX); #ifdef TCP_BLACKBOX - (void)tcp_log_get_id(tp, xt->xt_logid); + (void)tcp_log_get_id(tp, xt->xt_logid); #endif - } xt->xt_len = sizeof(struct xtcpcb); in_pcbtoxinpcb(inp, &xt->xt_inp); diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -229,8 +229,6 @@ void tcp_timer_init(void); void tcp_timer_2msl(void *xtp); -struct tcptw * - tcp_tw_2msl_scan(int reuse); /* XXX temporary? */ void tcp_timer_keep(void *xtp); void tcp_timer_persist(void *xtp); void tcp_timer_rexmt(void *xtp); diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -236,41 +236,6 @@ } } -/* - * Legacy TCP global callout routine called every 500 ms. - * Used to cleanup timewait states, which lack their own callouts. - */ -static struct callout tcpslow_callout; -static void -tcp_slowtimo(void *arg __unused) -{ - struct epoch_tracker et; - VNET_ITERATOR_DECL(vnet_iter); - - NET_EPOCH_ENTER(et); - VNET_LIST_RLOCK_NOSLEEP(); - VNET_FOREACH(vnet_iter) { - CURVNET_SET(vnet_iter); - (void) tcp_tw_2msl_scan(0); - CURVNET_RESTORE(); - } - VNET_LIST_RUNLOCK_NOSLEEP(); - NET_EPOCH_EXIT(et); - - callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10, - tcp_slowtimo, NULL, 0); -} - -static void -tcp_slowtimo_init(void *arg __unused) -{ - - callout_init(&tcpslow_callout, 1); - callout_reset_sbt(&tcpslow_callout, SBT_1MS * 500, SBT_1MS * 10, - tcp_slowtimo, NULL, 0); -} -SYSINIT(tcp_timer, SI_SUB_VNET_DONE, SI_ORDER_ANY, tcp_slowtimo_init, NULL); - int tcp_backoff[TCP_MAXRXTSHIFT + 1] = { 1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 512, 512, 512 }; @@ -387,8 +352,12 @@ * there's no point in hanging onto FIN_WAIT_2 socket. Just close it. * Ignore fact that there were recent incoming segments. */ - if (tcp_fast_finwait2_recycle && tp->t_state == TCPS_FIN_WAIT_2 && - tp->t_inpcb && tp->t_inpcb->inp_socket && + if (tp->t_state == TCPS_TIME_WAIT) { + tcp_timer_close(tp); + CURVNET_RESTORE(); + return; + } else if (tp->t_state == TCPS_FIN_WAIT_2 && + tcp_fast_finwait2_recycle && tp->t_inpcb->inp_socket && (tp->t_inpcb->inp_socket->so_rcv.sb_state & SBS_CANTRCVMORE)) { TCPSTAT_INC(tcps_finwait2_drops); tcp_timer_close(tp); diff --git a/sys/netinet/tcp_timewait.c b/sys/netinet/tcp_timewait.c --- a/sys/netinet/tcp_timewait.c +++ b/sys/netinet/tcp_timewait.c @@ -96,142 +96,26 @@ #include -VNET_DEFINE_STATIC(uma_zone_t, tcptw_zone); -#define V_tcptw_zone VNET(tcptw_zone) -static int maxtcptw; - -/* - * The timed wait queue contains references to each of the TCP sessions - * currently in the TIME_WAIT state. The queue pointers, including the - * queue pointers in each tcptw structure, are protected using the global - * timewait lock, which must be held over queue iteration and modification. - * - * Rules on tcptw usage: - * - a inpcb is always freed _after_ its tcptw - * - a tcptw relies on its inpcb reference counting for memory stability - * - a tcptw is dereferenceable only while its inpcb is locked - */ -VNET_DEFINE_STATIC(TAILQ_HEAD(, tcptw), twq_2msl); -#define V_twq_2msl VNET(twq_2msl) - -/* Global timewait lock */ -VNET_DEFINE_STATIC(struct rwlock, tw_lock); -#define V_tw_lock VNET(tw_lock) - -#define TW_LOCK_INIT(tw, d) rw_init_flags(&(tw), (d), 0) -#define TW_LOCK_DESTROY(tw) rw_destroy(&(tw)) -#define TW_RLOCK(tw) rw_rlock(&(tw)) -#define TW_WLOCK(tw) rw_wlock(&(tw)) -#define TW_RUNLOCK(tw) rw_runlock(&(tw)) -#define TW_WUNLOCK(tw) rw_wunlock(&(tw)) -#define TW_LOCK_ASSERT(tw) rw_assert(&(tw), RA_LOCKED) -#define TW_RLOCK_ASSERT(tw) rw_assert(&(tw), RA_RLOCKED) -#define TW_WLOCK_ASSERT(tw) rw_assert(&(tw), RA_WLOCKED) -#define TW_UNLOCK_ASSERT(tw) rw_assert(&(tw), RA_UNLOCKED) - -static void tcp_tw_2msl_reset(struct tcptw *, int); -static void tcp_tw_2msl_stop(struct tcptw *, int); -static int tcp_twrespond(struct tcptw *, int); - -static int -tcptw_auto_size(void) -{ - int halfrange; - - /* - * Max out at half the ephemeral port range so that TIME_WAIT - * sockets don't tie up too many ephemeral ports. - */ - if (V_ipport_lastauto > V_ipport_firstauto) - halfrange = (V_ipport_lastauto - V_ipport_firstauto) / 2; - else - halfrange = (V_ipport_firstauto - V_ipport_lastauto) / 2; - /* Protect against goofy port ranges smaller than 32. */ - return (imin(imax(halfrange, 32), maxsockets / 5)); -} - -static int -sysctl_maxtcptw(SYSCTL_HANDLER_ARGS) -{ - int error, new; - - if (maxtcptw == 0) - new = tcptw_auto_size(); - else - new = maxtcptw; - error = sysctl_handle_int(oidp, &new, 0, req); - if (error == 0 && req->newptr) - if (new >= 32) { - maxtcptw = new; - uma_zone_set_max(V_tcptw_zone, maxtcptw); - } - return (error); -} - -SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxtcptw, - CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_NEEDGIANT, - &maxtcptw, 0, sysctl_maxtcptw, "IU", - "Maximum number of compressed TCP TIME_WAIT entries"); - VNET_DEFINE_STATIC(bool, nolocaltimewait) = true; #define V_nolocaltimewait VNET(nolocaltimewait) -SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(nolocaltimewait), true, - "Do not create compressed TCP TIME_WAIT entries for local connections"); - -void -tcp_tw_zone_change(void) -{ - - if (maxtcptw == 0) - uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); -} - -void -tcp_tw_init(void) -{ - - V_tcptw_zone = uma_zcreate("tcptw", sizeof(struct tcptw), - NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0); - TUNABLE_INT_FETCH("net.inet.tcp.maxtcptw", &maxtcptw); - if (maxtcptw == 0) - uma_zone_set_max(V_tcptw_zone, tcptw_auto_size()); - else - uma_zone_set_max(V_tcptw_zone, maxtcptw); - TAILQ_INIT(&V_twq_2msl); - TW_LOCK_INIT(V_tw_lock, "tcptw"); -} - -#ifdef VIMAGE -void -tcp_tw_destroy(void) -{ - struct tcptw *tw; - struct epoch_tracker et; - - NET_EPOCH_ENTER(et); - while ((tw = TAILQ_FIRST(&V_twq_2msl)) != NULL) - tcp_twclose(tw, 0); - NET_EPOCH_EXIT(et); - - TW_LOCK_DESTROY(V_tw_lock); - uma_zdestroy(V_tcptw_zone); -} -#endif +SYSCTL_BOOL(_net_inet_tcp, OID_AUTO, nolocaltimewait, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(nolocaltimewait), true, + "Do not create TCP TIME_WAIT state for local connections"); /* * Move a TCP connection into TIME_WAIT state. - * tcbinfo is locked. * inp is locked, and is unlocked before returning. + * + * This function used to free tcpcb and allocate a compressed TCP time-wait + * structure tcptw. This served well for 20 years but is no longer relevant + * on modern machines in the modern internet. However, the function remains + * so that TCP stacks require less modification and we don't burn the bridge + * to go back to using compressed time-wait. */ void tcp_twstart(struct tcpcb *tp) { - struct tcptw twlocal, *tw; struct inpcb *inp = tp->t_inpcb; - struct socket *so; - uint32_t recwin; - bool acknow, local; #ifdef INET6 bool isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; #endif @@ -243,144 +127,44 @@ KASSERT((inp->inp_flags & INP_DROPPED) == 0, ("tcp_twstart: " "(inp->inp_flags & INP_DROPPED) != 0")); - if (V_nolocaltimewait) { + tcp_state_change(tp, TCPS_TIME_WAIT); + soisdisconnected(inp->inp_socket); + + if (tp->t_flags & TF_ACKNOW) + tcp_output(tp); + + if (V_nolocaltimewait && ( #ifdef INET6 - if (isipv6) - local = in6_localaddr(&inp->in6p_faddr); - else + isipv6 ? in6_localaddr(&inp->in6p_faddr) : #endif #ifdef INET - local = in_localip(inp->inp_faddr); + in_localip(inp->inp_faddr) #else - local = false; + false #endif - } else - local = false; - - /* - * For use only by DTrace. We do not reference the state - * after this point so modifying it in place is not a problem. - */ - tcp_state_change(tp, TCPS_TIME_WAIT); - - if (local) - tw = &twlocal; - else - tw = uma_zalloc(V_tcptw_zone, M_NOWAIT); - if (tw == NULL) { - /* - * Reached limit on total number of TIMEWAIT connections - * allowed. Remove a connection from TIMEWAIT queue in LRU - * fashion to make room for this connection. - * If that fails, use on stack tw at least to be able to - * run through tcp_twrespond() and standard tcpcb discard - * routine. - * - * XXX: Check if it possible to always have enough room - * in advance based on guarantees provided by uma_zalloc(). - */ - tw = tcp_tw_2msl_scan(1); - if (tw == NULL) { - tw = &twlocal; - local = true; - } - } - /* - * For !local case the tcptw will hold a reference on its inpcb - * until tcp_twclose is called. - */ - tw->tw_inpcb = inp; - - /* - * Recover last window size sent. - */ - so = inp->inp_socket; - recwin = lmin(lmax(sbspace(&so->so_rcv), 0), - (long)TCP_MAXWIN << tp->rcv_scale); - if (recwin < (so->so_rcv.sb_hiwat / 4) && - recwin < tp->t_maxseg) - recwin = 0; - if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && - recwin < (tp->rcv_adv - tp->rcv_nxt)) - recwin = (tp->rcv_adv - tp->rcv_nxt); - tw->last_win = (u_short)(recwin >> tp->rcv_scale); - - /* - * Set t_recent if timestamps are used on the connection. - */ - if ((tp->t_flags & (TF_REQ_TSTMP|TF_RCVD_TSTMP|TF_NOOPT)) == - (TF_REQ_TSTMP|TF_RCVD_TSTMP)) { - tw->t_recent = tp->ts_recent; - tw->ts_offset = tp->ts_offset; - } else { - tw->t_recent = 0; - tw->ts_offset = 0; - } - - tw->snd_nxt = tp->snd_nxt; - tw->t_port = tp->t_port; - tw->rcv_nxt = tp->rcv_nxt; - tw->tw_time = 0; - tw->tw_flags = tp->t_flags; - -/* XXX - * If this code will - * be used for fin-wait-2 state also, then we may need - * a ts_recent from the last segment. - */ - acknow = tp->t_flags & TF_ACKNOW; - - /* - * First, discard tcpcb state, which includes stopping its timers and - * freeing it. tcp_discardcb() used to also release the inpcb, but - * that work is now done in the caller. - * - * Note: soisdisconnected() call used to be made in tcp_discardcb(), - * and might not be needed here any longer. - */ -#ifdef TCPHPTS - tcp_hpts_remove(inp); -#endif - tcp_discardcb(tp); - soisdisconnected(so); - tw->tw_so_options = so->so_options; - inp->inp_flags |= INP_TIMEWAIT; - if (acknow) - tcp_twrespond(tw, TH_ACK); - if (local) - in_pcbdrop(inp); - else { - in_pcbref(inp); /* Reference from tw */ - tw->tw_cred = crhold(so->so_cred); - inp->inp_ppcb = tw; - TCPSTATES_INC(TCPS_TIME_WAIT); - tcp_tw_2msl_reset(tw, 0); + )) { + if ((tp = tcp_close(tp)) != NULL) + INP_WUNLOCK(inp); + return; } - /* - * If the inpcb owns the sole reference to the socket, then we can - * detach and free the socket as it is not needed in time wait. - */ - if (inp->inp_flags & INP_SOCKREF) { - inp->inp_flags &= ~INP_SOCKREF; - INP_WUNLOCK(inp); - sorele(so); - } else - INP_WUNLOCK(inp); + tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl); + INP_WUNLOCK(inp); } /* - * Returns 1 if the TIME_WAIT state was killed and we should start over, - * looking for a pcb in the listen state. Returns 0 otherwise. + * Returns true if the TIME_WAIT state was killed and we should start over, + * looking for a pcb in the listen state. Otherwise returns false and frees + * the mbuf. * * For pure SYN-segments the PCB shall be read-locked and the tcpopt pointer * may be NULL. For the rest write-lock and valid tcpopt. */ -int +bool tcp_twcheck(struct inpcb *inp, struct tcpopt *to, struct tcphdr *th, struct mbuf *m, int tlen) { - struct tcptw *tw; + struct tcpcb *tp = intotcpcb(inp); char *s; int thflags; tcp_seq seq; @@ -388,16 +172,6 @@ NET_EPOCH_ASSERT(); INP_LOCK_ASSERT(inp); - /* - * XXXRW: Time wait state for inpcb has been recycled, but inpcb is - * still present. This is undesirable, but temporarily necessary - * until we work out how to handle inpcb's who's timewait state has - * been removed. - */ - tw = intotw(inp); - if (tw == NULL) - goto drop; - thflags = tcp_get_flags(th); #ifdef INVARIANTS if ((thflags & (TH_SYN | TH_ACK)) == TH_SYN) @@ -459,36 +233,37 @@ * Allow UDP port number changes in this case. */ if (((thflags & (TH_SYN | TH_ACK)) == TH_SYN) && - SEQ_GT(th->th_seq, tw->rcv_nxt)) { + SEQ_GT(th->th_seq, tp->rcv_nxt)) { /* * In case we can't upgrade our lock just pretend we have * lost this packet. */ if (INP_TRY_UPGRADE(inp) == 0) goto drop; - tcp_twclose(tw, 0); + if ((tp = tcp_close(tp)) != NULL) + INP_WUNLOCK(inp); TCPSTAT_INC(tcps_tw_recycles); - return (1); + return (true); } /* * Send RST if UDP port numbers don't match */ - if (tw->t_port != m->m_pkthdr.tcp_tun_port) { + if (tp->t_port != m->m_pkthdr.tcp_tun_port) { if (tcp_get_flags(th) & TH_ACK) { - tcp_respond(NULL, mtod(m, void *), th, m, + tcp_respond(tp, mtod(m, void *), th, m, (tcp_seq)0, th->th_ack, TH_RST); } else { if (tcp_get_flags(th) & TH_SYN) tlen++; if (tcp_get_flags(th) & TH_FIN) tlen++; - tcp_respond(NULL, mtod(m, void *), th, m, + tcp_respond(tp, mtod(m, void *), th, m, th->th_seq+tlen, (tcp_seq)0, TH_RST|TH_ACK); } INP_UNLOCK(inp); TCPSTAT_INC(tcps_tw_resets); - return (0); + return (false); } /* @@ -505,7 +280,7 @@ * the segment, unless the missing timestamps are tolerated. * See section 3.2 of RFC 7323. */ - if (((to->to_flags & TOF_TS) == 0) && (tw->t_recent != 0) && + if (((to->to_flags & TOF_TS) == 0) && (tp->ts_recent != 0) && (V_tcp_tolerate_missing_ts == 0)) { goto drop; } @@ -515,344 +290,25 @@ */ if (thflags & TH_FIN) { seq = th->th_seq + tlen + (thflags & TH_SYN ? 1 : 0); - if (seq + 1 == tw->rcv_nxt) - tcp_tw_2msl_reset(tw, 1); + if (seq + 1 == tp->rcv_nxt) + tcp_timer_activate(tp, TT_2MSL, 2 * V_tcp_msl); } /* * Acknowledge the segment if it has data or is not a duplicate ACK. */ if (thflags != TH_ACK || tlen != 0 || - th->th_seq != tw->rcv_nxt || th->th_ack != tw->snd_nxt) { + th->th_seq != tp->rcv_nxt || th->th_ack != tp->snd_nxt) { TCP_PROBE5(receive, NULL, NULL, m, NULL, th); - tcp_twrespond(tw, TH_ACK); + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + INP_UNLOCK(inp); TCPSTAT_INC(tcps_tw_responds); - goto dropnoprobe; + return (false); } drop: TCP_PROBE5(receive, NULL, NULL, m, NULL, th); -dropnoprobe: INP_UNLOCK(inp); m_freem(m); - return (0); -} - -void -tcp_twclose(struct tcptw *tw, int reuse) -{ - struct socket *so; - struct inpcb *inp; - - /* - * At this point, we are in one of two situations: - * - * (1) We have no socket, just an inpcb<->twtcp pair. We can free - * all state. - * - * (2) We have a socket -- if we own a reference, release it and - * notify the socket layer. - */ - inp = tw->tw_inpcb; - KASSERT((inp->inp_flags & INP_TIMEWAIT), ("tcp_twclose: !timewait")); - KASSERT(intotw(inp) == tw, ("tcp_twclose: inp_ppcb != tw")); - NET_EPOCH_ASSERT(); - INP_WLOCK_ASSERT(inp); - - tcp_tw_2msl_stop(tw, reuse); - inp->inp_ppcb = NULL; - in_pcbdrop(inp); - - so = inp->inp_socket; - if (so != NULL) { - /* - * If there's a socket, handle two cases: first, we own a - * strong reference, which we will now release, or we don't - * in which case another reference exists (XXXRW: think - * about this more), and we don't need to take action. - */ - if (inp->inp_flags & INP_SOCKREF) { - inp->inp_flags &= ~INP_SOCKREF; - INP_WUNLOCK(inp); - sorele(so); - } else { - /* - * If we don't own the only reference, the socket and - * inpcb need to be left around to be handled by - * tcp_usr_detach() later. - */ - INP_WUNLOCK(inp); - } - } else { - /* - * The socket has been already cleaned-up for us, only free the - * inpcb. - */ - in_pcbfree(inp); - } - TCPSTAT_INC(tcps_closed); -} - -static int -tcp_twrespond(struct tcptw *tw, int flags) -{ - struct inpcb *inp = tw->tw_inpcb; -#if defined(INET6) || defined(INET) - struct tcphdr *th = NULL; -#endif - struct mbuf *m; -#ifdef INET - struct ip *ip = NULL; -#endif - u_int hdrlen, optlen, ulen; - int error = 0; /* Keep compiler happy */ - struct tcpopt to; -#ifdef INET6 - struct ip6_hdr *ip6 = NULL; - int isipv6 = inp->inp_inc.inc_flags & INC_ISIPV6; -#endif - struct udphdr *udp = NULL; - hdrlen = 0; /* Keep compiler happy */ - - INP_WLOCK_ASSERT(inp); - - m = m_gethdr(M_NOWAIT, MT_DATA); - if (m == NULL) - return (ENOBUFS); - m->m_data += max_linkhdr; - -#ifdef MAC - mac_inpcb_create_mbuf(inp, m); -#endif - -#ifdef INET6 - if (isipv6) { - hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); - ip6 = mtod(m, struct ip6_hdr *); - if (tw->t_port) { - udp = (struct udphdr *)(ip6 + 1); - hdrlen += sizeof(struct udphdr); - udp->uh_sport = htons(V_tcp_udp_tunneling_port); - udp->uh_dport = tw->t_port; - ulen = (hdrlen - sizeof(struct ip6_hdr)); - th = (struct tcphdr *)(udp + 1); - } else - th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, tw->t_port, ip6, th); - } -#endif -#if defined(INET6) && defined(INET) - else -#endif -#ifdef INET - { - hdrlen = sizeof(struct tcpiphdr); - ip = mtod(m, struct ip *); - if (tw->t_port) { - udp = (struct udphdr *)(ip + 1); - hdrlen += sizeof(struct udphdr); - udp->uh_sport = htons(V_tcp_udp_tunneling_port); - udp->uh_dport = tw->t_port; - ulen = (hdrlen - sizeof(struct ip)); - th = (struct tcphdr *)(udp + 1); - } else - th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, tw->t_port, ip, th); - } -#endif - to.to_flags = 0; - - /* - * Send a timestamp and echo-reply if both our side and our peer - * have sent timestamps in our SYN's and this is not a RST. - */ - if (tw->t_recent && flags == TH_ACK) { - to.to_flags |= TOF_TS; - to.to_tsval = tcp_ts_getticks() + tw->ts_offset; - to.to_tsecr = tw->t_recent; - } -#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) - if (tw->tw_flags & TF_SIGNATURE) - to.to_flags |= TOF_SIGNATURE; -#endif - optlen = tcp_addoptions(&to, (u_char *)(th + 1)); - - if (udp) { - ulen += optlen; - udp->uh_ulen = htons(ulen); - } - m->m_len = hdrlen + optlen; - m->m_pkthdr.len = m->m_len; - - KASSERT(max_linkhdr + m->m_len <= MHLEN, ("tcptw: mbuf too small")); - - th->th_seq = htonl(tw->snd_nxt); - th->th_ack = htonl(tw->rcv_nxt); - th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; - tcp_set_flags(th, flags); - th->th_win = htons(tw->last_win); - -#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) - if (tw->tw_flags & TF_SIGNATURE) { - if (!TCPMD5_ENABLED() || - TCPMD5_OUTPUT(m, th, to.to_signature) != 0) - return (-1); - } -#endif -#ifdef INET6 - if (isipv6) { - if (tw->t_port) { - m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; - m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); - th->th_sum = htons(0); - } else { - m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); - th->th_sum = in6_cksum_pseudo(ip6, - sizeof(struct tcphdr) + optlen, IPPROTO_TCP, 0); - } - ip6->ip6_hlim = in6_selecthlim(inp, NULL); - TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); - error = ip6_output(m, inp->in6p_outputopts, NULL, - (tw->tw_so_options & SO_DONTROUTE), NULL, NULL, inp); - } -#endif -#if defined(INET6) && defined(INET) - else -#endif -#ifdef INET - { - if (tw->t_port) { - m->m_pkthdr.csum_flags = CSUM_UDP; - m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); - udp->uh_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); - th->th_sum = htons(0); - } else { - m->m_pkthdr.csum_flags = CSUM_TCP; - m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); - th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, - htons(sizeof(struct tcphdr) + optlen + IPPROTO_TCP)); - } - ip->ip_len = htons(m->m_pkthdr.len); - if (V_path_mtu_discovery) - ip->ip_off |= htons(IP_DF); - TCP_PROBE5(send, NULL, NULL, ip, NULL, th); - error = ip_output(m, inp->inp_options, NULL, - ((tw->tw_so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), - NULL, inp); - } -#endif - if (flags & TH_ACK) - TCPSTAT_INC(tcps_sndacks); - else - TCPSTAT_INC(tcps_sndctrl); - TCPSTAT_INC(tcps_sndtotal); - return (error); -} - -static void -tcp_tw_2msl_reset(struct tcptw *tw, int rearm) -{ - - NET_EPOCH_ASSERT(); - INP_WLOCK_ASSERT(tw->tw_inpcb); - - TW_WLOCK(V_tw_lock); - if (rearm) - TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); - tw->tw_time = ticks + 2 * V_tcp_msl; - TAILQ_INSERT_TAIL(&V_twq_2msl, tw, tw_2msl); - TW_WUNLOCK(V_tw_lock); -} - -static void -tcp_tw_2msl_stop(struct tcptw *tw, int reuse) -{ - struct ucred *cred; - struct inpcb *inp; - int released __unused; - - NET_EPOCH_ASSERT(); - - TW_WLOCK(V_tw_lock); - inp = tw->tw_inpcb; - tw->tw_inpcb = NULL; - - TAILQ_REMOVE(&V_twq_2msl, tw, tw_2msl); - cred = tw->tw_cred; - tw->tw_cred = NULL; - TW_WUNLOCK(V_tw_lock); - - if (cred != NULL) - crfree(cred); - - released = in_pcbrele_wlocked(inp); - KASSERT(!released, ("%s: inp should not be released here", __func__)); - - if (!reuse) - uma_zfree(V_tcptw_zone, tw); - TCPSTATES_DEC(TCPS_TIME_WAIT); -} - -struct tcptw * -tcp_tw_2msl_scan(int reuse) -{ - struct tcptw *tw; - struct inpcb *inp; - - NET_EPOCH_ASSERT(); - - for (;;) { - TW_RLOCK(V_tw_lock); - tw = TAILQ_FIRST(&V_twq_2msl); - if (tw == NULL || (!reuse && (tw->tw_time - ticks) > 0)) { - TW_RUNLOCK(V_tw_lock); - break; - } - KASSERT(tw->tw_inpcb != NULL, ("%s: tw->tw_inpcb == NULL", - __func__)); - - inp = tw->tw_inpcb; - in_pcbref(inp); - TW_RUNLOCK(V_tw_lock); - - INP_WLOCK(inp); - tw = intotw(inp); - if (in_pcbrele_wlocked(inp)) { - if (__predict_true(tw == NULL)) { - continue; - } else { - /* This should not happen as in TIMEWAIT - * state the inp should not be destroyed - * before its tcptw. If INVARIANTS is - * defined panic. - */ -#ifdef INVARIANTS - panic("%s: Panic before an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); -#else - log(LOG_ERR, "%s: Avoid an infinite " - "loop: INP_TIMEWAIT && (INP_FREED " - "|| inp last reference) && tw != " - "NULL", __func__); -#endif - break; - } - } - - if (tw == NULL) { - /* tcp_twclose() has already been called */ - INP_WUNLOCK(inp); - continue; - } - - tcp_twclose(tw, reuse); - if (reuse) - return tw; - } - - return NULL; + return (false); } diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -631,24 +631,7 @@ struct in_conninfo; #endif /* _NETINET_IN_PCB_H_ */ -struct tcptw { - struct inpcb *tw_inpcb; /* XXX back pointer to internet pcb */ - uint32_t t_port:16, /* UDP port number if TCPoUDP */ - t_unused:16; - tcp_seq snd_nxt; - tcp_seq rcv_nxt; - u_short last_win; /* cached window value */ - short tw_so_options; /* copy of so_options */ - struct ucred *tw_cred; /* user credentials */ - u_int32_t t_recent; - u_int32_t ts_offset; /* our timestamp offset */ - int tw_time; - TAILQ_ENTRY(tcptw) tw_2msl; - u_int tw_flags; /* tcpcb t_flags */ -}; - #define intotcpcb(ip) ((struct tcpcb *)(ip)->inp_ppcb) -#define intotw(ip) ((struct tcptw *)(ip)->inp_ppcb) #define sototcpcb(so) (intotcpcb(sotoinpcb(so))) /* @@ -1083,7 +1066,6 @@ void tcp_discardcb(struct tcpcb *); bool tcp_freecb(struct tcpcb *); void tcp_twstart(struct tcpcb *); -void tcp_twclose(struct tcptw *, int); int tcp_ctloutput(struct socket *, struct sockopt *); void tcp_fini(void *); char *tcp_log_addrs(struct in_conninfo *, struct tcphdr *, const void *, @@ -1176,12 +1158,7 @@ void tcp_state_change(struct tcpcb *, int); void tcp_respond(struct tcpcb *, void *, struct tcphdr *, struct mbuf *, tcp_seq, tcp_seq, int); -void tcp_tw_init(void); -#ifdef VIMAGE -void tcp_tw_destroy(void); -#endif -void tcp_tw_zone_change(void); -int tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, +bool tcp_twcheck(struct inpcb *, struct tcpopt *, struct tcphdr *, struct mbuf *, int); void tcp_setpersist(struct tcpcb *); void tcp_record_dsack(struct tcpcb *tp, tcp_seq start, tcp_seq end, int tlp); diff --git a/sys/netinet/toecore.c b/sys/netinet/toecore.c --- a/sys/netinet/toecore.c +++ b/sys/netinet/toecore.c @@ -386,6 +386,7 @@ toe_4tuple_check(struct in_conninfo *inc, struct tcphdr *th, struct ifnet *ifp) { struct inpcb *inp; + struct tcpcb *tp; if (inc->inc_flags & INC_ISIPV6) { inp = in6_pcblookup(&V_tcbinfo, &inc->inc6_faddr, @@ -398,7 +399,8 @@ if (inp != NULL) { INP_RLOCK_ASSERT(inp); - if ((inp->inp_flags & INP_TIMEWAIT) && th != NULL) { + tp = intotcpcb(inp); + if (tp->t_state == TCPS_TIME_WAIT && th != NULL) { if (!tcp_twcheck(inp, NULL, th, NULL, 0)) return (EADDRINUSE); } else { diff --git a/sys/netinet6/in6_pcb.c b/sys/netinet6/in6_pcb.c --- a/sys/netinet6/in6_pcb.c +++ b/sys/netinet6/in6_pcb.c @@ -242,7 +242,6 @@ } if (lport) { struct inpcb *t; - struct tcptw *tw; /* GROSS */ if (ntohs(lport) <= V_ipport_reservedhigh && @@ -303,20 +302,8 @@ } t = in6_pcblookup_local(pcbinfo, &sin6->sin6_addr, lport, lookupflags, cred); - if (t && (t->inp_flags & INP_TIMEWAIT)) { - /* - * XXXRW: If an incpb has had its timewait - * state recycled, we treat the address as - * being in use (for now). This is better - * than a panic, but not desirable. - */ - tw = intotw(t); - if (tw == NULL || - ((reuseport & tw->tw_so_options) == 0 && - (reuseport_lb & tw->tw_so_options) == 0)) - return (EADDRINUSE); - } else if (t && (reuseport & inp_so_options(t)) == 0 && - (reuseport_lb & inp_so_options(t)) == 0) { + if (t && (reuseport & inp_so_options(t)) == 0 && + (reuseport_lb & inp_so_options(t)) == 0) { return (EADDRINUSE); } #ifdef INET @@ -327,18 +314,7 @@ in6_sin6_2_sin(&sin, sin6); t = in_pcblookup_local(pcbinfo, sin.sin_addr, lport, lookupflags, cred); - if (t && t->inp_flags & INP_TIMEWAIT) { - tw = intotw(t); - if (tw == NULL) - return (EADDRINUSE); - if ((reuseport & tw->tw_so_options) == 0 - && (reuseport_lb & tw->tw_so_options) == 0 - && (ntohl(t->inp_laddr.s_addr) != - INADDR_ANY || ((inp->inp_vflag & - INP_IPV6PROTO) == - (t->inp_vflag & INP_IPV6PROTO)))) - return (EADDRINUSE); - } else if (t && + if (t && (reuseport & inp_so_options(t)) == 0 && (reuseport_lb & inp_so_options(t)) == 0 && (ntohl(t->inp_laddr.s_addr) != INADDR_ANY ||