diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -282,6 +282,7 @@ The boolean option .Dv TCP_NODELAY defeats this algorithm. + .It Dv TCP_MAXSEG By default, a sender- and .No receiver- Ns Tn TCP @@ -291,6 +292,17 @@ .Dv TCP_MAXSEG option allows the user to determine the result of this negotiation, and to reduce it if desired. +.It Dv TCP_MAXUNACKTIME +This +.Xr setsockopt 2 +option accepts an argument of +.Vt "u_int" +to set the per-socket interval, in seconds, in which the connection must +make progress. Progress is defined by at least 1 byte being acknowledged within +the set time period. If a connection fails to make progress, then the +.Tn TCP +stack will terminate the connection with a reset. Note that the default +value for this is zero which indicates no progress checks should be made. .It Dv TCP_NOOPT .Tn TCP usually sends a number of options in each packet, corresponding to diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1888,11 +1888,21 @@ &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); + /* + * Clear t_acktime if remote side has ACKd + * all data in the socket buffer. + * Otherwise, update t_acktime if we received + * a sufficiently large ACK. + */ + if (sbavail(&so->so_snd) == 0) + tp->t_acktime = 0; + else if (acked > 1) + tp->t_acktime = ticks; if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, - tp->t_rxtcur); + TP_RXTCUR(tp)); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tcp_output(tp); @@ -2091,6 +2101,7 @@ */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { + tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; @@ -2475,6 +2486,7 @@ tp->t_tfo_pending = NULL; } if (tp->t_flags & TF_NEEDFIN) { + tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { @@ -2921,6 +2933,20 @@ tcp_xmit_timer(tp, ticks - tp->t_rtttime); } + SOCKBUF_LOCK(&so->so_snd); + /* + * Clear t_acktime if remote side has ACKd all data in the + * socket buffer and FIN (if applicable). + * Otherwise, update t_acktime if we received a sufficiently + * large ACK. + */ + if ((tp->t_state <= TCPS_CLOSE_WAIT && + acked == sbavail(&so->so_snd)) || + acked > sbavail(&so->so_snd)) + tp->t_acktime = 0; + else if (acked > 1) + tp->t_acktime = ticks; + /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). @@ -2931,14 +2957,16 @@ tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ - if (acked == 0) + if (acked == 0) { + SOCKBUF_UNLOCK(&so->so_snd); goto step6; + } /* * Let the congestion control algorithm update congestion @@ -2947,7 +2975,6 @@ */ cc_ack_received(tp, th, nsegs, CC_ACK); - SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -506,7 +506,8 @@ */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && - (off < (int) sbavail(&so->so_snd))) { + (off < (int) sbavail(&so->so_snd)) && + !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; @@ -734,7 +735,7 @@ SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); goto just_return; } /* @@ -1578,6 +1579,12 @@ goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + /* + * Update "made progress" indication if we just + * added new data to an empty socket buffer. + */ + if (tp->snd_una == tp->snd_max) + tp->t_acktime = ticks; tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and @@ -1616,7 +1623,7 @@ tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { @@ -1769,15 +1776,29 @@ { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; + int maxunacktime; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); + /* + * If the state is already closed, don't bother. + */ + if (tp->t_state == TCPS_CLOSED) + return; + /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); + if (TP_MAXUNACKTIME(tp) && tp->t_acktime) { + maxunacktime = tp->t_acktime + TP_MAXUNACKTIME(tp) - ticks; + if (maxunacktime < 1) + maxunacktime = 1; + if (maxunacktime < tt) + tt = maxunacktime; + } tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1030,6 +1030,7 @@ /* * Copy and activate timers. */ + tp->t_maxunacktime = sototcpcb(lso)->t_maxunacktime; tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; diff --git a/sys/netinet/tcp_timer.h b/sys/netinet/tcp_timer.h --- a/sys/netinet/tcp_timer.h +++ b/sys/netinet/tcp_timer.h @@ -86,6 +86,7 @@ #define TCPTV_KEEP_IDLE (120*60*hz) /* dflt time before probing */ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ +#define TCPTV_MAXUNACKTIME 0 /* max time without making progress */ #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ @@ -183,6 +184,17 @@ #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) +#define TP_MAXUNACKTIME(tp) \ + ((tp)->t_maxunacktime ? (tp)->t_maxunacktime : tcp_maxunacktime) + +/* + * Obtain the time until the restransmit timer should fire. + * This macro ensures the restransmit timer fires at the earlier of the + * t_rxtcur value or the time the maxunacktime would be exceeded. + */ +#define TP_RXTCUR(tp) \ + ((TP_MAXUNACKTIME(tp) == 0 || tp->t_acktime == 0) ? tp->t_rxtcur : \ + max(1, min(tp->t_rxtcur, tp->t_acktime + TP_MAXUNACKTIME(tp) - ticks))) extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ @@ -191,6 +203,7 @@ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ +extern int tcp_maxunacktime; /* max time without making progress */ extern int tcp_maxpersistidle; extern int tcp_rexmit_initial; extern int tcp_rexmit_min; diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -167,6 +167,12 @@ &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); +int tcp_maxunacktime = TCPTV_MAXUNACKTIME; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, + CTLTYPE_INT|CTLFLAG_RW | CTLFLAG_NEEDGIANT, + &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", + "Maximum time (in ms) that a session can linger without making progress"); + VNET_DEFINE(int, tcp_pmtud_blackhole_detect); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, @@ -505,12 +511,38 @@ CURVNET_RESTORE(); } +/* + * Has this session exceeded the maximum time without seeing a substantive + * acknowledgement? If so, return true; otherwise false. + */ +static bool +tcp_maxunacktime_check(struct tcpcb *tp) +{ + + /* Are we tracking this timer for this session? */ + if (TP_MAXUNACKTIME(tp) == 0) + return false; + + /* Do we have a current measurement. */ + if (tp->t_acktime == 0) + return false; + + /* Are we within the acceptable range? */ + if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) + return false; + + /* We exceeded the timer. */ + TCPSTAT_INC(tcps_progdrops); + return true; +} + void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; struct epoch_tracker et; + bool progdrop; int outrv; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG @@ -546,11 +578,15 @@ * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. + * Also, drop the connection if we haven't been making + * progress. */ - if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + progdrop = tcp_maxunacktime_check(tp); + if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || - ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { - TCPSTAT_INC(tcps_persistdrop); + ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { + if (!progdrop) + TCPSTAT_INC(tcps_persistdrop); NET_EPOCH_ENTER(et); tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); tp = tcp_drop(tp, ETIMEDOUT); @@ -630,10 +666,15 @@ * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. + * + * If we've either exceeded the maximum number of retransmissions, + * or we've gone long enough without making progress, then drop + * the session. */ - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { + if (tp->t_rxtshift > TCP_MAXRXTSHIFT) + TCPSTAT_INC(tcps_timeoutdrop); tp->t_rxtshift = TCP_MAXRXTSHIFT; - TCPSTAT_INC(tcps_timeoutdrop); NET_EPOCH_ENTER(et); tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); tp = tcp_drop(tp, ETIMEDOUT); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1116,6 +1116,8 @@ } } if (!(flags & PRUS_OOB)) { + if (tp->t_acktime == 0) + tp->t_acktime = ticks; sbappendstream(&so->so_snd, m, flags); m = NULL; if (nam && tp->t_state < TCPS_SYN_SENT) { @@ -1202,6 +1204,8 @@ * of data past the urgent section. * Otherwise, snd_up should be one lower. */ + if (tp->t_acktime == 0) + tp->t_acktime = ticks; sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); m = NULL; @@ -2375,7 +2379,7 @@ error = ktls_enable_rx(so, &tls); break; #endif - + case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: @@ -2392,6 +2396,10 @@ INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { + case TCP_MAXUNACKTIME: + tp->t_maxunacktime = ui; + break; + case TCP_KEEPIDLE: tp->t_keepidle = ui; /* @@ -2658,11 +2666,15 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; + case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { + case TCP_MAXUNACKTIME: + ui = TP_MAXUNACKTIME(tp) / hz; + break; case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; @@ -2834,6 +2846,8 @@ tcp_state_change(tp, TCPS_LAST_ACK); break; } + if (tp->t_acktime == 0) + tp->t_acktime = ticks; if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -708,6 +708,7 @@ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ + uint64_t tcps_progdrops; /* drops due to no progress */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ diff --git a/usr.bin/netstat/inet.c b/usr.bin/netstat/inet.c --- a/usr.bin/netstat/inet.c +++ b/usr.bin/netstat/inet.c @@ -757,6 +757,8 @@ "{N:/keepalive probe%s sent}\n"); p(tcps_keepdrops, "\t\t{:connections-dropped-by-keepalives/%ju} " "{N:/connection%s dropped by keepalive}\n"); + p(tcps_progdrops, "\t{:connections-dropped-due-to-progress-time/%ju} " + "{N:/connection%s dropped due to exceeding progress time}\n"); p(tcps_predack, "\t{:ack-header-predictions/%ju} " "{N:/correct ACK header prediction%s}\n"); p(tcps_preddat, "\t{:data-packet-header-predictions/%ju} " diff --git a/usr.bin/systat/tcp.c b/usr.bin/systat/tcp.c --- a/usr.bin/systat/tcp.c +++ b/usr.bin/systat/tcp.c @@ -125,8 +125,8 @@ L(5, "- in embryonic state"); R(5, "- ack-only"); L(6, "- on retransmit timeout"); R(6, "- window probes"); L(7, "- by keepalive"); R(7, "- window updates"); - L(8, "- from listen queue"); R(8, "- urgent data only"); - R(9, "- control"); + L(8, "- exceeded progress time"); R(8, "- urgent data only"); + L(9, "- from listen queue"); R(9, "- control"); R(10, "- resends by PMTU discovery"); L(11, "TCP Timers"); R(11, "total packets received"); L(12, "potential rtt updates"); R(12, "- in sequence"); @@ -179,6 +179,7 @@ DO(tcps_keeptimeo); DO(tcps_keepprobe); DO(tcps_keepdrops); + DO(tcps_progdrops); DO(tcps_sndtotal); DO(tcps_sndpack); @@ -248,8 +249,8 @@ L(5, tcps_conndrops); R(5, tcps_sndacks); L(6, tcps_timeoutdrop); R(6, tcps_sndprobe); L(7, tcps_keepdrops); R(7, tcps_sndwinup); - L(8, tcps_listendrop); R(8, tcps_sndurg); - R(9, tcps_sndctrl); + L(8, tcps_progdrops); R(8, tcps_sndurg); + L(9, tcps_listendrop); R(9, tcps_sndctrl); R(10, tcps_mturesent); R(11, tcps_rcvtotal); L(12, tcps_segstimed); R(12, tcps_rcvpack);