Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -176,6 +176,7 @@ device */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ +#define TCP_MAXUNACKTIME 68 /* max time without making progress (sec) */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -1874,11 +1874,21 @@ &tcp_savetcp, 0); #endif TCP_PROBE3(debug__input, tp, th, m); + /* + * Clear t_acktime if remote side has ACKd + * all data in the socket buffer. + * Otherwise, update t_acktime if we received + * a sufficiently large ACK. + */ + if (sbavail(&so->so_snd) == 0) + tp->t_acktime = 0; + else if (acked > 1) + tp->t_acktime = ticks; if (tp->snd_una == tp->snd_max) tcp_timer_activate(tp, TT_REXMT, 0); else if (!tcp_timer_active(tp, TT_PERSIST)) tcp_timer_activate(tp, TT_REXMT, - tp->t_rxtcur); + TP_RXTCUR(tp)); sowwakeup(so); if (sbavail(&so->so_snd)) (void) tp->t_fb->tfb_tcp_output(tp); @@ -2073,6 +2083,7 @@ */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { + tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; thflags &= ~TH_SYN; @@ -2452,6 +2463,7 @@ */ tp->t_starttime = ticks; if (tp->t_flags & TF_NEEDFIN) { + tp->t_acktime = ticks; tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; } else { @@ -2818,7 +2830,21 @@ tcp_xmit_timer(tp, ticks - tp->t_rtttime); } + SOCKBUF_LOCK(&so->so_snd); /* + * Clear t_acktime if remote side has ACKd all data in the + * socket buffer and FIN (if applicable). + * Otherwise, update t_acktime if we received a sufficiently + * large ACK. + */ + if ((tp->t_state <= TCPS_CLOSE_WAIT && + acked == sbavail(&so->so_snd)) || + acked > sbavail(&so->so_snd)) + tp->t_acktime = 0; + else if (acked > 1) + tp->t_acktime = ticks; + + /* * If all outstanding data is acked, stop retransmit * timer and remember to restart (more output or persist). * If there is more data to be acked, restart retransmit @@ -2828,14 +2854,16 @@ tcp_timer_activate(tp, TT_REXMT, 0); needoutput = 1; } else if (!tcp_timer_active(tp, TT_PERSIST)) - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); /* * If no data (only SYN) was ACK'd, * skip rest of ACK processing. */ - if (acked == 0) + if (acked == 0) { + SOCKBUF_UNLOCK(&so->so_snd); goto step6; + } /* * Let the congestion control algorithm update congestion @@ -2844,7 +2872,6 @@ */ cc_ack_received(tp, th, nsegs, CC_ACK); - SOCKBUF_LOCK(&so->so_snd); if (acked > sbavail(&so->so_snd)) { if (tp->snd_wnd >= sbavail(&so->so_snd)) tp->snd_wnd -= sbavail(&so->so_snd); Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -138,7 +138,8 @@ * non-ACK. */ #define TCP_XMIT_TIMER_ASSERT(tp, len, th_flags) \ - KASSERT(((len) == 0 && ((th_flags) & (TH_SYN | TH_FIN)) == 0) ||\ + KASSERT(((len) == 0 && \ + ((th_flags) & (TH_SYN | TH_FIN | TH_RST)) == 0) || \ tcp_timer_active((tp), TT_REXMT) || \ tcp_timer_active((tp), TT_PERSIST), \ ("neither rexmt nor persist timer is set")) @@ -482,12 +483,12 @@ */ len = 0; if ((sendwin == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && - (off < (int) sbavail(&so->so_snd))) { + (off < (int) sbavail(&so->so_snd)) && + !tcp_timer_active(tp, TT_PERSIST)) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rxtshift = 0; tp->snd_nxt = tp->snd_una; - if (!tcp_timer_active(tp, TT_PERSIST)) - tcp_setpersist(tp); + tcp_setpersist(tp); } } @@ -691,7 +692,7 @@ SEQ_GT(tp->snd_max, tp->snd_una) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); goto just_return; } /* @@ -1483,6 +1484,14 @@ goto timer; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { + /* + * Update "made progress" indication if we just + * sent new data while none was outstanding. + * This resets the time the remote side has to + * ACK the new data. + */ + if (tp->snd_una == tp->snd_max) + tp->t_acktime = ticks; tp->snd_max = tp->snd_nxt; /* * Time this transmission if not a retransmission and @@ -1511,7 +1520,7 @@ tcp_timer_activate(tp, TT_PERSIST, 0); tp->t_rxtshift = 0; } - tcp_timer_activate(tp, TT_REXMT, tp->t_rxtcur); + tcp_timer_activate(tp, TT_REXMT, TP_RXTCUR(tp)); } else if (len == 0 && sbavail(&so->so_snd) && !tcp_timer_active(tp, TT_REXMT) && !tcp_timer_active(tp, TT_PERSIST)) { @@ -1659,15 +1668,29 @@ { int t = ((tp->t_srtt >> 2) + tp->t_rttvar) >> 1; int tt; + int maxunacktime; tp->t_flags &= ~TF_PREVVALID; if (tcp_timer_active(tp, TT_REXMT)) panic("tcp_setpersist: retransmit pending"); /* + * If the state is already closed, don't bother. + */ + if (tp->t_state == TCPS_CLOSED) + return; + + /* * Start/restart persistence timer. */ TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], tcp_persmin, tcp_persmax); + if (TP_MAXUNACKTIME(tp) && tp->t_acktime) { + maxunacktime = tp->t_acktime + TP_MAXUNACKTIME(tp) - ticks; + if (maxunacktime < 1) + maxunacktime = 1; + if (maxunacktime < tt) + tt = maxunacktime; + } tcp_timer_activate(tp, TT_PERSIST, tt); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -919,6 +919,7 @@ /* * Copy and activate timers. */ + tp->t_maxunacktime = sototcpcb(lso)->t_maxunacktime; tp->t_keepinit = sototcpcb(lso)->t_keepinit; tp->t_keepidle = sototcpcb(lso)->t_keepidle; tp->t_keepintvl = sototcpcb(lso)->t_keepintvl; Index: sys/netinet/tcp_timer.h =================================================================== --- sys/netinet/tcp_timer.h +++ sys/netinet/tcp_timer.h @@ -87,6 +87,8 @@ #define TCPTV_KEEPINTVL ( 75*hz) /* default probe interval */ #define TCPTV_KEEPCNT 8 /* max probes before drop */ +#define TCPTV_MAXUNACKTIME 0 /* max time without making progress */ + #define TCPTV_FINWAIT2_TIMEOUT (60*hz) /* FIN_WAIT_2 timeout if no receiver */ /* @@ -181,7 +183,18 @@ #define TP_KEEPINTVL(tp) ((tp)->t_keepintvl ? (tp)->t_keepintvl : tcp_keepintvl) #define TP_KEEPCNT(tp) ((tp)->t_keepcnt ? (tp)->t_keepcnt : tcp_keepcnt) #define TP_MAXIDLE(tp) (TP_KEEPCNT(tp) * TP_KEEPINTVL(tp)) +#define TP_MAXUNACKTIME(tp) \ + ((tp)->t_maxunacktime ? (tp)->t_maxunacktime : tcp_maxunacktime) +/* + * Obtain the time until the restransmit timer should fire. + * This macro ensures the restransmit timer fires at the earlier of the + * t_rxtcur value or the time the maxunacktime would be exceeded. + */ +#define TP_RXTCUR(tp) \ + ((TP_MAXUNACKTIME(tp) == 0 || tp->t_acktime == 0) ? tp->t_rxtcur : \ + max(1, min(tp->t_rxtcur, tp->t_acktime + TP_MAXUNACKTIME(tp) - ticks))) + extern int tcp_persmin; /* minimum persist interval */ extern int tcp_persmax; /* maximum persist interval */ extern int tcp_keepinit; /* time to establish connection */ @@ -189,6 +202,7 @@ extern int tcp_keepintvl; /* time between keepalive probes */ extern int tcp_keepcnt; /* number of keepalives */ extern int tcp_delacktime; /* time before sending a delayed ACK */ +extern int tcp_maxunacktime; /* max time without making progress */ extern int tcp_maxpersistidle; extern int tcp_rexmit_min; extern int tcp_rexmit_slop; Index: sys/netinet/tcp_timer.c =================================================================== --- sys/netinet/tcp_timer.c +++ sys/netinet/tcp_timer.c @@ -69,6 +69,7 @@ #include #include #include +#include #include #include #include @@ -144,6 +145,11 @@ &tcp_rexmit_drop_options, 0, "Drop TCP options from 3rd and later retransmitted SYN"); +int tcp_maxunacktime = TCPTV_MAXUNACKTIME; +SYSCTL_PROC(_net_inet_tcp, OID_AUTO, maxunacktime, CTLTYPE_INT|CTLFLAG_RW, + &tcp_maxunacktime, 0, sysctl_msec_to_ticks, "I", + "Maximum time (in ms) that a session can linger without making progress"); + VNET_DEFINE(int, tcp_pmtud_blackhole_detect); SYSCTL_INT(_net_inet_tcp, OID_AUTO, pmtud_blackhole_detection, CTLFLAG_RW|CTLFLAG_VNET, @@ -533,11 +539,37 @@ CURVNET_RESTORE(); } +/* + * Has this session exceeded the maximum time without seeing a substantive + * acknowledgement? If so, return true; otherwise false. + */ +static bool +tcp_maxunacktime_check(struct tcpcb *tp) +{ + + /* Are we tracking this timer for this session? */ + if (TP_MAXUNACKTIME(tp) == 0) + return false; + + /* Do we have a current measurement. */ + if (tp->t_acktime == 0) + return false; + + /* Are we within the acceptable range? */ + if (TSTMP_GT(TP_MAXUNACKTIME(tp) + tp->t_acktime, (u_int)ticks)) + return false; + + /* We exceeded the timer. */ + TCPSTAT_INC(tcps_progdrops); + return true; +} + void tcp_timer_persist(void *xtp) { struct tcpcb *tp = xtp; struct inpcb *inp; + bool progdrop; CURVNET_SET(tp->t_vnet); #ifdef TCPDEBUG int ostate; @@ -572,11 +604,15 @@ * backoff, drop the connection if the idle time * (no responses to probes) reaches the maximum * backoff that we would use if retransmitting. + * Also, drop the connection if we haven't been making + * progress. */ - if (tp->t_rxtshift == TCP_MAXRXTSHIFT && + progdrop = tcp_maxunacktime_check(tp); + if (progdrop || (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || - ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { - TCPSTAT_INC(tcps_persistdrop); + ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff))) { + if (!progdrop) + TCPSTAT_INC(tcps_persistdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; @@ -654,10 +690,15 @@ * Retransmission timer went off. Message has not * been acked within retransmit interval. Back off * to a longer retransmit interval and retransmit one segment. + * + * If we've either exceeded the maximum number of retransmissions, + * or we've gone long enough without making progress, then drop + * the session. */ - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + if (++tp->t_rxtshift > TCP_MAXRXTSHIFT || tcp_maxunacktime_check(tp)) { + if (tp->t_rxtshift > TCP_MAXRXTSHIFT) + TCPSTAT_INC(tcps_timeoutdrop); tp->t_rxtshift = TCP_MAXRXTSHIFT; - TCPSTAT_INC(tcps_timeoutdrop); if (tcp_inpinfo_lock_add(inp)) { tcp_inpinfo_lock_del(inp, tp); goto out; Index: sys/netinet/tcp_usrreq.c =================================================================== --- sys/netinet/tcp_usrreq.c +++ sys/netinet/tcp_usrreq.c @@ -929,6 +929,8 @@ m_freem(control); /* empty control, just free it */ } if (!(flags & PRUS_OOB)) { + if (tp->t_acktime == 0) + tp->t_acktime = ticks; sbappendstream(&so->so_snd, m, flags); if (nam && tp->t_state < TCPS_SYN_SENT) { /* @@ -991,6 +993,8 @@ * of data past the urgent section. * Otherwise, snd_up should be one lower. */ + if (tp->t_acktime == 0) + tp->t_acktime = ticks; sbappendstream_locked(&so->so_snd, m, flags); SOCKBUF_UNLOCK(&so->so_snd); if (nam && tp->t_state < TCPS_SYN_SENT) { @@ -1722,6 +1726,7 @@ INP_WUNLOCK(inp); break; + case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: @@ -1738,6 +1743,10 @@ INP_WLOCK_RECHECK(inp); switch (sopt->sopt_name) { + case TCP_MAXUNACKTIME: + tp->t_maxunacktime = ui; + break; + case TCP_KEEPIDLE: tp->t_keepidle = ui; /* @@ -1947,11 +1956,14 @@ INP_WUNLOCK(inp); error = sooptcopyout(sopt, buf, len + 1); break; + case TCP_MAXUNACKTIME: case TCP_KEEPIDLE: case TCP_KEEPINTVL: case TCP_KEEPINIT: case TCP_KEEPCNT: switch (sopt->sopt_name) { + case TCP_MAXUNACKTIME: + ui = TP_MAXUNACKTIME(tp) / hz; case TCP_KEEPIDLE: ui = TP_KEEPIDLE(tp) / hz; break; @@ -2150,6 +2162,8 @@ tcp_state_change(tp, TCPS_LAST_ACK); break; } + if (tp->t_acktime == 0) + tp->t_acktime = ticks; if (tp->t_state >= TCPS_FIN_WAIT_2) { soisdisconnected(tp->t_inpcb->inp_socket); /* Prevent the connection hanging in FIN_WAIT_2 forever. */ Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -124,6 +124,8 @@ tcp_seq snd_recover; /* for use in NewReno Fast Recovery */ u_int t_rcvtime; /* inactivity time */ + u_int t_acktime; /* last time we received a "real" ACK */ + u_int t_maxunacktime; /* max time without making progress */ u_int t_starttime; /* time connection was established */ u_int t_rtttime; /* RTT measurement start time */ tcp_seq t_rtseq; /* sequence number being timed */ @@ -495,6 +497,7 @@ uint64_t tcps_keeptimeo; /* keepalive timeouts */ uint64_t tcps_keepprobe; /* keepalive probes sent */ uint64_t tcps_keepdrops; /* connections dropped in keepalive */ + uint64_t tcps_progdrops; /* drops due to no progress */ uint64_t tcps_sndtotal; /* total packets sent */ uint64_t tcps_sndpack; /* data packets sent */ Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c +++ usr.bin/netstat/inet.c @@ -678,6 +678,8 @@ "{N:/keepalive probe%s sent}\n"); p(tcps_keepdrops, "\t\t{:connections-dropped-by-keepalives/%ju} " "{N:/connection%s dropped by keepalive}\n"); + p(tcps_progdrops, "\t{:connections-dropped-due-to-progress-time/%ju} " + "{N:/connection%s dropped due to exceeding progress time}\n"); p(tcps_predack, "\t{:ack-header-predictions/%ju} " "{N:/correct ACK header prediction%s}\n"); p(tcps_preddat, "\t{:data-packet-header-predictions/%ju} " Index: usr.bin/systat/tcp.c =================================================================== --- usr.bin/systat/tcp.c +++ usr.bin/systat/tcp.c @@ -123,8 +123,8 @@ L(5, "- in embryonic state"); R(5, "- ack-only"); L(6, "- on retransmit timeout"); R(6, "- window probes"); L(7, "- by keepalive"); R(7, "- window updates"); - L(8, "- from listen queue"); R(8, "- urgent data only"); - R(9, "- control"); + L(8, "- exceeded progress time"); R(8, "- urgent data only"); + L(9, "- from listen queue"); R(9, "- control"); R(10, "- resends by PMTU discovery"); L(11, "TCP Timers"); R(11, "total packets received"); L(12, "potential rtt updates"); R(12, "- in sequence"); @@ -177,6 +177,7 @@ DO(tcps_keeptimeo); DO(tcps_keepprobe); DO(tcps_keepdrops); + DO(tcps_progdrops); DO(tcps_sndtotal); DO(tcps_sndpack); @@ -246,8 +247,8 @@ L(5, tcps_conndrops); R(5, tcps_sndacks); L(6, tcps_timeoutdrop); R(6, tcps_sndprobe); L(7, tcps_keepdrops); R(7, tcps_sndwinup); - L(8, tcps_listendrop); R(8, tcps_sndurg); - R(9, tcps_sndctrl); + L(8, tcps_progdrops); R(8, tcps_sndurg); + L(9, tcps_listendrop); R(9, tcps_sndctrl); R(10, tcps_mturesent); R(11, tcps_rcvtotal); L(12, tcps_segstimed); R(12, tcps_rcvpack);