diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -439,10 +439,7 @@ case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; - if ((tp->t_rxtshift > 1) || - !((tp->t_flags & TF_SACK_PERMIT) && - (!TAILQ_EMPTY(&tp->snd_holes)))) - EXIT_RECOVERY(tp->t_flags); + EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; @@ -480,7 +477,7 @@ tp->t_ccv.curack = th->th_ack; CC_ALGO(tp)->post_recovery(&tp->t_ccv); } - /* XXXLAS: EXIT_RECOVERY ? */ + EXIT_RECOVERY(tp->t_flags); tp->t_bytes_acked = 0; tp->sackhint.delivered_data = 0; tp->sackhint.prr_delivered = 0; @@ -1872,9 +1869,11 @@ * is new data available to be sent * or we need to send an ACK. */ - if (SEQ_GT(tp->snd_una + sbavail(&so->so_snd), - tp->snd_max) || tp->t_flags & TF_ACKNOW) + if ((tp->t_flags & TF_ACKNOW) || + SEQ_GEQ(sbavail(&so->so_snd), + SEQ_SUB(tp->snd_max, tp->snd_una))) { (void) tcp_output(tp); + } goto check_delack; } } else if (th->th_ack == tp->snd_una && @@ -2585,41 +2584,54 @@ */ if (th->th_ack != tp->snd_una || (tcp_is_sack_recovery(tp, &to) && - (sack_changed == SACK_NOCHANGE))) + (sack_changed == SACK_NOCHANGE))) { break; - else if (!tcp_timer_active(tp, TT_REXMT)) + } else + if (!tcp_timer_active(tp, TT_REXMT)) { tp->t_dupacks = 0; - else if (++tp->t_dupacks > tcprexmtthresh || + } else + if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { cc_ack_received(tp, th, nsegs, CC_DUPACK); if (V_tcp_do_prr && - IN_FASTRECOVERY(tp->t_flags) && - (tp->t_flags & TF_SACK_PERMIT)) { - tcp_do_prr_ack(tp, th, &to, sack_changed); - } else if (tcp_is_sack_recovery(tp, &to) && IN_FASTRECOVERY(tp->t_flags)) { + tcp_do_prr_ack(tp, th, &to, + sack_changed); + } else + if (tcp_is_sack_recovery(tp, &to) && + IN_FASTRECOVERY(tp->t_flags) && + (tp->snd_nxt == tp->snd_max)) { int awnd; /* - * Compute the amount of data in flight first. - * We can inject new data into the pipe iff - * we have less than 1/2 the original window's - * worth of data in flight. + * Compute the amount of data in flight + * first. We can inject new data into + * the pipe iff we have less than + * ssthresh worth of data in flight. */ - if (V_tcp_do_newsack) + if (V_tcp_do_newsack) { awnd = tcp_compute_pipe(tp); - else - awnd = (tp->snd_nxt - tp->snd_fack) + - tp->sackhint.sack_bytes_rexmit; - + } else { + awnd = tp->snd_nxt - tp->snd_fack + + tp->sackhint.sack_bytes_rexmit; + } if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += maxseg; + tp->snd_cwnd += imax(maxseg, + tp->sackhint.delivered_data); if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd = + tp->snd_ssthresh; } } else + if (tcp_is_sack_recovery(tp, &to) && + IN_FASTRECOVERY(tp->t_flags) && + SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tp->snd_cwnd += imax(maxseg, + tp->sackhint.delivered_data); + } else { tp->snd_cwnd += maxseg; + } (void) tcp_output(tp); goto drop; } else if (tp->t_dupacks == tcprexmtthresh || @@ -2661,37 +2673,45 @@ } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); - cc_ack_received(tp, th, nsegs, - CC_DUPACK); + cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (V_tcp_do_prr) { /* - * snd_ssthresh is already updated by - * cc_cong_signal. + * snd_ssthresh and snd_recover are + * already updated by cc_cong_signal. */ if (tcp_is_sack_recovery(tp, &to)) { /* - * Exclude Limited Transmit + * Include Limited Transmit * segments here */ tp->sackhint.prr_delivered = - maxseg; + imin(tp->snd_max - th->th_ack, + (tp->snd_limited + 1) * maxseg); } else { tp->sackhint.prr_delivered = - imin(tp->snd_max - tp->snd_una, - imin(INT_MAX / 65536, - tp->t_dupacks) * maxseg); + maxseg; } tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } + tp->snd_limited = 0; if (tcp_is_sack_recovery(tp, &to)) { - TCPSTAT_INC( - tcps_sack_recovery_episode); - tp->snd_recover = tp->snd_nxt; - tp->snd_cwnd = maxseg; + TCPSTAT_INC(tcps_sack_recovery_episode); + /* + * When entering LR after RTO due to + * Duplicate ACKs, retransmit existing + * holes from the scoreboard. + */ + tcp_resend_sackholes(tp); + /* Avoid inflating cwnd in tcp_output */ + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tcp_compute_pipe(tp) + + maxseg; (void) tcp_output(tp); + /* Set cwnd to the expected flightsize */ + tp->snd_cwnd = tp->snd_ssthresh; if (SEQ_GT(th->th_ack, tp->snd_una)) goto resume_partialack; goto drop; @@ -2733,7 +2753,8 @@ if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = - (tp->snd_nxt - tp->snd_una) + + SEQ_SUB(tp->snd_nxt, tp->snd_una) - + tcp_sack_adjust(tp) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* @@ -2743,11 +2764,12 @@ */ SOCKBUF_LOCK(&so->so_snd); avail = sbavail(&so->so_snd) - - (tp->snd_nxt - tp->snd_una); + SEQ_SUB(tp->snd_nxt, tp->snd_una); SOCKBUF_UNLOCK(&so->so_snd); - if (avail > 0 || tp->t_flags & TF_ACKNOW) + if (avail > 0 || tp->t_flags & TF_ACKNOW) { (void) tcp_output(tp); - sent = tp->snd_max - oldsndmax; + } + sent = SEQ_SUB(tp->snd_max, oldsndmax); if (sent > maxseg) { KASSERT((tp->t_dupacks == 2 && tp->snd_limited == 0) || @@ -2756,8 +2778,10 @@ ("%s: sent too much", __func__)); tp->snd_limited = 2; - } else if (sent > 0) + } else + if (sent > 0) { ++tp->snd_limited; + } tp->snd_cwnd = oldcwnd; goto drop; } @@ -2779,7 +2803,10 @@ * counted as dupacks here. */ if (tcp_is_sack_recovery(tp, &to) && - (sack_changed != SACK_NOCHANGE)) { + ((sack_changed == SACK_NEWLOSS) || + ((sack_changed != SACK_NOCHANGE) && + ((tp->t_rxtshift == 0) || + (tp->snd_nxt == tp->snd_max))))) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && @@ -2799,23 +2826,20 @@ * If the congestion window was inflated to account * for the other side's cached packets, retract it. */ - if (IN_FASTRECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (IN_FASTRECOVERY(tp->t_flags)) { if (tp->t_flags & TF_SACK_PERMIT) if (V_tcp_do_prr && to.to_flags & TOF_SACK) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; tcp_do_prr_ack(tp, th, &to, sack_changed); - tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else tcp_sack_partialack(tp, th); else tcp_newreno_partial_ack(tp, th); } else - cc_post_recovery(tp, th); - } else if (IN_CONGRECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (IN_CONGRECOVERY(tp->t_flags)) { if (V_tcp_do_prr) { tp->sackhint.delivered_data = BYTES_THIS_ACK(tp, th); tp->snd_fack = th->th_ack; @@ -2826,8 +2850,7 @@ tcp_do_prr_ack(tp, th, &to, SACK_CHANGE); (void) tcp_output(tp); } - } else - cc_post_recovery(tp, th); + } } /* * If we reach this point, ACK is not a duplicate, @@ -2978,12 +3001,11 @@ SEQ_GT(tp->snd_una, tp->snd_recover) && SEQ_LEQ(th->th_ack, tp->snd_recover)) tp->snd_recover = th->th_ack - 1; - /* XXXLAS: Can this be moved up into cc_post_recovery? */ + tp->snd_una = th->th_ack; if (IN_RECOVERY(tp->t_flags) && SEQ_GEQ(th->th_ack, tp->snd_recover)) { - EXIT_RECOVERY(tp->t_flags); + cc_post_recovery(tp, th); } - tp->snd_una = th->th_ack; if (tp->t_flags & TF_SACK_PERMIT) { if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; @@ -3308,9 +3330,9 @@ /* * Return any desired output. */ - if (needoutput || (tp->t_flags & TF_ACKNOW)) + if (needoutput || (tp->t_flags & TF_ACKNOW)) { (void) tcp_output(tp); - + } check_delack: INP_WLOCK_ASSERT(inp); @@ -4013,15 +4035,14 @@ */ if (IN_FASTRECOVERY(tp->t_flags)) { if (tcp_is_sack_recovery(tp, to)) { - tp->snd_cwnd = tp->snd_nxt - tp->snd_recover + - tp->sackhint.sack_bytes_rexmit + - (snd_cnt * maxseg); + tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); } else { tp->snd_cwnd = (tp->snd_max - tp->snd_una) + (snd_cnt * maxseg); } - } else if (IN_CONGRECOVERY(tp->t_flags)) + } else if (IN_CONGRECOVERY(tp->t_flags)) { tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); + } tp->snd_cwnd = imax(maxseg, tp->snd_cwnd); } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -264,19 +264,22 @@ } } again: + sendwin = 0; /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_LT(tp->snd_nxt, tp->snd_max)) - tcp_sack_adjust(tp); + (tp->sackhint.nexthole != NULL) && + !IN_FASTRECOVERY(tp->t_flags)) { + sendwin = tcp_sack_adjust(tp); + } sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; - sendwin = min(tp->snd_wnd, tp->snd_cwnd); + sendwin = min(tp->snd_wnd, tp->snd_cwnd + sendwin); flags = tcp_outflags[tp->t_state]; /* @@ -293,12 +296,16 @@ sack_bytes_rxmt = 0; len = 0; p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && + if ((tp->t_flags & TF_SACK_PERMIT) && + (IN_FASTRECOVERY(tp->t_flags) || SEQ_LT(tp->snd_nxt, tp->snd_max)) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { uint32_t cwin; - cwin = - imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); + if (IN_FASTRECOVERY(tp->t_flags)) { + cwin = imax(sendwin - tcp_compute_pipe(tp), 0); + } else { + cwin = imax(sendwin - off, 0); + } /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* @@ -317,22 +324,34 @@ goto after_sack_rexmit; } else { /* Can rexmit part of the current hole */ - len = ((int32_t)ulmin(cwin, - SEQ_SUB(tp->snd_recover, p->rxmit))); + if (cwin <= (len = SEQ_SUB(tp->snd_recover, p->rxmit))) { + len = cwin; + } else { + sendalot = 1; + } } } else { - len = ((int32_t)ulmin(cwin, - SEQ_SUB(p->end, p->rxmit))); + if (cwin <= (len = SEQ_SUB(p->end, p->rxmit))) { + len = cwin; + } else { + sendalot = 1; + } } if (len > 0) { off = SEQ_SUB(p->rxmit, tp->snd_una); KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); sack_rxmit = 1; - sendalot = 1; - TCPSTAT_INC(tcps_sack_rexmits); - TCPSTAT_ADD(tcps_sack_rexmit_bytes, - min(len, tcp_maxseg(tp))); + } else { + /* we could have transmitted from the scoreboard, + * but sendwin (expected flightsize) - pipe didn't + * allow any transmission. + * Bypass recalculating the possible transmission + * length further down by setting sack_rxmit. + * Wouldn't be here if there would have been + * nothing in the scoreboard to transmit. + */ + sack_rxmit = 1; } } after_sack_rexmit: @@ -395,35 +414,17 @@ * in which case len is already set. */ if (sack_rxmit == 0) { - if (sack_bytes_rxmt == 0) - len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - - off); - else { - int32_t cwin; - + if ((sack_bytes_rxmt == 0) || + SEQ_LT(tp->snd_nxt, tp->snd_max)) { + len = min(sbavail(&so->so_snd), sendwin) - off; + } else { /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - - off); - /* - * Don't remove this (len > 0) check ! - * We explicitly check for len > 0 here (although it - * isn't really necessary), to work around a gcc - * optimization issue - to force gcc to compute - * len above. Without this check, the computation - * of len is bungled by the optimizer. - */ - if (len > 0) { - cwin = tp->snd_cwnd - imax(0, (int32_t) - (tp->snd_nxt - tp->snd_recover)) - - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - len = imin(len, cwin); - } + len = imin(sbavail(&so->so_snd) - off, + sendwin - tcp_compute_pipe(tp)); } } @@ -554,21 +555,19 @@ ipoptlen += ipsec_optlen; #endif - if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && + if ((len > tp->t_maxseg) && + (tp->t_flags & TF_TSO) && + V_tcp_do_tso && (tp->t_port == 0) && - ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && - ipoptlen == 0 && !(flags & TH_SYN)) + !(tp->t_flags & TF_SIGNATURE) && + (tp->rcv_numsacks == 0) && + (ipoptlen == 0) && + !(flags & TH_SYN)) tso = 1; - if (sack_rxmit) { - if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) - flags &= ~TH_FIN; - } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + - sbused(&so->so_snd))) - flags &= ~TH_FIN; - } + if (SEQ_LT((sack_rxmit ? p->rxmit : tp->snd_nxt) + len, + tp->snd_una + sbused(&so->so_snd))) + flags &= ~TH_FIN; recwin = lmin(lmax(sbspace(&so->so_rcv), 0), (long)TCP_MAXWIN << tp->rcv_scale); @@ -612,9 +611,8 @@ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) && - (tp->t_flags & TF_NOPUSH) == 0) { + (tp->t_flags & TF_NOPUSH) == 0) goto send; - } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) @@ -1037,6 +1035,10 @@ tp->t_sndrexmitpack++; TCPSTAT_INC(tcps_sndrexmitpack); TCPSTAT_ADD(tcps_sndrexmitbyte, len); + if (sack_rxmit) { + TCPSTAT_INC(tcps_sack_rexmits); + TCPSTAT_ADD(tcps_sack_rexmit_bytes, len); + } #ifdef STATS stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, len); @@ -1633,6 +1635,15 @@ /* Clean up any DSACK's sent */ tcp_clean_dsack_blocks(tp); } + if ((error == 0) && + sack_rxmit && + SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) { + /* + * When transmitting from SACK scoreboard + * after an RTO, pull snd_nxt along. + */ + tp->snd_nxt = SEQ_MIN(p->rxmit, p->end); + } if (error) { /* * We know that the packet was lost, so back out the diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -948,8 +948,17 @@ /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2) num_segs = 2; - tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + - (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg); + if (tp->snd_nxt == tp->snd_max) { + tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + + (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg); + } else { + /* + * Since cwnd not is the expected flightsize during + * SACK LR, not deflating cwnd allows the partial + * ACKed amount to be sent. + */ + tp->snd_cwnd += 0; + } if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; @@ -983,12 +992,13 @@ if (tp->t_flags & TF_SENTFIN) highdata--; highdata = SEQ_MIN(highdata, tp->snd_recover); - if (th->th_ack != highdata) { - tp->snd_fack = th->th_ack; + if (SEQ_LT(th->th_ack, highdata)) { + tp->snd_fack = SEQ_MAX(th->th_ack, tp->snd_fack); if ((temp = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, - highdata - maxseg), highdata, NULL)) != NULL) - tp->sackhint.hole_bytes += temp->end - - temp->start; + highdata - maxseg), highdata, NULL)) != NULL) { + tp->sackhint.hole_bytes += + temp->end - temp->start; + } } } (void) tcp_output(tp); @@ -1054,34 +1064,43 @@ * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ -void +int tcp_sack_adjust(struct tcpcb *tp) { + int sacked = 0; struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tptoinpcb(tp)); - if (cur == NULL) - return; /* No holes */ - if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) - return; /* We're already beyond any SACKed blocks */ + if (cur == NULL) { + /* No holes */ + return (0); + } + if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) { + /* We're already beyond any SACKed blocks */ + return (tp->sackhint.sacked_bytes); + } /*- * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { - if (SEQ_LT(tp->snd_nxt, cur->end)) - return; - if (SEQ_GEQ(tp->snd_nxt, p->start)) + if (SEQ_LT(tp->snd_nxt, cur->end)) { + return (sacked); + } + sacked += p->start - cur->end; + if (SEQ_GEQ(tp->snd_nxt, p->start)) { cur = p; - else { + } else { tp->snd_nxt = p->start; - return; + return (sacked); } } - if (SEQ_LT(tp->snd_nxt, cur->end)) - return; + if (SEQ_LT(tp->snd_nxt, cur->end)) { + return (sacked); + } tp->snd_nxt = tp->snd_fack; + return (tp->sackhint.sacked_bytes); } /* diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1494,7 +1494,7 @@ void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); -void tcp_sack_adjust(struct tcpcb *tp); +int tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *);