diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -439,10 +439,7 @@ case CC_RTO: tp->t_dupacks = 0; tp->t_bytes_acked = 0; - if ((tp->t_rxtshift > 1) || - !((tp->t_flags & TF_SACK_PERMIT) && - (!TAILQ_EMPTY(&tp->snd_holes)))) - EXIT_RECOVERY(tp->t_flags); + EXIT_RECOVERY(tp->t_flags); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; @@ -2612,26 +2609,36 @@ tcp_do_prr_ack(tp, th, &to, sack_changed, &maxseg); } else if (tcp_is_sack_recovery(tp, &to) && - IN_FASTRECOVERY(tp->t_flags)) { + IN_FASTRECOVERY(tp->t_flags) && + (tp->snd_nxt == tp->snd_max)) { int awnd; /* - * Compute the amount of data in flight first. - * We can inject new data into the pipe iff - * we have less than 1/2 the original window's - * worth of data in flight. + * Compute the amount of data in flight + * first. We can inject new data into + * the pipe iff we have less than + * ssthresh worth of data in flight. */ if (V_tcp_do_newsack) { awnd = tcp_compute_pipe(tp); } else { - awnd = (tp->snd_nxt - tp->snd_fack) + - tp->sackhint.sack_bytes_rexmit; + awnd = tp->snd_nxt - tp->snd_fack + + tp->sackhint.sack_bytes_rexmit; } if (awnd < tp->snd_ssthresh) { - tp->snd_cwnd += maxseg; + tp->snd_cwnd += imax(maxseg, + imin(2 * maxseg, + tp->sackhint.delivered_data)); if (tp->snd_cwnd > tp->snd_ssthresh) - tp->snd_cwnd = tp->snd_ssthresh; + tp->snd_cwnd = + tp->snd_ssthresh; } + } else if (tcp_is_sack_recovery(tp, &to) && + IN_FASTRECOVERY(tp->t_flags) && + SEQ_LT(tp->snd_nxt, tp->snd_max)) { + tp->snd_cwnd += imax(maxseg, + imin(2 * maxseg, + tp->sackhint.delivered_data)); } else { tp->snd_cwnd += maxseg; } @@ -2655,14 +2662,13 @@ tcp_seq onxt = tp->snd_nxt; /* - * If we're doing sack, or prr, check - * to see if we're already in sack + * If we're doing sack, check to + * see if we're already in sack * recovery. If we're not doing sack, * check to see if we're in newreno * recovery. */ - if (V_tcp_do_prr || - (tp->t_flags & TF_SACK_PERMIT)) { + if (tcp_is_sack_recovery(tp, &to)) { if (IN_FASTRECOVERY(tp->t_flags)) { tp->t_dupacks = 0; break; @@ -2676,37 +2682,46 @@ } /* Congestion signal before ack. */ cc_cong_signal(tp, th, CC_NDUPACK); - cc_ack_received(tp, th, nsegs, - CC_DUPACK); + cc_ack_received(tp, th, nsegs, CC_DUPACK); tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; if (V_tcp_do_prr) { /* - * snd_ssthresh is already updated by - * cc_cong_signal. + * snd_ssthresh and snd_recover are + * already updated by cc_cong_signal. */ if (tcp_is_sack_recovery(tp, &to)) { /* - * Exclude Limited Transmit + * Include Limited Transmit * segments here */ tp->sackhint.prr_delivered = - maxseg; + imin(tp->snd_max - th->th_ack, + (tp->snd_limited + 1) * maxseg); } else { tp->sackhint.prr_delivered = - imin(tp->snd_max - tp->snd_una, - imin(INT_MAX / 65536, - tp->t_dupacks) * maxseg); + maxseg; } tp->sackhint.recover_fs = max(1, tp->snd_nxt - tp->snd_una); } + tp->snd_limited = 0; if (tcp_is_sack_recovery(tp, &to)) { TCPSTAT_INC(tcps_sack_recovery_episode); - tp->snd_recover = tp->snd_nxt; - tp->snd_cwnd = maxseg; + /* + * When entering LR after RTO due to + * Duplicate ACKs, retransmit existing + * holes from the scoreboard. + */ + tcp_resend_sackholes(tp); + /* Avoid inflating cwnd in tcp_output */ + tp->snd_nxt = tp->snd_max; + tp->snd_cwnd = tcp_compute_pipe(tp) + + maxseg; (void) tcp_output(tp); - if (SEQ_GT(th->th_ack, tp->snd_una)) { + /* Set cwnd to the expected flightsize */ + tp->snd_cwnd = tp->snd_ssthresh; + if (SEQ_GT(th->th_ack, tp->snd_una)) goto resume_partialack; } goto drop; @@ -2747,7 +2762,8 @@ if (tp->t_dupacks == 1) tp->snd_limited = 0; tp->snd_cwnd = - (tp->snd_nxt - tp->snd_una) + + SEQ_SUB(tp->snd_nxt, tp->snd_una) - + tcp_sack_adjust(tp) + (tp->t_dupacks - tp->snd_limited) * maxseg; /* @@ -2796,7 +2812,10 @@ * counted as dupacks here. */ if (tcp_is_sack_recovery(tp, &to) && - (sack_changed != SACK_NOCHANGE)) { + ((sack_changed == SACK_NEWLOSS) || + ((sack_changed != SACK_NOCHANGE) && + ((tp->t_rxtshift == 0) || + (tp->snd_nxt == tp->snd_max))))) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && @@ -3003,9 +3022,8 @@ SEQ_GEQ(th->th_ack, tp->snd_recover)) { cc_post_recovery(tp, th); } - if (tp->t_flags & TF_SACK_PERMIT) { - if (SEQ_GT(tp->snd_una, tp->snd_recover)) - tp->snd_recover = tp->snd_una; + if (SEQ_GT(tp->snd_una, tp->snd_recover)) { + tp->snd_recover = tp->snd_una; } if (SEQ_LT(tp->snd_nxt, tp->snd_una)) tp->snd_nxt = tp->snd_una; @@ -4040,9 +4058,7 @@ */ if (IN_FASTRECOVERY(tp->t_flags)) { if (tcp_is_sack_recovery(tp, to)) { - tp->snd_cwnd = tp->snd_nxt - tp->snd_recover + - tp->sackhint.sack_bytes_rexmit + - (snd_cnt * maxseg); + tp->snd_cwnd = pipe - del_data + (snd_cnt * maxseg); } else { tp->snd_cwnd = (tp->snd_max - tp->snd_una) + (snd_cnt * maxseg); @@ -4070,17 +4086,19 @@ tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; - tp->snd_nxt = th->th_ack; - /* - * Set snd_cwnd to one segment beyond acknowledged offset. - * (tp->snd_una has not yet been updated when this function is called.) - */ - tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); - tp->t_flags |= TF_ACKNOW; - (void) tcp_output(tp); - tp->snd_cwnd = ocwnd; - if (SEQ_GT(onxt, tp->snd_nxt)) - tp->snd_nxt = onxt; + if (IN_FASTRECOVERY(tp->t_flags)) { + tp->snd_nxt = th->th_ack; + /* + * Set snd_cwnd to one segment beyond acknowledged offset. + * (tp->snd_una has not yet been updated when this function is called.) + */ + tp->snd_cwnd = maxseg + BYTES_THIS_ACK(tp, th); + tp->t_flags |= TF_ACKNOW; + (void) tcp_output(tp); + tp->snd_cwnd = ocwnd; + if (SEQ_GT(onxt, tp->snd_nxt)) + tp->snd_nxt = onxt; + } /* * Partial window deflation. Relies on fact that tp->snd_una * not updated yet. diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -264,19 +264,22 @@ } } again: + sendwin = 0; /* * If we've recently taken a timeout, snd_max will be greater than * snd_nxt. There may be SACK information that allows us to avoid * resending already delivered data. Adjust snd_nxt accordingly. */ if ((tp->t_flags & TF_SACK_PERMIT) && - SEQ_LT(tp->snd_nxt, tp->snd_max)) - tcp_sack_adjust(tp); + (tp->sackhint.nexthole != NULL) && + !IN_FASTRECOVERY(tp->t_flags)) { + sendwin = tcp_sack_adjust(tp); + } sendalot = 0; tso = 0; mtu = 0; off = tp->snd_nxt - tp->snd_una; - sendwin = min(tp->snd_wnd, tp->snd_cwnd); + sendwin = min(tp->snd_wnd, tp->snd_cwnd + sendwin); flags = tcp_outflags[tp->t_state]; /* @@ -293,12 +296,16 @@ sack_bytes_rxmt = 0; len = 0; p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && + if ((tp->t_flags & TF_SACK_PERMIT) && + (IN_FASTRECOVERY(tp->t_flags) || SEQ_LT(tp->snd_nxt, tp->snd_max)) && (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { uint32_t cwin; - cwin = - imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); + if (IN_FASTRECOVERY(tp->t_flags)) { + cwin = imax(sendwin - tcp_compute_pipe(tp), 0); + } else { + cwin = imax(sendwin - off, 0); + } /* Do not retransmit SACK segments beyond snd_recover */ if (SEQ_GT(p->end, tp->snd_recover)) { /* @@ -317,19 +324,34 @@ goto after_sack_rexmit; } else { /* Can rexmit part of the current hole */ - len = ((int32_t)ulmin(cwin, - SEQ_SUB(tp->snd_recover, p->rxmit))); + if (cwin <= (len = SEQ_SUB(tp->snd_recover, p->rxmit))) { + len = cwin; + } else { + sendalot = 1; + } } } else { - len = ((int32_t)ulmin(cwin, - SEQ_SUB(p->end, p->rxmit))); + if (cwin <= (len = SEQ_SUB(p->end, p->rxmit))) { + len = cwin; + } else { + sendalot = 1; + } } if (len > 0) { off = SEQ_SUB(p->rxmit, tp->snd_una); KASSERT(off >= 0,("%s: sack block to the left of una : %d", __func__, off)); sack_rxmit = 1; - sendalot = 1; + } else { + /* we could have transmitted from the scoreboard, + * but sendwin (expected flightsize) - pipe didn't + * allow any transmission. + * Bypass recalculating the possible transmission + * length further down by setting sack_rxmit. + * Wouldn't be here if there would have been + * nothing in the scoreboard to transmit. + */ + sack_rxmit = 1; } } after_sack_rexmit: @@ -392,35 +414,17 @@ * in which case len is already set. */ if (sack_rxmit == 0) { - if (sack_bytes_rxmt == 0) { - len = ((int32_t)min(sbavail(&so->so_snd), sendwin) - - off); + if ((sack_bytes_rxmt == 0) || + SEQ_LT(tp->snd_nxt, tp->snd_max)) { + len = min(sbavail(&so->so_snd), sendwin) - off; } else { - int32_t cwin; - /* * We are inside of a SACK recovery episode and are * sending new data, having retransmitted all the * data possible in the scoreboard. */ - len = ((int32_t)min(sbavail(&so->so_snd), tp->snd_wnd) - - off); - /* - * Don't remove this (len > 0) check ! - * We explicitly check for len > 0 here (although it - * isn't really necessary), to work around a gcc - * optimization issue - to force gcc to compute - * len above. Without this check, the computation - * of len is bungled by the optimizer. - */ - if (len > 0) { - cwin = tp->snd_cwnd - imax(0, (int32_t) - (tp->snd_nxt - tp->snd_recover)) - - sack_bytes_rxmt; - if (cwin < 0) - cwin = 0; - len = imin(len, cwin); - } + len = imin(sbavail(&so->so_snd) - off, + sendwin - tcp_compute_pipe(tp)); } } @@ -551,16 +555,19 @@ ipoptlen += ipsec_optlen; #endif - if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && + if ((len > tp->t_maxseg) && + (tp->t_flags & TF_TSO) && + V_tcp_do_tso && (tp->t_port == 0) && - ((tp->t_flags & TF_SIGNATURE) == 0) && - tp->rcv_numsacks == 0 && sack_rxmit == 0 && - ipoptlen == 0 && !(flags & TH_SYN)) + !(tp->t_flags & TF_SIGNATURE) && + (tp->rcv_numsacks == 0) && + (ipoptlen == 0) && + !(flags & TH_SYN)) tso = 1; if (SEQ_LT((sack_rxmit ? p->rxmit : tp->snd_nxt) + len, tp->snd_una + sbused(&so->so_snd))) { - flags &= ~TH_FIN; + flags &= ~TH_FIN; } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), @@ -605,9 +612,8 @@ if (!(tp->t_flags & TF_MORETOCOME) && /* normal case */ (idle || (tp->t_flags & TF_NODELAY)) && (uint32_t)len + (uint32_t)off >= sbavail(&so->so_snd) && - (tp->t_flags & TF_NOPUSH) == 0) { + (tp->t_flags & TF_NOPUSH) == 0) goto send; - } if (tp->t_flags & TF_FORCEDATA) /* typ. timeout case */ goto send; if (len >= tp->max_sndwnd / 2 && tp->max_sndwnd > 0) @@ -1633,6 +1639,15 @@ /* Clean up any DSACK's sent */ tcp_clean_dsack_blocks(tp); } + if ((error == 0) && + sack_rxmit && + SEQ_LT(tp->snd_nxt, SEQ_MIN(p->rxmit, p->end))) { + /* + * When transmitting from SACK scoreboard + * after an RTO, pull snd_nxt along. + */ + tp->snd_nxt = SEQ_MIN(p->rxmit, p->end); + } if (error) { /* * We know that the packet was lost, so back out the diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -953,16 +953,16 @@ /* Send one or 2 segments based on how much new data was acked. */ if ((BYTES_THIS_ACK(tp, th) / maxseg) >= 2) num_segs = 2; - if (V_tcp_do_newsack) { - tp->snd_cwnd = imax(tp->snd_nxt - th->th_ack + - tp->sackhint.sack_bytes_rexmit - - tp->sackhint.sacked_bytes - - tp->sackhint.lost_bytes, maxseg) + - num_segs * maxseg; - } else { + if (tp->snd_nxt == tp->snd_max) { tp->snd_cwnd = (tp->sackhint.sack_bytes_rexmit + - imax(0, tp->snd_nxt - tp->snd_recover) + - num_segs * maxseg); + (tp->snd_nxt - tp->snd_recover) + num_segs * maxseg); + } else { + /* + * Since cwnd not is the expected flightsize during + * SACK LR, not deflating cwnd allows the partial + * ACKed amount to be sent. + */ + tp->snd_cwnd += 0; } if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; @@ -998,7 +998,7 @@ highdata--; highdata = SEQ_MIN(highdata, tp->snd_recover); if (SEQ_LT(th->th_ack, highdata)) { - tp->snd_fack = th->th_ack; + tp->snd_fack = SEQ_MAX(th->th_ack, tp->snd_fack); if ((temp = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, highdata - maxseg), highdata, NULL)) != NULL) { tp->sackhint.hole_bytes += @@ -1069,40 +1069,43 @@ * should be used to avoid retransmitting SACKed data. This function * traverses the SACK list to see if snd_nxt should be moved forward. */ -void +int tcp_sack_adjust(struct tcpcb *tp) { + int sacked = 0; struct sackhole *p, *cur = TAILQ_FIRST(&tp->snd_holes); INP_WLOCK_ASSERT(tptoinpcb(tp)); if (cur == NULL) { /* No holes */ - return; + return (0); } if (SEQ_GEQ(tp->snd_nxt, tp->snd_fack)) { /* We're already beyond any SACKed blocks */ - return; + return (tp->sackhint.sacked_bytes); } - /*- + /* * Two cases for which we want to advance snd_nxt: * i) snd_nxt lies between end of one hole and beginning of another * ii) snd_nxt lies between end of last hole and snd_fack */ while ((p = TAILQ_NEXT(cur, scblink)) != NULL) { if (SEQ_LT(tp->snd_nxt, cur->end)) { - return; + return (sacked); } + sacked += p->start - cur->end; if (SEQ_GEQ(tp->snd_nxt, p->start)) { cur = p; } else { tp->snd_nxt = p->start; - return; + return (sacked); } } if (SEQ_LT(tp->snd_nxt, cur->end)) { - return; + return (sacked); } tp->snd_nxt = tp->snd_fack; + return (tp->sackhint.sacked_bytes); } /* diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -1485,7 +1485,7 @@ tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); void tcp_clean_sackreport(struct tcpcb *tp); -void tcp_sack_adjust(struct tcpcb *tp); +int tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t, u_int *);