Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -2467,23 +2467,21 @@ } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || - !TAILQ_EMPTY(&tp->snd_holes))) + !TAILQ_EMPTY(&tp->snd_holes))) { sack_changed = tcp_sack_doack(tp, &to, th->th_ack); - else + } else { /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; - + } #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); #endif - + u_int maxseg; if (SEQ_LEQ(th->th_ack, tp->snd_una)) { - u_int maxseg; - maxseg = tcp_maxseg(tp); if (tlen == 0 && (tiwin == tp->snd_wnd || @@ -2573,9 +2571,22 @@ tp->snd_cwnd += maxseg; (void) tp->t_fb->tfb_tcp_output(tp); goto drop; - } else if (tp->t_dupacks == tcprexmtthresh) { + } else if ((tp->t_dupacks == tcprexmtthresh) || + /* + * Add RFC6675 trigger condition of more + * than (dupthresh-1)*mss sacked data. + * If the count of holes in the + * scoreboard is >= dupthresh, we could + * also enter loss recovery, but don't + * have that value readily available. + */ + ((tp->t_flags & TF_SACK_PERMIT) && + (V_tcp_do_rfc6675_pipe) && + (tp->sackhint.sacked_bytes > + (tcprexmtthresh - 1) * maxseg))) { +enter_recovery: + tp->t_dupacks = tcprexmtthresh; tcp_seq onxt = tp->snd_nxt; - /* * If we're doing sack, check to * see if we're already in sack @@ -2607,6 +2618,8 @@ tp->sack_newdata = tp->snd_nxt; tp->snd_cwnd = maxseg; (void) tp->t_fb->tfb_tcp_output(tp); + if (SEQ_GT(th->th_ack, tp->snd_una)) + goto resume_partialack; goto drop; } tp->snd_nxt = th->th_ack; @@ -2684,12 +2697,23 @@ tp->t_dupacks = 0; /* * If this ack also has new SACK info, increment the - * counter as per rfc6675. + * counter as per rfc6675. Start FastRecovery if + * sufficient SACKed bytes were part of this + * partial ACK. */ - if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) + if ((V_tcp_do_rfc6675_pipe) && + (tp->t_flags & TF_SACK_PERMIT) && sack_changed) { tp->t_dupacks++; + /* limit overhead by setting maxseg last */ + if (!IN_FASTRECOVERY(tp->t_flags) && + (tp->sackhint.sacked_bytes > + ((tcprexmtthresh - 1) * + (maxseg = tcp_maxseg(tp))))) { + goto enter_recovery; + } + } } - +resume_partialack: KASSERT(SEQ_GT(th->th_ack, tp->snd_una), ("%s: th_ack <= snd_una", __func__)); Index: sys/netinet/tcp_sack.c =================================================================== --- sys/netinet/tcp_sack.c +++ sys/netinet/tcp_sack.c @@ -345,9 +345,7 @@ * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). * Returns 1 if incoming ACK has previously unknown SACK information, - * 0 otherwise. Note: We treat (snd_una, th_ack) as a sack block so any changes - * to that (i.e. left edge moving) would also be considered a change in SACK - * information which is slightly different than rfc6675. + * 0 otherwise. */ int tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) @@ -355,16 +353,21 @@ struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks, sack_changed; + int delivered_data, left_edge_delta; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; sack_changed = 0; + delivered_data = 0; + left_edge_delta = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. + * Account changes to SND.UNA always in delivered data. */ if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { + left_edge_delta = th_ack - tp->snd_una; sack_blocks[num_sack_blks].start = tp->snd_una; sack_blocks[num_sack_blks++].end = th_ack; } @@ -373,7 +376,6 @@ * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { - tp->sackhint.sacked_bytes = 0; /* reset */ for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); @@ -386,8 +388,6 @@ SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) { sack_blocks[num_sack_blks++] = sack; - tp->sackhint.sacked_bytes += - (sack.end-sack.start); } } } @@ -412,7 +412,7 @@ } } } - if (TAILQ_EMPTY(&tp->snd_holes)) + if (TAILQ_EMPTY(&tp->snd_holes)) { /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes @@ -421,6 +421,8 @@ * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); + tp->sackhint.sacked_bytes = 0; /* reset */ + } /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just @@ -444,6 +446,7 @@ */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { + delivered_data += sblkp->end - sblkp->start; tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; @@ -462,10 +465,12 @@ sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) + delivered_data += sblkp->end - tp->snd_fack; tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { /* fack is advanced. */ + delivered_data += sblkp->end - tp->snd_fack; tp->snd_fack = sblkp->end; sack_changed = 1; } @@ -499,6 +504,7 @@ /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ + delivered_data += (cur->end - cur->start); temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); @@ -510,6 +516,7 @@ continue; } else { /* Move start of hole forward. */ + delivered_data += (sblkp->end - cur->start); cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } @@ -517,6 +524,7 @@ /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ + delivered_data += (cur->end - sblkp->start); cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { @@ -536,6 +544,7 @@ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + delivered_data += (sblkp->end - sblkp->start); } } } @@ -550,6 +559,18 @@ else sblkp--; } + tp->sackhint.delivered_data = delivered_data; + tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; + if (!(to->to_flags & TOF_SACK)) + /* + * If this ACK did not contain any + * SACK blocks, any only moved the + * left edge right, it is a pure + * cumulative ACK. Do not count + * DupAck for this. Also required + * for RFC6675 rescue retransmission. + */ + sack_changed = 0; return (sack_changed); } @@ -595,6 +616,31 @@ if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; + /* + * RFC6675 rescue retransmission + * Add a hole between th_ack (snd_una is not yet set) and snd_max, + * if this was a pure cumulative ACK and no data was send beyond + * recovery point. Since the data in the socket has not been freed + * at this point, we check if the scoreboard is empty, and the ACK + * delivered some new data, indicating a full ACK. Also, if the + * recovery point is still at snd_max, we are probably application + * limited. However, this inference might not always be true. The + * rescue retransmission may rarely be slightly premature + * compared to RFC6675. + * The corresponding ACK+SACK will cause any further outstanding + * segments to be retransmitted. This addresses a corner case, when + * the trailing packets of a window are lost and no further data + * is available for sending. + */ + if ((V_tcp_do_rfc6675_pipe) && + SEQ_LT(th->th_ack, tp->snd_recover) && + (tp->snd_recover == tp->snd_max) && + TAILQ_EMPTY(&tp->snd_holes) && + (tp->sackhint.delivered_data > 0)) { + struct sackhole *hole; + int maxseg = tcp_maxseg(tp); + hole = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, tp->snd_max - maxseg), tp->snd_max, NULL); + } (void) tp->t_fb->tfb_tcp_output(tp); } Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -70,15 +70,15 @@ struct sackhint { struct sackhole *nexthole; - int sack_bytes_rexmit; + int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ - int ispare; /* explicit pad for 64bit alignment */ - int sacked_bytes; /* - * Total sacked bytes reported by the + int32_t delivered_data; /* Newly acked data from last SACK */ + + int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ - uint32_t _pad1[1]; /* TBD */ + int32_t _pad1[1]; /* TBD */ uint64_t _pad[1]; /* TBD */ };