Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -2467,15 +2467,25 @@ } if ((tp->t_flags & TF_SACK_PERMIT) && ((to.to_flags & TOF_SACK) || - !TAILQ_EMPTY(&tp->snd_holes))) + !TAILQ_EMPTY(&tp->snd_holes))) { sack_changed = tcp_sack_doack(tp, &to, th->th_ack); - else + LOGTCPCBSTATE; + if (TAILQ_EMPTY(&tp->snd_holes) && + !(to.to_flags & TOF_SACK) && + ((tp->snd_max - th->th_ack) == sbavail(&so->so_snd)) && + SEQ_LT(th->th_ack, tp->snd_recover)) { + if (so->so_options & SO_DEBUG) + log(LOG_DEBUG,"rfc6675 rescue retransmission\n"); + } + } else { /* * Reset the value so that previous (valid) value * from the last ack with SACK doesn't get used. */ tp->sackhint.sacked_bytes = 0; - + tp->sackhint.sacked_bytes_old = 0; + LOGTCPCBSTATE; + } #ifdef TCP_HHOOK /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, &to); @@ -2518,7 +2528,7 @@ * window size so do congestion avoidance * (set ssthresh to half the current window * and pull our congestion window back to - * the new ssthresh). + * the new ssthresh09). * * Dup acks mean that packets have left the * network (they're now cached at the receiver) @@ -2540,9 +2550,17 @@ */ if (th->th_ack != tp->snd_una || ((tp->t_flags & TF_SACK_PERMIT) && - !sack_changed)) + !sack_changed)) { +// log(LOG_DEBUG,"tcp_input:2569 falling through here %u\n", tp->snd_fack); +// if (SEQ_LT(th->th_ack, tp->snd_recover) && +// (tp->t_flags & TF_SACK_PERMIT) && +// TAILQ_EMPTY(&tp->snd_holes) && +// ((tp->snd_max - tp->snd_una) == sbavail(&so->so_snd))) { +// log(LOG_DEBUG,"adding hole for rescue rexmit\n"); +// tcp_sackhole_insert(tp, tp->snd_una, tp->snd_max, NULL); +// } break; - else if (!tcp_timer_active(tp, TT_REXMT)) + } else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; else if (++tp->t_dupacks > tcprexmtthresh || IN_FASTRECOVERY(tp->t_flags)) { @@ -2571,9 +2589,23 @@ } } else tp->snd_cwnd += maxseg; + (void) tp->t_fb->tfb_tcp_output(tp); goto drop; - } else if (tp->t_dupacks == tcprexmtthresh) { + } else if ((tp->t_dupacks == tcprexmtthresh) || + /* + * Add RFC6675 trigger condition of more + * than (dupthresh-1)*mss sacked data. + * If the count of holes in the + * scoreboard is >= dupthresh, we could + * also enter loss recovery, but don't + * have that value readily available. + */ + ((tp->t_flags & TF_SACK_PERMIT) && + (V_tcp_do_rfc6675_pipe) && + (tp->sackhint.sacked_bytes > + (tcprexmtthresh - 1) * maxseg))) { + tp->t_dupacks = tcprexmtthresh; tcp_seq onxt = tp->snd_nxt; /* @@ -2686,8 +2718,10 @@ * If this ack also has new SACK info, increment the * counter as per rfc6675. */ - if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) + if ((tp->t_flags & TF_SACK_PERMIT) && sack_changed) { tp->t_dupacks++; + } + LOGTCPCBSTATE; } KASSERT(SEQ_GT(th->th_ack, tp->snd_una), Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -54,6 +54,7 @@ #include #include #include +#include #include #include @@ -285,43 +286,71 @@ sack_bytes_rxmt = 0; len = 0; p = NULL; - if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags) && - (p = tcp_sack_output(tp, &sack_bytes_rxmt))) { + + LOGTCPCBSTATE2; + + if ((tp->t_flags & TF_SACK_PERMIT) && IN_FASTRECOVERY(tp->t_flags)) { + p = tcp_sack_output(tp, &sack_bytes_rxmt); + /* + * RFC6675 Rescue Retransmission + * when no new data is available, and + * all Scoreboard Holes were retransmitted, + * resend 1 MSS just beneath snd_max + */ uint32_t cwin; cwin = imax(min(tp->snd_wnd, tp->snd_cwnd) - sack_bytes_rxmt, 0); - /* Do not retransmit SACK segments beyond snd_recover */ - if (SEQ_GT(p->end, tp->snd_recover)) { - /* - * (At least) part of sack hole extends beyond - * snd_recover. Check to see if we can rexmit data - * for this hole. - */ - if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { +/* if (V_tcp_do_rfc6675_pipe && (p == NULL) && + SEQ_GT(tp->snd_max, tp->snd_una) && + ((tp->snd_max - tp->snd_una) == sbavail(&so->so_snd))) { +// if (so->so_options & SO_DEBUG) { +// log(LOG_DEBUG,"rfc6675 rescue retransmission"); +// } + len = ((int32_t)ulmin(tp->t_maxseg, cwin)); + tp->snd_nxt = tp->snd_max - len - tp->snd_una; + sendalot = 1; + TCPSTAT_INC(tcps_sack_rescxmits); + TCPSTAT_ADD(tcps_sack_rescxmit_bytes, len); + } +*/ if (p != NULL) { + /* Do not retransmit SACK segments beyond snd_recover */ + if (SEQ_GT(p->end, tp->snd_recover)) { /* - * Can't rexmit any more data for this hole. - * That data will be rexmitted in the next - * sack recovery episode, when snd_recover - * moves past p->rxmit. + * (At least) part of sack hole extends beyond + * snd_recover. Check to see if we can rexmit data + * for this hole. */ - p = NULL; - goto after_sack_rexmit; - } else - /* Can rexmit part of the current hole */ - len = ((int32_t)ulmin(cwin, + if (SEQ_GEQ(p->rxmit, tp->snd_recover)) { + /* + * Can't rexmit any more data for this hole. + * That data will be rexmitted in the next + * sack recovery episode, when snd_recover + * moves past p->rxmit. + */ + p = NULL; + goto after_sack_rexmit; + } else + /* Can rexmit part of the current hole */ + len = ((int32_t)ulmin(cwin, tp->snd_recover - p->rxmit)); - } else - len = ((int32_t)ulmin(cwin, p->end - p->rxmit)); - off = p->rxmit - tp->snd_una; - KASSERT(off >= 0,("%s: sack block to the left of una : %d", - __func__, off)); - if (len > 0) { - sack_rxmit = 1; - sendalot = 1; - TCPSTAT_INC(tcps_sack_rexmits); - TCPSTAT_ADD(tcps_sack_rexmit_bytes, - min(len, tp->t_maxseg)); + } else + len = ((int32_t)ulmin(cwin, p->end - p->rxmit)); + off = p->rxmit - tp->snd_una; + if (off < 0) { + log(LOG_DEBUG,"near panic: una: %u, rxmit: %u, start: %u, end:%u, len: %i\n", + tp->snd_una - tp->iss, p->rxmit - tp->iss, p->start - tp->iss, p->end - tp->iss, len); + off = 0; + } + KASSERT(off >= 0,("%s: sack block to the left of una : %d", + __func__, off)); + if (len > 0) { + sack_rxmit = 1; + sendalot = 1; + TCPSTAT_INC(tcps_sack_rexmits); + TCPSTAT_ADD(tcps_sack_rexmit_bytes, + min(len, tp->t_maxseg)); + } } } after_sack_rexmit: @@ -1405,6 +1434,8 @@ TCP_PROBE5(connect__request, NULL, tp, ip, tp, th); TCP_PROBE5(send, NULL, tp, ip, tp, th); + if (so->so_options & SO_DEBUG) + log(LOG_DEBUG, "tcp_output:1444 hand off to IP\n"); #ifdef TCPPCAP /* Save packet, if requested. */ Index: sys/netinet/tcp_sack.c =================================================================== --- sys/netinet/tcp_sack.c +++ sys/netinet/tcp_sack.c @@ -355,25 +355,33 @@ struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; int i, j, num_sack_blks, sack_changed; + int delivered_data, left_edge_delta; INP_WLOCK_ASSERT(tp->t_inpcb); num_sack_blks = 0; sack_changed = 0; + delivered_data = 0; + left_edge_delta = 0; /* * If SND.UNA will be advanced by SEG.ACK, and if SACK holes exist, * treat [SND.UNA, SEG.ACK) as if it is a SACK block. + * Account changes to SND.UNA always in delivered data. */ - if (SEQ_LT(tp->snd_una, th_ack) && !TAILQ_EMPTY(&tp->snd_holes)) { - sack_blocks[num_sack_blks].start = tp->snd_una; - sack_blocks[num_sack_blks++].end = th_ack; + if SEQ_LT(tp->snd_una, th_ack) { +// delivered_data = th_ack - tp->snd_una; + left_edge_delta = th_ack - tp->snd_una; + if(!TAILQ_EMPTY(&tp->snd_holes)) { + sack_blocks[num_sack_blks].start = tp->snd_una; + sack_blocks[num_sack_blks++].end = th_ack; + } } /* * Append received valid SACK blocks to sack_blocks[], but only if we * received new blocks from the other side. */ if (to->to_flags & TOF_SACK) { - tp->sackhint.sacked_bytes = 0; /* reset */ + tp->sackhint.sacked_bytes_old = 0; /* reset */ for (i = 0; i < to->to_nsacks; i++) { bcopy((to->to_sacks + i * TCPOLEN_SACK), &sack, sizeof(sack)); @@ -386,8 +394,8 @@ SEQ_GT(sack.end, tp->snd_una) && SEQ_LEQ(sack.end, tp->snd_max)) { sack_blocks[num_sack_blks++] = sack; - tp->sackhint.sacked_bytes += - (sack.end-sack.start); + tp->sackhint.sacked_bytes_old += + (sack.end - sack.start); } } } @@ -412,7 +420,7 @@ } } } - if (TAILQ_EMPTY(&tp->snd_holes)) + if (TAILQ_EMPTY(&tp->snd_holes)) { /* * Empty scoreboard. Need to initialize snd_fack (it may be * uninitialized or have a bogus value). Scoreboard holes @@ -421,6 +429,8 @@ * scoreboard). */ tp->snd_fack = SEQ_MAX(tp->snd_una, th_ack); + tp->sackhint.sacked_bytes = 0; /* reset */ + } /* * In the while-loop below, incoming SACK blocks (sack_blocks[]) and * SACK holes (snd_holes) are traversed from their tails with just @@ -444,6 +454,7 @@ */ temp = tcp_sackhole_insert(tp, tp->snd_fack,sblkp->start,NULL); if (temp != NULL) { + delivered_data += sblkp->end - sblkp->start; tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; @@ -462,10 +473,12 @@ sblkp--; if (sblkp >= sack_blocks && SEQ_LT(tp->snd_fack, sblkp->end)) + delivered_data += sblkp->end - tp->snd_fack; tp->snd_fack = sblkp->end; } } else if (SEQ_LT(tp->snd_fack, sblkp->end)) { /* fack is advanced. */ + delivered_data += sblkp->end - tp->snd_fack; tp->snd_fack = sblkp->end; sack_changed = 1; } @@ -499,6 +512,7 @@ /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Acks entire hole, so delete hole. */ + delivered_data += (cur->end - cur->start); temp = cur; cur = TAILQ_PREV(cur, sackhole_head, scblink); tcp_sackhole_remove(tp, temp); @@ -510,6 +524,7 @@ continue; } else { /* Move start of hole forward. */ + delivered_data += (sblkp->end - cur->start); cur->start = sblkp->end; cur->rxmit = SEQ_MAX(cur->rxmit, cur->start); } @@ -517,6 +532,7 @@ /* Data acks at least the end of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { /* Move end of hole backward. */ + delivered_data += (cur->end - sblkp->start); cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); } else { @@ -536,6 +552,7 @@ cur->end = sblkp->start; cur->rxmit = SEQ_MIN(cur->rxmit, cur->end); + delivered_data += (sblkp->end - sblkp->start); } } } @@ -550,6 +567,18 @@ else sblkp--; } + tp->sackhint.delivered_data = delivered_data; + tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; + if (!(to->to_flags & TOF_SACK)) + /* + * If this ACK did not contain any + * SACK blocks, any only moved the + * left edge right, it is a pure + * cumulative ACK. Do not count + * DupAck for this. Also required + * for RFC6675 rescue retransmission. + */ + sack_changed = 0; return (sack_changed); } @@ -595,7 +624,42 @@ if (tp->snd_cwnd > tp->snd_ssthresh) tp->snd_cwnd = tp->snd_ssthresh; tp->t_flags |= TF_ACKNOW; + /* + * RFC6675 rescue retransmission + * Add a hole between th_ack (una is not yet set) and snd_max, + * if this was a pure cumulative ACK and no data was send beyond + * recovery point. Since the data in the socket has not been freed + * at this point, this may still happen when more new data is ready to + * send. The rescue retransmission may be slightly premature + * compared to RFC6675. + */ + if ((V_tcp_do_rfc6675_pipe) && + SEQ_LT(th->th_ack, tp->snd_recover) && + (tp->snd_recover == tp->snd_max) && + TAILQ_EMPTY(&tp->snd_holes) && + (tp->sackhint.delivered_data > 0)) { + struct sackhole *hole; + int maxseg = tcp_maxseg(tp); + hole = tcp_sackhole_insert(tp, SEQ_MAX(th->th_ack, tp->snd_max - maxseg), tp->snd_max, NULL); +// if ((tp->snd_max - th->th_ack) > maxseg) { // do this with PRR to avoid bursts. + /* + * have to insert lower hole after + * rescue retransmission, for + * sackhint updates to pick this up + */ +// hole = tcp_sackhole_insert(tp, th->th_ack, tp->snd_max - maxseg, NULL); +// log(LOG_DEBUG,"low hole %u - %u <- %u\n", hole->start - tp->iss, hole->end - tp->iss, hole->rxmit - tp->iss); +// } + log(LOG_DEBUG,"high hole %u - %u <- %u\n", tp->sackhint.nexthole->start - tp->iss, tp->sackhint.nexthole->end - tp->iss, tp->sackhint.nexthole->rxmit - tp->iss); + log(LOG_DEBUG,"nexthole: %p (%u) hole: %p (%u)\n", + (void *)tp->sackhint.nexthole, tp->sackhint.nexthole->start - tp->iss, + (void *)hole, hole->start - tp->iss); + } + (void) tp->t_fb->tfb_tcp_output(tp); + + struct socket *so = tp->t_inpcb->inp_socket; + LOGTCPCBSTATE2; } #if 0 @@ -649,6 +713,8 @@ INP_WLOCK_ASSERT(tp->t_inpcb); *sack_bytes_rexmt = tp->sackhint.sack_bytes_rexmit; hole = tp->sackhint.nexthole; + struct socket *so = tp->t_inpcb->inp_socket; + LOGTCPCBSTATE2; if (hole == NULL || SEQ_LT(hole->rxmit, hole->end)) goto out; while ((hole = TAILQ_NEXT(hole, scblink)) != NULL) { Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -43,6 +43,89 @@ #include #endif +#define STRIPPATH(s)\ + (sizeof(s) > 2 && (s)[sizeof(s)-2] == '/' ? (s) + sizeof(s) - 1 : \ + sizeof(s) > 3 && (s)[sizeof(s)-3] == '/' ? (s) + sizeof(s) - 2 : \ + sizeof(s) > 4 && (s)[sizeof(s)-4] == '/' ? (s) + sizeof(s) - 3 : \ + sizeof(s) > 5 && (s)[sizeof(s)-5] == '/' ? (s) + sizeof(s) - 4 : \ + sizeof(s) > 6 && (s)[sizeof(s)-6] == '/' ? (s) + sizeof(s) - 5 : \ + sizeof(s) > 7 && (s)[sizeof(s)-7] == '/' ? (s) + sizeof(s) - 6 : \ + sizeof(s) > 8 && (s)[sizeof(s)-8] == '/' ? (s) + sizeof(s) - 7 : \ + sizeof(s) > 9 && (s)[sizeof(s)-9] == '/' ? (s) + sizeof(s) - 8 : \ + sizeof(s) > 10 && (s)[sizeof(s)-10] == '/' ? (s) + sizeof(s) - 9 : \ + sizeof(s) > 11 && (s)[sizeof(s)-11] == '/' ? (s) + sizeof(s) - 10 : \ + sizeof(s) > 12 && (s)[sizeof(s)-12] == '/' ? (s) + sizeof(s) - 11 : \ + sizeof(s) > 13 && (s)[sizeof(s)-13] == '/' ? (s) + sizeof(s) - 12 : \ + sizeof(s) > 14 && (s)[sizeof(s)-14] == '/' ? (s) + sizeof(s) - 13 : \ + sizeof(s) > 15 && (s)[sizeof(s)-15] == '/' ? (s) + sizeof(s) - 14 : (s)) + +#define __JUSTFILE__ STRIPPATH(__FILE__) + +#define LOGTCPCBSTATE do { \ + if (so->so_options & SO_DEBUG) { \ + log(LOG_DEBUG,"%12s:%-4d una:%5u ack:%5u fack:%5u rp:%5u new:%5u max:%5u nxt:%5u " \ + "cwnd:%5u dup:%2d pipe ori:%5u old:%5u new:%5u sack re:%5u old:%5u by:%5u dd:%5u avail:%5u %s %s %s\n", \ + __JUSTFILE__, \ + __LINE__, \ + tp->snd_una - tp->iss, \ + th->th_ack - tp->iss, \ + (tp->snd_fack == 0) ? 0 : tp->snd_fack - tp->iss, \ + tp->snd_recover - tp->iss, \ + (tp->sack_newdata == 0) ? 0 : tp->sack_newdata - tp->iss, \ + tp->snd_max - tp->iss, \ + tp->snd_nxt - tp->iss, \ + tp->snd_cwnd, \ + tp->t_dupacks, \ + tp->snd_nxt - tp->snd_fack + tp->sackhint.sack_bytes_rexmit, \ + tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes_old, \ + tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes, \ + \ + tp->sackhint.sack_bytes_rexmit, \ + tp->sackhint.sacked_bytes_old, \ + tp->sackhint.sacked_bytes, \ + tp->sackhint.delivered_data, \ + \ + sbavail(&so->so_snd), \ + IN_RECOVERY(tp->t_flags) ? "LR" : " ", \ + (to.to_flags & TOF_SACK) ? "sack" : " ", \ + TAILQ_EMPTY(&tp->snd_holes) ? "empty":" " \ + ); \ + } \ + } while (0) + +#define LOGTCPCBSTATE2 do { \ + if (so->so_options & SO_DEBUG) { \ + log(LOG_DEBUG,"%12s:%-4d una:%5u ack:----- fack:%5u rp:%5u new:%5u max:%5u nxt:%5u " \ + "cwnd:%5u dup:%2d pipe ori:%5u old:%5u new:%5u sack re:%5u old:%5u by:%5u dd:%5u avail:%5u %s %s %s\n", \ + __JUSTFILE__, \ + __LINE__, \ + tp->snd_una - tp->iss, \ + \ + (tp->snd_fack == 0) ? 0 : tp->snd_fack - tp->iss, \ + tp->snd_recover - tp->iss, \ + (tp->sack_newdata == 0) ? 0 : tp->sack_newdata - tp->iss, \ + tp->snd_max - tp->iss, \ + tp->snd_nxt - tp->iss, \ + tp->snd_cwnd, \ + tp->t_dupacks, \ + tp->snd_nxt - tp->snd_fack + tp->sackhint.sack_bytes_rexmit, \ + tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes_old, \ + tp->snd_max - tp->snd_una + tp->sackhint.sack_bytes_rexmit - tp->sackhint.sacked_bytes, \ + \ + tp->sackhint.sack_bytes_rexmit, \ + tp->sackhint.sacked_bytes_old, \ + tp->sackhint.sacked_bytes, \ + tp->sackhint.delivered_data, \ + \ + sbavail(&so->so_snd), \ + IN_RECOVERY(tp->t_flags) ? "LR" : " ", \ + "----", \ + TAILQ_EMPTY(&tp->snd_holes) ? "empty":" " \ + ); \ + } \ + } while (0) + + #if defined(_KERNEL) || defined(_WANT_TCPCB) /* TCP segment queue entry */ struct tseg_qent { @@ -70,15 +153,16 @@ struct sackhint { struct sackhole *nexthole; - int sack_bytes_rexmit; + int32_t sack_bytes_rexmit; tcp_seq last_sack_ack; /* Most recent/largest sacked ack */ - int ispare; /* explicit pad for 64bit alignment */ - int sacked_bytes; /* - * Total sacked bytes reported by the + int32_t delivered_data; /* Newly acked data from last SACK */ + + int32_t sacked_bytes; /* Total sacked bytes reported by the * receiver via sack option */ - uint32_t _pad1[1]; /* TBD */ + int32_t sacked_bytes_old; /* just for demonstration */ +// uint32_t _pad1[1]; /* TBD */ uint64_t _pad[1]; /* TBD */ };