diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -157,11 +157,6 @@ &VNET_NAME(drop_synfin), 0, "Drop TCP packets with SYN+FIN set"); -VNET_DEFINE(int, tcp_do_prr_conservative) = 0; -SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr_conservative, CTLFLAG_VNET | CTLFLAG_RW, - &VNET_NAME(tcp_do_prr_conservative), 0, - "Do conservative Proportional Rate Reduction"); - VNET_DEFINE(int, tcp_do_prr) = 1; SYSCTL_INT(_net_inet_tcp, OID_AUTO, do_prr, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_prr), 1, @@ -1489,7 +1484,8 @@ struct tcpcb *tp, int drop_hdrlen, int tlen, uint8_t iptos) { uint16_t thflags; - int acked, ourfinisacked, needoutput = 0, sack_changed; + int acked, ourfinisacked, needoutput = 0; + sackstatus_t sack_changed; int rstreason, todrop, win, incforsyn = 0; uint32_t tiwin; uint16_t nsegs; @@ -1503,7 +1499,7 @@ thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; - sack_changed = 0; + sack_changed = SACK_NOCHANGE; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); @@ -2542,7 +2538,7 @@ */ if (th->th_ack != tp->snd_una || (tcp_is_sack_recovery(tp, &to) && - !sack_changed)) + (sack_changed == SACK_NOCHANGE))) break; else if (!tcp_timer_active(tp, TT_REXMT)) tp->t_dupacks = 0; @@ -2551,8 +2547,13 @@ cc_ack_received(tp, th, nsegs, CC_DUPACK); if (V_tcp_do_prr && - IN_FASTRECOVERY(tp->t_flags)) { - tcp_do_prr_ack(tp, th, &to); + IN_FASTRECOVERY(tp->t_flags) && + (tp->t_flags & TF_SACK_PERMIT)) { + /* + * While dealing with DupAcks, + * always use PRR-CRB + */ + tcp_do_prr_ack(tp, th, &to, SACK_NEWLOSS); } else if (tcp_is_sack_recovery(tp, &to) && IN_FASTRECOVERY(tp->t_flags)) { int awnd; @@ -2731,7 +2732,7 @@ * counted as dupacks here. */ if (tcp_is_sack_recovery(tp, &to) && - sack_changed) { + (sack_changed != SACK_NOCHANGE)) { tp->t_dupacks++; /* limit overhead by setting maxseg last */ if (!IN_FASTRECOVERY(tp->t_flags) && @@ -2757,7 +2758,7 @@ if (V_tcp_do_prr && to.to_flags & TOF_SACK) { tcp_timer_activate(tp, TT_REXMT, 0); tp->t_rtttime = 0; - tcp_do_prr_ack(tp, th, &to); + tcp_do_prr_ack(tp, th, &to, sack_changed); tp->t_flags |= TF_ACKNOW; (void) tcp_output(tp); } else @@ -2771,7 +2772,11 @@ if (V_tcp_do_prr) { tp->sackhint.delivered_data = BYTES_THIS_ACK(tp, th); tp->snd_fack = th->th_ack; - tcp_do_prr_ack(tp, th, &to); + /* + * During ECN cwnd reduction + * always use PRR-SSRB + */ + tcp_do_prr_ack(tp, th, &to, SACK_CHANGE); (void) tcp_output(tp); } } else @@ -3894,7 +3899,7 @@ } void -tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to) +tcp_do_prr_ack(struct tcpcb *tp, struct tcphdr *th, struct tcpopt *to, sackstatus_t sack_changed) { int snd_cnt = 0, limit = 0, del_data = 0, pipe = 0; int maxseg = tcp_maxseg(tp); @@ -3934,7 +3939,17 @@ tp->snd_ssthresh, tp->sackhint.recover_fs) - tp->sackhint.prr_out; } else { - if (V_tcp_do_prr_conservative || (del_data == 0)) + /* + * PRR 6937bis heuristic: + * - A partial ack without SACK block beneath snd_recover + * indicates further loss. + * - An SACK scoreboard update adding a new hole indicates + * further loss, so be conservative and send at most one + * segment. + * - Prevent ACK splitting attacks, by being conservative + * when no new data is acked. + */ + if ((sack_changed == SACK_NEWLOSS) || (del_data == 0)) limit = tp->sackhint.prr_delivered - tp->sackhint.prr_out; else diff --git a/sys/netinet/tcp_sack.c b/sys/netinet/tcp_sack.c --- a/sys/netinet/tcp_sack.c +++ b/sys/netinet/tcp_sack.c @@ -546,21 +546,23 @@ * Process cumulative ACK and the TCP SACK option to update the scoreboard. * tp->snd_holes is an ordered list of holes (oldest to newest, in terms of * the sequence space). - * Returns 1 if incoming ACK has previously unknown SACK information, - * 0 otherwise. + * Returns SACK_NEWLOSS if incoming ACK indicates ongoing loss (hole split, new hole), + * SACK_CHANGE if incoming ACK has previously unknown SACK information, + * SACK_NOCHANGE otherwise. */ -int +sackstatus_t tcp_sack_doack(struct tcpcb *tp, struct tcpopt *to, tcp_seq th_ack) { struct sackhole *cur, *temp; struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1], *sblkp; - int i, j, num_sack_blks, sack_changed; + int i, j, num_sack_blks; + sackstatus_t sack_changed; int delivered_data, left_edge_delta; INP_WLOCK_ASSERT(tptoinpcb(tp)); num_sack_blks = 0; - sack_changed = 0; + sack_changed = SACK_NOCHANGE; delivered_data = 0; left_edge_delta = 0; /* @@ -579,7 +581,7 @@ if (SEQ_LT(tp->snd_fack, th_ack)) { delivered_data += th_ack - tp->snd_una; tp->snd_fack = th_ack; - sack_changed = 1; + sack_changed = SACK_CHANGE; } } /* @@ -669,7 +671,7 @@ delivered_data += sblkp->end - sblkp->start; tp->snd_fack = sblkp->end; sblkp--; - sack_changed = 1; + sack_changed = SACK_NEWLOSS; } else { /* * Append a new SACK hole at the tail. If the @@ -683,7 +685,7 @@ tp->snd_fack = sblkp->end; /* Go to the previous sack block. */ sblkp--; - sack_changed = 1; + sack_changed = SACK_CHANGE; } else { /* * We failed to add a new hole based on the current @@ -700,7 +702,12 @@ SEQ_LT(tp->snd_fack, sblkp->end)) { delivered_data += sblkp->end - tp->snd_fack; tp->snd_fack = sblkp->end; - sack_changed = 1; + /* + * While the Scoreboard didn't change in + * size, we only ended up here because + * some SACK data had to be dismissed. + */ + sack_changed = SACK_NEWLOSS; } } } @@ -708,7 +715,7 @@ /* fack is advanced. */ delivered_data += sblkp->end - tp->snd_fack; tp->snd_fack = sblkp->end; - sack_changed = 1; + sack_changed = SACK_CHANGE; } cur = TAILQ_LAST(&tp->snd_holes, sackhole_head); /* Last SACK hole. */ /* @@ -736,7 +743,7 @@ (SEQ_MIN(cur->rxmit, cur->end) - cur->start); KASSERT(tp->sackhint.sack_bytes_rexmit >= 0, ("sackhint bytes rtx >= 0")); - sack_changed = 1; + sack_changed = SACK_CHANGE; if (SEQ_LEQ(sblkp->start, cur->start)) { /* Data acks at least the beginning of hole. */ if (SEQ_GEQ(sblkp->end, cur->end)) { @@ -773,6 +780,7 @@ */ temp = tcp_sackhole_insert(tp, sblkp->end, cur->end, cur); + sack_changed = SACK_NEWLOSS; if (temp != NULL) { if (SEQ_GT(cur->rxmit, temp->rxmit)) { temp->rxmit = cur->rxmit; @@ -810,7 +818,7 @@ * DupAck for this. Also required * for RFC6675 rescue retransmission. */ - sack_changed = 0; + sack_changed = SACK_NOCHANGE; tp->sackhint.delivered_data = delivered_data; tp->sackhint.sacked_bytes += delivered_data - left_edge_delta; KASSERT((delivered_data >= 0), ("delivered_data < 0")); diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -304,6 +304,13 @@ struct tcphdr tt_t; }; +/* SACK scoreboard update status */ +typedef enum { + SACK_NOCHANGE = 0, + SACK_CHANGE, + SACK_NEWLOSS +} sackstatus_t; + /* Enable TCP/UDP tunneling port */ #define TCP_TUNNELING_PORT_MIN 0 #define TCP_TUNNELING_PORT_MAX 65535 @@ -1035,7 +1042,6 @@ #define V_tcp_do_lrd VNET(tcp_do_lrd) #define V_tcp_do_prr VNET(tcp_do_prr) -#define V_tcp_do_prr_conservative VNET(tcp_do_prr_conservative) #define V_tcp_do_newcwv VNET(tcp_do_newcwv) #define V_drop_synfin VNET(drop_synfin) #define V_path_mtu_discovery VNET(path_mtu_discovery) @@ -1210,7 +1216,8 @@ uint32_t tcp_new_ts_offset(struct in_conninfo *); tcp_seq tcp_new_isn(struct in_conninfo *); -int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); +sackstatus_t + tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); @@ -1218,7 +1225,7 @@ void tcp_clean_sackreport(struct tcpcb *tp); void tcp_sack_adjust(struct tcpcb *tp); struct sackhole *tcp_sack_output(struct tcpcb *tp, int *sack_bytes_rexmt); -void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *); +void tcp_do_prr_ack(struct tcpcb *, struct tcphdr *, struct tcpopt *, sackstatus_t); void tcp_lost_retransmission(struct tcpcb *, struct tcphdr *); void tcp_sack_partialack(struct tcpcb *, struct tcphdr *); void tcp_free_sackholes(struct tcpcb *tp);