diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h --- a/sys/netinet/tcp_ecn.h +++ b/sys/netinet/tcp_ecn.h @@ -38,6 +38,8 @@ #include #include +static u_int tcp_ecn_tso_cwr_split = 1; + void tcp_ecn_input_syn_sent(struct tcpcb *, uint16_t, int); void tcp_ecn_input_parallel_syn(struct tcpcb *, uint16_t, int); int tcp_ecn_input_segment(struct tcpcb *, uint16_t, int, int, int); diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c --- a/sys/netinet/tcp_ecn.c +++ b/sys/netinet/tcp_ecn.c @@ -110,6 +110,10 @@ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); +SYSCTL_UINT(_net_inet_tcp_ecn, OID_AUTO, tso_cwr_split, + CTLFLAG_RWTUN, &tcp_ecn_tso_cwr_split, 1, + "TSO has RFC3168 ECN support masking the CWR flag"); + /* * Process incoming SYN,ACK packet */ @@ -367,18 +371,12 @@ return thflags; if (V_tcp_do_ecn == 1) { /* Send a RFC3168 ECN setup packet */ - if (tp->t_rxtshift >= 1) { - if (tp->t_rxtshift <= V_tcp_ecn_maxretries) - thflags = TH_ECE|TH_CWR; - } else + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) thflags = TH_ECE|TH_CWR; } else if (V_tcp_do_ecn == 3) { /* Send an Accurate ECN setup packet */ - if (tp->t_rxtshift >= 1) { - if (tp->t_rxtshift <= V_tcp_ecn_maxretries) - thflags = TH_ECE|TH_CWR|TH_AE; - } else + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) thflags = TH_ECE|TH_CWR|TH_AE; } @@ -401,12 +399,12 @@ * Ignore pure control packets, retransmissions * and window probes. */ - newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && - !rxmit && + newdata = (!rxmit && len > 0 && + SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)); /* RFC3168 ECN marking, only new data segments */ - if (newdata) { - if (tp->t_flags2 & TF2_ECN_USE_ECT1) { + if (__predict_true(newdata)) { + if (__predict_false(tp->t_flags2 & TF2_ECN_USE_ECT1)) { ipecn = IPTOS_ECN_ECT1; TCPSTAT_INC(tcps_ecn_sndect1); } else { @@ -443,8 +441,11 @@ *thflags |= TH_CWR; tp->t_flags2 &= ~TF2_ECN_SND_CWR; } - if (tp->t_flags2 & TF2_ECN_SND_ECE) + if (tp->t_flags2 & TF2_ECN_SND_ECE) { *thflags |= TH_ECE; + if (tp->t_state == TCPS_SYN_RECEIVED) + tp->t_flags2 &= ~TF2_ECN_SND_ECE; + } } return ipecn; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -208,6 +208,7 @@ int sack_rxmit, sack_bytes_rxmt; struct sackhole *p; int tso, mtu; + int tcp_ect = IPTOS_ECN_NOTECT; struct tcpopt to; struct udphdr *udp = NULL; struct tcp_log_buffer *lgb; @@ -562,11 +563,12 @@ tso = 1; if (sack_rxmit) { - if (SEQ_LT(p->rxmit + len, tp->snd_una + sbused(&so->so_snd))) + if (SEQ_LT(p->rxmit + len, + tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + - sbused(&so->so_snd))) + if (SEQ_LT(tp->snd_nxt + len, + tp->snd_una + sbused(&so->so_snd))) flags &= ~TH_FIN; } @@ -895,6 +897,50 @@ } hdrlen += sizeof(struct udphdr); } + /* + * If we are starting a connection, send ECN setup + * SYN packet. If we are on a retransmit, we may + * resend those bits a number of times as per + * RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + flags |= tcp_ecn_output_syn_sent(tp); + } + /* Also handle parallel SYN for ECN */ + if ((TCPS_HAVERCVDSYN(tp->t_state)) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { + tcp_ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); + } + /* + * Disallow use of TSO while sending segments with the CWR flag set, + * as the support for TSO with ECN is inconsistent and frequently + * broken - either setting CWR on all packets, dropping the TSO + * mbuf entirely, or clearing the CWR bit when it may be inappropriate. + */ + if (__predict_false((tso && flags & TH_CWR)) { + if (__predict_true(tcp_ecn_tso_cwr_split)) { + if (__predict_false((tp->t_flags2 & + (TF2_ECN_PERMIT | TF2_ACE_PERMIT) == + (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { + /* + * AccECN case - split CWR when TSO supports + * RFC3168 CWR handling (clearing the flag on + * all but the first segment). + */ + tso = 0; + } + } else { + if (__predict_false(tp->t_flags2 & + (TF2_ECN_PERMIT | TF2_ACE_PERMIT) == + TF2_ECN_PERMIT)) { + /* + * RFC3168 case - split CWR when TSO hw/dev does not + * clear CWR when doing TSO processing. + */ + tso = 0; + } + } + } /* * Adjust data length if insertion of options will * bump the packet length beyond the t_maxseg length. @@ -1182,32 +1228,17 @@ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; - /* - * If we are starting a connection, send ECN setup - * SYN packet. If we are on a retransmit, we may - * resend those bits a number of times as per - * RFC 3168. - */ - if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { - flags |= tcp_ecn_output_syn_sent(tp); - } - /* Also handle parallel SYN for ECN */ - if ((TCPS_HAVERCVDSYN(tp->t_state)) && - (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { - int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); - if ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags2 & TF2_ECN_SND_ECE)) - tp->t_flags2 &= ~TF2_ECN_SND_ECE; + if (tcp_ect) { #ifdef INET6 if (isipv6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << IPV6_FLOWLABEL_LEN); - ip6->ip6_flow |= htonl(ect << IPV6_FLOWLABEL_LEN); + ip6->ip6_flow |= htonl(tcp_ect << IPV6_FLOWLABEL_LEN); } else #endif { ip->ip_tos &= ~IPTOS_ECN_MASK; - ip->ip_tos |= ect; + ip->ip_tos |= tcp_ect; } } diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -18703,6 +18703,7 @@ u_char opt[TCP_MAXOLEN]; uint32_t hdrlen, optlen; int32_t slot, segsiz, max_val, tso = 0, error = 0, ulen = 0; + int32_t tcp_ect = IPTOS_ECN_NOTECT; uint16_t flags; uint32_t if_hw_tsomaxsegcount = 0, startseq; uint32_t if_hw_tsomaxsegsize; @@ -18818,6 +18819,28 @@ } else { tso = 0; } + /* + * If we are starting a connection, send ECN setup + * SYN packet. If we are on a retransmit, we may + * resend those bits a number of times as per + * RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + flags |= tcp_ecn_output_syn_sent(tp); + } + /* Also handle parallel SYN for ECN */ + if ((TCPS_HAVERCVDSYN(tp->t_state)) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { + tcp_ect = tcp_ecn_output_established(tp, &flags, len, true); + } + /* + * Disallow use of TSO while sending segments with the CWR flag set, + * as the support for TSO with ECN is inconsistent and frequently + * broken - either setting CWR on all packets, dropping the TSO + * mbuf entirely, or clearing the CWR bit when it may be inappropriate. + */ + if (flags & TH_CWR) + tso = 0; if ((tso == 0) && (len > segsiz)) len = segsiz; (void)tcp_get_usecs(tv); @@ -18888,22 +18911,17 @@ udp->uh_ulen = htons(ulen); } m->m_pkthdr.rcvif = (struct ifnet *)0; - if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { - int ect = tcp_ecn_output_established(tp, &flags, len, true); - if ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags2 & TF2_ECN_SND_ECE)) - tp->t_flags2 &= ~TF2_ECN_SND_ECE; + if (tcp_ect) { #ifdef INET6 if (rack->r_is_v6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); - ip6->ip6_flow |= htonl(ect << 20); + ip6->ip6_flow |= htonl(tcp_ect << 20); } else #endif { ip->ip_tos &= ~IPTOS_ECN_MASK; - ip->ip_tos |= ect; + ip->ip_tos |= tcp_ect; } } if (rack->r_ctl.crte != NULL) { @@ -19306,6 +19324,7 @@ int cnt_thru = 1; #endif int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, ulen = 0; + int32_t tcp_ect = IPTOS_ECN_NOTECT; uint16_t flags; uint32_t s_soff; uint32_t if_hw_tsomaxsegcount = 0, startseq; @@ -19409,6 +19428,28 @@ } else { tso = 0; } + /* + * If we are starting a connection, send ECN setup + * SYN packet. If we are on a retransmit, we may + * resend those bits a number of times as per + * RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + flags |= tcp_ecn_output_syn_sent(tp); + } + /* Also handle parallel SYN for ECN */ + if ((TCPS_HAVERCVDSYN(tp->t_state)) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { + tcp_ect = tcp_ecn_output_established(tp, &flags, len, false); + } + /* + * Disallow use of TSO while sending segments with the CWR flag set, + * as the support for TSO with ECN is inconsistent and frequently + * broken - either setting CWR on all packets, dropping the TSO + * mbuf entirely, or clearing the CWR bit when it may be inappropriate. + */ + if (flags & TH_CWR) + tso = 0; if ((tso == 0) && (len > segsiz)) len = segsiz; (void)tcp_get_usecs(tv); @@ -19463,23 +19504,18 @@ udp->uh_ulen = htons(ulen); } m->m_pkthdr.rcvif = (struct ifnet *)0; - if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { - int ect = tcp_ecn_output_established(tp, &flags, len, false); - if ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags2 & TF2_ECN_SND_ECE)) - tp->t_flags2 &= ~TF2_ECN_SND_ECE; + if (tcp_ect) { #ifdef INET6 if (rack->r_is_v6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); - ip6->ip6_flow |= htonl(ect << 20); + ip6->ip6_flow |= htonl(tcp_ect << 20); } else #endif { #ifdef INET ip->ip_tos &= ~IPTOS_ECN_MASK; - ip->ip_tos |= ect; + ip->ip_tos |= tcp_ect; #endif } } @@ -19933,6 +19969,7 @@ volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; int32_t tso, mtu; + int32_t tcp_ect = IPTOS_ECN_NOTECT; struct tcpopt to; int32_t slot = 0; int32_t sup_rack = 0; @@ -21433,6 +21470,28 @@ ipoptlen += ipsec_optlen; #endif + /* + * If we are starting a connection, send ECN setup + * SYN packet. If we are on a retransmit, we may + * resend those bits a number of times as per + * RFC 3168. + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { + flags |= tcp_ecn_output_syn_sent(tp); + } + /* Also handle parallel SYN for ECN */ + if ((TCPS_HAVERCVDSYN(tp->t_state)) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { + tcp_ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); + } + /* + * Disallow use of TSO while sending segments with the CWR flag set, + * as the support for TSO with ECN is inconsistent and frequently + * broken - either setting CWR on all packets, dropping the TSO + * mbuf entirely, or clearing the CWR bit when it may be inappropriate. + */ + if (flags & TH_CWR) + tso = 0; /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we @@ -21755,32 +21814,18 @@ if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && tp->snd_nxt == tp->snd_max) tp->snd_nxt--; - /* - * If we are starting a connection, send ECN setup SYN packet. If we - * are on a retransmit, we may resend those bits a number of times - * as per RFC 3168. - */ - if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { - flags |= tcp_ecn_output_syn_sent(tp); - } - /* Also handle parallel SYN for ECN */ - if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { - int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); - if ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags2 & TF2_ECN_SND_ECE)) - tp->t_flags2 &= ~TF2_ECN_SND_ECE; + if (tcp_ect) { #ifdef INET6 if (isipv6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); - ip6->ip6_flow |= htonl(ect << 20); + ip6->ip6_flow |= htonl(tcp_ect << 20); } else #endif { #ifdef INET ip->ip_tos &= ~IPTOS_ECN_MASK; - ip->ip_tos |= ect; + ip->ip_tos |= tcp_ect; #endif } }