Index: share/man/man4/tcp.4 =================================================================== --- share/man/man4/tcp.4 +++ share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd January 8, 2022 +.Dd February 10, 2022 .Dt TCP 4 .Os .Sh NAME @@ -705,6 +705,13 @@ Allow incoming connections to request ECN. Outgoing connections will not request ECN. (default) +.It 3 +Negotiate on incoming connection for Accurate ECN, ECN, or no ECN. +Outgoing connections will request Accurate ECN and fall back to +ECN depending on the capabilities of the server. +.It 4 +Negotiate on incoming connection for Accurate ECN, ECN, or no ECN. +Outgoing connections will not request ECN. .El .It Va ecn.maxretries Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a Index: sys/netinet/tcp_ecn.h =================================================================== --- sys/netinet/tcp_ecn.h +++ sys/netinet/tcp_ecn.h @@ -49,6 +49,7 @@ void tcp_ecn_syncache_socket(struct tcpcb *, struct syncache *); int tcp_ecn_syncache_add(uint16_t, int); uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *); +int tcp_ecn_get_ace(uint16_t); #endif /* _KERNEL */ Index: sys/netinet/tcp_ecn.c =================================================================== --- sys/netinet/tcp_ecn.c +++ sys/netinet/tcp_ecn.c @@ -109,12 +109,86 @@ void tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) { - thflags &= (TH_CWR|TH_ECE); - if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && - V_tcp_do_ecn) { - tp->t_flags2 |= TF2_ECN_PERMIT; - TCPSTAT_INC(tcps_ecn_shs); + if (V_tcp_do_ecn == 0) + return; + if ((V_tcp_do_ecn == 1) || + (V_tcp_do_ecn == 2)) { + /* RFC3168 ECN handling */ + if ((thflags & (TH_CWR | TH_ECE)) == (0 | TH_ECE)) { + tp->t_flags2 |= TF2_ECN_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + } + } else + /* decoding Accurate ECN according to table in section 3.1.1 */ + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + /* + * on the SYN,ACK, process the AccECN + * flags indicating the state the SYN + * was delivered. + * Reactions to Path ECN mangling can + * come here. + */ + switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { + /* non-ECT SYN */ + case (0|TH_CWR|0): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + /* ECT0 SYN */ + case (TH_AE|0|0): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect0); + break; + /* ECT1 SYN */ + case (0|TH_CWR|TH_ECE): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect1); + break; + /* CE SYN */ + case (TH_AE|TH_CWR|0): + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 6; + /* + * reduce the IW to 2 MSS (to + * account for delayed acks) if + * the SYN,ACK was CE marked + */ + tp->snd_cwnd = 2 * tcp_maxseg(tp); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + default: + break; + } + /* + * Set the AccECN Codepoints on + * the outgoing to the ECN + * state of the + * according to table 3 in the + * AccECN draft + */ + switch (iptos & IPTOS_ECN_MASK) { + case (IPTOS_ECN_NOTECT): + tp->t_rcep = 0b010; + break; + case (IPTOS_ECN_ECT0): + tp->t_rcep = 0b100; + break; + case (IPTOS_ECN_ECT1): + tp->t_rcep = 0b011; + break; + case (IPTOS_ECN_CE): + tp->t_rcep = 0b110; + break; + } } } @@ -128,13 +202,53 @@ return; if (V_tcp_do_ecn == 0) return; - if ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2)) { + if ((V_tcp_do_ecn == 1) || + (V_tcp_do_ecn == 2)) { /* RFC3168 ECN handling */ if ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) { tp->t_flags2 |= TF2_ECN_PERMIT; tp->t_flags2 |= TF2_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_shs); } + } else + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + /* AccECN handling */ + switch (thflags & (TH_AE | TH_CWR | TH_ECE)) { + default: + case (0|0|0): + break; + case (0|TH_CWR|TH_ECE): + tp->t_flags2 |= TF2_ECN_PERMIT; + tp->t_flags2 |= TF2_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_shs); + break; + case (TH_AE|TH_CWR|TH_ECE): + tp->t_flags2 |= TF2_ACE_PERMIT; + TCPSTAT_INC(tcps_ecn_shs); + /* + * Set the AccECN Codepoints on + * the outgoing to the ECN + * state of the + * according to table 3 in the + * AccECN draft + */ + switch (iptos & IPTOS_ECN_MASK) { + case (IPTOS_ECN_NOTECT): + tp->t_rcep = 0b010; + break; + case (IPTOS_ECN_ECT0): + tp->t_rcep = 0b100; + break; + case (IPTOS_ECN_ECT1): + tp->t_rcep = 0b011; + break; + case (IPTOS_ECN_CE): + tp->t_rcep = 0b110; + break; + } + break; + } } } @@ -146,7 +260,7 @@ { int delta_ace = 0; - if (tp->t_flags2 & TF2_ECN_PERMIT) { + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: TCPSTAT_INC(tcps_ecn_ce); @@ -159,15 +273,52 @@ break; } - /* RFC3168 ECN handling */ - if (thflags & TH_ECE) - delta_ace = 1; - if (thflags & TH_CWR) { - tp->t_flags2 &= ~TF2_ECN_SND_ECE; - tp->t_flags |= TF_ACKNOW; + if (tp->t_flags2 & TF2_ACE_PERMIT) { + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + tp->t_rcep += 1; + if (tp->t_flags2 & TF2_ECN_PERMIT) { + delta_ace = (tcp_ecn_get_ace(thflags) + 8 - + (tp->t_scep & 0x07)) & 0x07; + tp->t_scep += delta_ace; + } else { + /* + * process the final ACK of the 3WHS + * see table 3 in draft-ietf-tcpm-accurate-ecn + */ + switch (tcp_ecn_get_ace(thflags)) { + case 0b010: + /* nonECT SYN or SYN,ACK */ + /* Fallthrough */ + case 0b011: + /* ECT1 SYN or SYN,ACK */ + /* Fallthrough */ + case 0b100: + /* ECT0 SYN or SYN,ACK */ + tp->t_scep = 5; + break; + case 0b110: + /* CE SYN or SYN,ACK */ + tp->t_scep = 6; + tp->snd_cwnd = 2 * tcp_maxseg(tp); + break; + default: + /* mangled AccECN handshake */ + tp->t_scep = 5; + break; + } + tp->t_flags2 |= TF2_ECN_PERMIT; + } + } else { + /* RFC3168 ECN handling */ + if (thflags & TH_ECE) + delta_ace = 1; + if (thflags & TH_CWR) { + tp->t_flags2 &= ~TF2_ECN_SND_ECE; + tp->t_flags |= TF_ACKNOW; + } + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + tp->t_flags2 |= TF2_ECN_SND_ECE; } - if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) - tp->t_flags2 |= TF2_ECN_SND_ECE; /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler_flags(tp, thflags, iptos); @@ -184,6 +335,8 @@ { uint16_t thflags = 0; + if (V_tcp_do_ecn == 0) + return thflags; if (V_tcp_do_ecn == 1) { /* Send a RFC3168 ECN setup packet */ if (tp->t_rxtshift >= 1) { @@ -191,6 +344,14 @@ thflags = TH_ECE|TH_CWR; } else thflags = TH_ECE|TH_CWR; + } else + if (V_tcp_do_ecn == 3) { + /* Send an Accurate ECN setup packet */ + if (tp->t_rxtshift >= 1) { + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) + thflags = TH_ECE|TH_CWR|TH_AE; + } else + thflags = TH_ECE|TH_CWR|TH_AE; } return thflags; @@ -214,6 +375,7 @@ */ newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !((tp->t_flags & TF_FORCEDATA) && len == 1)); + /* RFC3168 ECN marking, only new data segments */ if (newdata) { ipecn = IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_ect0); @@ -221,13 +383,35 @@ /* * Reply with proper ECN notifications. */ - if (newdata && - (tp->t_flags2 & TF2_ECN_SND_CWR)) { - *thflags |= TH_CWR; - tp->t_flags2 &= ~TF2_ECN_SND_CWR; + if (tp->t_flags2 & TF2_ACE_PERMIT) { + *thflags &= ~(TH_AE|TH_CWR|TH_ECE); + if (tp->t_rcep & 0x01) + *thflags |= TH_ECE; + if (tp->t_rcep & 0x02) + *thflags |= TH_CWR; + if (tp->t_rcep & 0x04) + *thflags |= TH_AE; + if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { + /* + * here we process the final + * ACK of the 3WHS + */ + if (tp->t_rcep == 0b110) { + tp->t_rcep = 6; + } else { + tp->t_rcep = 5; + } + tp->t_flags2 |= TF2_ECN_PERMIT; + } + } else { + if (newdata && + (tp->t_flags2 & TF2_ECN_SND_CWR)) { + *thflags |= TH_CWR; + tp->t_flags2 &= ~TF2_ECN_SND_CWR; + } + if (tp->t_flags2 & TF2_ECN_SND_ECE) + *thflags |= TH_ECE; } - if (tp->t_flags2 & TF2_ECN_SND_ECE) - *thflags |= TH_ECE; return ipecn; } @@ -244,6 +428,20 @@ case SCF_ECN: tp->t_flags2 |= TF2_ECN_PERMIT; break; + case SCF_ACE_N: + /* Fallthrough */ + case SCF_ACE_0: + /* Fallthrough */ + case SCF_ACE_1: + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 5; + tp->t_rcep = 5; + break; + case SCF_ACE_CE: + tp->t_flags2 |= TF2_ACE_PERMIT; + tp->t_scep = 6; + tp->t_rcep = 6; + break; /* undefined SCF codepoint */ default: break; @@ -260,15 +458,54 @@ { int scflags = 0; - switch (thflags & (TH_CWR|TH_ECE)) { + switch (thflags & (TH_AE|TH_CWR|TH_ECE)) { /* no ECN */ - case (0|0): + case (0|0|0): break; /* legacy ECN */ - case (TH_CWR|TH_ECE): + case (0|TH_CWR|TH_ECE): scflags = SCF_ECN; break; + /* Accurate ECN */ + case (TH_AE|TH_CWR|TH_ECE): + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + scflags = SCF_ACE_CE; + break; + case IPTOS_ECN_ECT0: + scflags = SCF_ACE_0; + break; + case IPTOS_ECN_ECT1: + scflags = SCF_ACE_1; + break; + case IPTOS_ECN_NOTECT: + scflags = SCF_ACE_N; + break; + } + } else + scflags = SCF_ECN; + break; + /* Default Case (section 3.1.2) */ default: + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + scflags = SCF_ACE_CE; + break; + case IPTOS_ECN_ECT0: + scflags = SCF_ACE_0; + break; + case IPTOS_ECN_ECT1: + scflags = SCF_ACE_1; + break; + case IPTOS_ECN_NOTECT: + scflags = SCF_ACE_N; + break; + } + } break; } return scflags; @@ -285,8 +522,28 @@ (sc->sc_flags & SCF_ECN_MASK)) { switch (sc->sc_flags & SCF_ECN_MASK) { case SCF_ECN: - thflags |= (0 | TH_ECE); + thflags |= (0 | 0 | TH_ECE); + TCPSTAT_INC(tcps_ecn_shs); + break; + case SCF_ACE_N: + thflags |= (0 | TH_CWR | 0); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + case SCF_ACE_0: + thflags |= (TH_AE | 0 | 0); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect0); + break; + case SCF_ACE_1: + thflags |= (0 | TH_ECE | TH_CWR); TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect1); + break; + case SCF_ACE_CE: + thflags |= (TH_AE | TH_CWR | 0); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ce); break; /* undefined SCF codepoint */ default: @@ -295,3 +552,17 @@ } return thflags; } + +int +tcp_ecn_get_ace(uint16_t thflags) +{ + int ace = 0; + + if (thflags & TH_ECE) + ace += 1; + if (thflags & TH_CWR) + ace += 2; + if (thflags & TH_AE) + ace += 4; + return ace; +} Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -1205,7 +1205,7 @@ } /* Also handle parallel SYN for ECN */ if ((TCPS_HAVERCVDSYN(tp->t_state)) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -15741,7 +15741,7 @@ } m->m_pkthdr.rcvif = (struct ifnet *)0; if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) @@ -16232,7 +16232,7 @@ } m->m_pkthdr.rcvif = (struct ifnet *)0; if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) @@ -18277,7 +18277,7 @@ } /* Also handle parallel SYN for ECN */ if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & TF2_ECN_PERMIT)) { + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { int ect = tcp_ecn_output_established(tp, &flags, len); if ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_flags2 & TF2_ECN_SND_ECE)) @@ -20273,7 +20273,7 @@ ti->tcpi_snd_wscale = tp->snd_scale; ti->tcpi_rcv_wscale = tp->rcv_scale; } - if (tp->t_flags2 & TF2_ECN_PERMIT) + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ti->tcpi_options |= TCPI_OPT_ECN; if (tp->t_flags & TF_FASTOPEN) ti->tcpi_options |= TCPI_OPT_TFO; Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -249,8 +249,8 @@ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ - uint32_t r_cep; /* Number of received CE marked packets */ - uint32_t s_cep; /* Synced number of delivered CE packets */ + uint32_t t_rcep; /* Number of received CE marked packets */ + uint32_t t_scep; /* Synced number of delivered CE packets */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin;