diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -31,7 +31,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 30, 2023 +.Dd January 26, 2024 .Dt TCP 4 .Os .Sh NAME @@ -494,11 +494,28 @@ .It 3 Negotiate on incoming connection for Accurate ECN, ECN, or no ECN. Outgoing connections will request Accurate ECN and fall back to -ECN depending on the capabilities of the server. +ECN depending on the capabilities of the remote host. .It 4 Negotiate on incoming connection for Accurate ECN, ECN, or no ECN. Outgoing connections will not request ECN. .El +.It Va ecn.generalized +Enable sending all segments as ECN capable transport, +including SYN, SYN/ACK, and retransmissions. +This may only be enabled when ECN support itself is also active. +Disabling ECN support will disable this feature automatically. +Settings: +.Bl -tag -compact +.It 0 +Regular RFC3168 operation. +Send only new data segments as ECN capable transport. +(default) +.It 1 +Support generalized ECN (ECN++), and send all segments of an ECN-enabled +session as ECN capable transport. +Also control packets to non-established and non-listening ports are +identically marked, if outgoing sessions would request ECN. +.El .It Va ecn.maxretries Number of retries (SYN or SYN/ACK retransmits) before disabling ECN on a specific connection. diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h --- a/sys/netinet/tcp_ecn.h +++ b/sys/netinet/tcp_ecn.h @@ -43,11 +43,11 @@ void tcp_ecn_input_syn_sent(struct tcpcb *, uint16_t, int); void tcp_ecn_input_parallel_syn(struct tcpcb *, uint16_t, int); int tcp_ecn_input_segment(struct tcpcb *, uint16_t, int, int, int); -uint16_t tcp_ecn_output_syn_sent(struct tcpcb *); +int tcp_ecn_output_syn_sent(struct tcpcb *, uint16_t *); int tcp_ecn_output_established(struct tcpcb *, uint16_t *, int, bool); void tcp_ecn_syncache_socket(struct tcpcb *, struct syncache *); int tcp_ecn_syncache_add(uint16_t, int); -uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *); +int tcp_ecn_syncache_respond(uint16_t *, struct syncache *); #endif /* _KERNEL */ diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c --- a/sys/netinet/tcp_ecn.c +++ b/sys/netinet/tcp_ecn.c @@ -104,8 +104,10 @@ "TCP ECN"); VNET_DEFINE(int, tcp_do_ecn) = 2; -SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, - CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0, +static int sysctl_net_inet_tcp_ecn_enable_check(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_net_inet_tcp_ecn, OID_AUTO, enable, + CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_UINT | CTLFLAG_NEEDGIANT, + &VNET_NAME(tcp_do_ecn), 0, &sysctl_net_inet_tcp_ecn_enable_check, "IU", "TCP ECN support"); VNET_DEFINE(int, tcp_ecn_maxretries) = 1; @@ -113,6 +115,13 @@ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); +VNET_DEFINE(int, tcp_ecn_generalized) = 0; +static int sysctl_net_inet_tcp_ecn_generalized_check(SYSCTL_HANDLER_ARGS); +SYSCTL_PROC(_net_inet_tcp_ecn, OID_AUTO, generalized, + CTLFLAG_VNET | CTLFLAG_RW | CTLTYPE_UINT | CTLFLAG_NEEDGIANT, + &VNET_NAME(tcp_ecn_generalized), 0, &sysctl_net_inet_tcp_ecn_generalized_check, "IU", + "Send all packets as ECT"); + /* * Process incoming SYN,ACK packet */ @@ -121,6 +130,7 @@ { switch (V_tcp_do_ecn) { case 0: + /* No ECN */ return; case 1: /* FALLTHROUGH */ @@ -217,6 +227,11 @@ } break; } + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { + if (V_tcp_ecn_generalized) { + tp->t_flags2 |= TF2_ECN_PLUSPLUS; + } + } } /* @@ -229,6 +244,7 @@ return; switch (V_tcp_do_ecn) { case 0: + /* No ECN */ return; case 1: /* FALLTHROUGH */ @@ -285,6 +301,11 @@ } break; } + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { + if (V_tcp_ecn_generalized) { + tp->t_flags2 |= TF2_ECN_PLUSPLUS; + } + } } /* @@ -364,36 +385,61 @@ cc_ecnpkt_handler_flags(tp, thflags, iptos); } - return delta_cep; + return (delta_cep); } /* * Send ECN setup packet header flags */ -uint16_t -tcp_ecn_output_syn_sent(struct tcpcb *tp) +int +tcp_ecn_output_syn_sent(struct tcpcb *tp, uint16_t *thflags) { - uint16_t thflags = 0; - - if (V_tcp_do_ecn == 0) - return thflags; - if (V_tcp_do_ecn == 1) { + switch (V_tcp_do_ecn) { + case 0: + /* No ECN */ + /* FALLTHROUGH */ + case 2: + /* passive RFC3168 */ + /* FALLTHROUGH */ + case 4: + /* passive AccECN */ + /* FALLTHROUGH */ + break; + case 1: /* Send a RFC3168 ECN setup packet */ if (tp->t_rxtshift >= 1) { - if (tp->t_rxtshift <= V_tcp_ecn_maxretries) - thflags = TH_ECE|TH_CWR; - } else - thflags = TH_ECE|TH_CWR; - } else if (V_tcp_do_ecn == 3) { + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) { + *thflags |= TH_ECE|TH_CWR; + } else { + return (IPTOS_ECN_NOTECT); + } + } else { + *thflags = TH_ECE|TH_CWR; + } + break; + case 3: /* Send an Accurate ECN setup packet */ if (tp->t_rxtshift >= 1) { - if (tp->t_rxtshift <= V_tcp_ecn_maxretries) - thflags = TH_ECE|TH_CWR|TH_AE; - } else - thflags = TH_ECE|TH_CWR|TH_AE; + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) { + *thflags |= TH_ECE|TH_CWR|TH_AE; + } else { + return (IPTOS_ECN_NOTECT); + } + } else { + *thflags |= TH_ECE|TH_CWR|TH_AE; + } + break; } - - return thflags; + if (V_tcp_ecn_generalized) { + if (tp->t_flags2 & TF2_ECN_USE_ECT1) { + TCPSTAT_INC(tcps_ecn_sndect1); + return (IPTOS_ECN_ECT1); + } else { + TCPSTAT_INC(tcps_ecn_sndect0); + return (IPTOS_ECN_ECT0); + } + } + return (IPTOS_ECN_NOTECT); } /* @@ -403,7 +449,7 @@ int tcp_ecn_output_established(struct tcpcb *tp, uint16_t *thflags, int len, bool rxmit) { - int ipecn = IPTOS_ECN_NOTECT; + int ect = IPTOS_ECN_NOTECT; bool newdata; /* @@ -415,13 +461,20 @@ newdata = (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max) && !rxmit && !((tp->t_flags & TF_FORCEDATA) && len == 1)); - /* RFC3168 ECN marking, only new data segments */ - if (newdata) { + /* + * RFC3168 ECN marking for new data segments, or + * for all segments as ECN-capable transport + * when ecn.generalized is set. + */ + if (newdata || + tp->t_flags2 & TF2_ECN_PLUSPLUS || + (tp->t_state == TCPS_SYN_SENT && + V_tcp_ecn_generalized)) { if (tp->t_flags2 & TF2_ECN_USE_ECT1) { - ipecn = IPTOS_ECN_ECT1; + ect = IPTOS_ECN_ECT1; TCPSTAT_INC(tcps_ecn_sndect1); } else { - ipecn = IPTOS_ECN_ECT0; + ect = IPTOS_ECN_ECT0; TCPSTAT_INC(tcps_ecn_sndect0); } } @@ -452,7 +505,7 @@ *thflags |= TH_ECE; } - return ipecn; + return (ect); } /* @@ -483,6 +536,10 @@ break; } } + if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { + if (V_tcp_ecn_generalized) + tp->t_flags2 |= TF2_ECN_PLUSPLUS; + } } /* @@ -532,8 +589,9 @@ scflags = SCF_ACE_N; break; } - } else + } else { scflags = SCF_ECN; + } break; /* Default Case (section 3.1.2) */ default: @@ -556,46 +614,63 @@ } break; } - return scflags; + return (scflags); } /* * Set up the ECN information for the from * syncache information. */ -uint16_t -tcp_ecn_syncache_respond(uint16_t thflags, struct syncache *sc) +int +tcp_ecn_syncache_respond(uint16_t *thflags, struct syncache *sc) { - if ((thflags & TH_SYN) && + int ect = IPTOS_ECN_NOTECT; + + if ((*thflags & TH_SYN) && (sc->sc_flags & SCF_ECN_MASK)) { switch (sc->sc_flags & SCF_ECN_MASK) { case SCF_ECN: - thflags |= (0 | 0 | TH_ECE); + *thflags |= (0 | 0 | TH_ECE); TCPSTAT_INC(tcps_ecn_shs); + if ((V_tcp_ecn_generalized && + (*thflags & TH_ACK))) + ect = IPTOS_ECN_ECT0; break; case SCF_ACE_N: - thflags |= (0 | TH_CWR | 0); + *thflags |= (0 | TH_CWR | 0); TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_nect); + if ((V_tcp_ecn_generalized && + (*thflags & TH_ACK))) + ect = IPTOS_ECN_ECT0; break; case SCF_ACE_0: - thflags |= (TH_AE | 0 | 0); + *thflags |= (TH_AE | 0 | 0); TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_ect0); + if ((V_tcp_ecn_generalized && + (*thflags & TH_ACK))) + ect = IPTOS_ECN_ECT0; break; case SCF_ACE_1: - thflags |= (0 | TH_ECE | TH_CWR); + *thflags |= (0 | TH_ECE | TH_CWR); TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_ect1); + if ((V_tcp_ecn_generalized && + (*thflags & TH_ACK))) + ect = IPTOS_ECN_ECT0; break; case SCF_ACE_CE: - thflags |= (TH_AE | TH_CWR | 0); + *thflags |= (TH_AE | TH_CWR | 0); TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_ce); + if ((V_tcp_ecn_generalized && + (*thflags & TH_ACK))) + ect = IPTOS_ECN_ECT0; break; } } - return thflags; + return (ect); } static inline int @@ -610,3 +685,48 @@ *thflags &= ~(TH_AE|TH_CWR|TH_ECE); *thflags |= ((t_rcep << TH_ACE_SHIFT) & (TH_AE|TH_CWR|TH_ECE)); } + +static int +sysctl_net_inet_tcp_ecn_enable_check(SYSCTL_HANDLER_ARGS) +{ + uint32_t new; + int error; + + new = V_tcp_do_ecn; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr != NULL) { + if (new > 4) { + error = EINVAL; + } else { + V_tcp_do_ecn = new; + if (new == 0) { + V_tcp_ecn_generalized = new; + } + } + } + + return (error); +} + +static int +sysctl_net_inet_tcp_ecn_generalized_check(SYSCTL_HANDLER_ARGS) +{ + uint32_t new; + int error; + + new = V_tcp_ecn_generalized; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr != NULL) { + if (new > 1) { + error = EINVAL; + } else { + if (!V_tcp_do_ecn && new == 1) { + error = EINVAL; + } else { + V_tcp_ecn_generalized = new; + } + } + } + + return (error); +} diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -414,7 +414,7 @@ stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); #endif - switch(type) { + switch (type) { case CC_NDUPACK: if (!IN_FASTRECOVERY(tp->t_flags)) { tp->snd_recover = tp->snd_max; diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -193,6 +193,7 @@ int32_t len; uint32_t recwin, sendwin; uint16_t flags; + int ect = 0; int off, error = 0; /* Keep compiler happy */ u_int if_hw_tsomaxsegcount = 0; u_int if_hw_tsomaxsegsize = 0; @@ -1185,15 +1186,17 @@ * RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { - flags |= tcp_ecn_output_syn_sent(tp); + ect = tcp_ecn_output_syn_sent(tp, &flags); } /* Also handle parallel SYN for ECN */ - if ((TCPS_HAVERCVDSYN(tp->t_state)) && - (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { - int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); + if ((tp->t_flags2 & TF2_ECN_PLUSPLUS) || + (TCPS_HAVERCVDSYN(tp->t_state) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)))) { + ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); if ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags2 & TF2_ECN_SND_ECE)) + (tp->t_flags2 & TF2_ECN_SND_ECE)) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; + } #ifdef INET6 if (isipv6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << IPV6_FLOWLABEL_LEN); diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -19902,6 +19902,7 @@ struct socket *so; uint32_t recwin; uint32_t sb_offset, s_moff = 0; + uint8_t ect = 0; int32_t len, error = 0; uint16_t flags; struct mbuf *m, *s_mb = NULL; @@ -21761,15 +21762,17 @@ * as per RFC 3168. */ if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn) { - flags |= tcp_ecn_output_syn_sent(tp); + ect |= tcp_ecn_output_syn_sent(tp, &flags); } /* Also handle parallel SYN for ECN */ - if (TCPS_HAVERCVDSYN(tp->t_state) && - (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))) { - int ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); + if ((tp->t_flags2 & TF2_ECN_PLUSPLUS) || + (TCPS_HAVERCVDSYN(tp->t_state) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)))) { + ect = tcp_ecn_output_established(tp, &flags, len, sack_rxmit); if ((tp->t_state == TCPS_SYN_RECEIVED) && - (tp->t_flags2 & TF2_ECN_SND_ECE)) + (tp->t_flags2 & TF2_ECN_SND_ECE)) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; + } #ifdef INET6 if (isipv6) { ip6->ip6_flow &= ~htonl(IPTOS_ECN_MASK << 20); diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -2000,6 +2000,23 @@ optp = (u_char *) (nth + 1); optm = m; } + } else { + /* + * Send out control packets with same IP ECN header + */ + if (V_tcp_ecn_generalized && + ((V_tcp_do_ecn == 1) || + (V_tcp_do_ecn == 3) || + ((tp != NULL) && + (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT))))) { + if ((tp != NULL) && (tp->t_flags2 & TF2_ECN_USE_ECT1)) { + ect = IPTOS_ECN_ECT1; + TCPSTAT_INC(tcps_ecn_sndect1); + } else { + ect = IPTOS_ECN_ECT0; + TCPSTAT_INC(tcps_ecn_sndect0); + } + } } if (incl_opts) { /* Timestamps. */ @@ -2034,12 +2051,10 @@ ip6->ip6_nxt = IPPROTO_TCP; ip6->ip6_plen = htons(tlen - sizeof(*ip6)); } -#endif -#if defined(INET) && defined(INET6) else #endif -#ifdef INET { +#ifdef INET if (uh) { ulen = tlen - sizeof(struct ip); uh->uh_ulen = htons(ulen); @@ -2060,8 +2075,8 @@ } if (V_path_mtu_discovery) ip->ip_off |= htons(IP_DF); - } #endif + } m->m_pkthdr.len = tlen; m->m_pkthdr.rcvif = NULL; #ifdef MAC diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -129,7 +129,7 @@ static void syncache_drop(struct syncache *, struct syncache_head *); static void syncache_free(struct syncache *); static void syncache_insert(struct syncache *, struct syncache_head *); -static int syncache_respond(struct syncache *, const struct mbuf *, int); +static int syncache_respond(struct syncache *, const struct mbuf *, uint16_t); static struct socket *syncache_socket(struct syncache *, struct socket *, struct mbuf *m); static void syncache_timeout(struct syncache *sc, struct syncache_head *sch, @@ -1798,14 +1798,15 @@ * i.e. m0 != NULL, or upon 3WHS ACK timeout, i.e. m0 == NULL. */ static int -syncache_respond(struct syncache *sc, const struct mbuf *m0, int flags) +syncache_respond(struct syncache *sc, const struct mbuf *m0, uint16_t flags) { struct ip *ip = NULL; struct mbuf *m; struct tcphdr *th = NULL; struct udphdr *udp = NULL; int optlen, error = 0; /* Make compiler happy */ - u_int16_t hlen, tlen, mssopt, ulen; + uint16_t hlen, tlen, mssopt, ulen; + int ect; struct tcpopt to; #ifdef INET6 struct ip6_hdr *ip6 = NULL; @@ -1870,12 +1871,10 @@ } ip6->ip6_flow |= htonl(sc->sc_ip_tos << IPV6_FLOWLABEL_LEN); } -#endif -#if defined(INET6) && defined(INET) else #endif -#ifdef INET { +#ifdef INET ip = mtod(m, struct ip *); ip->ip_v = IPVERSION; ip->ip_hl = sizeof(struct ip) >> 2; @@ -1908,8 +1907,8 @@ ulen = (tlen - sizeof(struct ip)); th = (struct tcphdr *)(udp + 1); } - } #endif /* INET */ + } th->th_sport = sc->sc_inc.inc_lport; th->th_dport = sc->sc_inc.inc_fport; @@ -1922,7 +1921,17 @@ th->th_win = htons(sc->sc_wnd); th->th_urp = 0; - flags = tcp_ecn_syncache_respond(flags, sc); + ect = tcp_ecn_syncache_respond(&flags, sc); +#ifdef INET6 + if (sc->sc_inc.inc_flags & INC_ISIPV6) { + ip6->ip6_flow |= htonl(ect << 20); + } else +#endif +#ifdef INET + { + ip->ip_tos |= ect; + } +#endif tcp_set_flags(th, flags); /* Tack on the TCP options. */ @@ -2024,12 +2033,10 @@ TCP_PROBE5(send, NULL, NULL, ip6, NULL, th); error = ip6_output(m, NULL, NULL, 0, NULL, NULL, NULL); } -#endif -#if defined(INET6) && defined(INET) else #endif -#ifdef INET { +#ifdef INET if (sc->sc_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); @@ -2053,8 +2060,8 @@ #endif TCP_PROBE5(send, NULL, NULL, ip, NULL, th); error = ip_output(m, sc->sc_ipopts, NULL, 0, NULL, NULL); - } #endif + } return (error); } diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -3010,6 +3010,10 @@ db_printf("%sTF2_ACE_PERMIT", comma ? ", " : ""); comma = 1; } + if (t_flags2 & TF2_ECN_PLUSPLUS) { + db_printf("%sTF2_ECN_PLUSPLUS", comma ? ", " : ""); + comma = 1; + } if (t_flags2 & TF2_FBYTES_COMPLETE) { db_printf("%sTF2_FBYTES_COMPLETE", comma ? ", " : ""); comma = 1; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -843,6 +843,7 @@ #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ +#define TF2_ECN_PLUSPLUS 0x00100000 /* ECN++ session */ /* * Structure to hold TCP options that are only used during segment @@ -1273,6 +1274,7 @@ VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); +VNET_DECLARE(int, tcp_ecn_generalized); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); @@ -1319,6 +1321,7 @@ #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) +#define V_tcp_ecn_generalized VNET(tcp_ecn_generalized) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn)