diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -34,7 +34,7 @@ .\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93 .\" $FreeBSD$ .\" -.Dd June 6, 2023 +.Dd June 16, 2023 .Dt TCP 4 .Os .Sh NAME @@ -520,6 +520,9 @@ specific connection. This is needed to help with connection establishment when a broken firewall is in the network path. +.It Va ecn.option +Reflect back the number of received bytes with a particular ECN marking +by using the Accurate ECN TCP option on each outgoing packet. .It Va fast_finwait2_recycle Recycle .Tn TCP diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -107,6 +107,10 @@ #define TCPOLEN_SIGNATURE 18 #define TCPOPT_FAST_OPEN 34 #define TCPOLEN_FAST_OPEN_EMPTY 2 +#define TCPOPT_ACCECN0 0xAC +#define TCPOPT_ACCECN1 0XAE +#define TCPOLEN_ACCECN_EMPTY 2 +#define TCPOLEN_ACCECN_COUNTER 3 #define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */ @@ -417,12 +421,12 @@ /* Accurate ECN counters. */ u_int32_t tcpi_delivered_ce; u_int32_t tcpi_received_ce; /* # of CE marks received */ - u_int32_t __tcpi_delivered_e1_bytes; - u_int32_t __tcpi_delivered_e0_bytes; - u_int32_t __tcpi_delivered_ce_bytes; - u_int32_t __tcpi_received_e1_bytes; - u_int32_t __tcpi_received_e0_bytes; - u_int32_t __tcpi_received_ce_bytes; + u_int32_t tcpi_delivered_e1_bytes; + u_int32_t tcpi_delivered_e0_bytes; + u_int32_t tcpi_delivered_ce_bytes; + u_int32_t tcpi_received_e1_bytes; + u_int32_t tcpi_received_e0_bytes; + u_int32_t tcpi_received_ce_bytes; u_int32_t tcpi_total_tlp; /* tail loss probes sent */ u_int64_t tcpi_total_tlp_bytes; /* tail loss probe bytes sent */ diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h --- a/sys/netinet/tcp_ecn.h +++ b/sys/netinet/tcp_ecn.h @@ -51,6 +51,24 @@ uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *); int tcp_ecn_get_ace(uint16_t); +static inline void hton24(u_char **p, uint32_t v) +{ + *(*p)++ = (u_char)(v >> 16); + *(*p)++ = (u_char)(v >> 8); + *(*p)++ = (u_char)(v); +} + +static inline uint32_t ntoh24(u_char *p) +{ + uint32_t v; + + v = (uint32_t)(p[0] << 16); + v |= (uint32_t)(p[1] << 8); + v |= (uint32_t)(p[2] << 0); + return v; +} + + #endif /* _KERNEL */ #endif /* _NETINET_TCP_ECN_H_ */ diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c --- a/sys/netinet/tcp_ecn.c +++ b/sys/netinet/tcp_ecn.c @@ -114,13 +114,17 @@ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); +VNET_DEFINE(int, tcp_ecn_option) = 0; +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, option, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_option), 0, + "Use AccECN TCP option"); + /* * Process incoming SYN,ACK packet */ void tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) { - if (V_tcp_do_ecn == 0) return; if ((V_tcp_do_ecn == 1) || @@ -298,11 +302,23 @@ TCPSTAT_INC(tcps_ecn_rcvect1); break; } - if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { if (tp->t_flags2 & TF2_ACE_PERMIT) { - if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) - tp->t_rcep += 1; + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags2 |= TF2_ACO_CE; + tp->t_ae.rceb += tlen; + tp->t_rcep++; + break; + case IPTOS_ECN_ECT0: + tp->t_flags2 |= TF2_ACO_E0; + tp->t_ae.re0b += tlen; + break; + case IPTOS_ECN_ECT1: + tp->t_flags2 |= TF2_ACO_E1; + tp->t_ae.re1b += tlen; + break; + } if (tp->t_flags2 & TF2_ECN_PERMIT) { delta_cep = (tcp_ecn_get_ace(thflags) + 8 - (tp->t_scep & 7)) & 7; @@ -450,7 +466,6 @@ if (tp->t_flags2 & TF2_ECN_SND_ECE) *thflags |= TH_ECE; } - return ipecn; } diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -1003,6 +1003,8 @@ } tp = intotcpcb(inp); + to.to_ae = &tp->t_ae; + switch (tp->t_state) { case TCPS_TIME_WAIT: /* @@ -1542,6 +1544,7 @@ thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = 0; + to.to_ae = &tp->t_ae; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); @@ -3429,7 +3432,7 @@ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { - int opt, optlen; + int opt, optlen, tmp; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { @@ -3522,6 +3525,42 @@ to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + if (optlen >= (TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER)) { + tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 0); + if (opt == TCPOPT_ACCECN0) { + tmp -= (to->to_ae->se0b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se0b += tmp; + } else { + tmp -= (to->to_ae->se1b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se1b += tmp; + } + } + if (optlen >= (TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER)) { + tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 3); + tmp -= (to->to_ae->sceb & 0xFFFFFF); + if (tmp > 0) + to->to_ae->sceb += tmp; + } + if (optlen >= (TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER)) { + tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 6); + if (opt == TCPOPT_ACCECN0) { + tmp -= (to->to_ae->se1b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se1b += tmp; + } else { + tmp -= (to->to_ae->se0b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se0b += tmp; + } + } + break; default: continue; } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -880,9 +880,35 @@ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ - + /* + * AccECN option + * Don't send on , only on or + * when doing an AccECN session + */ + if (V_tcp_ecn_option && + ((V_tcp_do_ecn == 3) || (V_tcp_do_ecn == 4)) && + ((tp->t_flags2 & TF2_ACE_PERMIT) || + ((flags & TH_SYN) && (flags & TH_ACK)))) { + to.to_flags |= TOF_ACCECNOPT; + to.to_ae = &tp->t_ae; + to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) | + ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) | + ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0); + if (flags & TH_SYN) + to.to_flags |= TOF_ACCE_SYN; + if (tp->t_flags & TF_ACKNOW) + to.to_flags |= TOF_ACCE_ACKNOW; + } /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); + if (to.to_flags & TOF_ACCECNOPT) { + if ((to.to_flags & TOF_ACCE_E0) == 0) + tp->t_flags2 &= ~TF2_ACO_E0; + if ((to.to_flags & TOF_ACCE_E1) == 0) + tp->t_flags2 &= ~TF2_ACO_E1; + if ((to.to_flags & TOF_ACCE_CE) == 0) + tp->t_flags2 &= ~TF2_ACO_CE; + } /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. @@ -1921,6 +1947,75 @@ optlen += total_len; break; } + case TOF_ACCECNOPT: + { + int max_len = TCP_MAXOLEN - optlen; + if (max_len < TCPOLEN_ACCECN_EMPTY) { + to->to_flags &= ~TOF_ACCECNOPT; + continue; + } + if (max_len < (TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER)) { + if (to->to_flags & TOF_ACCE_SYN) { + *optp++ = TCPOPT_ACCECN0; + optlen += TCPOLEN_ACCECN_EMPTY; + *optp++ = TCPOLEN_ACCECN_EMPTY; + continue; + } else { + to->to_flags &= ~TOF_ACCECNOPT; + continue; + } + } + *optp++ = (to->to_flags & TOF_ACCE_E1) ? + TCPOPT_ACCECN1 : TCPOPT_ACCECN0; + if (max_len >= (TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER)) { + *optp++ = TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER; + } else + if (max_len >= (TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER)) { + *optp++ = TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER; + } else + if (max_len >= (TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER)) { + *optp++ = TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER; + } + if (to->to_flags & TOF_ACCE_E1) { + hton24(&optp, to->to_ae->re1b); + } else { + hton24(&optp, to->to_ae->re0b); + to->to_flags &= ~TOF_ACCE_E0; + } + if (max_len < (TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER)) { + to->to_flags &= ~TOF_ACCE_E1; + continue; + } + hton24(&optp, to->to_ae->rceb); + to->to_flags &= ~TOF_ACCE_CE; + if (max_len < (TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER)) { + to->to_flags &= ~TOF_ACCE_E1; + continue; + } + /* + * TCP option sufficient to hold full AccECN option + * but only send changed counters normally, + * full counters on ACKNOW + */ + if (to->to_flags & TOF_ACCE_E1) { + hton24(&optp, to->to_ae->re0b); + to->to_flags &= ~TOF_ACCE_E0; + to->to_flags &= ~TOF_ACCE_E1; + continue; + } else { + hton24(&optp, to->to_ae->re1b); + continue; + } + } default: panic("%s: unknown TCP option type", __func__); break; diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1810,7 +1810,6 @@ #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif - KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); @@ -2018,9 +2017,26 @@ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif + /* AccECN option */ + if (V_tcp_ecn_option && + ((V_tcp_do_ecn == 3) || (V_tcp_do_ecn == 4)) && + (tp->t_flags2 & TF2_ACE_PERMIT)) { + to.to_flags |= TOF_ACCECNOPT; + to.to_ae = &tp->t_ae; + to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) | + ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) | + ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0); + } /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); - + if (to.to_flags & TOF_ACCECNOPT) { + if ((to.to_flags & TOF_ACCE_E0) == 0) + tp->t_flags2 &= ~TF2_ACO_E0; + if ((to.to_flags & TOF_ACCE_E1) == 0) + tp->t_flags2 &= ~TF2_ACO_E1; + if ((to.to_flags & TOF_ACCE_CE) == 0) + tp->t_flags2 &= ~TF2_ACO_CE; + } /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else @@ -2335,6 +2351,14 @@ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; + if (V_tcp_do_lrd) + tp->t_flags |= TF_LRD; + tp->t_ae.re0b = 1; + tp->t_ae.re1b = 1; + tp->t_ae.rceb = 0; + tp->t_ae.se0b = 1; + tp->t_ae.se1b = 1; + tp->t_ae.sceb = 0; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { refcount_release(&tp->t_fb->tfb_refcnt); diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1825,6 +1825,7 @@ #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif + struct accecn ae; NET_EPOCH_ASSERT(); @@ -1964,6 +1965,20 @@ /* don't send cookie again when retransmitting response */ sc->sc_tfo_cookie = NULL; } + if (V_tcp_ecn_option) + to.to_flags |= TOF_ACCE_SYN; + } + if (V_tcp_ecn_option && + (sc->sc_flags & SCF_ECN_MASK) && + ((sc->sc_flags & SCF_ECN_MASK) != SCF_ECN)) { + to.to_flags |= TOF_ACCECNOPT; + to.to_flags |= TOF_ACCE_E0 | + TOF_ACCE_E1 | + TOF_ACCE_CE; + ae.re0b = 1; + ae.re1b = 1; + ae.rceb = 0; + to.to_ae = &ae; } if (sc->sc_flags & SCF_TIMESTAMP) { to.to_tsval = sc->sc_tsoff + tcp_ts_getticks(); diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1590,15 +1590,23 @@ * AccECN related counters. */ if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) == - (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) + (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { /* * Internal counter starts at 5 for AccECN * but 0 for RFC3168 ECN. */ ti->tcpi_delivered_ce = tp->t_scep - 5; - else + ti->tcpi_received_ce = tp->t_rcep - 5; + } else { ti->tcpi_delivered_ce = tp->t_scep; - ti->tcpi_received_ce = tp->t_rcep; + ti->tcpi_received_ce = tp->t_rcep; + } + ti->tcpi_received_e0_bytes = tp->t_ae.re0b - 1; + ti->tcpi_received_e1_bytes = tp->t_ae.re1b - 1; + ti->tcpi_received_ce_bytes = tp->t_ae.rceb; + ti->tcpi_delivered_e0_bytes = tp->t_ae.se0b - 1; + ti->tcpi_delivered_e1_bytes = tp->t_ae.se1b - 1; + ti->tcpi_delivered_ce_bytes = tp->t_ae.sceb; } /* diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -131,6 +131,15 @@ uint32_t prr_out; /* Bytes sent during IN_RECOVERY */ }; +struct accecn { + uint32_t re0b; /* Number of ECT0 marked data bytes */ + uint32_t re1b; /* Number of ECT1 marked data bytes */ + uint32_t rceb; /* Number of CE marked data bytes */ + uint32_t se0b; /* Synced number of delivered ECT0 bytes */ + uint32_t se1b; /* Synced number of delivered ECT1 bytes */ + uint32_t sceb; /* Synced number of delivered CE bytes */ +}; + #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); @@ -435,6 +444,7 @@ int t_loglimit; /* Maximum number of log entries */ uint32_t t_rcep; /* Number of received CE marked pkts */ uint32_t t_scep; /* Synced number of delivered CE pkts */ + struct accecn t_ae; /* AccECN related byte counters */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; @@ -847,7 +857,9 @@ #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ - +#define TF2_ACO_E0 0x00100000 /* EE0 counter changed */ +#define TF2_ACO_E1 0x00200000 /* EE1 counter changed */ +#define TF2_ACO_CE 0x00400000 /* ECE counter changed */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. @@ -858,14 +870,21 @@ */ struct tcpopt { u_int32_t to_flags; /* which options are present */ -#define TOF_MSS 0x0001 /* maximum segment size */ -#define TOF_SCALE 0x0002 /* window scaling */ -#define TOF_SACKPERM 0x0004 /* SACK permitted */ -#define TOF_TS 0x0010 /* timestamp */ -#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ -#define TOF_SACK 0x0080 /* Peer sent SACK option */ -#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ -#define TOF_MAXOPT 0x0200 +#define TOF_MSS 0x00000001 /* maximum segment size */ +#define TOF_SCALE 0x00000002 /* window scaling */ +#define TOF_SACKPERM 0x00000004 /* SACK permitted */ +#define TOF_TS 0x00000010 /* timestamp */ +#define TOF_SIGNATURE 0x00000040 /* TCP-MD5 signature option (RFC2385) */ +#define TOF_SACK 0x00000080 /* Peer sent SACK option */ +#define TOF_FASTOPEN 0x00000100 /* TCP Fast Open (TFO) cookie */ +#define TOF_ACCECNOPT 0x00000200 /* AccECN Option */ +#define TOF_MAXOPT 0x00000400 + /* Keep internal flags above TOF_MAXOPT */ +#define TOF_ACCE_SYN 0x80000000 /* send empty option */ +#define TOF_ACCE_CE 0x40000000 /* CE counter changed */ +#define TOF_ACCE_E0 0x20000000 /* E0 counter changed */ +#define TOF_ACCE_E1 0x10000000 /* E1 counter changed */ +#define TOF_ACCE_ACKNOW 0x08000000 /* send full option */ u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ @@ -875,7 +894,8 @@ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ - u_int32_t to_spare; /* UTO */ + struct accecn *to_ae; /* pointer to AccECN byte counters */ + u_int32_t to_spare; /* UTO */ }; /* @@ -1277,6 +1297,7 @@ VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); +VNET_DECLARE(int, tcp_ecn_option); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); @@ -1324,6 +1345,7 @@ #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) +#define V_tcp_ecn_option VNET(tcp_ecn_option) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn)