diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4 --- a/share/man/man4/tcp.4 +++ b/share/man/man4/tcp.4 @@ -31,7 +31,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd November 30, 2023 +.Dd January 17, 2023 .Dt TCP 4 .Os .Sh NAME @@ -504,6 +504,9 @@ specific connection. This is needed to help with connection establishment when a broken firewall is in the network path. +.It Va ecn.option +Reflect back the number of received bytes with a particular ECN marking +by using the Accurate ECN TCP option on each outgoing packet. .It Va fast_finwait2_recycle Recycle .Tn TCP diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -121,6 +121,10 @@ #define TCPOLEN_SIGNATURE 18 #define TCPOPT_FAST_OPEN 34 #define TCPOLEN_FAST_OPEN_EMPTY 2 +#define TCPOPT_ACCECN0 0xAC +#define TCPOPT_ACCECN1 0XAE +#define TCPOLEN_ACCECN_EMPTY 2 +#define TCPOLEN_ACCECN_COUNTER 3 #define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */ @@ -431,12 +435,12 @@ /* Accurate ECN counters. */ u_int32_t tcpi_delivered_ce; u_int32_t tcpi_received_ce; /* # of CE marks received */ - u_int32_t __tcpi_delivered_e1_bytes; - u_int32_t __tcpi_delivered_e0_bytes; - u_int32_t __tcpi_delivered_ce_bytes; - u_int32_t __tcpi_received_e1_bytes; - u_int32_t __tcpi_received_e0_bytes; - u_int32_t __tcpi_received_ce_bytes; + u_int32_t tcpi_delivered_e1_bytes; + u_int32_t tcpi_delivered_e0_bytes; + u_int32_t tcpi_delivered_ce_bytes; + u_int32_t tcpi_received_e1_bytes; + u_int32_t tcpi_received_e0_bytes; + u_int32_t tcpi_received_ce_bytes; u_int32_t tcpi_total_tlp; /* tail loss probes sent */ u_int64_t tcpi_total_tlp_bytes; /* tail loss probe bytes sent */ diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h --- a/sys/netinet/tcp_ecn.h +++ b/sys/netinet/tcp_ecn.h @@ -48,6 +48,24 @@ uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *); int tcp_ecn_get_ace(uint16_t); +static inline void hton24(u_char **p, uint32_t v) +{ + *(*p)++ = (u_char)(v >> 16); + *(*p)++ = (u_char)(v >> 8); + *(*p)++ = (u_char)(v); +} + +static inline uint32_t ntoh24(u_char *p) +{ + uint32_t v; + + v = (uint32_t)(p[0] << 16); + v |= (uint32_t)(p[1] << 8); + v |= (uint32_t)(p[2] << 0); + return v; +} + + #endif /* _KERNEL */ #endif /* _NETINET_TCP_ECN_H_ */ diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c --- a/sys/netinet/tcp_ecn.c +++ b/sys/netinet/tcp_ecn.c @@ -110,13 +110,17 @@ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0, "Max retries before giving up on ECN"); +VNET_DEFINE(int, tcp_ecn_option) = 0; +SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, option, + CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_option), 0, + "Use AccECN TCP option"); + /* * Process incoming SYN,ACK packet */ void tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos) { - if (V_tcp_do_ecn == 0) return; if ((V_tcp_do_ecn == 1) || @@ -149,7 +153,9 @@ case (0|TH_CWR|0): tp->t_flags2 |= TF2_ACE_PERMIT; tp->t_flags2 &= ~TF2_ECN_PERMIT; - tp->t_scep = 5; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; + tp->t_ae.scep = 5; TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_nect); break; @@ -157,7 +163,9 @@ case (TH_AE|0|0): tp->t_flags2 |= TF2_ACE_PERMIT; tp->t_flags2 &= ~TF2_ECN_PERMIT; - tp->t_scep = 5; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; + tp->t_ae.scep = 5; TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_ect0); break; @@ -165,7 +173,9 @@ case (0|TH_CWR|TH_ECE): tp->t_flags2 |= TF2_ACE_PERMIT; tp->t_flags2 &= ~TF2_ECN_PERMIT; - tp->t_scep = 5; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; + tp->t_ae.scep = 5; TCPSTAT_INC(tcps_ecn_shs); TCPSTAT_INC(tcps_ace_ect1); break; @@ -173,7 +183,9 @@ case (TH_AE|TH_CWR|0): tp->t_flags2 |= TF2_ACE_PERMIT; tp->t_flags2 &= ~TF2_ECN_PERMIT; - tp->t_scep = 6; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; + tp->t_ae.scep = 6; /* * reduce the IW to 2 MSS (to * account for delayed acks) if @@ -196,16 +208,16 @@ */ switch (iptos & IPTOS_ECN_MASK) { case (IPTOS_ECN_NOTECT): - tp->t_rcep = 0b010; + tp->t_ae.rcep = 0b010; break; case (IPTOS_ECN_ECT0): - tp->t_rcep = 0b100; + tp->t_ae.rcep = 0b100; break; case (IPTOS_ECN_ECT1): - tp->t_rcep = 0b011; + tp->t_ae.rcep = 0b011; break; case (IPTOS_ECN_CE): - tp->t_rcep = 0b110; + tp->t_ae.rcep = 0b110; break; } } @@ -248,6 +260,8 @@ case (TH_AE|TH_CWR|TH_ECE): tp->t_flags2 |= TF2_ACE_PERMIT; tp->t_flags2 &= ~TF2_ECN_PERMIT; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; TCPSTAT_INC(tcps_ecn_shs); /* * Set the AccECN Codepoints on @@ -258,16 +272,16 @@ */ switch (iptos & IPTOS_ECN_MASK) { case (IPTOS_ECN_NOTECT): - tp->t_rcep = 0b010; + tp->t_ae.rcep = 0b010; break; case (IPTOS_ECN_ECT0): - tp->t_rcep = 0b100; + tp->t_ae.rcep = 0b100; break; case (IPTOS_ECN_ECT1): - tp->t_rcep = 0b011; + tp->t_ae.rcep = 0b011; break; case (IPTOS_ECN_CE): - tp->t_rcep = 0b110; + tp->t_ae.rcep = 0b110; break; } break; @@ -294,18 +308,31 @@ TCPSTAT_INC(tcps_ecn_rcvect1); break; } - if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { if (tp->t_flags2 & TF2_ACE_PERMIT) { - if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) - tp->t_rcep += 1; + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags2 |= TF2_ACO_CE; + tp->t_ae.rceb += tlen; + tp->t_ae.rcep++; + break; + case IPTOS_ECN_ECT0: + tp->t_flags2 |= TF2_ACO_E0; + tp->t_ae.re0b += tlen; + break; + case IPTOS_ECN_ECT1: + tp->t_flags2 |= TF2_ACO_E1; + tp->t_ae.re1b += tlen; + break; + } if (tp->t_flags2 & TF2_ECN_PERMIT) { delta_cep = (tcp_ecn_get_ace(thflags) + 8 - - (tp->t_scep & 7)) & 7; + (tp->t_ae.scep & 7)) & 7; if (delta_cep < pkts) delta_cep = pkts - ((pkts - delta_cep) & 7); - tp->t_scep += delta_cep; + tp->t_ae.scep += delta_cep; + tp->t_ae.dcep = delta_cep; } else { /* * process the final ACK of the 3WHS @@ -320,16 +347,16 @@ /* FALLTHROUGH */ case 0b100: /* ECT0 SYN or SYN,ACK */ - tp->t_scep = 5; + tp->t_ae.scep = 5; break; case 0b110: /* CE SYN or SYN,ACK */ - tp->t_scep = 6; + tp->t_ae.scep = 6; tp->snd_cwnd = 2 * tcp_maxseg(tp); break; default: /* mangled AccECN handshake */ - tp->t_scep = 5; + tp->t_ae.scep = 5; break; } tp->t_flags2 |= TF2_ECN_PERMIT; @@ -338,7 +365,7 @@ /* RFC3168 ECN handling */ if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) { delta_cep = 1; - tp->t_scep++; + tp->t_ae.scep++; } if (thflags & TH_CWR) { tp->t_flags2 &= ~TF2_ECN_SND_ECE; @@ -419,21 +446,21 @@ */ if (tp->t_flags2 & TF2_ACE_PERMIT) { *thflags &= ~(TH_AE|TH_CWR|TH_ECE); - if (tp->t_rcep & 0x01) + if (tp->t_ae.rcep & 0x01) *thflags |= TH_ECE; - if (tp->t_rcep & 0x02) + if (tp->t_ae.rcep & 0x02) *thflags |= TH_CWR; - if (tp->t_rcep & 0x04) + if (tp->t_ae.rcep & 0x04) *thflags |= TH_AE; if (!(tp->t_flags2 & TF2_ECN_PERMIT)) { /* * here we process the final * ACK of the 3WHS */ - if (tp->t_rcep == 0b110) { - tp->t_rcep = 6; + if (tp->t_ae.rcep == 0b110) { + tp->t_ae.rcep = 6; } else { - tp->t_rcep = 5; + tp->t_ae.rcep = 5; } tp->t_flags2 |= TF2_ECN_PERMIT; } @@ -446,7 +473,6 @@ if (tp->t_flags2 & TF2_ECN_SND_ECE) *thflags |= TH_ECE; } - return ipecn; } @@ -468,13 +494,17 @@ /* FALLTHROUGH */ case SCF_ACE_1: tp->t_flags2 |= TF2_ACE_PERMIT; - tp->t_scep = 5; - tp->t_rcep = 5; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; + tp->t_ae.scep = 5; + tp->t_ae.rcep = 5; break; case SCF_ACE_CE: tp->t_flags2 |= TF2_ACE_PERMIT; - tp->t_scep = 6; - tp->t_rcep = 6; + if (V_tcp_ecn_option) + tp->t_flags |= TF_ACCECN_OPT; + tp->t_ae.scep = 6; + tp->t_ae.rcep = 6; break; /* undefined SCF codepoint */ default: diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c --- a/sys/netinet/tcp_input.c +++ b/sys/netinet/tcp_input.c @@ -994,6 +994,8 @@ } tp = intotcpcb(inp); + to.to_ae = &tp->t_ae; + switch (tp->t_state) { case TCPS_TIME_WAIT: /* @@ -1520,7 +1522,7 @@ int acked, ourfinisacked, needoutput = 0; sackstatus_t sack_changed; int rstreason, todrop, win, incforsyn = 0; - uint32_t tiwin; + uint32_t tiwin, old_sceb; uint16_t nsegs; char *s; struct inpcb *inp = tptoinpcb(tp); @@ -1534,6 +1536,7 @@ thflags = tcp_get_flags(th); tp->sackhint.last_sack_ack = 0; sack_changed = SACK_NOCHANGE; + to.to_ae = &tp->t_ae; nsegs = max(1, m->m_pkthdr.lro_nsegs); NET_EPOCH_ASSERT(); @@ -1605,9 +1608,15 @@ /* * Parse options on any incoming segment. */ + old_sceb = tp->t_ae.sceb; tcp_dooptions(&to, (u_char *)(th + 1), (th->th_off << 2) - sizeof(struct tcphdr), (thflags & TH_SYN) ? TO_SYN : 0); + if ((to.to_flags & TOF_ACCE_CE) && + (tp->t_ae.dcep != 0) && + ((tp->t_ae.sceb - old_sceb) == 0)) + tp->t_ae.scep -= tp->t_ae.dcep; + #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if ((tp->t_flags & TF_SIGNATURE) != 0 && @@ -3443,7 +3452,7 @@ void tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags) { - int opt, optlen; + int opt, optlen, tmp; to->to_flags = 0; for (; cnt > 0; cnt -= optlen, cp += optlen) { @@ -3536,6 +3545,48 @@ to->to_tfo_len = optlen - 2; to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL; break; + case TCPOPT_ACCECN0: + case TCPOPT_ACCECN1: + to->to_flags |= TOF_ACCECNOPT; + if (optlen >= (TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER)) { + tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 0); + if (opt == TCPOPT_ACCECN0) { + to->to_flags |= TOF_ACCE_E0; + tmp -= (to->to_ae->se0b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se0b += tmp; + } else { + to->to_flags |= TOF_ACCE_E1; + tmp -= (to->to_ae->se1b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se1b += tmp; + } + } + if (optlen >= (TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER)) { + to->to_flags |= TOF_ACCE_CE; + tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 3); + tmp -= (to->to_ae->sceb & 0xFFFFFF); + if (tmp > 0) + to->to_ae->sceb += tmp; + } + if (optlen >= (TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER)) { + tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 6); + if (opt == TCPOPT_ACCECN0) { + to->to_flags |= TOF_ACCE_E1; + tmp -= (to->to_ae->se1b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se1b += tmp; + } else { + to->to_flags |= TOF_ACCE_E0; + tmp -= (to->to_ae->se0b & 0xFFFFFF); + if (tmp > 0) + to->to_ae->se0b += tmp; + } + } + break; default: continue; } diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c --- a/sys/netinet/tcp_output.c +++ b/sys/netinet/tcp_output.c @@ -596,10 +596,14 @@ * Note: this may not work when tcp headers change * very dynamically in the future. */ - if ((((tp->t_flags & TF_SIGNATURE) ? + if ((min(TCP_MAXOLEN, + (((tp->t_flags & TF_SIGNATURE) ? PADTCPOLEN(TCPOLEN_SIGNATURE) : 0) + ((tp->t_flags & TF_RCVD_TSTMP) ? PADTCPOLEN(TCPOLEN_TIMESTAMP) : 0) + + ((tp->t_flags & TF_ACCECN_OPT) ? + PADTCPOLEN(TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER) : 0))) + len) >= tp->t_maxseg) goto send; /* @@ -876,9 +880,32 @@ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif /* TCP_SIGNATURE */ - + /* + * AccECN option + * Don't send on , only on or + * when doing an AccECN session + */ + if (tp->t_flags & TF_ACCECN_OPT) { + to.to_flags |= TOF_ACCECNOPT; + to.to_ae = &tp->t_ae; + to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) | + ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) | + ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0); + if (flags & TH_SYN) + to.to_flags |= TOF_ACCE_SYN; + if (tp->t_flags & TF_ACKNOW) + to.to_flags |= TOF_ACCE_ACKNOW; + } /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); + if (to.to_flags & TOF_ACCECNOPT) { + if ((to.to_flags & TOF_ACCE_E0) == 0) + tp->t_flags2 &= ~TF2_ACO_E0; + if ((to.to_flags & TOF_ACCE_E1) == 0) + tp->t_flags2 &= ~TF2_ACO_E1; + if ((to.to_flags & TOF_ACCE_CE) == 0) + tp->t_flags2 &= ~TF2_ACO_CE; + } /* * If we wanted a TFO option to be added, but it was unable * to fit, ensure no data is sent. @@ -1919,6 +1946,78 @@ optlen += total_len; break; } + case TOF_ACCECNOPT: + { + int tmp = 0; + int max_len = TCP_MAXOLEN - optlen; + if (max_len < TCPOLEN_ACCECN_EMPTY) { + to->to_flags &= ~TOF_ACCECNOPT; + continue; + } + if (max_len < (TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER)) { + if (to->to_flags & TOF_ACCE_SYN) { + *optp++ = TCPOPT_ACCECN0; + optlen += TCPOLEN_ACCECN_EMPTY; + *optp++ = TCPOLEN_ACCECN_EMPTY; + continue; + } else { + to->to_flags &= ~TOF_ACCECNOPT; + continue; + } + } + *optp++ = (to->to_flags & TOF_ACCE_E1) ? + TCPOPT_ACCECN1 : TCPOPT_ACCECN0; + if (max_len >= (TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER)) { + tmp = TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER; + } else + if (max_len >= (TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER)) { + tmp = TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER; + } else + if (max_len >= (TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER)) { + tmp = TCPOLEN_ACCECN_EMPTY + + 1 * TCPOLEN_ACCECN_COUNTER; + } + *optp++ = tmp; + optlen += tmp; + if (to->to_flags & TOF_ACCE_E1) { + hton24(&optp, to->to_ae->re1b); + } else { + hton24(&optp, to->to_ae->re0b); + to->to_flags &= ~TOF_ACCE_E0; + } + if (max_len < (TCPOLEN_ACCECN_EMPTY + + 2 * TCPOLEN_ACCECN_COUNTER)) { + to->to_flags &= ~TOF_ACCE_E1; + continue; + } + hton24(&optp, to->to_ae->rceb); + to->to_flags &= ~TOF_ACCE_CE; + if (max_len < (TCPOLEN_ACCECN_EMPTY + + 3 * TCPOLEN_ACCECN_COUNTER)) { + to->to_flags &= ~TOF_ACCE_E1; + continue; + } + /* + * TCP option sufficient to hold full AccECN option + * but only send changed counters normally, + * full counters on ACKNOW + */ + if (to->to_flags & TOF_ACCE_E1) { + hton24(&optp, to->to_ae->re0b); + to->to_flags &= ~TOF_ACCE_E0; + to->to_flags &= ~TOF_ACCE_E1; + continue; + } else { + hton24(&optp, to->to_ae->re1b); + continue; + } + } default: panic("%s: unknown TCP option type", __func__); break; diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -1805,7 +1805,6 @@ #ifdef INVARIANTS int thflags = tcp_get_flags(th); #endif - KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL")); NET_EPOCH_ASSERT(); @@ -2013,9 +2012,24 @@ if (tp->t_flags & TF_SIGNATURE) to.to_flags |= TOF_SIGNATURE; #endif + /* AccECN option */ + if (tp->t_flags & TF_ACCECN_OPT) { + to.to_flags |= TOF_ACCECNOPT; + to.to_ae = &tp->t_ae; + to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) | + ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) | + ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0); + } /* Add the options. */ tlen += optlen = tcp_addoptions(&to, optp); - + if (to.to_flags & TOF_ACCECNOPT) { + if ((to.to_flags & TOF_ACCE_E0) == 0) + tp->t_flags2 &= ~TF2_ACO_E0; + if ((to.to_flags & TOF_ACCE_E1) == 0) + tp->t_flags2 &= ~TF2_ACO_E1; + if ((to.to_flags & TOF_ACCE_CE) == 0) + tp->t_flags2 &= ~TF2_ACO_CE; + } /* Update m_len in the correct mbuf. */ optm->m_len += optlen; } else @@ -2330,6 +2344,14 @@ tcp_log_tcpcbinit(tp); #endif tp->t_pacing_rate = -1; + if (V_tcp_do_lrd) + tp->t_flags |= TF_LRD; + tp->t_ae.re0b = 1; + tp->t_ae.re1b = 1; + tp->t_ae.rceb = 0; + tp->t_ae.se0b = 1; + tp->t_ae.se1b = 1; + tp->t_ae.sceb = 0; if (tp->t_fb->tfb_tcp_fb_init) { if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) { refcount_release(&tp->t_fb->tfb_refcnt); diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1810,6 +1810,7 @@ #ifdef INET6 struct ip6_hdr *ip6 = NULL; #endif + struct accecn ae; NET_EPOCH_ASSERT(); @@ -1949,6 +1950,20 @@ /* don't send cookie again when retransmitting response */ sc->sc_tfo_cookie = NULL; } + if (V_tcp_ecn_option) + to.to_flags |= TOF_ACCE_SYN; + } + if (V_tcp_ecn_option && + (sc->sc_flags & SCF_ECN_MASK) && + ((sc->sc_flags & SCF_ECN_MASK) != SCF_ECN)) { + to.to_flags |= TOF_ACCECNOPT; + to.to_flags |= TOF_ACCE_E0 | + TOF_ACCE_E1 | + TOF_ACCE_CE; + ae.re0b = 1; + ae.re1b = 1; + ae.rceb = 0; + to.to_ae = &ae; } if (sc->sc_flags & SCF_TIMESTAMP) { to.to_tsval = sc->sc_tsoff + tcp_ts_getticks(); diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c --- a/sys/netinet/tcp_timer.c +++ b/sys/netinet/tcp_timer.c @@ -789,6 +789,15 @@ #endif in_losing(inp); } + /* + * Disable AccECN option when + * retransmitting after multiple + * timeouts. + */ + if ((tp->t_rxtshift >= V_tcp_ecn_maxretries) && + (tp->t_flags2 & TF2_ACE_PERMIT) && + (tp->t_flags & TF_ACCECN_OPT)) + tp->t_flags &= ~TF_ACCECN_OPT; tp->snd_nxt = tp->snd_una; tp->snd_recover = tp->snd_max; /* diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -1614,15 +1614,23 @@ * AccECN related counters. */ if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) == - (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) + (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) { /* * Internal counter starts at 5 for AccECN * but 0 for RFC3168 ECN. */ - ti->tcpi_delivered_ce = tp->t_scep - 5; - else - ti->tcpi_delivered_ce = tp->t_scep; - ti->tcpi_received_ce = tp->t_rcep; + ti->tcpi_delivered_ce = tp->t_ae.scep - 5; + ti->tcpi_received_ce = tp->t_ae.rcep - 5; + } else { + ti->tcpi_delivered_ce = tp->t_ae.scep; + ti->tcpi_received_ce = tp->t_ae.rcep; + } + ti->tcpi_received_e0_bytes = tp->t_ae.re0b - 1; + ti->tcpi_received_e1_bytes = tp->t_ae.re1b - 1; + ti->tcpi_received_ce_bytes = tp->t_ae.rceb; + ti->tcpi_delivered_e0_bytes = tp->t_ae.se0b - 1; + ti->tcpi_delivered_e1_bytes = tp->t_ae.se1b - 1; + ti->tcpi_delivered_ce_bytes = tp->t_ae.sceb; } /* diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -129,6 +129,18 @@ int32_t lost_bytes; /* number of rfc6675 IsLost() bytes */ }; +struct accecn { + uint32_t rcep; /* Number of received CE marked pkts */ + uint32_t scep; /* Synced number of delivered CE pkts */ + uint32_t dcep; /* delta of CE marks for rollback */ + uint32_t re0b; /* Number of ECT0 marked data bytes */ + uint32_t re1b; /* Number of ECT1 marked data bytes */ + uint32_t rceb; /* Number of CE marked data bytes */ + uint32_t se0b; /* Synced number of delivered ECT0 bytes */ + uint32_t se1b; /* Synced number of delivered ECT1 bytes */ + uint32_t sceb; /* Synced number of delivered CE bytes */ +}; + #define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq) STAILQ_HEAD(tcp_log_stailq, tcp_log_mem); @@ -431,8 +443,7 @@ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ int t_loglimit; /* Maximum number of log entries */ - uint32_t t_rcep; /* Number of received CE marked pkts */ - uint32_t t_scep; /* Synced number of delivered CE pkts */ + struct accecn t_ae; /* AccECN related byte counters */ int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; @@ -798,7 +809,7 @@ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_CLOSED 0x04000000 /* close(2) called on socket */ -#define TF_UNUSED1 0x08000000 /* unused */ +#define TF_ACCECN_OPT 0x08000000 /* AccECN is using TCP options */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ @@ -853,7 +864,9 @@ #define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */ #define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */ #define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */ - +#define TF2_ACO_E0 0x00100000 /* EE0 counter changed */ +#define TF2_ACO_E1 0x00200000 /* EE1 counter changed */ +#define TF2_ACO_CE 0x00400000 /* ECE counter changed */ /* * Structure to hold TCP options that are only used during segment * processing (in tcp_input), but not held in the tcpcb. @@ -864,14 +877,21 @@ */ struct tcpopt { u_int32_t to_flags; /* which options are present */ -#define TOF_MSS 0x0001 /* maximum segment size */ -#define TOF_SCALE 0x0002 /* window scaling */ -#define TOF_SACKPERM 0x0004 /* SACK permitted */ -#define TOF_TS 0x0010 /* timestamp */ -#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */ -#define TOF_SACK 0x0080 /* Peer sent SACK option */ -#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */ -#define TOF_MAXOPT 0x0200 +#define TOF_MSS 0x00000001 /* maximum segment size */ +#define TOF_SCALE 0x00000002 /* window scaling */ +#define TOF_SACKPERM 0x00000004 /* SACK permitted */ +#define TOF_TS 0x00000010 /* timestamp */ +#define TOF_SIGNATURE 0x00000040 /* TCP-MD5 signature option (RFC2385) */ +#define TOF_SACK 0x00000080 /* Peer sent SACK option */ +#define TOF_FASTOPEN 0x00000100 /* TCP Fast Open (TFO) cookie */ +#define TOF_ACCECNOPT 0x00000200 /* AccECN Option */ +#define TOF_MAXOPT 0x00000400 + /* Keep internal flags above TOF_MAXOPT */ +#define TOF_ACCE_SYN 0x80000000 /* send empty option */ +#define TOF_ACCE_CE 0x40000000 /* CE counter changed */ +#define TOF_ACCE_E0 0x20000000 /* E0 counter changed */ +#define TOF_ACCE_E1 0x10000000 /* E1 counter changed */ +#define TOF_ACCE_ACKNOW 0x08000000 /* send full option */ u_int32_t to_tsval; /* new timestamp */ u_int32_t to_tsecr; /* reflected timestamp */ u_char *to_sacks; /* pointer to the first SACK blocks */ @@ -881,7 +901,8 @@ u_int8_t to_wscale; /* window scaling */ u_int8_t to_nsacks; /* number of SACK blocks */ u_int8_t to_tfo_len; /* TFO cookie length */ - u_int32_t to_spare; /* UTO */ + struct accecn *to_ae; /* pointer to AccECN byte counters */ + u_int32_t to_spare; /* UTO */ }; /* @@ -1283,6 +1304,7 @@ VNET_DECLARE(int, tcp_do_sack); VNET_DECLARE(int, tcp_do_tso); VNET_DECLARE(int, tcp_ecn_maxretries); +VNET_DECLARE(int, tcp_ecn_option); VNET_DECLARE(int, tcp_initcwnd_segments); VNET_DECLARE(int, tcp_insecure_rst); VNET_DECLARE(int, tcp_insecure_syn); @@ -1329,6 +1351,7 @@ #define V_tcp_do_sack VNET(tcp_do_sack) #define V_tcp_do_tso VNET(tcp_do_tso) #define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries) +#define V_tcp_ecn_option VNET(tcp_ecn_option) #define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments) #define V_tcp_insecure_rst VNET(tcp_insecure_rst) #define V_tcp_insecure_syn VNET(tcp_insecure_syn)