Page MenuHomeFreeBSD

D36303.id111278.diff
No OneTemporary

D36303.id111278.diff

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -34,7 +34,7 @@
.\" From: @(#)tcp.4 8.1 (Berkeley) 6/5/93
.\" $FreeBSD$
.\"
-.Dd August 1, 2022
+.Dd September 30, 2022
.Dt TCP 4
.Os
.Sh NAME
@@ -520,6 +520,9 @@
specific connection.
This is needed to help with connection establishment
when a broken firewall is in the network path.
+.It Va ecn.option
+Reflect back the number of received bytes with a particular ECN marking
+by using the Accurate ECN TCP option on each outgoing packet.
.It Va fast_finwait2_recycle
Recycle
.Tn TCP
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -104,6 +104,10 @@
#define TCPOLEN_SIGNATURE 18
#define TCPOPT_FAST_OPEN 34
#define TCPOLEN_FAST_OPEN_EMPTY 2
+#define TCPOPT_ACCECN0 0xAC
+#define TCPOPT_ACCECN1 0XAE
+#define TCPOLEN_ACCECN_EMPTY 2
+#define TCPOLEN_ACCECN_COUNTER 3
#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h
--- a/sys/netinet/tcp_ecn.h
+++ b/sys/netinet/tcp_ecn.h
@@ -43,7 +43,7 @@
void tcp_ecn_input_syn_sent(struct tcpcb *, uint16_t, int);
void tcp_ecn_input_parallel_syn(struct tcpcb *, uint16_t, int);
-int tcp_ecn_input_segment(struct tcpcb *, uint16_t, int);
+int tcp_ecn_input_segment(struct tcpcb *, uint16_t, int, int);
uint16_t tcp_ecn_output_syn_sent(struct tcpcb *);
int tcp_ecn_output_established(struct tcpcb *, uint16_t *, int, bool);
void tcp_ecn_syncache_socket(struct tcpcb *, struct syncache *);
diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c
--- a/sys/netinet/tcp_ecn.c
+++ b/sys/netinet/tcp_ecn.c
@@ -102,6 +102,24 @@
#include <netinet/tcpip.h>
#include <netinet/tcp_ecn.h>
+static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn,
+ CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
+ "TCP ECN");
+
+VNET_DEFINE(int, tcp_do_ecn) = 2;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_do_ecn), 0,
+ "TCP ECN support");
+
+VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0,
+ "Max retries before giving up on ECN");
+
+VNET_DEFINE(int, tcp_ecn_option) = 0;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, option,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_option), 0,
+ "Use AccECN TCP option");
/*
* Process incoming SYN,ACK packet
@@ -109,7 +127,6 @@
void
tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
{
-
if (V_tcp_do_ecn == 0)
return;
if ((V_tcp_do_ecn == 1) ||
@@ -261,19 +278,25 @@
* TCP ECN processing.
*/
int
-tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int iptos)
+tcp_ecn_input_segment(struct tcpcb *tp, uint16_t thflags, int tlen, int iptos)
{
int delta_ace = 0;
if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
switch (iptos & IPTOS_ECN_MASK) {
case IPTOS_ECN_CE:
+ tp->t_flags2 |= TF2_ACO_CE;
+ tp->t_rceb += tlen;
TCPSTAT_INC(tcps_ecn_ce);
break;
case IPTOS_ECN_ECT0:
+ tp->t_flags2 |= TF2_ACO_E0;
+ tp->t_re0b += tlen;
TCPSTAT_INC(tcps_ecn_ect0);
break;
case IPTOS_ECN_ECT1:
+ tp->t_flags2 |= TF2_ACO_E1;
+ tp->t_re1b += tlen;
TCPSTAT_INC(tcps_ecn_ect1);
break;
}
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -204,20 +204,6 @@
&VNET_NAME(tcp_abc_l_var), 2,
"Cap the max cwnd increment during slow-start to this number of segments");
-static SYSCTL_NODE(_net_inet_tcp, OID_AUTO, ecn,
- CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
- "TCP ECN");
-
-VNET_DEFINE(int, tcp_do_ecn) = 2;
-SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, enable, CTLFLAG_VNET | CTLFLAG_RW,
- &VNET_NAME(tcp_do_ecn), 0,
- "TCP ECN support");
-
-VNET_DEFINE(int, tcp_ecn_maxretries) = 1;
-SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, maxretries, CTLFLAG_VNET | CTLFLAG_RW,
- &VNET_NAME(tcp_ecn_maxretries), 0,
- "Max retries before giving up on ECN");
-
VNET_DEFINE(int, tcp_insecure_syn) = 0;
SYSCTL_INT(_net_inet_tcp, OID_AUTO, insecure_syn, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(tcp_insecure_syn), 0,
@@ -1630,7 +1616,7 @@
/*
* TCP ECN processing.
*/
- if (tcp_ecn_input_segment(tp, thflags, iptos))
+ if (tcp_ecn_input_segment(tp, thflags, tlen, iptos))
cc_cong_signal(tp, th, CC_ECN);
/*
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -890,9 +890,37 @@
if (tp->t_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif /* TCP_SIGNATURE */
-
+ /*
+ * AccECN option
+ * Don't send on <SYN>, only on <SYN,ACK> or
+ * when doing an AccECN session
+ */
+ if (V_tcp_ecn_option &&
+ ((V_tcp_do_ecn == 3) || (V_tcp_do_ecn == 4)) &&
+ ((tp->t_flags2 & TF2_ACE_PERMIT) ||
+ ((flags & TH_SYN) && (flags & TH_ACK)))) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_ee0b = tp->t_re0b;
+ to.to_ee1b = tp->t_re1b;
+ to.to_eceb = tp->t_rceb;
+ to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) |
+ ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) |
+ ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0);
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_ACCE_SYN;
+ if (tp->t_flags & TF_ACKNOW)
+ to.to_flags |= TOF_ACCE_ACKNOW;
+ }
/* Processing the options. */
hdrlen += optlen = tcp_addoptions(&to, opt);
+ if (to.to_flags & TOF_ACCECNOPT) {
+ if ((to.to_flags & TOF_ACCE_E0) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E0;
+ if ((to.to_flags & TOF_ACCE_E1) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E1;
+ if ((to.to_flags & TOF_ACCE_CE) == 0)
+ tp->t_flags2 &= ~TF2_ACO_CE;
+ }
/*
* If we wanted a TFO option to be added, but it was unable
* to fit, ensure no data is sent.
@@ -1955,6 +1983,128 @@
optlen += total_len;
break;
}
+ case TOF_ACCECNOPT:
+ {
+ int max_len = TCP_MAXOLEN - optlen;
+ if (max_len < TCPOLEN_ACCECN_EMPTY) {
+ to->to_flags &= ~TOF_ACCECNOPT;
+ continue;
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ if (to->to_flags & TOF_ACCE_SYN) {
+ *optp++ = TCPOPT_ACCECN0;
+ optlen += TCPOLEN_ACCECN_EMPTY;
+ *optp++ = TCPOLEN_ACCECN_EMPTY;
+ continue;
+ } else {
+ to->to_flags &= ~TOF_ACCECNOPT;
+ continue;
+ }
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ if (to->to_flags & TOF_ACCE_E1) {
+ *optp++ = TCPOPT_ACCECN1;
+ *optp++ = TCPOLEN_ACCECN_EMPTY +
+ TCPOLEN_ACCECN_COUNTER;
+ optlen += TCPOLEN_ACCECN_EMPTY +
+ TCPOLEN_ACCECN_COUNTER;
+ *optp++ = (char)(to->to_ee1b >> 16);
+ *optp++ = (char)(to->to_ee1b >> 8);
+ *optp++ = (char)(to->to_ee1b);
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ }
+ *optp++ = TCPOPT_ACCECN0;
+ *optp++ = TCPOLEN_ACCECN_EMPTY +
+ TCPOLEN_ACCECN_COUNTER;
+ optlen += TCPOLEN_ACCECN_EMPTY +
+ TCPOLEN_ACCECN_COUNTER;
+ *optp++ = (char)(to->to_ee0b >> 16);
+ *optp++ = (char)(to->to_ee0b >> 8);
+ *optp++ = (char)(to->to_ee0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ continue;
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ if (to->to_flags & TOF_ACCE_E1) {
+ *optp++ = TCPOPT_ACCECN1;
+ *optp++ = TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER;
+ optlen += TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER;
+ *optp++ = (char)(to->to_ee1b >> 16);
+ *optp++ = (char)(to->to_ee1b >> 8);
+ *optp++ = (char)(to->to_ee1b);
+ to->to_flags &= ~TOF_ACCE_E1;
+ *optp++ = (char)(to->to_eceb >> 16);
+ *optp++ = (char)(to->to_eceb >> 8);
+ *optp++ = (char)(to->to_eceb);
+ to->to_flags &= ~TOF_ACCE_CE;
+ continue;
+ }
+ *optp++ = TCPOPT_ACCECN0;
+ *optp++ = TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER;
+ optlen += TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER;
+ *optp++ = (char)(to->to_ee0b >> 16);
+ *optp++ = (char)(to->to_ee0b >> 8);
+ *optp++ = (char)(to->to_ee0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ *optp++ = (char)(to->to_eceb >> 16);
+ *optp++ = (char)(to->to_eceb >> 8);
+ *optp++ = (char)(to->to_eceb);
+ to->to_flags &= ~TOF_ACCE_CE;
+ continue;
+ }
+ /*
+ * TCP option sufficient to hold full AccECN option
+ * but only send changed counters normally,
+ * full counters on ACKNOW
+ */
+ if (to->to_flags & TOF_ACCE_E1) {
+ *optp++ = TCPOPT_ACCECN1;
+ *optp++ = TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER;
+ optlen += TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER;
+ *optp++ = (char)(to->to_ee1b >> 16);
+ *optp++ = (char)(to->to_ee1b >> 8);
+ *optp++ = (char)(to->to_ee1b);
+ to->to_flags &= ~TOF_ACCE_E1;
+ *optp++ = (char)(to->to_eceb >> 16);
+ *optp++ = (char)(to->to_eceb >> 8);
+ *optp++ = (char)(to->to_eceb);
+ to->to_flags &= ~TOF_ACCE_CE;
+ *optp++ = (char)(to->to_ee0b >> 16);
+ *optp++ = (char)(to->to_ee0b >> 8);
+ *optp++ = (char)(to->to_ee0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ continue;
+ } else {
+ *optp++ = TCPOPT_ACCECN0;
+ *optp++ = TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER;
+ optlen += TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER;
+ *optp++ = (char)(to->to_ee0b >> 16);
+ *optp++ = (char)(to->to_ee0b >> 8);
+ *optp++ = (char)(to->to_ee0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ *optp++ = (char)(to->to_eceb >> 16);
+ *optp++ = (char)(to->to_eceb >> 8);
+ *optp++ = (char)(to->to_eceb);
+ to->to_flags &= ~TOF_ACCE_CE;
+ *optp++ = (char)(to->to_ee1b >> 16);
+ *optp++ = (char)(to->to_ee1b >> 8);
+ *optp++ = (char)(to->to_ee1b);
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ }
+ }
default:
panic("%s: unknown TCP option type", __func__);
break;
diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c
--- a/sys/netinet/tcp_stacks/rack.c
+++ b/sys/netinet/tcp_stacks/rack.c
@@ -13528,8 +13528,8 @@
rack_cc_after_idle(rack, tp);
}
tp->t_rcvtime = ticks;
- /* Now what about ECN? */
- if (tcp_ecn_input_segment(tp, ae->flags, ae->codepoint))
+ /* Now what about ECN of a chain of pure ACKs? */
+ if (tcp_ecn_input_segment(tp, ae->flags, 0, ae->codepoint))
rack_cong_signal(tp, CC_ECN, ae->ack, __LINE__);
#ifdef TCP_ACCOUNTING
/* Count for the specific type of ack in */
@@ -14319,7 +14319,7 @@
* TCP ECN processing. XXXJTL: If we ever use ECN, we need to move
* this to occur after we've validated the segment.
*/
- if (tcp_ecn_input_segment(tp, thflags, iptos))
+ if (tcp_ecn_input_segment(tp, thflags, tlen, iptos))
rack_cong_signal(tp, CC_ECN, th->th_ack, __LINE__);
/*
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -2316,6 +2316,11 @@
tcp_log_tcpcbinit(tp);
#endif
tp->t_pacing_rate = -1;
+ if (V_tcp_do_lrd)
+ tp->t_flags |= TF_LRD;
+ tp->t_re0b = 1;
+ tp->t_re1b = 1;
+ tp->t_rceb = 0;
if (tp->t_fb->tfb_tcp_fb_init) {
if ((*tp->t_fb->tfb_tcp_fb_init)(tp)) {
refcount_release(&tp->t_fb->tfb_refcnt);
@@ -2328,8 +2333,6 @@
if (V_tcp_perconn_stats_enable == 1)
tp->t_stats = stats_blob_alloc(V_tcp_perconn_stats_dflt_tpl, 0);
#endif
- if (V_tcp_do_lrd)
- tp->t_flags |= TF_LRD;
return (tp); /* XXX */
}
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1958,6 +1958,19 @@
/* don't send cookie again when retransmitting response */
sc->sc_tfo_cookie = NULL;
}
+ if (V_tcp_ecn_option)
+ to.to_flags |= TOF_ACCE_SYN;
+ }
+ if (V_tcp_ecn_option &&
+ (sc->sc_flags & SCF_ECN_MASK) &&
+ ((sc->sc_flags & SCF_ECN_MASK) != SCF_ECN)) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_flags |= TOF_ACCE_E0 |
+ TOF_ACCE_E1 |
+ TOF_ACCE_CE;
+ to.to_ee0b = 1;
+ to.to_ee1b = 1;
+ to.to_eceb = 0;
}
if (sc->sc_flags & SCF_TIMESTAMP) {
to.to_tsval = sc->sc_tsoff + tcp_ts_getticks();
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -251,6 +251,9 @@
int t_loglimit; /* Maximum number of log entries */
uint32_t t_rcep; /* Number of received CE marked pkts */
uint32_t t_scep; /* Synced number of delivered CE pkts */
+ uint32_t t_re0b; /* Number of ECT0 marked data bytes */
+ uint32_t t_re1b; /* Number of ECT1 marked data bytes */
+ uint32_t t_rceb; /* Number of CE marked data bytes */
int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */
struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;
@@ -570,7 +573,10 @@
#define TF2_ECN_SND_CWR 0x00000040 /* ECN CWR in queue */
#define TF2_ECN_SND_ECE 0x00000080 /* ECN ECE in queue */
#define TF2_ACE_PERMIT 0x00000100 /* Accurate ECN mode */
-#define TF2_FBYTES_COMPLETE 0x00000400 /* We have first bytes in and out */
+#define TF2_ACO_E0 0x00000200 /* EE0 counter changed */
+#define TF2_ACO_E1 0x00000400 /* EE1 counter changed */
+#define TF2_ACO_CE 0x00000800 /* ECE counter changed */
+#define TF2_FBYTES_COMPLETE 0x00001000 /* We have first bytes in and out */
/*
* Structure to hold TCP options that are only used during segment
* processing (in tcp_input), but not held in the tcpcb.
@@ -581,14 +587,21 @@
*/
struct tcpopt {
u_int32_t to_flags; /* which options are present */
-#define TOF_MSS 0x0001 /* maximum segment size */
-#define TOF_SCALE 0x0002 /* window scaling */
-#define TOF_SACKPERM 0x0004 /* SACK permitted */
-#define TOF_TS 0x0010 /* timestamp */
-#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */
-#define TOF_SACK 0x0080 /* Peer sent SACK option */
-#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */
-#define TOF_MAXOPT 0x0200
+#define TOF_MSS 0x00000001 /* maximum segment size */
+#define TOF_SCALE 0x00000002 /* window scaling */
+#define TOF_SACKPERM 0x00000004 /* SACK permitted */
+#define TOF_TS 0x00000010 /* timestamp */
+#define TOF_SIGNATURE 0x00000040 /* TCP-MD5 signature option (RFC2385) */
+#define TOF_SACK 0x00000080 /* Peer sent SACK option */
+#define TOF_FASTOPEN 0x00000100 /* TCP Fast Open (TFO) cookie */
+#define TOF_ACCECNOPT 0x00000200 /* AccECN Option */
+#define TOF_MAXOPT 0x00000400
+ /* Keep internal flags above TOF_MAXOPT */
+#define TOF_ACCE_SYN 0x80000000 /* send empty option */
+#define TOF_ACCE_CE 0x40000000 /* CE counter changed */
+#define TOF_ACCE_E0 0x20000000 /* E0 counter changed */
+#define TOF_ACCE_E1 0x10000000 /* E1 counter changed */
+#define TOF_ACCE_ACKNOW 0x08000000 /* send full option */
u_int32_t to_tsval; /* new timestamp */
u_int32_t to_tsecr; /* reflected timestamp */
u_char *to_sacks; /* pointer to the first SACK blocks */
@@ -598,7 +611,10 @@
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
u_int8_t to_tfo_len; /* TFO cookie length */
- u_int32_t to_spare; /* UTO */
+ u_int32_t to_ee0b; /* AccECN E0 marked bytes */
+ u_int32_t to_ee1b; /* AccECN E1 marked bytes */
+ u_int32_t to_eceb; /* AccECN CE marked bytes */
+ u_int32_t to_spare; /* UTO */
};
/*
@@ -1006,6 +1022,7 @@
VNET_DECLARE(int, tcp_do_sack);
VNET_DECLARE(int, tcp_do_tso);
VNET_DECLARE(int, tcp_ecn_maxretries);
+VNET_DECLARE(int, tcp_ecn_option);
VNET_DECLARE(int, tcp_initcwnd_segments);
VNET_DECLARE(int, tcp_insecure_rst);
VNET_DECLARE(int, tcp_insecure_syn);
@@ -1052,6 +1069,7 @@
#define V_tcp_do_sack VNET(tcp_do_sack)
#define V_tcp_do_tso VNET(tcp_do_tso)
#define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries)
+#define V_tcp_ecn_option VNET(tcp_ecn_option)
#define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments)
#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
#define V_tcp_insecure_syn VNET(tcp_insecure_syn)

File Metadata

Mime Type
text/plain
Expires
Tue, Jun 23, 1:31 PM (4 h, 41 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
34248387
Default Alt Text
D36303.id111278.diff (15 KB)

Event Timeline