Page MenuHomeFreeBSD

D36303.id132863.diff
No OneTemporary

D36303.id132863.diff

diff --git a/share/man/man4/tcp.4 b/share/man/man4/tcp.4
--- a/share/man/man4/tcp.4
+++ b/share/man/man4/tcp.4
@@ -31,7 +31,7 @@
.\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
.\" SUCH DAMAGE.
.\"
-.Dd November 30, 2023
+.Dd January 17, 2023
.Dt TCP 4
.Os
.Sh NAME
@@ -504,6 +504,9 @@
specific connection.
This is needed to help with connection establishment
when a broken firewall is in the network path.
+.It Va ecn.option
+Reflect back the number of received bytes with a particular ECN marking
+by using the Accurate ECN TCP option on each outgoing packet.
.It Va fast_finwait2_recycle
Recycle
.Tn TCP
diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h
--- a/sys/netinet/tcp.h
+++ b/sys/netinet/tcp.h
@@ -121,6 +121,10 @@
#define TCPOLEN_SIGNATURE 18
#define TCPOPT_FAST_OPEN 34
#define TCPOLEN_FAST_OPEN_EMPTY 2
+#define TCPOPT_ACCECN0 0xAC
+#define TCPOPT_ACCECN1 0XAE
+#define TCPOLEN_ACCECN_EMPTY 2
+#define TCPOLEN_ACCECN_COUNTER 3
#define MAX_TCPOPTLEN 40 /* Absolute maximum TCP options len */
@@ -431,12 +435,12 @@
/* Accurate ECN counters. */
u_int32_t tcpi_delivered_ce;
u_int32_t tcpi_received_ce; /* # of CE marks received */
- u_int32_t __tcpi_delivered_e1_bytes;
- u_int32_t __tcpi_delivered_e0_bytes;
- u_int32_t __tcpi_delivered_ce_bytes;
- u_int32_t __tcpi_received_e1_bytes;
- u_int32_t __tcpi_received_e0_bytes;
- u_int32_t __tcpi_received_ce_bytes;
+ u_int32_t tcpi_delivered_e1_bytes;
+ u_int32_t tcpi_delivered_e0_bytes;
+ u_int32_t tcpi_delivered_ce_bytes;
+ u_int32_t tcpi_received_e1_bytes;
+ u_int32_t tcpi_received_e0_bytes;
+ u_int32_t tcpi_received_ce_bytes;
u_int32_t tcpi_total_tlp; /* tail loss probes sent */
u_int64_t tcpi_total_tlp_bytes; /* tail loss probe bytes sent */
diff --git a/sys/netinet/tcp_ecn.h b/sys/netinet/tcp_ecn.h
--- a/sys/netinet/tcp_ecn.h
+++ b/sys/netinet/tcp_ecn.h
@@ -48,6 +48,24 @@
uint16_t tcp_ecn_syncache_respond(uint16_t, struct syncache *);
int tcp_ecn_get_ace(uint16_t);
+static inline void hton24(u_char **p, uint32_t v)
+{
+ *(*p)++ = (u_char)(v >> 16);
+ *(*p)++ = (u_char)(v >> 8);
+ *(*p)++ = (u_char)(v);
+}
+
+static inline uint32_t ntoh24(u_char *p)
+{
+ uint32_t v;
+
+ v = (uint32_t)(p[0] << 16);
+ v |= (uint32_t)(p[1] << 8);
+ v |= (uint32_t)(p[2] << 0);
+ return v;
+}
+
+
#endif /* _KERNEL */
#endif /* _NETINET_TCP_ECN_H_ */
diff --git a/sys/netinet/tcp_ecn.c b/sys/netinet/tcp_ecn.c
--- a/sys/netinet/tcp_ecn.c
+++ b/sys/netinet/tcp_ecn.c
@@ -110,13 +110,17 @@
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_maxretries), 0,
"Max retries before giving up on ECN");
+VNET_DEFINE(int, tcp_ecn_option) = 0;
+SYSCTL_INT(_net_inet_tcp_ecn, OID_AUTO, option,
+ CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(tcp_ecn_option), 0,
+ "Use AccECN TCP option");
+
/*
* Process incoming SYN,ACK packet
*/
void
tcp_ecn_input_syn_sent(struct tcpcb *tp, uint16_t thflags, int iptos)
{
-
if (V_tcp_do_ecn == 0)
return;
if ((V_tcp_do_ecn == 1) ||
@@ -149,7 +153,9 @@
case (0|TH_CWR|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_nect);
break;
@@ -157,7 +163,9 @@
case (TH_AE|0|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect0);
break;
@@ -165,7 +173,9 @@
case (0|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
TCPSTAT_INC(tcps_ecn_shs);
TCPSTAT_INC(tcps_ace_ect1);
break;
@@ -173,7 +183,9 @@
case (TH_AE|TH_CWR|0):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
- tp->t_scep = 6;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 6;
/*
* reduce the IW to 2 MSS (to
* account for delayed acks) if
@@ -196,16 +208,16 @@
*/
switch (iptos & IPTOS_ECN_MASK) {
case (IPTOS_ECN_NOTECT):
- tp->t_rcep = 0b010;
+ tp->t_ae.rcep = 0b010;
break;
case (IPTOS_ECN_ECT0):
- tp->t_rcep = 0b100;
+ tp->t_ae.rcep = 0b100;
break;
case (IPTOS_ECN_ECT1):
- tp->t_rcep = 0b011;
+ tp->t_ae.rcep = 0b011;
break;
case (IPTOS_ECN_CE):
- tp->t_rcep = 0b110;
+ tp->t_ae.rcep = 0b110;
break;
}
}
@@ -248,6 +260,8 @@
case (TH_AE|TH_CWR|TH_ECE):
tp->t_flags2 |= TF2_ACE_PERMIT;
tp->t_flags2 &= ~TF2_ECN_PERMIT;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
TCPSTAT_INC(tcps_ecn_shs);
/*
* Set the AccECN Codepoints on
@@ -258,16 +272,16 @@
*/
switch (iptos & IPTOS_ECN_MASK) {
case (IPTOS_ECN_NOTECT):
- tp->t_rcep = 0b010;
+ tp->t_ae.rcep = 0b010;
break;
case (IPTOS_ECN_ECT0):
- tp->t_rcep = 0b100;
+ tp->t_ae.rcep = 0b100;
break;
case (IPTOS_ECN_ECT1):
- tp->t_rcep = 0b011;
+ tp->t_ae.rcep = 0b011;
break;
case (IPTOS_ECN_CE):
- tp->t_rcep = 0b110;
+ tp->t_ae.rcep = 0b110;
break;
}
break;
@@ -294,18 +308,31 @@
TCPSTAT_INC(tcps_ecn_rcvect1);
break;
}
-
if (tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
if (tp->t_flags2 & TF2_ACE_PERMIT) {
- if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE)
- tp->t_rcep += 1;
+ switch (iptos & IPTOS_ECN_MASK) {
+ case IPTOS_ECN_CE:
+ tp->t_flags2 |= TF2_ACO_CE;
+ tp->t_ae.rceb += tlen;
+ tp->t_ae.rcep++;
+ break;
+ case IPTOS_ECN_ECT0:
+ tp->t_flags2 |= TF2_ACO_E0;
+ tp->t_ae.re0b += tlen;
+ break;
+ case IPTOS_ECN_ECT1:
+ tp->t_flags2 |= TF2_ACO_E1;
+ tp->t_ae.re1b += tlen;
+ break;
+ }
if (tp->t_flags2 & TF2_ECN_PERMIT) {
delta_cep = (tcp_ecn_get_ace(thflags) + 8 -
- (tp->t_scep & 7)) & 7;
+ (tp->t_ae.scep & 7)) & 7;
if (delta_cep < pkts)
delta_cep = pkts -
((pkts - delta_cep) & 7);
- tp->t_scep += delta_cep;
+ tp->t_ae.scep += delta_cep;
+ tp->t_ae.dcep = delta_cep;
} else {
/*
* process the final ACK of the 3WHS
@@ -320,16 +347,16 @@
/* FALLTHROUGH */
case 0b100:
/* ECT0 SYN or SYN,ACK */
- tp->t_scep = 5;
+ tp->t_ae.scep = 5;
break;
case 0b110:
/* CE SYN or SYN,ACK */
- tp->t_scep = 6;
+ tp->t_ae.scep = 6;
tp->snd_cwnd = 2 * tcp_maxseg(tp);
break;
default:
/* mangled AccECN handshake */
- tp->t_scep = 5;
+ tp->t_ae.scep = 5;
break;
}
tp->t_flags2 |= TF2_ECN_PERMIT;
@@ -338,7 +365,7 @@
/* RFC3168 ECN handling */
if ((thflags & (TH_SYN | TH_ECE)) == TH_ECE) {
delta_cep = 1;
- tp->t_scep++;
+ tp->t_ae.scep++;
}
if (thflags & TH_CWR) {
tp->t_flags2 &= ~TF2_ECN_SND_ECE;
@@ -419,21 +446,21 @@
*/
if (tp->t_flags2 & TF2_ACE_PERMIT) {
*thflags &= ~(TH_AE|TH_CWR|TH_ECE);
- if (tp->t_rcep & 0x01)
+ if (tp->t_ae.rcep & 0x01)
*thflags |= TH_ECE;
- if (tp->t_rcep & 0x02)
+ if (tp->t_ae.rcep & 0x02)
*thflags |= TH_CWR;
- if (tp->t_rcep & 0x04)
+ if (tp->t_ae.rcep & 0x04)
*thflags |= TH_AE;
if (!(tp->t_flags2 & TF2_ECN_PERMIT)) {
/*
* here we process the final
* ACK of the 3WHS
*/
- if (tp->t_rcep == 0b110) {
- tp->t_rcep = 6;
+ if (tp->t_ae.rcep == 0b110) {
+ tp->t_ae.rcep = 6;
} else {
- tp->t_rcep = 5;
+ tp->t_ae.rcep = 5;
}
tp->t_flags2 |= TF2_ECN_PERMIT;
}
@@ -446,7 +473,6 @@
if (tp->t_flags2 & TF2_ECN_SND_ECE)
*thflags |= TH_ECE;
}
-
return ipecn;
}
@@ -468,13 +494,17 @@
/* FALLTHROUGH */
case SCF_ACE_1:
tp->t_flags2 |= TF2_ACE_PERMIT;
- tp->t_scep = 5;
- tp->t_rcep = 5;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 5;
+ tp->t_ae.rcep = 5;
break;
case SCF_ACE_CE:
tp->t_flags2 |= TF2_ACE_PERMIT;
- tp->t_scep = 6;
- tp->t_rcep = 6;
+ if (V_tcp_ecn_option)
+ tp->t_flags |= TF_ACCECN_OPT;
+ tp->t_ae.scep = 6;
+ tp->t_ae.rcep = 6;
break;
/* undefined SCF codepoint */
default:
diff --git a/sys/netinet/tcp_input.c b/sys/netinet/tcp_input.c
--- a/sys/netinet/tcp_input.c
+++ b/sys/netinet/tcp_input.c
@@ -994,6 +994,8 @@
}
tp = intotcpcb(inp);
+ to.to_ae = &tp->t_ae;
+
switch (tp->t_state) {
case TCPS_TIME_WAIT:
/*
@@ -1520,7 +1522,7 @@
int acked, ourfinisacked, needoutput = 0;
sackstatus_t sack_changed;
int rstreason, todrop, win, incforsyn = 0;
- uint32_t tiwin;
+ uint32_t tiwin, old_sceb;
uint16_t nsegs;
char *s;
struct inpcb *inp = tptoinpcb(tp);
@@ -1534,6 +1536,7 @@
thflags = tcp_get_flags(th);
tp->sackhint.last_sack_ack = 0;
sack_changed = SACK_NOCHANGE;
+ to.to_ae = &tp->t_ae;
nsegs = max(1, m->m_pkthdr.lro_nsegs);
NET_EPOCH_ASSERT();
@@ -1605,9 +1608,15 @@
/*
* Parse options on any incoming segment.
*/
+ old_sceb = tp->t_ae.sceb;
tcp_dooptions(&to, (u_char *)(th + 1),
(th->th_off << 2) - sizeof(struct tcphdr),
(thflags & TH_SYN) ? TO_SYN : 0);
+ if ((to.to_flags & TOF_ACCE_CE) &&
+ (tp->t_ae.dcep != 0) &&
+ ((tp->t_ae.sceb - old_sceb) == 0))
+ tp->t_ae.scep -= tp->t_ae.dcep;
+
#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE)
if ((tp->t_flags & TF_SIGNATURE) != 0 &&
@@ -3443,7 +3452,7 @@
void
tcp_dooptions(struct tcpopt *to, u_char *cp, int cnt, int flags)
{
- int opt, optlen;
+ int opt, optlen, tmp;
to->to_flags = 0;
for (; cnt > 0; cnt -= optlen, cp += optlen) {
@@ -3536,6 +3545,48 @@
to->to_tfo_len = optlen - 2;
to->to_tfo_cookie = to->to_tfo_len ? cp + 2 : NULL;
break;
+ case TCPOPT_ACCECN0:
+ case TCPOPT_ACCECN1:
+ to->to_flags |= TOF_ACCECNOPT;
+ if (optlen >= (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 0);
+ if (opt == TCPOPT_ACCECN0) {
+ to->to_flags |= TOF_ACCE_E0;
+ tmp -= (to->to_ae->se0b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se0b += tmp;
+ } else {
+ to->to_flags |= TOF_ACCE_E1;
+ tmp -= (to->to_ae->se1b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se1b += tmp;
+ }
+ }
+ if (optlen >= (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ to->to_flags |= TOF_ACCE_CE;
+ tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 3);
+ tmp -= (to->to_ae->sceb & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->sceb += tmp;
+ }
+ if (optlen >= (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = ntoh24(cp + TCPOLEN_ACCECN_EMPTY + 6);
+ if (opt == TCPOPT_ACCECN0) {
+ to->to_flags |= TOF_ACCE_E1;
+ tmp -= (to->to_ae->se1b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se1b += tmp;
+ } else {
+ to->to_flags |= TOF_ACCE_E0;
+ tmp -= (to->to_ae->se0b & 0xFFFFFF);
+ if (tmp > 0)
+ to->to_ae->se0b += tmp;
+ }
+ }
+ break;
default:
continue;
}
diff --git a/sys/netinet/tcp_output.c b/sys/netinet/tcp_output.c
--- a/sys/netinet/tcp_output.c
+++ b/sys/netinet/tcp_output.c
@@ -596,10 +596,14 @@
* Note: this may not work when tcp headers change
* very dynamically in the future.
*/
- if ((((tp->t_flags & TF_SIGNATURE) ?
+ if ((min(TCP_MAXOLEN,
+ (((tp->t_flags & TF_SIGNATURE) ?
PADTCPOLEN(TCPOLEN_SIGNATURE) : 0) +
((tp->t_flags & TF_RCVD_TSTMP) ?
PADTCPOLEN(TCPOLEN_TIMESTAMP) : 0) +
+ ((tp->t_flags & TF_ACCECN_OPT) ?
+ PADTCPOLEN(TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER) : 0))) +
len) >= tp->t_maxseg)
goto send;
/*
@@ -876,9 +880,32 @@
if (tp->t_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif /* TCP_SIGNATURE */
-
+ /*
+ * AccECN option
+ * Don't send on <SYN>, only on <SYN,ACK> or
+ * when doing an AccECN session
+ */
+ if (tp->t_flags & TF_ACCECN_OPT) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_ae = &tp->t_ae;
+ to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) |
+ ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) |
+ ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0);
+ if (flags & TH_SYN)
+ to.to_flags |= TOF_ACCE_SYN;
+ if (tp->t_flags & TF_ACKNOW)
+ to.to_flags |= TOF_ACCE_ACKNOW;
+ }
/* Processing the options. */
hdrlen += optlen = tcp_addoptions(&to, opt);
+ if (to.to_flags & TOF_ACCECNOPT) {
+ if ((to.to_flags & TOF_ACCE_E0) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E0;
+ if ((to.to_flags & TOF_ACCE_E1) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E1;
+ if ((to.to_flags & TOF_ACCE_CE) == 0)
+ tp->t_flags2 &= ~TF2_ACO_CE;
+ }
/*
* If we wanted a TFO option to be added, but it was unable
* to fit, ensure no data is sent.
@@ -1919,6 +1946,78 @@
optlen += total_len;
break;
}
+ case TOF_ACCECNOPT:
+ {
+ int tmp = 0;
+ int max_len = TCP_MAXOLEN - optlen;
+ if (max_len < TCPOLEN_ACCECN_EMPTY) {
+ to->to_flags &= ~TOF_ACCECNOPT;
+ continue;
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ if (to->to_flags & TOF_ACCE_SYN) {
+ *optp++ = TCPOPT_ACCECN0;
+ optlen += TCPOLEN_ACCECN_EMPTY;
+ *optp++ = TCPOLEN_ACCECN_EMPTY;
+ continue;
+ } else {
+ to->to_flags &= ~TOF_ACCECNOPT;
+ continue;
+ }
+ }
+ *optp++ = (to->to_flags & TOF_ACCE_E1) ?
+ TCPOPT_ACCECN1 : TCPOPT_ACCECN0;
+ if (max_len >= (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER;
+ } else
+ if (max_len >= (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER;
+ } else
+ if (max_len >= (TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER)) {
+ tmp = TCPOLEN_ACCECN_EMPTY +
+ 1 * TCPOLEN_ACCECN_COUNTER;
+ }
+ *optp++ = tmp;
+ optlen += tmp;
+ if (to->to_flags & TOF_ACCE_E1) {
+ hton24(&optp, to->to_ae->re1b);
+ } else {
+ hton24(&optp, to->to_ae->re0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ }
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 2 * TCPOLEN_ACCECN_COUNTER)) {
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ }
+ hton24(&optp, to->to_ae->rceb);
+ to->to_flags &= ~TOF_ACCE_CE;
+ if (max_len < (TCPOLEN_ACCECN_EMPTY +
+ 3 * TCPOLEN_ACCECN_COUNTER)) {
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ }
+ /*
+ * TCP option sufficient to hold full AccECN option
+ * but only send changed counters normally,
+ * full counters on ACKNOW
+ */
+ if (to->to_flags & TOF_ACCE_E1) {
+ hton24(&optp, to->to_ae->re0b);
+ to->to_flags &= ~TOF_ACCE_E0;
+ to->to_flags &= ~TOF_ACCE_E1;
+ continue;
+ } else {
+ hton24(&optp, to->to_ae->re1b);
+ continue;
+ }
+ }
default:
panic("%s: unknown TCP option type", __func__);
break;
diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c
--- a/sys/netinet/tcp_subr.c
+++ b/sys/netinet/tcp_subr.c
@@ -1805,7 +1805,6 @@
#ifdef INVARIANTS
int thflags = tcp_get_flags(th);
#endif
-
KASSERT(tp != NULL || m != NULL, ("tcp_respond: tp and m both NULL"));
NET_EPOCH_ASSERT();
@@ -2013,9 +2012,24 @@
if (tp->t_flags & TF_SIGNATURE)
to.to_flags |= TOF_SIGNATURE;
#endif
+ /* AccECN option */
+ if (tp->t_flags & TF_ACCECN_OPT) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_ae = &tp->t_ae;
+ to.to_flags |= ((tp->t_flags2 & TF2_ACO_E0) ? TOF_ACCE_E0 : 0) |
+ ((tp->t_flags2 & TF2_ACO_E1) ? TOF_ACCE_E1 : 0) |
+ ((tp->t_flags2 & TF2_ACO_CE) ? TOF_ACCE_CE : 0);
+ }
/* Add the options. */
tlen += optlen = tcp_addoptions(&to, optp);
-
+ if (to.to_flags & TOF_ACCECNOPT) {
+ if ((to.to_flags & TOF_ACCE_E0) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E0;
+ if ((to.to_flags & TOF_ACCE_E1) == 0)
+ tp->t_flags2 &= ~TF2_ACO_E1;
+ if ((to.to_flags & TOF_ACCE_CE) == 0)
+ tp->t_flags2 &= ~TF2_ACO_CE;
+ }
/* Update m_len in the correct mbuf. */
optm->m_len += optlen;
} else
@@ -2330,6 +2344,14 @@
tcp_log_tcpcbinit(tp);
#endif
tp->t_pacing_rate = -1;
+ if (V_tcp_do_lrd)
+ tp->t_flags |= TF_LRD;
+ tp->t_ae.re0b = 1;
+ tp->t_ae.re1b = 1;
+ tp->t_ae.rceb = 0;
+ tp->t_ae.se0b = 1;
+ tp->t_ae.se1b = 1;
+ tp->t_ae.sceb = 0;
if (tp->t_fb->tfb_tcp_fb_init) {
if ((*tp->t_fb->tfb_tcp_fb_init)(tp, &tp->t_fb_ptr)) {
refcount_release(&tp->t_fb->tfb_refcnt);
diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c
--- a/sys/netinet/tcp_syncache.c
+++ b/sys/netinet/tcp_syncache.c
@@ -1810,6 +1810,7 @@
#ifdef INET6
struct ip6_hdr *ip6 = NULL;
#endif
+ struct accecn ae;
NET_EPOCH_ASSERT();
@@ -1949,6 +1950,20 @@
/* don't send cookie again when retransmitting response */
sc->sc_tfo_cookie = NULL;
}
+ if (V_tcp_ecn_option)
+ to.to_flags |= TOF_ACCE_SYN;
+ }
+ if (V_tcp_ecn_option &&
+ (sc->sc_flags & SCF_ECN_MASK) &&
+ ((sc->sc_flags & SCF_ECN_MASK) != SCF_ECN)) {
+ to.to_flags |= TOF_ACCECNOPT;
+ to.to_flags |= TOF_ACCE_E0 |
+ TOF_ACCE_E1 |
+ TOF_ACCE_CE;
+ ae.re0b = 1;
+ ae.re1b = 1;
+ ae.rceb = 0;
+ to.to_ae = &ae;
}
if (sc->sc_flags & SCF_TIMESTAMP) {
to.to_tsval = sc->sc_tsoff + tcp_ts_getticks();
diff --git a/sys/netinet/tcp_timer.c b/sys/netinet/tcp_timer.c
--- a/sys/netinet/tcp_timer.c
+++ b/sys/netinet/tcp_timer.c
@@ -789,6 +789,15 @@
#endif
in_losing(inp);
}
+ /*
+ * Disable AccECN option when
+ * retransmitting after multiple
+ * timeouts.
+ */
+ if ((tp->t_rxtshift >= V_tcp_ecn_maxretries) &&
+ (tp->t_flags2 & TF2_ACE_PERMIT) &&
+ (tp->t_flags & TF_ACCECN_OPT))
+ tp->t_flags &= ~TF_ACCECN_OPT;
tp->snd_nxt = tp->snd_una;
tp->snd_recover = tp->snd_max;
/*
diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c
--- a/sys/netinet/tcp_usrreq.c
+++ b/sys/netinet/tcp_usrreq.c
@@ -1614,15 +1614,23 @@
* AccECN related counters.
*/
if ((tp->t_flags2 & (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) ==
- (TF2_ECN_PERMIT | TF2_ACE_PERMIT))
+ (TF2_ECN_PERMIT | TF2_ACE_PERMIT)) {
/*
* Internal counter starts at 5 for AccECN
* but 0 for RFC3168 ECN.
*/
- ti->tcpi_delivered_ce = tp->t_scep - 5;
- else
- ti->tcpi_delivered_ce = tp->t_scep;
- ti->tcpi_received_ce = tp->t_rcep;
+ ti->tcpi_delivered_ce = tp->t_ae.scep - 5;
+ ti->tcpi_received_ce = tp->t_ae.rcep - 5;
+ } else {
+ ti->tcpi_delivered_ce = tp->t_ae.scep;
+ ti->tcpi_received_ce = tp->t_ae.rcep;
+ }
+ ti->tcpi_received_e0_bytes = tp->t_ae.re0b - 1;
+ ti->tcpi_received_e1_bytes = tp->t_ae.re1b - 1;
+ ti->tcpi_received_ce_bytes = tp->t_ae.rceb;
+ ti->tcpi_delivered_e0_bytes = tp->t_ae.se0b - 1;
+ ti->tcpi_delivered_e1_bytes = tp->t_ae.se1b - 1;
+ ti->tcpi_delivered_ce_bytes = tp->t_ae.sceb;
}
/*
diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h
--- a/sys/netinet/tcp_var.h
+++ b/sys/netinet/tcp_var.h
@@ -129,6 +129,18 @@
int32_t lost_bytes; /* number of rfc6675 IsLost() bytes */
};
+struct accecn {
+ uint32_t rcep; /* Number of received CE marked pkts */
+ uint32_t scep; /* Synced number of delivered CE pkts */
+ uint32_t dcep; /* delta of CE marks for rollback */
+ uint32_t re0b; /* Number of ECT0 marked data bytes */
+ uint32_t re1b; /* Number of ECT1 marked data bytes */
+ uint32_t rceb; /* Number of CE marked data bytes */
+ uint32_t se0b; /* Synced number of delivered ECT0 bytes */
+ uint32_t se1b; /* Synced number of delivered ECT1 bytes */
+ uint32_t sceb; /* Synced number of delivered CE bytes */
+};
+
#define SEGQ_EMPTY(tp) TAILQ_EMPTY(&(tp)->t_segq)
STAILQ_HEAD(tcp_log_stailq, tcp_log_mem);
@@ -431,8 +443,7 @@
int t_dupacks; /* consecutive dup acks recd */
int t_lognum; /* Number of log entries */
int t_loglimit; /* Maximum number of log entries */
- uint32_t t_rcep; /* Number of received CE marked pkts */
- uint32_t t_scep; /* Synced number of delivered CE pkts */
+ struct accecn t_ae; /* AccECN related byte counters */
int64_t t_pacing_rate; /* bytes / sec, -1 => unlimited */
struct tcp_log_stailq t_logs; /* Log buffer */
struct tcp_log_id_node *t_lin;
@@ -798,7 +809,7 @@
#define TF_TSO 0x01000000 /* TSO enabled on this connection */
#define TF_TOE 0x02000000 /* this connection is offloaded */
#define TF_CLOSED 0x04000000 /* close(2) called on socket */
-#define TF_UNUSED1 0x08000000 /* unused */
+#define TF_ACCECN_OPT 0x08000000 /* AccECN is using TCP options */
#define TF_LRD 0x10000000 /* Lost Retransmission Detection */
#define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */
#define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */
@@ -853,7 +864,9 @@
#define TF2_MBUF_QUEUE_READY 0x00020000 /* Inputs can be queued */
#define TF2_DONT_SACK_QUEUE 0x00040000 /* Don't wake on sack */
#define TF2_CANNOT_DO_ECN 0x00080000 /* The stack does not do ECN */
-
+#define TF2_ACO_E0 0x00100000 /* EE0 counter changed */
+#define TF2_ACO_E1 0x00200000 /* EE1 counter changed */
+#define TF2_ACO_CE 0x00400000 /* ECE counter changed */
/*
* Structure to hold TCP options that are only used during segment
* processing (in tcp_input), but not held in the tcpcb.
@@ -864,14 +877,21 @@
*/
struct tcpopt {
u_int32_t to_flags; /* which options are present */
-#define TOF_MSS 0x0001 /* maximum segment size */
-#define TOF_SCALE 0x0002 /* window scaling */
-#define TOF_SACKPERM 0x0004 /* SACK permitted */
-#define TOF_TS 0x0010 /* timestamp */
-#define TOF_SIGNATURE 0x0040 /* TCP-MD5 signature option (RFC2385) */
-#define TOF_SACK 0x0080 /* Peer sent SACK option */
-#define TOF_FASTOPEN 0x0100 /* TCP Fast Open (TFO) cookie */
-#define TOF_MAXOPT 0x0200
+#define TOF_MSS 0x00000001 /* maximum segment size */
+#define TOF_SCALE 0x00000002 /* window scaling */
+#define TOF_SACKPERM 0x00000004 /* SACK permitted */
+#define TOF_TS 0x00000010 /* timestamp */
+#define TOF_SIGNATURE 0x00000040 /* TCP-MD5 signature option (RFC2385) */
+#define TOF_SACK 0x00000080 /* Peer sent SACK option */
+#define TOF_FASTOPEN 0x00000100 /* TCP Fast Open (TFO) cookie */
+#define TOF_ACCECNOPT 0x00000200 /* AccECN Option */
+#define TOF_MAXOPT 0x00000400
+ /* Keep internal flags above TOF_MAXOPT */
+#define TOF_ACCE_SYN 0x80000000 /* send empty option */
+#define TOF_ACCE_CE 0x40000000 /* CE counter changed */
+#define TOF_ACCE_E0 0x20000000 /* E0 counter changed */
+#define TOF_ACCE_E1 0x10000000 /* E1 counter changed */
+#define TOF_ACCE_ACKNOW 0x08000000 /* send full option */
u_int32_t to_tsval; /* new timestamp */
u_int32_t to_tsecr; /* reflected timestamp */
u_char *to_sacks; /* pointer to the first SACK blocks */
@@ -881,7 +901,8 @@
u_int8_t to_wscale; /* window scaling */
u_int8_t to_nsacks; /* number of SACK blocks */
u_int8_t to_tfo_len; /* TFO cookie length */
- u_int32_t to_spare; /* UTO */
+ struct accecn *to_ae; /* pointer to AccECN byte counters */
+ u_int32_t to_spare; /* UTO */
};
/*
@@ -1283,6 +1304,7 @@
VNET_DECLARE(int, tcp_do_sack);
VNET_DECLARE(int, tcp_do_tso);
VNET_DECLARE(int, tcp_ecn_maxretries);
+VNET_DECLARE(int, tcp_ecn_option);
VNET_DECLARE(int, tcp_initcwnd_segments);
VNET_DECLARE(int, tcp_insecure_rst);
VNET_DECLARE(int, tcp_insecure_syn);
@@ -1329,6 +1351,7 @@
#define V_tcp_do_sack VNET(tcp_do_sack)
#define V_tcp_do_tso VNET(tcp_do_tso)
#define V_tcp_ecn_maxretries VNET(tcp_ecn_maxretries)
+#define V_tcp_ecn_option VNET(tcp_ecn_option)
#define V_tcp_initcwnd_segments VNET(tcp_initcwnd_segments)
#define V_tcp_insecure_rst VNET(tcp_insecure_rst)
#define V_tcp_insecure_syn VNET(tcp_insecure_syn)

File Metadata

Mime Type
text/plain
Expires
Sun, Feb 1, 4:27 AM (20 h, 40 m)
Storage Engine
blob
Storage Format
Raw Data
Storage Handle
28274031
Default Alt Text
D36303.id132863.diff (23 KB)

Event Timeline