Index: sys/dev/cxgbe/tom/t4_listen.c =================================================================== --- sys/dev/cxgbe/tom/t4_listen.c +++ sys/dev/cxgbe/tom/t4_listen.c @@ -1097,7 +1097,7 @@ static void pass_accept_req_to_protohdrs(struct adapter *sc, const struct mbuf *m, - struct in_conninfo *inc, struct tcphdr *th) + struct in_conninfo *inc, struct tcphdr *th, uint8_t *iptos) { const struct cpl_pass_accept_req *cpl = mtod(m, const void *); const struct ether_header *eh; @@ -1114,6 +1114,17 @@ tcp = (const void *)(l3hdr + G_IP_HDR_LEN(hlen)); } + /* extract TOS (DiffServ + ECN) byte for AccECN */ + if (iptos) { + if (((struct ip *)l3hdr)->ip_v == IPVERSION) { + const struct ip *ip = (const void *)l3hdr; + *iptos = ip->ip_tos; + } else { + const struct ip6_hdr *ip6 = (const void *)l3hdr; + *iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + } + } + if (inc) { bzero(inc, sizeof(*inc)); inc->inc_fport = tcp->th_sport; @@ -1254,6 +1265,7 @@ struct synq_entry *synqe = NULL; int reject_reason, v, ntids; uint16_t vid, l2info; + uint8_t iptos; struct epoch_tracker et; #ifdef INVARIANTS unsigned int opcode = G_CPL_OPCODE(be32toh(OPCODE_TID(cpl))); @@ -1317,7 +1329,7 @@ if (lctx->vnet != ifp->if_vnet) REJECT_PASS_ACCEPT_REQ(true); - pass_accept_req_to_protohdrs(sc, m, &inc, &th); + pass_accept_req_to_protohdrs(sc, m, &inc, &th, &iptos); if (inc.inc_flags & INC_ISIPV6) { /* Don't offload if the ifcap isn't enabled */ @@ -1390,7 +1402,7 @@ * syncache_add. Note that syncache_add releases the pcb lock. */ t4opt_to_tcpopt(&cpl->tcpopt, &to); - toe_syncache_add(&inc, &to, &th, inp, tod, synqe); + toe_syncache_add(&inc, &to, &th, inp, tod, synqe, iptos); if (atomic_load_int(&synqe->ok_to_respond) > 0) { uint64_t opt0; @@ -1471,9 +1483,10 @@ struct tcphdr *th, struct tcpopt *to) { uint16_t tcp_opt = be16toh(cpl->tcp_opt); + uint8_t iptos; /* start off with the original SYN */ - pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th); + pass_accept_req_to_protohdrs(sc, synqe->syn, inc, th, &iptos); /* modify parts to make it look like the ACK to our SYN|ACK */ th->th_flags = TH_ACK; Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -71,6 +71,8 @@ #define TH_URG 0x20 #define TH_ECE 0x40 #define TH_CWR 0x80 +#define TH_AE 0x100 /* maps into th_x2 */ + #define TH_FLAGS (TH_FIN|TH_SYN|TH_RST|TH_PUSH|TH_ACK|TH_URG|TH_ECE|TH_CWR) #define PRINT_TH_FLAGS "\20\1FIN\2SYN\3RST\4PUSH\5ACK\6URG\7ECE\10CWR" Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -1325,7 +1325,7 @@ #endif TCP_PROBE3(debug__input, tp, th, m); tcp_dooptions(&to, optp, optlen, TO_SYN); - if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL)) + if (syncache_add(&inc, &to, th, inp, &so, m, NULL, NULL, iptos)) goto tfo_socket_result; /* @@ -1576,11 +1576,8 @@ * TCP ECN processing. */ if (tp->t_flags & TF_ECN_PERMIT) { - if (thflags & TH_CWR) - tp->t_flags &= ~TF_ECN_SND_ECE; switch (iptos & IPTOS_ECN_MASK) { case IPTOS_ECN_CE: - tp->t_flags |= TF_ECN_SND_ECE; TCPSTAT_INC(tcps_ecn_ce); break; case IPTOS_ECN_ECT0: @@ -1591,11 +1588,29 @@ break; } + char d_ace; + + if (tp->t_flags & TF_ACE_PERMIT) { + d_ace = (tcp_get_ace(th) + 8 - (tp->s_cep & 0x07)) & 0x07; + tp->s_cep += d_ace; + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + tp->r_cep += 1; + } else { + if (thflags & TH_CWR) + tp->t_flags &= ~TF_ECN_SND_ECE; + if ((iptos & IPTOS_ECN_MASK) == IPTOS_ECN_CE) + tp->t_flags |= TF_ECN_SND_ECE; + } + /* Process a packet differently from RFC3168. */ cc_ecnpkt_handler(tp, th, iptos); - /* Congestion experienced. */ - if (thflags & TH_ECE) { + /* Congestion experienced. + * With ACE, process a cong signal with ACE changed, + * for legacy ECN, whenever ECE is received + */ + if ((!(tp->t_flags & TF_ACE_PERMIT) && (thflags & TH_ECE)) || + ((tp->t_flags & TF_ACE_PERMIT) && (d_ace != 0))) { cc_cong_signal(tp, th, CC_ECN); } } @@ -2013,6 +2028,49 @@ TCPSTAT_INC(tcps_ecn_shs); } + /* decoding Accurate ECN according to table in section 3.1.1 */ + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + int xflags; + xflags = ((th->th_x2 << 8) | thflags) & (TH_AE|TH_CWR|TH_ECE); + switch (xflags) { + /* non-ECT SYN */ + case (0|TH_CWR|0): + tp->t_flags |= (TF_ACE_PERMIT|TF_ECN_PERMIT); + tp->s_cep = 5; + tp->r_cep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + /* ECT1 SYN */ + case (0|TH_CWR|TH_ECE): + tp->t_flags |= (TF_ACE_PERMIT|TF_ECN_PERMIT); + tp->s_cep = 5; + tp->r_cep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect1); + break; + /* ECT0 SYN */ + case (TH_AE|0|0): + tp->t_flags |= (TF_ACE_PERMIT|TF_ECN_PERMIT); + tp->s_cep = 5; + tp->r_cep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect0); + break; + /* CE SYN */ + case (TH_AE|TH_CWR|0): + tp->t_flags |= (TF_ACE_PERMIT|TF_ECN_PERMIT); + tp->s_cep = 6; + tp->r_cep = 5; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + break; + default: + break; + } + } + /* * Received in SYN_SENT[*] state. * Transitions: Index: sys/netinet/tcp_output.c =================================================================== --- sys/netinet/tcp_output.c +++ sys/netinet/tcp_output.c @@ -1110,6 +1110,16 @@ } else flags |= TH_ECE|TH_CWR; } + /* + * Send an Accurate ECN setup SYN packet + */ + if (tp->t_state == TCPS_SYN_SENT && V_tcp_do_ecn == 3) { + if (tp->t_rxtshift >= 1) { + if (tp->t_rxtshift <= V_tcp_ecn_maxretries) + flags |= TH_ECE|TH_CWR|TH_AE; + } else + flags |= TH_ECE|TH_CWR|TH_AE; + } if (tp->t_state == TCPS_ESTABLISHED && (tp->t_flags & TF_ECN_PERMIT)) { @@ -1132,14 +1142,28 @@ /* * Reply with proper ECN notifications. */ - if (tp->t_flags & TF_ECN_SND_CWR) { - flags |= TH_CWR; - tp->t_flags &= ~TF_ECN_SND_CWR; - } - if (tp->t_flags & TF_ECN_SND_ECE) - flags |= TH_ECE; + if (tp->t_flags & TF_ACE_PERMIT) { + if (tp->r_cep & 0x01) + flags |= TH_CWR; + else + flags &= ~TH_CWR; + if (tp->r_cep & 0x02) + flags |= TH_ECE; + else + flags &= ~TH_CWR; + if (tp->r_cep & 0x04) + flags |= TH_AE; + else + flags &= ~TH_AE; + } else + if (tp->t_flags & TF_ECN_SND_CWR) { + flags |= TH_CWR; + tp->t_flags &= ~TF_ECN_SND_CWR; + } + if (tp->t_flags & TF_ECN_SND_ECE) + flags |= TH_ECE; } - + /* * If we are doing retransmissions, then snd_nxt will * not reflect the first unsent octet. For ACK only @@ -1169,7 +1193,8 @@ bcopy(opt, th + 1, optlen); th->th_off = (sizeof (struct tcphdr) + optlen) >> 2; } - th->th_flags = flags; + th->th_flags = (flags & 0x00FF); + th->th_x2 = (flags & 0x0100) >> 8; /* * Calculate receive window. Don't shrink window, * but avoid silly window syndrome. Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -3244,3 +3244,16 @@ if (inp->inp_socket == NULL) xt->xt_inp.xi_socket.xso_protocol = IPPROTO_TCP; } + +int +tcp_get_ace(struct tcphdr *th) +{ + int ace = 0; + if (th->th_flags & TH_ECE) + ace += 1; + if (th->th_flags & TH_CWR) + ace += 2; + if (th->th_x2 & (TH_AE >> 8)) + ace += 4; + return ace; +} \ No newline at end of file Index: sys/netinet/tcp_syncache.h =================================================================== --- sys/netinet/tcp_syncache.h +++ sys/netinet/tcp_syncache.h @@ -45,7 +45,7 @@ struct tcphdr *, struct socket **, struct mbuf *); int syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct inpcb *, struct socket **, struct mbuf *, - void *, void *); + void *, void *, uint8_t); void syncache_chkrst(struct in_conninfo *, struct tcphdr *, struct mbuf *); void syncache_badack(struct in_conninfo *); int syncache_pcblist(struct sysctl_req *req, int max_pcbs, int *pcbs_exported); @@ -90,6 +90,10 @@ #define SCF_SIGNATURE 0x20 /* send MD5 digests */ #define SCF_SACK 0x80 /* send SACK option */ #define SCF_ECN 0x100 /* send ECN setup packet */ +#define SCF_ACE_N 0x200 /* send ACE non-ECT setup */ +#define SCF_ACE_0 0x400 /* send ACE ECT0 setup */ +#define SCF_ACE_1 0x800 /* send ACE ECT1 setup */ +#define SCF_ACE_CE 0x1000 /* send ACE CE setup */ struct syncache_head { struct mtx sch_mtx; Index: sys/netinet/tcp_syncache.c =================================================================== --- sys/netinet/tcp_syncache.c +++ sys/netinet/tcp_syncache.c @@ -1309,7 +1309,7 @@ int syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, struct inpcb *inp, struct socket **lsop, struct mbuf *m, void *tod, - void *todctx) + void *todctx, uint8_t tos) { struct tcpcb *tp; struct socket *so; @@ -1612,8 +1612,62 @@ sc->sc_peer_mss = to->to_mss; /* peer mss may be zero */ if (ltflags & TF_NOOPT) sc->sc_flags |= SCF_NOOPT; - if ((th->th_flags & (TH_ECE|TH_CWR)) && V_tcp_do_ecn) - sc->sc_flags |= SCF_ECN; + /* ECN Handshake */ + if (V_tcp_do_ecn) { + int xflags; + xflags = ((th->th_x2 << 8) | th->th_flags) & (TH_AE|TH_CWR|TH_ECE); + switch (xflags) { + /* no ECN */ + case (0|0|0): + break; + /* legacy ECN */ + case (0|TH_CWR|TH_ECE): + sc->sc_flags |= SCF_ECN; + break; + /* Accurate ECN */ + case (TH_AE|TH_CWR|TH_ECE): + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + + switch (tos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + sc->sc_flags |= SCF_ACE_CE; + break; + case IPTOS_ECN_ECT0: + sc->sc_flags |= SCF_ACE_0; + break; + case IPTOS_ECN_ECT1: + sc->sc_flags |= SCF_ACE_1; + break; + case IPTOS_ECN_NOTECT: + sc->sc_flags |= SCF_ACE_N; + break; + } + } else + sc->sc_flags |= SCF_ECN; + break; + /* Default Case (section 3.1.2) */ + default: + if ((V_tcp_do_ecn == 3) || + (V_tcp_do_ecn == 4)) { + switch (tos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + sc->sc_flags |= SCF_ACE_CE; + break; + case IPTOS_ECN_ECT0: + sc->sc_flags |= SCF_ACE_0; + break; + case IPTOS_ECN_ECT1: + sc->sc_flags |= SCF_ACE_1; + break; + case IPTOS_ECN_NOTECT: + sc->sc_flags |= SCF_ACE_N; + break; + } + } + break; + } + } if (V_tcp_syncookies) sc->sc_iss = syncookie_generate(sch, sc); @@ -1787,6 +1841,28 @@ TCPSTAT_INC(tcps_ecn_shs); } + if ((flags & TH_SYN) && (sc->sc_flags & SCF_ACE_N)) { + th->th_flags |= TH_CWR; + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_nect); + } + if ((flags & TH_SYN) && (sc->sc_flags & SCF_ACE_0)) { + th->th_x2 |= (TH_AE >> 8); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect0); + } + if ((flags & TH_SYN) && (sc->sc_flags & SCF_ACE_1)) { + th->th_flags |= (TH_ECE | TH_CWR); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ect1); + } + if ((flags & TH_SYN) && (sc->sc_flags & SCF_ACE_CE)) { + th->th_flags |= TH_CWR; + th->th_x2 |= (TH_AE >> 8); + TCPSTAT_INC(tcps_ecn_shs); + TCPSTAT_INC(tcps_ace_ce); + } + /* Tack on the TCP options. */ if ((sc->sc_flags & SCF_NOOPT) == 0) { to.to_flags = 0; Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -205,6 +205,8 @@ u_int t_keepcnt; /* number of keepalives before close */ int t_dupacks; /* consecutive dup acks recd */ int t_lognum; /* Number of log entries */ + uint32_t r_cep; /* Number of received CE marked packets */ + uint32_t s_cep; /* Synced number of delivered CE packets */ struct tcp_log_stailq t_logs; /* Log buffer */ struct tcp_log_id_node *t_lin; struct tcp_log_id_bucket *t_lib; @@ -336,6 +338,7 @@ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ #define TF_FASTOPEN 0x80000000 /* TCP Fast Open indication */ +#define TF_ACE_PERMIT 0x100000000 /* Accurate ECN mode */ #define IN_FASTRECOVERY(t_flags) (t_flags & TF_FASTRECOVERY) #define ENTER_FASTRECOVERY(t_flags) t_flags |= TF_FASTRECOVERY @@ -610,7 +613,12 @@ uint64_t tcps_pmtud_blackhole_activated_min_mss; /* BH at min MSS Count */ uint64_t tcps_pmtud_blackhole_failed; /* Black Hole Failure Count */ - uint64_t _pad[12]; /* 6 UTO, 6 TBD */ + /* Accurate ECN Handshake stats */ + uint64_t tcps_ace_nect; /* ACE SYN packet with Non-ECT */ + uint64_t tcps_ace_ect1; /* ACE SYN packet with ECT1 */ + uint64_t tcps_ace_ect0; /* ACE SYN packet with ECT0 */ + uint64_t tcps_ace_ce; /* ACE SYN packet with CE */ + uint64_t _pad[8]; /* 6 UTO, 6 TBD */ }; #define tcps_rcvmemdrop tcps_rcvreassfull /* compat */ @@ -946,6 +954,7 @@ tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb); +int tcp_get_ace(struct tcphdr *th); static inline void tcp_fields_to_host(struct tcphdr *th) Index: sys/netinet/toecore.h =================================================================== --- sys/netinet/toecore.h +++ sys/netinet/toecore.h @@ -130,7 +130,7 @@ void toe_connect_failed(struct toedev *, struct inpcb *, int); void toe_syncache_add(struct in_conninfo *, struct tcpopt *, struct tcphdr *, - struct inpcb *, void *, void *); + struct inpcb *, void *, void *, uint8_t); int toe_syncache_expand(struct in_conninfo *, struct tcpopt *, struct tcphdr *, struct socket **); Index: sys/netinet/toecore.c =================================================================== --- sys/netinet/toecore.c +++ sys/netinet/toecore.c @@ -337,13 +337,13 @@ void toe_syncache_add(struct in_conninfo *inc, struct tcpopt *to, struct tcphdr *th, - struct inpcb *inp, void *tod, void *todctx) + struct inpcb *inp, void *tod, void *todctx, uint8_t tos) { struct socket *lso = inp->inp_socket; INP_WLOCK_ASSERT(inp); - syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx); + syncache_add(inc, to, th, inp, &lso, NULL, tod, todctx, tos); } int Index: usr.bin/netstat/inet.c =================================================================== --- usr.bin/netstat/inet.c +++ usr.bin/netstat/inet.c @@ -763,6 +763,15 @@ p(tcps_ecn_rcwnd, "\t{:congestion-reductions/%ju} " "{N:/time%s ECN reduced the congestion window}\n"); + p(tcps_ace_nect, "\t{:ace-nonect-syn/%ju} " + "{N:/ACE SYN packet%s with Non-ECT}\n"); + p(tcps_ace_ect0, "\t{:ace-ect0-syn/%ju} " + "{N:/ACE SYN packet%s with ECT0}\n"); + p(tcps_ace_ect1, "\t{:ace-ect1-syn/%ju} " + "{N:/ACE SYN packet%s with ECT1}\n"); + p(tcps_ace_ce, "\t{:ace-ce-syn/%ju} " + "{N:/ACE SYN packet%s with CE}\n"); + xo_close_container("ecn"); xo_open_container("tcp-signature"); p(tcps_sig_rcvgoodsig, "\t{:received-good-signature/%ju} "