Index: sys/netinet/cc/cc.h =================================================================== --- sys/netinet/cc/cc.h +++ sys/netinet/cc/cc.h @@ -91,15 +91,20 @@ struct sctp_nets *sctp; } ccvc; uint16_t nsegs; /* # segments coalesced into current chain. */ + uint8_t labc; /* Dont use system abc use passed in */ }; /* cc_var flags. */ #define CCF_ABC_SENTAWND 0x0001 /* ABC counted cwnd worth of bytes? */ #define CCF_CWND_LIMITED 0x0002 /* Are we currently cwnd limited? */ -#define CCF_UNUSED1 0x0004 /* unused */ +#define CCF_USE_LOCAL_ABC 0x0004 /* Dont use the system l_abc val */ #define CCF_ACKNOW 0x0008 /* Will this ack be sent now? */ #define CCF_IPHDR_CE 0x0010 /* Does this packet set CE bit? */ #define CCF_TCPHDR_CWR 0x0020 /* Does this packet set CWR bit? */ +#define CCF_MAX_CWND 0x0040 /* Have we reached maximum cwnd? */ +#define CCF_CHG_MAX_CWND 0x0080 /* Cubic max_cwnd changed, for K */ +#define CCF_USR_IWND 0x0100 /* User specified initial window */ +#define CCF_USR_IWND_INIT_NSEG 0x0200 /* Convert segs to bytes on conn init */ /* ACK types passed to the ack_received() hook. */ #define CC_ACK 0x0001 /* Regular in sequence ACK. */ Index: sys/netinet/cc/cc_newreno.h =================================================================== --- sys/netinet/cc/cc_newreno.h +++ sys/netinet/cc/cc_newreno.h @@ -31,12 +31,17 @@ #define CCALGONAME_NEWRENO "newreno" +struct newreno { + uint32_t beta; + uint32_t beta_ecn; + uint32_t newreno_flags; +}; + struct cc_newreno_opts { - int name; + int name; uint32_t val; }; -#define CC_NEWRENO_BETA 1 -#define CC_NEWRENO_BETA_ECN 2 - +#define CC_NEWRENO_BETA 1 /* Beta for normal DUP-ACK/Sack recovery */ +#define CC_NEWRENO_BETA_ECN 2 /* ECN Beta for Abe */ #endif /* _CC_NEWRENO_H */ Index: sys/netinet/cc/cc_newreno.c =================================================================== --- sys/netinet/cc/cc_newreno.c +++ sys/netinet/cc/cc_newreno.c @@ -86,8 +86,8 @@ static void newreno_post_recovery(struct cc_var *ccv); static int newreno_ctl_output(struct cc_var *ccv, struct sockopt *sopt, void *buf); -VNET_DEFINE_STATIC(uint32_t, newreno_beta) = 50; -VNET_DEFINE_STATIC(uint32_t, newreno_beta_ecn) = 80; +VNET_DEFINE(uint32_t, newreno_beta) = 50; +VNET_DEFINE(uint32_t, newreno_beta_ecn) = 80; #define V_newreno_beta VNET(newreno_beta) #define V_newreno_beta_ecn VNET(newreno_beta_ecn) @@ -101,11 +101,6 @@ .ctl_output = newreno_ctl_output, }; -struct newreno { - uint32_t beta; - uint32_t beta_ecn; -}; - static inline struct newreno * newreno_malloc(struct cc_var *ccv) { @@ -182,9 +177,15 @@ * XXXLAS: Find a way to signal SS after RTO that * doesn't rely on tcpcb vars. */ + uint16_t abc_val; + + if (ccv->flags & CCF_USE_LOCAL_ABC) + abc_val = ccv->labc; + else + abc_val = V_tcp_abc_l_var; if (CCV(ccv, snd_nxt) == CCV(ccv, snd_max)) incr = min(ccv->bytes_this_ack, - ccv->nsegs * V_tcp_abc_l_var * + ccv->nsegs * abc_val * CCV(ccv, t_maxseg)); else incr = min(ccv->bytes_this_ack, CCV(ccv, t_maxseg)); @@ -237,11 +238,19 @@ u_int mss; cwin = CCV(ccv, snd_cwnd); - mss = tcp_maxseg(ccv->ccvc.tcp); + mss = tcp_fixed_maxseg(ccv->ccvc.tcp); nreno = ccv->cc_data; beta = (nreno == NULL) ? V_newreno_beta : nreno->beta; beta_ecn = (nreno == NULL) ? V_newreno_beta_ecn : nreno->beta_ecn; - if (V_cc_do_abe && type == CC_ECN) + + /* + * Note that we only change the backoff for ECN if the + * global sysctl V_cc_do_abe is set the stack itself + * has set a flag in our newreno_flags (due to pacing) telling + * us to use the lower valued back-off. + */ + if (V_cc_do_abe || + (nreno && (nreno->newreno_flags & CC_NEWRENO_BETA_ECN) && (type == CC_ECN))) factor = beta_ecn; else factor = beta; @@ -260,8 +269,7 @@ V_cc_do_abe && V_cc_abe_frlossreduce)) { CCV(ccv, snd_ssthresh) = ((uint64_t)CCV(ccv, snd_ssthresh) * - (uint64_t)beta) / - (100ULL * (uint64_t)beta_ecn); + (uint64_t)beta) / (uint64_t)beta_ecn; } if (!IN_CONGRECOVERY(CCV(ccv, t_flags))) CCV(ccv, snd_ssthresh) = cwin; @@ -344,7 +352,7 @@ nreno->beta = opt->val; break; case CC_NEWRENO_BETA_ECN: - if (!V_cc_do_abe) + if ((!V_cc_do_abe) && ((nreno->newreno_flags & CC_NEWRENO_BETA_ECN) == 0)) return (EACCES); nreno->beta_ecn = opt->val; break; Index: sys/netinet/tcp.h =================================================================== --- sys/netinet/tcp.h +++ sys/netinet/tcp.h @@ -181,13 +181,24 @@ #define TCP_TXTLS_MODE 40 /* Transmit TLS mode */ #define TCP_RXTLS_ENABLE 41 /* TLS framing and encryption for receive */ #define TCP_RXTLS_MODE 42 /* Receive TLS mode */ +#define TCP_IWND_NB 43 /* Override initial window (units: bytes) */ +#define TCP_IWND_NSEG 44 /* Override initial window (units: MSS segs) */ +#define TCP_LOGID_CNT 46 /* get number of connections with the same ID */ +#define TCP_LOG_TAG 47 /* configure tag for grouping logs */ +#define TCP_USER_LOG 48 /* userspace log event */ #define TCP_CONGESTION 64 /* get/set congestion control algorithm */ #define TCP_CCALGOOPT 65 /* get/set cc algorithm specific options */ +#define TCP_MAXUNACKTIME 68 /* maximum time without making progress (sec) */ +#define TCP_MAXPEAKRATE 69 /* maximum peak rate allowed (kbps) */ +#define TCP_IDLE_REDUCE 70 /* Reduce cwnd on idle input */ #define TCP_REMOTE_UDP_ENCAPS_PORT 71 /* Enable TCP over UDP tunneling via the specified port */ #define TCP_DELACK 72 /* socket option for delayed ack */ #define TCP_FIN_IS_RST 73 /* A fin from the peer is treated has a RST */ #define TCP_LOG_LIMIT 74 /* Limit to number of records in tcp-log */ #define TCP_SHARED_CWND_ALLOWED 75 /* Use of a shared cwnd is allowed */ +#define TCP_PROC_ACCOUNTING 76 /* Do accounting on tcp cpu usage and counts */ +#define TCP_USE_CMP_ACKS 77 /* The transport can handle the Compressed mbuf acks */ +#define TCP_PERF_INFO 78 /* retrieve accounting counters */ #define TCP_KEEPINIT 128 /* N, time to establish connection */ #define TCP_KEEPIDLE 256 /* L,N,X start keeplives after this period */ #define TCP_KEEPINTVL 512 /* L,N interval between keepalives */ @@ -201,7 +212,7 @@ #define TCP_RACK_MBUF_QUEUE 1050 /* Do we allow mbuf queuing if supported */ #define TCP_RACK_PROP 1051 /* RACK proportional rate reduction (bool) */ #define TCP_RACK_TLP_REDUCE 1052 /* RACK TLP cwnd reduction (bool) */ -#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacing reduction factor (divisor) */ +#define TCP_RACK_PACE_REDUCE 1053 /* RACK Pacingv reduction factor (divisor) */ #define TCP_RACK_PACE_MAX_SEG 1054 /* Max TSO size we will send */ #define TCP_RACK_PACE_ALWAYS 1055 /* Use the always pace method */ #define TCP_RACK_PROP_RATE 1056 /* The proportional reduction rate */ @@ -284,6 +295,16 @@ #define TCP_RACK_PACE_TO_FILL 1127 /* If we are not in recovery, always pace to fill the cwnd in 1 RTT */ #define TCP_SHARED_CWND_TIME_LIMIT 1128 /* we should limit to low time values the scwnd life */ #define TCP_RACK_PROFILE 1129 /* Select a profile that sets multiple options */ +#define TCP_HDWR_RATE_CAP 1130 /* Allow hardware rates to cap pacing rate */ +#define TCP_PACING_RATE_CAP 1131 /* Highest rate allowed in pacing in bytes per second (uint64_t) */ +#define TCP_HDWR_UP_ONLY 1132 /* Allow the pacing rate to climb but not descend (with the exception of fill-cw */ +#define TCP_RACK_ABC_VAL 1133 /* Set a local ABC value different then the system default */ +#define TCP_REC_ABC_VAL 1134 /* Do we use the ABC value for recovery or the override one from sysctl */ +#define TCP_RACK_MEASURE_CNT 1135 /* How many measurements are required in GP pacing */ +#define TCP_DEFER_OPTIONS 1136 /* Defer options until the proper number of measurements occur, does not defer TCP_RACK_MEASURE_CNT */ +#define TCP_FAST_RSM_HACK 1137 /* Do we do the broken thing where we don't twiddle the TLP bits properly in fast_rsm_output? */ +#define TCP_RACK_PACING_BETA 1138 /* Changing the beta for pacing */ +#define TCP_RACK_PACING_BETA_ECN 1139 /* Changing the beta for ecn with pacing */ /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -295,6 +316,7 @@ #define TCPI_OPT_WSCALE 0x04 #define TCPI_OPT_ECN 0x08 #define TCPI_OPT_TOE 0x10 +#define TCPI_OPT_TFO 0x20 /* Maximum length of log ID. */ #define TCP_LOG_ID_LEN 64 Index: sys/netinet/tcp_accounting.h =================================================================== --- /dev/null +++ sys/netinet/tcp_accounting.h @@ -0,0 +1,39 @@ +#ifndef __tcp_accounting_h__ +#define __tcp_accounting_h__ +/* + * Return values from tcp_do_ack_accounting + * and indexs to the into the tcp_proc_time[] + * array. + */ +#define ACK_BEHIND 0 +#define ACK_SACK 1 +#define ACK_CUMACK 2 +#define ACK_CUMACK_SACK 3 +#define ACK_DUPACK 4 +#define ACK_RWND 5 +/* Added values for tracking output too */ +#define SND_BLOCKED 6 +#define SND_LIMITED 7 +#define SND_OUT_DATA 8 +#define SND_OUT_ACK 9 +#define SND_OUT_FAIL 10 +/* We also count in the counts array two added (MSS sent and ACKS In) */ +#define CNT_OF_MSS_OUT 11 +#define CNT_OF_ACKS_IN 12 + +/* for the tcpcb we add two more cycle counters */ +#define CYC_HANDLE_MAP 11 +#define CYC_HANDLE_ACK 12 + +/* Should the tp->xxx array's be alloc'ed? */ +/* #define TCP_NUM_PROC_COUNTERS 11 defined in tcp_var.h */ +/* #define TCP_NUM_CNT_COUNTERS 13 defined in tcp_var.h */ + +#ifdef _KERNEL +#ifdef TCP_ACCOUNTING +extern counter_u64_t tcp_cnt_counters[TCP_NUM_CNT_COUNTERS]; +extern counter_u64_t tcp_proc_time[TCP_NUM_PROC_COUNTERS]; +#endif +#endif + +#endif Index: sys/netinet/tcp_input.c =================================================================== --- sys/netinet/tcp_input.c +++ sys/netinet/tcp_input.c @@ -526,7 +526,7 @@ (V_tcp_delack_enabled || (tp->t_flags & TF_NEEDSYN))) void inline -cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) +cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos) { INP_WLOCK_ASSERT(tp->t_inpcb); @@ -544,7 +544,7 @@ break; } - if (th->th_flags & TH_CWR) + if (flags & TH_CWR) tp->ccv->flags |= CCF_TCPHDR_CWR; else tp->ccv->flags &= ~CCF_TCPHDR_CWR; @@ -558,6 +558,12 @@ } } +void inline +cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos) +{ + cc_ecnpkt_handler_flags(tp, th->th_flags, iptos); +} + /* * TCP input handling is split into multiple parts: * tcp6_input is a thin wrapper around tcp_input for the extended Index: sys/netinet/tcp_log_buf.h =================================================================== --- sys/netinet/tcp_log_buf.h +++ sys/netinet/tcp_log_buf.h @@ -174,7 +174,7 @@ TCP_LOG_IN = 1, /* Incoming packet 1 */ TCP_LOG_OUT, /* Transmit (without other event) 2 */ TCP_LOG_RTO, /* Retransmit timeout 3 */ - TCP_LOG_TF_ACK, /* Transmit due to TF_ACK 4 */ + TCP_LOG_SB_WAKE, /* Awaken socket buffer 4 */ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER, /* Detected reorder 7 */ @@ -200,7 +200,7 @@ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ - BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ + TCP_LOG_MAPCHG, /* Map Changes to the sendmap 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ @@ -232,7 +232,9 @@ TCP_LOG_USER_EVENT, /* User space event data 59 */ TCP_LOG_SENDFILE, /* sendfile() logging for TCP connections 60 */ TCP_LOG_HTTP_T, /* logging of http request tracking 61 */ - TCP_LOG_END /* End (keep at end) 62 */ + TCP_LOG_ACCOUNTING, /* Log of TCP Accounting data 62 */ + TCP_LOG_FSB, /* FSB information 63 */ + TCP_LOG_END /* End (keep at end) 64 */ }; enum tcp_log_states { Index: sys/netinet/tcp_ratelimit.h =================================================================== --- sys/netinet/tcp_ratelimit.h +++ sys/netinet/tcp_ratelimit.h @@ -43,7 +43,9 @@ struct tcp_hwrate_limit_table { const struct tcp_rate_set *ptbl; /* Pointer to parent table */ struct m_snd_tag *tag; /* Send tag if needed (chelsio) */ - uint64_t rate; /* Rate we get in Bytes per second (Bps) */ + long rate; /* Rate we get in Bytes per second (Bps) */ + long using; /* How many flows are using this hdwr rate. */ + long rs_num_enobufs; uint32_t time_between; /* Time-Gap between packets at this rate */ uint32_t flags; }; Index: sys/netinet/tcp_ratelimit.c =================================================================== --- sys/netinet/tcp_ratelimit.c +++ sys/netinet/tcp_ratelimit.c @@ -367,11 +367,22 @@ OID_AUTO, "pacetime", CTLFLAG_RD, &rs->rs_rlt[i].time_between, 0, "Time hardware inserts between 1500 byte sends"); - SYSCTL_ADD_U64(&rs->sysctl_ctx, + SYSCTL_ADD_LONG(&rs->sysctl_ctx, SYSCTL_CHILDREN(rl_rate_num), OID_AUTO, "rate", CTLFLAG_RD, - &rs->rs_rlt[i].rate, 0, + &rs->rs_rlt[i].rate, "Rate in bytes per second"); + SYSCTL_ADD_LONG(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "using", CTLFLAG_RD, + &rs->rs_rlt[i].using, + "Number of flows using"); + SYSCTL_ADD_LONG(&rs->sysctl_ctx, + SYSCTL_CHILDREN(rl_rate_num), + OID_AUTO, "enobufs", CTLFLAG_RD, + &rs->rs_rlt[i].rs_num_enobufs, + "Number of enobufs logged on this rate"); + } } #endif @@ -667,6 +678,8 @@ */ rs->rs_rlt[i].ptbl = rs; rs->rs_rlt[i].tag = NULL; + rs->rs_rlt[i].using = 0; + rs->rs_rlt[i].rs_num_enobufs = 0; /* * Calculate the time between. */ @@ -1063,16 +1076,28 @@ static void rl_increment_using(const struct tcp_hwrate_limit_table *rte) { + struct tcp_hwrate_limit_table *decon_rte; + + decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); + atomic_add_long(&decon_rte->using, 1); } static void rl_decrement_using(const struct tcp_hwrate_limit_table *rte) { + struct tcp_hwrate_limit_table *decon_rte; + + decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); + atomic_subtract_long(&decon_rte->using, 1); } void tcp_rl_log_enobuf(const struct tcp_hwrate_limit_table *rte) { + struct tcp_hwrate_limit_table *decon_rte; + + decon_rte = __DECONST(struct tcp_hwrate_limit_table *, rte); + atomic_add_long(&decon_rte->rs_num_enobufs, 1); } /* Index: sys/netinet/tcp_sack.c =================================================================== --- sys/netinet/tcp_sack.c +++ sys/netinet/tcp_sack.c @@ -156,6 +156,17 @@ &VNET_NAME(tcp_sack_globalholes), 0, "Global number of TCP SACK holes currently allocated"); +int +tcp_dsack_block_exists(struct tcpcb *tp) +{ + /* Return true if a DSACK block exists */ + if (tp->rcv_numsacks == 0) + return (0); + if (SEQ_LEQ(tp->sackblks[0].end, tp->rcv_nxt)) + return(1); + return (0); +} + /* * This function will find overlaps with the currently stored sackblocks * and add any overlap as a dsack block upfront Index: sys/netinet/tcp_stacks/bbr.c =================================================================== --- sys/netinet/tcp_stacks/bbr.c +++ sys/netinet/tcp_stacks/bbr.c @@ -3930,6 +3930,9 @@ struct tcp_bbr *bbr; INP_WLOCK_ASSERT(tp->t_inpcb); +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); +#endif bbr = (struct tcp_bbr *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: @@ -4403,6 +4406,7 @@ nrsm->r_start = start; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm-> r_rtt_not_allowed = rsm->r_rtt_not_allowed; nrsm->r_flags = rsm->r_flags; /* We don't transfer forward the SYN flag */ nrsm->r_flags &= ~BBR_HAS_SYN; @@ -6429,65 +6433,6 @@ bbr->r_ctl.bbr_smallest_srtt_this_state = rtt; } -static void -bbr_earlier_retran(struct tcpcb *tp, struct tcp_bbr *bbr, struct bbr_sendmap *rsm, - uint32_t t, uint32_t cts, int ack_type) -{ - /* - * For this RSM, we acknowledged the data from a previous - * transmission, not the last one we made. This means we did a false - * retransmit. - */ - if (rsm->r_flags & BBR_HAS_FIN) { - /* - * The sending of the FIN often is multiple sent when we - * have everything outstanding ack'd. We ignore this case - * since its over now. - */ - return; - } - if (rsm->r_flags & BBR_TLP) { - /* - * We expect TLP's to have this occur often - */ - bbr->rc_tlp_rtx_out = 0; - return; - } - if (ack_type != BBR_CUM_ACKED) { - /* - * If it was not a cum-ack we - * don't really know for sure since - * the timestamp could be from some - * other transmission. - */ - return; - } - - if (rsm->r_flags & BBR_WAS_SACKPASS) { - /* - * We retransmitted based on a sack and the earlier - * retransmission ack'd it - re-ordering is occuring. - */ - BBR_STAT_INC(bbr_reorder_seen); - bbr->r_ctl.rc_reorder_ts = cts; - } - /* Back down the loss count */ - if (rsm->r_flags & BBR_MARKED_LOST) { - bbr->r_ctl.rc_lost -= rsm->r_end - rsm->r_start; - bbr->r_ctl.rc_lost_bytes -= rsm->r_end - rsm->r_start; - rsm->r_flags &= ~BBR_MARKED_LOST; - if (SEQ_GT(bbr->r_ctl.rc_lt_lost, bbr->r_ctl.rc_lost)) - /* LT sampling also needs adjustment */ - bbr->r_ctl.rc_lt_lost = bbr->r_ctl.rc_lost; - } - /***** RRS HERE ************************/ - /* Do we need to do this??? */ - /* bbr_reset_lt_bw_sampling(bbr, cts); */ - /***** RRS HERE ************************/ - BBR_STAT_INC(bbr_badfr); - BBR_STAT_ADD(bbr_badfr_bytes, (rsm->r_end - rsm->r_start)); -} - static void bbr_set_reduced_rtt(struct tcp_bbr *bbr, uint32_t cts, uint32_t line) { @@ -6869,6 +6814,10 @@ /* Already done */ return (0); } + if (rsm->r_rtt_not_allowed) { + /* Not allowed */ + return (0); + } if (rsm->r_rtr_cnt == 1) { /* * Only one transmit. Hopefully the normal case. @@ -6926,7 +6875,7 @@ rsm->r_tim_lastsent[i], ack_type, to); if ((i + 1) < rsm->r_rtr_cnt) { /* Likely */ - bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); + return (0); } else if (rsm->r_flags & BBR_TLP) { bbr->rc_tlp_rtx_out = 0; } @@ -6974,7 +6923,7 @@ t = 1; bbr_update_bbr_info(bbr, rsm, t, cts, to->to_tsecr, uts, BBR_RTT_BY_EARLIER_RET, rsm->r_tim_lastsent[i], ack_type, to); - bbr_earlier_retran(tp, bbr, rsm, t, cts, ack_type); + return (0); } else { /* * Too many prior transmissions, just @@ -10207,7 +10156,7 @@ tp->t_fb_ptr = NULL; return (ENOMEM); } - rsm->r_flags = BBR_OVERMAX; + rsm->r_rtt_not_allowed = 1; rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; @@ -10320,6 +10269,10 @@ counter_u64_add(bbr_flows_whdwr_pacing, -1); else counter_u64_add(bbr_flows_nohdwr_pacing, -1); + if (bbr->r_ctl.crte != NULL) { + tcp_rel_pacing_rate(bbr->r_ctl.crte, tp); + bbr->r_ctl.crte = NULL; + } rsm = TAILQ_FIRST(&bbr->r_ctl.rc_map); while (rsm) { TAILQ_REMOVE(&bbr->r_ctl.rc_map, rsm, r_next); @@ -13463,15 +13416,6 @@ th->th_seq = htonl(tp->snd_max); bbr_seq = tp->snd_max; } - } else if (flags & TH_RST) { - /* - * For a Reset send the last cum ack in sequence - * (this like any other choice may still generate a - * challenge ack, if a ack-update packet is in - * flight). - */ - th->th_seq = htonl(tp->snd_una); - bbr_seq = tp->snd_una; } else { /* * len == 0 and not persist we use snd_max, sending @@ -14536,9 +14480,9 @@ } else { bbr->bbr_hdw_pace_ena = 0; #ifdef RATELIMIT - if (bbr->bbr_hdrw_pacing) { - bbr->bbr_hdrw_pacing = 0; - in_pcbdetach_txrtlmt(bbr->rc_inp); + if (bbr->r_ctl.crte != NULL) { + tcp_rel_pacing_rate(bbr->r_ctl.crte, tp); + bbr->r_ctl.crte = NULL; } #endif } Index: sys/netinet/tcp_stacks/rack.c =================================================================== --- sys/netinet/tcp_stacks/rack.c +++ sys/netinet/tcp_stacks/rack.c @@ -63,7 +63,10 @@ #include #include #include - +#ifdef TCP_ACCOUNTING +#include +#include +#endif #include #include @@ -91,8 +94,10 @@ #include #include #include +#include #include #include +#include #include #include #ifdef NETFLIX_SHARED_CWND @@ -133,6 +138,15 @@ #define TICKS2SBT(__t) (tick_sbt * ((sbintime_t)(__t))) #endif +VNET_DECLARE(uint32_t, newreno_beta); +VNET_DECLARE(uint32_t, newreno_beta_ecn); +#define V_newreno_beta VNET(newreno_beta) +#define V_newreno_beta_ecn VNET(newreno_beta_ecn) + + +MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); +MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); + struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; @@ -175,30 +189,51 @@ static int32_t rack_tlp_limit = 2; /* No more than 2 TLPs w-out new data */ static int32_t rack_tlp_use_greater = 1; static int32_t rack_reorder_thresh = 2; -static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 +static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 * - 60 seconds */ +static uint8_t rack_req_measurements = 1; /* Attack threshold detections */ static uint32_t rack_highest_sack_thresh_seen = 0; static uint32_t rack_highest_move_thresh_seen = 0; - -static int32_t rack_pkt_delay = 1; -static int32_t rack_early_recovery = 1; +static int32_t rack_enable_hw_pacing = 0; /* Due to CCSP keep it off by default */ +static int32_t rack_hw_pace_extra_slots = 2; /* 2 extra MSS time betweens */ +static int32_t rack_hw_rate_caps = 1; /* 1; */ +static int32_t rack_hw_rate_min = 0; /* 1500000;*/ +static int32_t rack_hw_rate_to_low = 0; /* 1200000; */ +static int32_t rack_hw_up_only = 1; +static int32_t rack_stats_gets_ms_rtt = 1; +static int32_t rack_prr_addbackmax = 2; + +static int32_t rack_pkt_delay = 1000; static int32_t rack_send_a_lot_in_prr = 1; -static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ +static int32_t rack_min_to = 1000; /* Number of microsecond min timeout */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; -static int32_t rack_enable_shared_cwnd = 0; +static int32_t rack_enable_shared_cwnd = 1; +static int32_t rack_use_cmp_acks = 1; +static int32_t rack_use_fsb = 1; +static int32_t rack_use_rfo = 1; +static int32_t rack_use_rsm_rfo = 1; +static int32_t rack_max_abc_post_recovery = 2; +static int32_t rack_client_low_buf = 0; +#ifdef TCP_ACCOUNTING +static int32_t rack_tcp_accounting = 0; +#endif static int32_t rack_limits_scwnd = 1; static int32_t rack_enable_mqueue_for_nonpaced = 0; static int32_t rack_disable_prr = 0; static int32_t use_rack_rr = 1; static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ -static int32_t rack_persist_min = 250; /* 250ms */ -static int32_t rack_persist_max = 2000; /* 2 Second */ -static int32_t rack_sack_not_required = 0; /* set to one to allow non-sack to use rack */ -static int32_t rack_default_init_window = 0; /* Use system default */ +static int32_t rack_persist_min = 250000; /* 250usec */ +static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ +static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ +static int32_t rack_default_init_window = 0; /* Use system default */ static int32_t rack_limit_time_with_srtt = 0; -static int32_t rack_hw_pace_adjust = 0; +static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ +static int32_t rack_enobuf_hw_boost_mult = 2; /* How many times the hw rate we boost slot using time_between */ +static int32_t rack_enobuf_hw_max = 12000; /* 12 ms in usecs */ +static int32_t rack_enobuf_hw_min = 10000; /* 10 ms in usecs */ +static int32_t rack_hw_rwnd_factor = 2; /* How many max_segs the rwnd must be before we hold off sending */ /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -209,23 +244,21 @@ static uint32_t rack_goal_bdp = 2; static uint32_t rack_min_srtts = 1; static uint32_t rack_min_measure_usec = 0; -static int32_t rack_tlp_min = 10; -static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ -static int32_t rack_rto_max = 4000; /* 4 seconds */ +static int32_t rack_tlp_min = 10000; /* 10ms */ +static int32_t rack_rto_min = 30000; /* 30,000 usec same as main freebsd */ +static int32_t rack_rto_max = 4000000; /* 4 seconds in usec's */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; static int32_t rack_pace_every_seg = 0; -static int32_t rack_delayed_ack_time = 200; /* 200ms */ +static int32_t rack_delayed_ack_time = 40000; /* 40ms in usecs */ static int32_t rack_slot_reduction = 4; static int32_t rack_wma_divisor = 8; /* For WMA calculation */ static int32_t rack_cwnd_block_ends_measure = 0; static int32_t rack_rwnd_block_ends_measure = 0; +static int32_t rack_def_profile = 0; static int32_t rack_lower_cwnd_at_tlp = 0; -static int32_t rack_use_proportional_reduce = 0; -static int32_t rack_proportional_rate = 10; -static int32_t rack_tlp_max_resend = 2; static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; @@ -247,13 +280,13 @@ static uint32_t rack_probertt_use_min_rtt_exit = 0; static uint32_t rack_probe_rtt_sets_cwnd = 0; static uint32_t rack_probe_rtt_safety_val = 2000000; /* No more than 2 sec in probe-rtt */ -static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in us */ +static uint32_t rack_time_between_probertt = 9600000; /* 9.6 sec in usecs */ static uint32_t rack_probertt_gpsrtt_cnt_mul = 0; /* How many srtt periods does probe-rtt last top fraction */ -static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ -static uint32_t rack_min_probertt_hold = 200000; /* Equal to delayed ack time */ +static uint32_t rack_probertt_gpsrtt_cnt_div = 0; /* How many srtt periods does probe-rtt last bottom fraction */ +static uint32_t rack_min_probertt_hold = 40000; /* Equal to delayed ack time */ static uint32_t rack_probertt_filter_life = 10000000; static uint32_t rack_probertt_lower_within = 10; -static uint32_t rack_min_rtt_movement = 250; /* Must move at least 250 useconds to count as a lowering */ +static uint32_t rack_min_rtt_movement = 250000; /* Must move at least 250ms (in microseconds) to count as a lowering */ static int32_t rack_pace_one_seg = 0; /* Shall we pace for less than 1.4Meg 1MSS at a time */ static int32_t rack_probertt_clear_is = 1; static int32_t rack_max_drain_hbp = 1; /* Extra drain times gpsrtt for highly buffered paths */ @@ -264,7 +297,7 @@ /* Timely information */ /* Combine these two gives the range of 'no change' to bw */ -/* ie the up/down provide the upper and lower bound */ +/* ie the up/down provide the upper and lower bound */ static int32_t rack_gp_per_bw_mul_up = 2; /* 2% */ static int32_t rack_gp_per_bw_mul_down = 4; /* 4% */ static int32_t rack_gp_rtt_maxmul = 3; /* 3 x maxmin */ @@ -286,6 +319,7 @@ static int32_t rack_timely_no_stopping = 0; static int32_t rack_down_raise_thresh = 100; static int32_t rack_req_segs = 1; +static uint64_t rack_bw_rate_cap = 0; /* Weird delayed ack mode */ static int32_t rack_use_imac_dack = 0; @@ -301,9 +335,14 @@ counter_u64_t rack_calc_zero; counter_u64_t rack_calc_nonzero; counter_u64_t rack_saw_enobuf; +counter_u64_t rack_saw_enobuf_hw; counter_u64_t rack_saw_enetunreach; counter_u64_t rack_per_timer_hole; - +counter_u64_t rack_large_ackcmp; +counter_u64_t rack_small_ackcmp; +#ifdef INVARIANTS +counter_u64_t rack_adjust_map_bw; +#endif /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; counter_u64_t rack_tlp_newdata; @@ -313,6 +352,7 @@ counter_u64_t rack_to_tot; counter_u64_t rack_to_arm_rack; counter_u64_t rack_to_arm_tlp; +counter_u64_t rack_hot_alloc; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; @@ -320,6 +360,17 @@ counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; +#define MAX_NUM_OF_CNTS 13 +counter_u64_t rack_proc_comp_ack[MAX_NUM_OF_CNTS]; +counter_u64_t rack_multi_single_eq; +counter_u64_t rack_proc_non_comp_ack; + +counter_u64_t rack_fto_send; +counter_u64_t rack_fto_rsm_send; +counter_u64_t rack_nfto_resend; +counter_u64_t rack_non_fto_send; +counter_u64_t rack_extended_rfo; + counter_u64_t rack_sack_proc_all; counter_u64_t rack_sack_proc_short; counter_u64_t rack_sack_proc_restart; @@ -342,6 +393,10 @@ counter_u64_t rack_collapsed_win; counter_u64_t rack_tlp_does_nada; counter_u64_t rack_try_scwnd; +counter_u64_t rack_hw_pace_init_fail; +counter_u64_t rack_hw_pace_lost; +counter_u64_t rack_sbsndptr_right; +counter_u64_t rack_sbsndptr_wrong; /* Temp CPU counters */ counter_u64_t rack_find_high; @@ -350,6 +405,17 @@ counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; + +#define RACK_REXMTVAL(tp) max(rack_rto_min, ((tp)->t_srtt + ((tp)->t_rttvar << 2))) + +#define RACK_TCPT_RANGESET(tv, value, tvmin, tvmax) do { \ + (tv) = (value) + TICKS_2_USEC(tcp_rexmit_slop); \ + if ((u_long)(tv) < (u_long)(tvmin)) \ + (tv) = (tvmin); \ + if ((u_long)(tv) > (u_long)(tvmax)) \ + (tv) = (tvmax); \ +} while (0) + static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); @@ -363,7 +429,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, - struct tcphdr *th, uint16_t nsegs, uint16_t type, int32_t recovery); + uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery); static struct rack_sendmap *rack_alloc(struct tcp_rack *rack); static struct rack_sendmap *rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type); @@ -371,24 +437,21 @@ rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); static void -rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, - uint32_t type); +rack_cong_signal(struct tcpcb *tp, + uint32_t type, uint32_t ack); static void rack_counter_destroy(void); static int rack_ctloutput(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp); static int32_t rack_ctor(void *mem, int32_t size, void *arg, int32_t how); static void -rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line); +rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override); static void rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos); static void rack_dtor(void *mem, int32_t size, void *arg); static void -rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, - uint32_t t, uint32_t cts); -static void rack_log_alt_to_to_cancel(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, @@ -416,11 +479,12 @@ static void rack_init_sysctls(void); static void rack_log_ack(struct tcpcb *tp, struct tcpopt *to, - struct tcphdr *th); + struct tcphdr *th, int entered_rec, int dup_ack_struck); static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, - uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, - uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts); + uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t ts, + struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff); + static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); @@ -431,7 +495,7 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, int *moved_two); -static void rack_post_recovery(struct tcpcb *tp, struct tcphdr *th); +static void rack_post_recovery(struct tcpcb *tp, uint32_t th_seq); static void rack_remxt_tmr(struct tcpcb *tp); static int rack_set_sockopt(struct socket *so, struct sockopt *sopt, @@ -446,10 +510,10 @@ static void rack_timer_stop(struct tcpcb *tp, uint32_t timer_type); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint32_t ts, int32_t * lenp); + struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint32_t ts); + struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); @@ -496,15 +560,182 @@ static void tcp_rack_xmit_timer(struct tcp_rack *rack, int32_t rtt, uint32_t len, uint32_t us_tim, int confidence, struct rack_sendmap *rsm, uint16_t rtrcnt); static void - tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); + tcp_rack_partialack(struct tcpcb *tp); +static int +rack_set_profile(struct tcp_rack *rack, int prof); +static void +rack_apply_deferred_options(struct tcp_rack *rack); int32_t rack_clear_counter=0; +static void +rack_set_cc_pacing(struct tcp_rack *rack) +{ + struct sockopt sopt; + struct cc_newreno_opts opt; + struct newreno old, *ptr; + struct tcpcb *tp; + int error; + + if (rack->rc_pacing_cc_set) + return; + + tp = rack->rc_tp; + if (tp->cc_algo == NULL) { + /* Tcb is leaving */ + printf("No cc algorithm?\n"); + return; + } + rack->rc_pacing_cc_set = 1; + if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { + /* Not new-reno we can't play games with beta! */ + printf("cc_algo:%s is not NEWRENO:%s\n", + tp->cc_algo->name, CCALGONAME_NEWRENO); + goto out; + } + ptr = ((struct newreno *)tp->ccv->cc_data); + if (CC_ALGO(tp)->ctl_output == NULL) { + /* Huh, why does new_reno no longer have a set function? */ + printf("no ctl_output for algo:%s\n", tp->cc_algo->name); + goto out; + } + if (ptr == NULL) { + /* Just the default values */ + old.beta = V_newreno_beta_ecn; + old.beta_ecn = V_newreno_beta_ecn; + old.newreno_flags = 0; + } else { + old.beta = ptr->beta; + old.beta_ecn = ptr->beta_ecn; + old.newreno_flags = ptr->newreno_flags; + } + sopt.sopt_valsize = sizeof(struct cc_newreno_opts); + sopt.sopt_dir = SOPT_SET; + opt.name = CC_NEWRENO_BETA; + opt.val = rack->r_ctl.rc_saved_beta.beta; + error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); + if (error) { + printf("Error returned by ctl_output %d\n", error); + goto out; + } + /* + * Hack alert we need to set in our newreno_flags + * so that Abe behavior is also applied. + */ + ((struct newreno *)tp->ccv->cc_data)->newreno_flags = CC_NEWRENO_BETA_ECN; + opt.name = CC_NEWRENO_BETA_ECN; + opt.val = rack->r_ctl.rc_saved_beta.beta_ecn; + error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); + if (error) { + printf("Error returned by ctl_output %d\n", error); + goto out; + } + /* Save off the original values for restoral */ + memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); +out: + if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + struct timeval tv; + + ptr = ((struct newreno *)tp->ccv->cc_data); + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + if (ptr) { + log.u_bbr.flex1 = ptr->beta; + log.u_bbr.flex2 = ptr->beta_ecn; + log.u_bbr.flex3 = ptr->newreno_flags; + } + log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; + log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; + log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; + log.u_bbr.flex7 = rack->gp_ready; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->use_fixed_rate; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->rc_pacing_cc_set; + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex8 = 3; + tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, error, + 0, &log, false, NULL, NULL, 0, &tv); + } +} + +static void +rack_undo_cc_pacing(struct tcp_rack *rack) +{ + struct newreno old, *ptr; + struct tcpcb *tp; + + if (rack->rc_pacing_cc_set == 0) + return; + tp = rack->rc_tp; + rack->rc_pacing_cc_set = 0; + if (tp->cc_algo == NULL) + /* Tcb is leaving */ + return; + if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { + /* Not new-reno nothing to do! */ + return; + } + ptr = ((struct newreno *)tp->ccv->cc_data); + if (ptr == NULL) { + /* + * This happens at rack_fini() if the + * cc module gets freed on us. In that + * case we loose our "new" settings but + * thats ok, since the tcb is going away anyway. + */ + return; + } + /* Grab out our set values */ + memcpy(&old, ptr, sizeof(struct newreno)); + /* Copy back in the original values */ + memcpy(ptr, &rack->r_ctl.rc_saved_beta, sizeof(struct newreno)); + /* Now save back the values we had set in (for when pacing is restored) */ + memcpy(&rack->r_ctl.rc_saved_beta, &old, sizeof(struct newreno)); + if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + struct timeval tv; + + ptr = ((struct newreno *)tp->ccv->cc_data); + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = ptr->beta; + log.u_bbr.flex2 = ptr->beta_ecn; + log.u_bbr.flex3 = ptr->newreno_flags; + log.u_bbr.flex4 = rack->r_ctl.rc_saved_beta.beta; + log.u_bbr.flex5 = rack->r_ctl.rc_saved_beta.beta_ecn; + log.u_bbr.flex6 = rack->r_ctl.rc_saved_beta.newreno_flags; + log.u_bbr.flex7 = rack->gp_ready; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->use_fixed_rate; + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->rc_pacing_cc_set; + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex8 = 4; + tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } +} + +#ifdef NETFLIX_PEAKRATE +static inline void +rack_update_peakrate_thr(struct tcpcb *tp) +{ + /* Keep in mind that t_maxpeakrate is in B/s. */ + uint64_t peak; + peak = uqmax((tp->t_maxseg * 2), + (((uint64_t)tp->t_maxpeakrate * (uint64_t)(tp->t_srtt)) / (uint64_t)HPTS_USEC_IN_SEC)); + tp->t_peakrate_thr = (uint32_t)uqmin(peak, UINT32_MAX); +} +#endif + static int sysctl_rack_clear(SYSCTL_HANDLER_ARGS) { uint32_t stat; int32_t error; + int i; error = SYSCTL_OUT(req, &rack_clear_counter, sizeof(uint32_t)); if (error || req->newptr == NULL) @@ -536,17 +767,37 @@ counter_u64_zero(rack_calc_nonzero); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); + counter_u64_zero(rack_saw_enobuf_hw); counter_u64_zero(rack_saw_enetunreach); counter_u64_zero(rack_per_timer_hole); + counter_u64_zero(rack_large_ackcmp); + counter_u64_zero(rack_small_ackcmp); +#ifdef INVARIANTS + counter_u64_zero(rack_adjust_map_bw); +#endif counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); + counter_u64_zero(rack_fto_send); + counter_u64_zero(rack_fto_rsm_send); + counter_u64_zero(rack_extended_rfo); + counter_u64_zero(rack_hw_pace_init_fail); + counter_u64_zero(rack_hw_pace_lost); + counter_u64_zero(rack_sbsndptr_wrong); + counter_u64_zero(rack_sbsndptr_right); + counter_u64_zero(rack_non_fto_send); + counter_u64_zero(rack_nfto_resend); counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); + for (i = 0; i < MAX_NUM_OF_CNTS; i++) { + counter_u64_zero(rack_proc_comp_ack[i]); + } + counter_u64_zero(rack_multi_single_eq); + counter_u64_zero(rack_proc_non_comp_ack); counter_u64_zero(rack_find_high); counter_u64_zero(rack_sack_attacks_detected); counter_u64_zero(rack_sack_attacks_reversed); @@ -574,6 +825,7 @@ static void rack_init_sysctls(void) { + int i; struct sysctl_oid *rack_counters; struct sysctl_oid *rack_attack; struct sysctl_oid *rack_pacing; @@ -583,6 +835,7 @@ struct sysctl_oid *rack_misc; struct sysctl_oid *rack_measure; struct sysctl_oid *rack_probertt; + struct sysctl_oid *rack_hw_pacing; rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -740,11 +993,6 @@ OID_AUTO, "init_win", CTLFLAG_RW, &rack_default_init_window, 0, "Do we have a rack initial window 0 = system default"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_pacing), - OID_AUTO, "hw_pacing_adjust", CTLFLAG_RW, - &rack_hw_pace_adjust, 0, - "What percentage do we raise the MSS by (11 = 1.1%)"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "gp_per_ss", CTLFLAG_RW, @@ -775,7 +1023,73 @@ OID_AUTO, "use_pacing", CTLFLAG_RW, &rack_pace_every_seg, 0, "If set we use pacing, if clear we use only the original burst mitigation"); - + SYSCTL_ADD_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "rate_cap", CTLFLAG_RW, + &rack_bw_rate_cap, 0, + "If set we apply this value to the absolute rate cap used by pacing"); + SYSCTL_ADD_U8(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "req_measure_cnt", CTLFLAG_RW, + &rack_req_measurements, 1, + "If doing dynamic pacing, how many measurements must be in before we start pacing?"); + /* Hardware pacing */ + rack_hw_pacing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "hdwr_pacing", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "Pacing related Controls"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "rwnd_factor", CTLFLAG_RW, + &rack_hw_rwnd_factor, 2, + "How many times does snd_wnd need to be bigger than pace_max_seg so we will hold off and get more acks?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "pace_enobuf_mult", CTLFLAG_RW, + &rack_enobuf_hw_boost_mult, 2, + "By how many time_betweens should we boost the pacing time if we see a ENOBUFS?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "pace_enobuf_max", CTLFLAG_RW, + &rack_enobuf_hw_max, 2, + "What is the max boost the pacing time if we see a ENOBUFS?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "pace_enobuf_min", CTLFLAG_RW, + &rack_enobuf_hw_min, 2, + "What is the min boost the pacing time if we see a ENOBUFS?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "enable", CTLFLAG_RW, + &rack_enable_hw_pacing, 0, + "Should RACK attempt to use hw pacing?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "rate_cap", CTLFLAG_RW, + &rack_hw_rate_caps, 1, + "Does the highest hardware pacing rate cap the rate we will send at??"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "rate_min", CTLFLAG_RW, + &rack_hw_rate_min, 0, + "Do we need a minimum estimate of this many bytes per second in order to engage hw pacing?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "rate_to_low", CTLFLAG_RW, + &rack_hw_rate_to_low, 0, + "If we fall below this rate, dis-engage hw pacing?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "up_only", CTLFLAG_RW, + &rack_hw_up_only, 1, + "Do we allow hw pacing to lower the rate selected?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_hw_pacing), + OID_AUTO, "extra_mss_precise", CTLFLAG_RW, + &rack_hw_pace_extra_slots, 2, + "If the rates between software and hardware match precisely how many extra time_betweens do we get?"); rack_timely = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, @@ -901,6 +1215,11 @@ OID_AUTO, "use_rrr", CTLFLAG_RW, &use_rack_rr, 1, "Do we use Rack Rapid Recovery"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_tlp), + OID_AUTO, "post_rec_labc", CTLFLAG_RW, + &rack_max_abc_post_recovery, 2, + "Since we do early recovery, do we override the l_abc to a value, if so what?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "nonrxt_use_cr", CTLFLAG_RW, @@ -924,8 +1243,8 @@ SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlpminto", CTLFLAG_RW, - &rack_tlp_min, 10, - "TLP minimum timeout per the specification (10ms)"); + &rack_tlp_min, 10000, + "TLP minimum timeout per the specification (in microseconds)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "send_oldest", CTLFLAG_RW, @@ -936,11 +1255,6 @@ OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_tlp), - OID_AUTO, "tlp_retry", CTLFLAG_RW, - &rack_tlp_max_resend, 2, - "How many times does TLP retry a single segment or multiple with no ACK"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, @@ -959,13 +1273,13 @@ SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "reorder_fade", CTLFLAG_RW, - &rack_reorder_fade, 0, - "Does reorder detection fade, if so how many ms (0 means never)"); + &rack_reorder_fade, 60000000, + "Does reorder detection fade, if so how many microseconds (0 means never)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "pktdelay", CTLFLAG_RW, - &rack_pkt_delay, 1, - "Extra RACK time (in ms) besides reordering thresh"); + &rack_pkt_delay, 1000, + "Extra RACK time (in microseconds) besides reordering thresh"); /* Timer related controls */ rack_timers = SYSCTL_ADD_NODE(&rack_sysctl_ctx, @@ -977,33 +1291,33 @@ SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmin", CTLFLAG_RW, - &rack_persist_min, 250, - "What is the minimum time in milliseconds between persists"); + &rack_persist_min, 250000, + "What is the minimum time in microseconds between persists"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmax", CTLFLAG_RW, - &rack_persist_max, 2000, - "What is the largest delay in milliseconds between persists"); + &rack_persist_max, 2000000, + "What is the largest delay in microseconds between persists"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "delayed_ack", CTLFLAG_RW, - &rack_delayed_ack_time, 200, - "Delayed ack time (200ms)"); + &rack_delayed_ack_time, 40000, + "Delayed ack time (40ms in microseconds)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "minrto", CTLFLAG_RW, - &rack_rto_min, 0, - "Minimum RTO in ms -- set with caution below 1000 due to TLP"); + &rack_rto_min, 30000, + "Minimum RTO in microseconds -- set with caution below 1000 due to TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "maxrto", CTLFLAG_RW, - &rack_rto_max, 0, - "Maxiumum RTO in ms -- should be at least as large as min_rto"); + &rack_rto_max, 4000000, + "Maxiumum RTO in microseconds -- should be at least as large as min_rto"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "minto", CTLFLAG_RW, - &rack_min_to, 1, - "Minimum rack timeout in milliseconds"); + &rack_min_to, 1000, + "Minimum rack timeout in microseconds"); /* Measure controls */ rack_measure = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1053,10 +1367,57 @@ "misc", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Misc related controls"); +#ifdef TCP_ACCOUNTING + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "tcp_acct", CTLFLAG_RW, + &rack_tcp_accounting, 0, + "Should we turn on TCP accounting for all rack sessions?"); +#endif + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "prr_addback_max", CTLFLAG_RW, + &rack_prr_addbackmax, 2, + "What is the maximum number of MSS we allow to be added back if prr can't send all its data?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "stats_gets_ms", CTLFLAG_RW, + &rack_stats_gets_ms_rtt, 1, + "What do we feed the stats framework (1 = ms_rtt, 0 = us_rtt, 2 = ms_rtt from hdwr, > 2 usec rtt from hdwr)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "clientlowbuf", CTLFLAG_RW, + &rack_client_low_buf, 0, + "Client low buffer level (below this we are more aggressive in DGP exiting recovery (0 = off)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "defprofile", CTLFLAG_RW, + &rack_def_profile, 0, + "Should RACK use a default profile (0=no, num == profile num)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "cmpack", CTLFLAG_RW, + &rack_use_cmp_acks, 1, + "Should RACK have LRO send compressed acks"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "fsb", CTLFLAG_RW, + &rack_use_fsb, 1, + "Should RACK use the fast send block?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rfo", CTLFLAG_RW, + &rack_use_rfo, 1, + "Should RACK use rack_fast_output()?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_misc), + OID_AUTO, "rsmrfo", CTLFLAG_RW, + &rack_use_rsm_rfo, 1, + "Should RACK use rack_fast_rsm_output()?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "shared_cwnd", CTLFLAG_RW, - &rack_enable_shared_cwnd, 0, + &rack_enable_shared_cwnd, 1, "Should RACK try to use the shared cwnd on connections where allowed"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), @@ -1091,18 +1452,8 @@ SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "no_sack_needed", CTLFLAG_RW, - &rack_sack_not_required, 0, + &rack_sack_not_required, 1, "Do we allow rack to run on connections not supporting SACK"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "recovery_loss_prop", CTLFLAG_RW, - &rack_use_proportional_reduce, 0, - "Should we proportionaly reduce cwnd based on the number of losses "); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "recovery_prop", CTLFLAG_RW, - &rack_proportional_rate, 10, - "What percent reduction per loss"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), OID_AUTO, "prr_sendalot", CTLFLAG_RW, @@ -1110,9 +1461,9 @@ "Send a lot in prr"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "earlyrecovery", CTLFLAG_RW, - &rack_early_recovery, 1, - "Do we do early recovery with rack"); + OID_AUTO, "autoscale", CTLFLAG_RW, + &rack_autosndbuf_inc, 20, + "What percentage should rack scale up its snd buffer by?"); /* Sack Attacker detection stuff */ SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), @@ -1179,6 +1530,46 @@ &rack_sack_used_prev_merge, "Total number of times we used the prev merge"); /* Counters */ + rack_fto_send = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "fto_send", CTLFLAG_RD, + &rack_fto_send, "Total number of rack_fast_output sends"); + rack_fto_rsm_send = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "fto_rsm_send", CTLFLAG_RD, + &rack_fto_rsm_send, "Total number of rack_fast_rsm_output sends"); + rack_nfto_resend = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "nfto_resend", CTLFLAG_RD, + &rack_nfto_resend, "Total number of rack_output retransmissions"); + rack_non_fto_send = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "nfto_send", CTLFLAG_RD, + &rack_non_fto_send, "Total number of rack_output first sends"); + rack_extended_rfo = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "rfo_extended", CTLFLAG_RD, + &rack_extended_rfo, "Total number of times we extended rfo"); + + rack_hw_pace_init_fail = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "hwpace_init_fail", CTLFLAG_RD, + &rack_hw_pace_init_fail, "Total number of times we failed to initialize hw pacing"); + rack_hw_pace_lost = counter_u64_alloc(M_WAITOK); + + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "hwpace_lost", CTLFLAG_RD, + &rack_hw_pace_lost, "Total number of times we failed to initialize hw pacing"); + + + rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1296,13 +1687,25 @@ SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, - "Total number of times a segment did not cause hptsi"); + "Total number of times a sends returned enobuf for non-hdwr paced connections"); + rack_saw_enobuf_hw = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "saw_enobufs_hw", CTLFLAG_RD, + &rack_saw_enobuf_hw, + "Total number of times a send returned enobuf for hdwr paced connections"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, - "Total number of times a segment did not cause hptsi"); + "Total number of times a send received a enetunreachable"); + rack_hot_alloc = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "alloc_hot", CTLFLAG_RD, + &rack_hot_alloc, + "Total allocations from the top of our list"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1339,6 +1742,51 @@ OID_AUTO, "split_limited", CTLFLAG_RD, &rack_split_limited, "Split allocations dropped due to limit"); + + for (i = 0; i < MAX_NUM_OF_CNTS; i++) { + char name[32]; + sprintf(name, "cmp_ack_cnt_%d", i); + rack_proc_comp_ack[i] = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, name, CTLFLAG_RD, + &rack_proc_comp_ack[i], + "Number of compressed acks we processed"); + } + rack_large_ackcmp = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "cmp_large_mbufs", CTLFLAG_RD, + &rack_large_ackcmp, + "Number of TCP connections with large mbuf's for compressed acks"); + rack_small_ackcmp = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "cmp_small_mbufs", CTLFLAG_RD, + &rack_small_ackcmp, + "Number of TCP connections with small mbuf's for compressed acks"); +#ifdef INVARIANTS + rack_adjust_map_bw = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "map_adjust_req", CTLFLAG_RD, + &rack_adjust_map_bw, + "Number of times we hit the case where the sb went up and down on a sendmap entry"); +#endif + rack_multi_single_eq = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "cmp_ack_equiv", CTLFLAG_RD, + &rack_multi_single_eq, + "Number of compressed acks total represented"); + rack_proc_non_comp_ack = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "cmp_ack_not", CTLFLAG_RD, + &rack_proc_non_comp_ack, + "Number of non compresseds acks that we processed"); + + rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1424,6 +1872,18 @@ OID_AUTO, "timer_hole", CTLFLAG_RD, &rack_per_timer_hole, "Total persists start in timer hole"); + + rack_sbsndptr_wrong = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "sndptr_wrong", CTLFLAG_RD, + &rack_sbsndptr_wrong, "Total number of times the saved sbsndptr was incorret"); + rack_sbsndptr_right = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "sndptr_right", CTLFLAG_RD, + &rack_sbsndptr_right, "Total number of times the saved sbsndptr was corret"); + COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, @@ -1491,16 +1951,16 @@ * Nothing set by the user, use the system stack * default. */ - return(tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); + return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); } win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; - return(win); + return (win); } static uint64_t rack_get_fixed_pacing_bw(struct tcp_rack *rack) { - if (IN_RECOVERY(rack->rc_tp->t_flags)) + if (IN_FASTRECOVERY(rack->rc_tp->t_flags)) return (rack->r_ctl.rc_fixed_pacing_rate_rec); else if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) return (rack->r_ctl.rc_fixed_pacing_rate_ss); @@ -1546,19 +2006,21 @@ } /* Ok lets get the initial TCP win (not racks) */ bw = tcp_compute_initwnd(tcp_maxseg(rack->rc_tp)); - srtt = ((uint64_t)TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); + srtt = (uint64_t)rack->rc_tp->t_srtt; bw *= (uint64_t)USECS_IN_SECOND; bw /= srtt; + if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) + bw = rack->r_ctl.bw_rate_cap; return (bw); } else { uint64_t bw; - if(rack->r_ctl.num_avg >= RACK_REQ_AVG) { + if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { /* Averaging is done, we can return the value */ bw = rack->r_ctl.gp_bw; } else { /* Still doing initial average must calculate */ - bw = rack->r_ctl.gp_bw / rack->r_ctl.num_avg; + bw = rack->r_ctl.gp_bw / rack->r_ctl.num_measurements; } #ifdef NETFLIX_PEAKRATE if ((rack->rc_tp->t_maxpeakrate) && @@ -1569,6 +2031,8 @@ return (rack->rc_tp->t_maxpeakrate); } #endif + if (rack->r_ctl.bw_rate_cap && (bw > rack->r_ctl.bw_rate_cap)) + bw = rack->r_ctl.bw_rate_cap; return (bw); } } @@ -1579,26 +2043,26 @@ if (rack->use_fixed_rate) { return (100); } else if (rack->in_probe_rtt && (rsm == NULL)) - return(rack->r_ctl.rack_per_of_gp_probertt); - else if ((IN_RECOVERY(rack->rc_tp->t_flags) && + return (rack->r_ctl.rack_per_of_gp_probertt); + else if ((IN_FASTRECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.rack_per_of_gp_rec)) { if (rsm) { /* a retransmission always use the recovery rate */ - return(rack->r_ctl.rack_per_of_gp_rec); + return (rack->r_ctl.rack_per_of_gp_rec); } else if (rack->rack_rec_nonrxt_use_cr) { /* Directed to use the configured rate */ goto configured_rate; } else if (rack->rack_no_prr && (rack->r_ctl.rack_per_of_gp_rec > 100)) { /* No PRR, lets just use the b/w estimate only */ - return(100); + return (100); } else { /* * Here we may have a non-retransmit but we * have no overrides, so just use the recovery * rate (prr is in effect). */ - return(rack->r_ctl.rack_per_of_gp_rec); + return (rack->r_ctl.rack_per_of_gp_rec); } } configured_rate: @@ -1606,16 +2070,64 @@ if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) return (rack->r_ctl.rack_per_of_gp_ss); else - return(rack->r_ctl.rack_per_of_gp_ca); + return (rack->r_ctl.rack_per_of_gp_ca); +} + +static void +rack_log_hdwr_pacing(struct tcp_rack *rack, + uint64_t rate, uint64_t hw_rate, int line, + int error, uint16_t mod) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + const struct ifnet *ifp; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); + log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); + if (rack->r_ctl.crte) { + ifp = rack->r_ctl.crte->ptbl->rs_ifp; + } else if (rack->rc_inp->inp_route.ro_nh && + rack->rc_inp->inp_route.ro_nh->nh_ifp) { + ifp = rack->rc_inp->inp_route.ro_nh->nh_ifp; + } else + ifp = NULL; + if (ifp) { + log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); + log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); + } + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.bw_inuse = rate; + log.u_bbr.flex5 = line; + log.u_bbr.flex6 = error; + log.u_bbr.flex7 = mod; + log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex8 = rack->use_fixed_rate; + log.u_bbr.flex8 <<= 1; + log.u_bbr.flex8 |= rack->rack_hdrw_pacing; + log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; + log.u_bbr.delRate = rack->r_ctl.crte_prev_rate; + if (rack->r_ctl.crte) + log.u_bbr.cur_del_rate = rack->r_ctl.crte->rate; + else + log.u_bbr.cur_del_rate = 0; + log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HDWR_PACE, 0, + 0, &log, false, &tv); + } } static uint64_t -rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm) +rack_get_output_bw(struct tcp_rack *rack, uint64_t bw, struct rack_sendmap *rsm, int *capped) { /* * We allow rack_per_of_gp_xx to dictate our bw rate we want. */ - uint64_t bw_est; + uint64_t bw_est, high_rate; uint64_t gain; gain = (uint64_t)rack_get_output_gain(rack, rsm); @@ -1624,6 +2136,43 @@ /* Never fall below the minimum (def 64kbps) */ if (bw_est < RACK_MIN_BW) bw_est = RACK_MIN_BW; + if (rack->r_rack_hw_rate_caps) { + /* Rate caps are in place */ + if (rack->r_ctl.crte != NULL) { + /* We have a hdwr rate already */ + high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); + if (bw_est >= high_rate) { + /* We are capping bw at the highest rate table entry */ + rack_log_hdwr_pacing(rack, + bw_est, high_rate, __LINE__, + 0, 3); + bw_est = high_rate; + if (capped) + *capped = 1; + } + } else if ((rack->rack_hdrw_pacing == 0) && + (rack->rack_hdw_pace_ena) && + (rack->rack_attempt_hdwr_pace == 0) && + (rack->rc_inp->inp_route.ro_nh != NULL) && + (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { + /* + * Special case, we have not yet attempted hardware + * pacing, and yet we may, when we do, find out if we are + * above the highest rate. We need to know the maxbw for the interface + * in question (if it supports ratelimiting). We get back + * a 0, if the interface is not found in the RL lists. + */ + high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); + if (high_rate) { + /* Yep, we have a rate is it above this rate? */ + if (bw_est > high_rate) { + bw_est = high_rate; + if (capped) + *capped = 1; + } + } + } + } return (bw_est); } @@ -1658,6 +2207,9 @@ log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1674,8 +2226,8 @@ struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); - log.u_bbr.flex2 = to * 1000; + log.u_bbr.flex1 = rack->rc_tp->t_srtt; + log.u_bbr.flex2 = to; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; @@ -1690,6 +2242,11 @@ log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; + log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; + log.u_bbr.lost = rack_rto_min; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1721,6 +2278,9 @@ log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1729,6 +2289,56 @@ } } +static void +rack_log_map_chg(struct tcpcb *tp, struct tcp_rack *rack, + struct rack_sendmap *prev, + struct rack_sendmap *rsm, + struct rack_sendmap *next, + int flag, uint32_t th_ack, int line) +{ + if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex8 = flag; + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.cur_del_rate = (uint64_t)prev; + log.u_bbr.delRate = (uint64_t)rsm; + log.u_bbr.rttProp = (uint64_t)next; + log.u_bbr.flex7 = 0; + if (prev) { + log.u_bbr.flex1 = prev->r_start; + log.u_bbr.flex2 = prev->r_end; + log.u_bbr.flex7 |= 0x4; + } + if (rsm) { + log.u_bbr.flex3 = rsm->r_start; + log.u_bbr.flex4 = rsm->r_end; + log.u_bbr.flex7 |= 0x2; + } + if (next) { + log.u_bbr.flex5 = next->r_start; + log.u_bbr.flex6 = next->r_end; + log.u_bbr.flex7 |= 0x1; + } + log.u_bbr.applimited = line; + log.u_bbr.pkts_out = th_ack; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + if (rack->rack_no_prr) + log.u_bbr.lost = 0; + else + log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_LOG_MAPCHG, 0, + 0, &log, false, &tv); + } +} + static void rack_log_rtt_upd(struct tcpcb *tp, struct tcp_rack *rack, uint32_t t, uint32_t len, struct rack_sendmap *rsm, int conf) @@ -1741,30 +2351,28 @@ log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = t; log.u_bbr.flex2 = len; - log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC; - log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest * HPTS_USEC_IN_MSEC; - log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest * HPTS_USEC_IN_MSEC; - log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; + log.u_bbr.flex3 = rack->r_ctl.rc_rack_min_rtt; + log.u_bbr.flex4 = rack->r_ctl.rack_rs.rs_rtt_lowest; + log.u_bbr.flex5 = rack->r_ctl.rack_rs.rs_rtt_highest; + log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_us_rtrcnt; log.u_bbr.flex7 = conf; - log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot * (uint64_t)HPTS_USEC_IN_MSEC; + log.u_bbr.rttProp = (uint64_t)rack->r_ctl.rack_rs.rs_rtt_tot; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; - if (rack->rack_no_prr) - log.u_bbr.pkts_out = 0; - else - log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtt; + log.u_bbr.delivered = rack->r_ctl.rack_rs.rs_us_rtrcnt; log.u_bbr.pkts_out = rack->r_ctl.rack_rs.rs_flags; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); if (rsm) { log.u_bbr.pkt_epoch = rsm->r_start; log.u_bbr.lost = rsm->r_end; log.u_bbr.cwnd_gain = rsm->r_rtr_cnt; + log.u_bbr.pacing_gain = rsm->r_flags; } else { /* Its a SYN */ log.u_bbr.pkt_epoch = rack->rc_tp->iss; log.u_bbr.lost = 0; log.u_bbr.cwnd_gain = 0; + log.u_bbr.pacing_gain = 0; } /* Write out general bits of interest rrs here */ log.u_bbr.use_lt_bw = rack->rc_highly_buffered; @@ -1787,11 +2395,17 @@ log.u_bbr.lt_epoch = rack->r_ctl.rc_time_probertt_entered; log.u_bbr.cur_del_rate = rack->r_ctl.rc_lower_rtt_us_cts; log.u_bbr.delRate = rack->r_ctl.rc_gp_srtt; + log.u_bbr.bw_inuse = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + log.u_bbr.bw_inuse <<= 32; + if (rsm) + log.u_bbr.bw_inuse |= ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]); TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, 0, &log, false, &tv); + + } } @@ -1809,14 +2423,38 @@ /* Convert our ms to a microsecond */ memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = rtt * 1000; + log.u_bbr.flex1 = rtt; log.u_bbr.flex2 = rack->r_ctl.ack_count; log.u_bbr.flex3 = rack->r_ctl.sack_count; log.u_bbr.flex4 = rack->r_ctl.sack_noextra_move; log.u_bbr.flex5 = rack->r_ctl.sack_moved_extra; + log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; + log.u_bbr.flex7 = 1; log.u_bbr.flex8 = rack->sack_attack_disable; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; + /* + * We capture in delRate the upper 32 bits as + * the confidence level we had declared, and the + * lower 32 bits as the actual RTT using the arrival + * timestamp. + */ + log.u_bbr.delRate = rack->r_ctl.rack_rs.confidence; + log.u_bbr.delRate <<= 32; + log.u_bbr.delRate |= rack->r_ctl.rack_rs.rs_us_rtt; + /* Lets capture all the things that make up t_rtxcur */ + log.u_bbr.applimited = rack_rto_min; + log.u_bbr.epoch = rack_rto_max; + log.u_bbr.lt_epoch = rtt; + log.u_bbr.lost = rack_rto_min; + log.u_bbr.pkt_epoch = TICKS_2_USEC(tcp_rexmit_slop); + log.u_bbr.rttProp = RACK_REXMTVAL(rack->rc_tp); + log.u_bbr.bw_inuse = rack->r_ctl.act_rcv_time.tv_sec; + log.u_bbr.bw_inuse *= HPTS_USEC_IN_SEC; + log.u_bbr.bw_inuse += rack->r_ctl.act_rcv_time.tv_usec; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1825,24 +2463,52 @@ } } -static inline void -rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) +static void +rack_log_rtt_sample_calc(struct tcp_rack *rack, uint32_t rtt, uint32_t send_time, uint32_t ack_time, int where) { - if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; struct timeval tv; - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; - log.u_bbr.ininput = rack->rc_inp->inp_in_input; - log.u_bbr.flex1 = line; - log.u_bbr.flex2 = tick; - log.u_bbr.flex3 = tp->t_maxunacktime; - log.u_bbr.flex4 = tp->t_acktime; - log.u_bbr.flex8 = event; + /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = rtt; + log.u_bbr.flex2 = send_time; + log.u_bbr.flex3 = ack_time; + log.u_bbr.flex4 = where; + log.u_bbr.flex7 = 2; log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - TCP_LOG_EVENTP(tp, NULL, + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_LOG_RTT, 0, + 0, &log, false, &tv); + } +} + + + +static inline void +rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line) +{ + if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = line; + log.u_bbr.flex2 = tick; + log.u_bbr.flex3 = tp->t_maxunacktime; + log.u_bbr.flex4 = tp->t_acktime; + log.u_bbr.flex8 = event; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; + TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, @@ -1868,6 +2534,9 @@ log.u_bbr.flex8 = rack->rc_in_persist; log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1877,7 +2546,7 @@ } static void -rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out) +rack_log_doseg_done(struct tcp_rack *rack, uint32_t cts, int32_t nxt_pkt, int32_t did_out, int way_out, int nsegs) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; @@ -1892,12 +2561,23 @@ log.u_bbr.flex5 = 0; else log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex6 = nsegs; log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; - log.u_bbr.flex7 = rack->r_wanted_output; + log.u_bbr.flex7 = rack->rc_ack_can_sendout_data; /* Do we have ack-can-send set */ + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->r_fast_output; /* is fast output primed */ + log.u_bbr.flex7 <<= 1; + log.u_bbr.flex7 |= rack->r_wanted_output; /* Do we want output */ log.u_bbr.flex8 = rack->rc_in_persist; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->r_might_revert; + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1907,7 +2587,7 @@ } static void -rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) +rack_log_type_pacing_sizes(struct tcpcb *tp, struct tcp_rack *rack, uint32_t arg1, uint32_t arg2, uint32_t arg3, uint8_t frm) { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; @@ -1918,13 +2598,16 @@ cts = tcp_get_usecs(&tv); log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; - log.u_bbr.flex4 = len; - log.u_bbr.flex5 = orig_len; - log.u_bbr.flex6 = rack->r_ctl.rc_sacked; - log.u_bbr.flex7 = mod; + log.u_bbr.flex4 = arg1; + log.u_bbr.flex5 = arg2; + log.u_bbr.flex6 = arg3; log.u_bbr.flex8 = frm; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.applimited = rack->r_ctl.rc_sacked; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(tp, NULL, &tp->t_inpcb->inp_socket->so_rcv, &tp->t_inpcb->inp_socket->so_snd, @@ -1956,6 +2639,9 @@ log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -1988,6 +2674,9 @@ log.u_bbr.applimited = rack->r_ctl.rc_hpts_flags; log.u_bbr.timeStamp = us_cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2020,7 +2709,7 @@ log.u_bbr.flex5 = flex5; log.u_bbr.flex6 = flex6; log.u_bbr.flex7 = flex7; - log.u_bbr.flex8 = mod; + log.u_bbr.flex8 = mod; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2046,6 +2735,9 @@ log.u_bbr.flex6 = 0; else log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; + log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; + log.u_bbr.pacing_gain = rack->r_must_retran; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, @@ -2077,6 +2769,9 @@ log.u_bbr.pkts_out = orig_cwnd; log.u_bbr.timeStamp = tcp_get_usecs(&tv); log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->r_might_revert; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2121,6 +2816,15 @@ static void rack_counter_destroy(void) { + int i; + + counter_u64_free(rack_fto_send); + counter_u64_free(rack_fto_rsm_send); + counter_u64_free(rack_nfto_resend); + counter_u64_free(rack_hw_pace_init_fail); + counter_u64_free(rack_hw_pace_lost); + counter_u64_free(rack_non_fto_send); + counter_u64_free(rack_extended_rfo); counter_u64_free(rack_ack_total); counter_u64_free(rack_express_sack); counter_u64_free(rack_sack_total); @@ -2150,13 +2854,20 @@ counter_u64_free(rack_paced_segments); counter_u64_free(rack_unpaced_segments); counter_u64_free(rack_saw_enobuf); + counter_u64_free(rack_saw_enobuf_hw); counter_u64_free(rack_saw_enetunreach); + counter_u64_free(rack_hot_alloc); counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); counter_u64_free(rack_to_alloc_limited); counter_u64_free(rack_alloc_limited_conns); counter_u64_free(rack_split_limited); + for (i = 0; i < MAX_NUM_OF_CNTS; i++) { + counter_u64_free(rack_proc_comp_ack[i]); + } + counter_u64_free(rack_multi_single_eq); + counter_u64_free(rack_proc_non_comp_ack); counter_u64_free(rack_sack_proc_all); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_sack_proc_short); @@ -2171,6 +2882,11 @@ counter_u64_free(rack_tlp_does_nada); counter_u64_free(rack_try_scwnd); counter_u64_free(rack_per_timer_hole); + counter_u64_free(rack_large_ackcmp); + counter_u64_free(rack_small_ackcmp); +#ifdef INVARIANTS + counter_u64_free(rack_adjust_map_bw); +#endif COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); } @@ -2180,12 +2896,33 @@ { struct rack_sendmap *rsm; + /* + * First get the top of the list it in + * theory is the "hottest" rsm we have, + * possibly just freed by ack processing. + */ + if (rack->rc_free_cnt > rack_free_cache) { + rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); + counter_u64_add(rack_hot_alloc, 1); + rack->rc_free_cnt--; + return (rsm); + } + /* + * Once we get under our free cache we probably + * no longer have a "hot" one available. Lets + * get one from UMA. + */ rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { rack->r_ctl.rc_num_maps_alloced++; counter_u64_add(rack_to_alloc, 1); return (rsm); } + /* + * Dig in to our aux rsm's (the last two) since + * UMA failed to get us one. + */ if (rack->rc_free_cnt) { counter_u64_add(rack_to_alloc_emerg, 1); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); @@ -2274,17 +3011,29 @@ rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_sacklast == rsm) rack->r_ctl.rc_sacklast = NULL; - if (rack->rc_free_cnt < rack_free_cache) { - memset(rsm, 0, sizeof(struct rack_sendmap)); - TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); - rsm->r_limit_type = 0; - rack->rc_free_cnt++; - return; + memset(rsm, 0, sizeof(struct rack_sendmap)); + TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); + rack->rc_free_cnt++; +} + +static void +rack_free_trim(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + /* + * Free up all the tail entries until + * we get our list down to the limit. + */ + while (rack->rc_free_cnt > rack_free_cache) { + rsm = TAILQ_LAST(&rack->r_ctl.rc_free, rack_head); + TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_tnext); + rack->rc_free_cnt--; + uma_zfree(rack_zone, rsm); } - rack->r_ctl.rc_num_maps_alloced--; - uma_zfree(rack_zone, rsm); } + static uint32_t rack_get_measure_window(struct tcpcb *tp, struct tcp_rack *rack) { @@ -2330,11 +3079,11 @@ * goal. */ bw = rack_get_bw(rack); - srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); + srtt = (uint64_t)tp->t_srtt; len = bw * srtt; len /= (uint64_t)HPTS_USEC_IN_SEC; len *= max(1, rack_goal_bdp); - /* Now we need to round up to the nearest MSS */ + /* Now we need to round up to the nearest MSS */ len = roundup(len, segsiz); if (rack_min_measure_usec) { /* Now calculate our min length for this b/w */ @@ -2655,7 +3404,7 @@ { /* * norm_grad = rtt_diff / minrtt; - * new_per = curper * (1 - B * norm_grad) + * new_per = curper * (1 - B * norm_grad) * * B = rack_gp_decrease_per (default 10%) * rtt_dif = input var current rtt-diff @@ -2694,8 +3443,8 @@ highrttthresh = get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack_gp_rtt_maxmul; - perf = (((uint64_t)curper * ((uint64_t)1000000 - - ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - + perf = (((uint64_t)curper * ((uint64_t)1000000 - + ((uint64_t)rack_gp_decrease_per * ((uint64_t)1000000 - ((uint64_t)highrttthresh * (uint64_t)1000000) / (uint64_t)rtt)) / 100)) /(uint64_t)1000000); return (perf); @@ -2708,7 +3457,7 @@ uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; if (rack->rc_gp_incr) { - /* Turn off increment counting */ + /* Turn off increment counting */ rack->rc_gp_incr = 0; rack->rc_gp_timely_inc_cnt = 0; } @@ -2774,7 +3523,7 @@ if (rack_per_lower_bound > rack->r_ctl.rack_per_of_gp_ss) rack->r_ctl.rack_per_of_gp_ss = rack_per_lower_bound; logged |= 4; - } else if (rack->rc_gp_saw_ca) { + } else if (rack->rc_gp_saw_ca) { /* Sent in CA */ if (timely_says == 2) { new_per = rack_decrease_highrtt(rack, rack->r_ctl.rack_per_of_gp_ca, rtt); @@ -3051,7 +3800,7 @@ /* Set to entry gp rtt */ rack_set_prtt_target(rack, segsiz, rack->r_ctl.rc_entry_gp_rtt); - } else { + } else { uint64_t sum; uint32_t setval; @@ -3197,12 +3946,12 @@ endtime += rack_min_probertt_hold; endtime += rack->r_ctl.rc_time_probertt_starts; if (TSTMP_GEQ(us_cts, endtime)) { - /* yes, exit probertt */ + /* yes, exit probertt */ rack_exit_probertt(rack, us_cts); - } + } - } else if((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { - /* Go into probertt, its been too long since we went lower */ + } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { + /* Go into probertt, its been too long since we went lower */ rack_enter_probertt(rack, us_cts); } } @@ -3303,7 +4052,7 @@ */ goto use_timely; } - } else if ((timely_says != 2) && + } else if ((timely_says != 2) && !losses && (last_bw_est > up_bnd)) { /* @@ -3335,11 +4084,11 @@ } rack->rc_gp_bwred = 0; rack->rc_gp_timely_dec_cnt = 0; - /* You get a set number of pushes if timely is trying to reduce */ + /* You get a set number of pushes if timely is trying to reduce */ if ((rack->rc_gp_incr < rack_timely_max_push_rise) || (timely_says == 0)) { rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); } else { - /* Log it stays the same */ + /* Log it stays the same */ rack_log_timely(rack, 0, last_bw_est, up_bnd, 0, __LINE__, 12); } @@ -3366,7 +4115,7 @@ rack->rc_gp_timely_inc_cnt = 0; } else rack_decrease_bw_mul(rack, timely_says, rtt, rtt_diff); - } else { + } else { rack->rc_gp_bwred = 0; rack->rc_gp_timely_dec_cnt = 0; rack_increase_bw_mul(rack, timely_says, cur_bw, last_bw_est, 0); @@ -3445,6 +4194,8 @@ uint64_t tim, bytes_ps, ltim, stim, utim; uint32_t segsiz, bytes, reqbytes, us_cts; int32_t gput, new_rtt_diff, timely_says; + uint64_t resid_bw, subpart = 0, addpart = 0, srtt; + int did_add = 0; us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); @@ -3453,7 +4204,7 @@ else tim = 0; - if (TSTMP_GT(rack->r_ctl.rc_gp_cumack_ts, rack->r_ctl.rc_gp_output_ts)) + if (rack->r_ctl.rc_gp_cumack_ts > rack->r_ctl.rc_gp_output_ts) stim = rack->r_ctl.rc_gp_cumack_ts - rack->r_ctl.rc_gp_output_ts; else stim = 0; @@ -3468,8 +4219,8 @@ utim = max(tim, 1); else utim = max(stim, 1); - /* Lets validate utim */ - ltim = max(1, (utim/HPTS_USEC_IN_MSEC)); + /* Lets get a msec time ltim too for the old stuff */ + ltim = max(1, (utim / HPTS_USEC_IN_MSEC)); gput = (((uint64_t) (th_ack - tp->gput_seq)) << 3) / ltim; reqbytes = min(rc_init_window(rack), (MIN_GP_WIN * segsiz)); if ((tim == 0) && (stim == 0)) { @@ -3602,14 +4353,14 @@ 11, __LINE__, NULL); bytes_ps = rack->r_ctl.last_max_bw; } - /* We store gp for b/w in bytes per second */ + /* We store gp for b/w in bytes per second */ if (rack->rc_gp_filled == 0) { /* Initial measurment */ if (bytes_ps) { rack->r_ctl.gp_bw = bytes_ps; rack->rc_gp_filled = 1; - rack->r_ctl.num_avg = 1; - rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + rack->r_ctl.num_measurements = 1; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); } else { rack_log_pacing_delay_calc(rack, bytes_ps, reqbytes, rack->r_ctl.rc_app_limited_cnt, @@ -3629,14 +4380,17 @@ rack->r_ctl.rc_hpts_flags = 0; rack->r_ctl.rc_last_output_to = 0; } - } else if (rack->r_ctl.num_avg < RACK_REQ_AVG) { + did_add = 2; + } else if (rack->r_ctl.num_measurements < RACK_REQ_AVG) { /* Still a small number run an average */ rack->r_ctl.gp_bw += bytes_ps; - rack->r_ctl.num_avg++; - if (rack->r_ctl.num_avg >= RACK_REQ_AVG) { + addpart = rack->r_ctl.num_measurements; + rack->r_ctl.num_measurements++; + if (rack->r_ctl.num_measurements >= RACK_REQ_AVG) { /* We have collected enought to move forward */ - rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_avg; + rack->r_ctl.gp_bw /= (uint64_t)rack->r_ctl.num_measurements; } + did_add = 3; } else { /* * We want to take 1/wma of the goodput and add in to 7/8th @@ -3650,15 +4404,16 @@ * other hand if we get a measurement over 1ms with a * 10ms rtt we only want to take a much smaller portion. */ - uint64_t resid_bw, subpart, addpart, srtt; - - srtt = ((uint64_t)TICKS_2_USEC(tp->t_srtt) >> TCP_RTT_SHIFT); + if (rack->r_ctl.num_measurements < 0xff) { + rack->r_ctl.num_measurements++; + } + srtt = (uint64_t)tp->t_srtt; if (srtt == 0) { /* * Strange why did t_srtt go back to zero? */ if (rack->r_ctl.rc_rack_min_rtt) - srtt = (rack->r_ctl.rc_rack_min_rtt * HPTS_USEC_IN_MSEC); + srtt = rack->r_ctl.rc_rack_min_rtt; else srtt = HPTS_USEC_IN_MSEC; } @@ -3704,6 +4459,7 @@ } resid_bw = rack->r_ctl.gp_bw - subpart; rack->r_ctl.gp_bw = resid_bw + addpart; + did_add = 1; } else { if ((utim / srtt) <= 1) { /* @@ -3734,11 +4490,22 @@ * if its larger, all others we just * add in. */ + did_add = 1; resid_bw = rack->r_ctl.gp_bw - subpart; rack->r_ctl.gp_bw = resid_bw + addpart; } } } + if ((rack->gp_ready == 0) && + (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { + /* We have enough measurements now */ + rack->gp_ready = 1; + rack_set_cc_pacing(rack); + if (rack->defer_options) + rack_apply_deferred_options(rack); + } + rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, + rack_get_bw(rack), 22, did_add, NULL); /* We do not update any multipliers if we are in or have seen a probe-rtt */ if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) rack_update_multiplier(rack, timely_says, bytes_ps, @@ -3831,17 +4598,20 @@ tp->gput_seq = rsm->r_start; } if (rsm->r_flags & RACK_ACKED) - tp->gput_ts = rsm->r_ack_arrival; + tp->gput_ts = (uint32_t)rsm->r_ack_arrival; else rack->app_limited_needs_set = 1; - rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; } else { /* * If we don't find the rsm due to some * send-limit set the current time, which * basically disables the send-limit. */ - rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); + struct timeval tv; + + microuptime(&tv); + rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); } rack_log_pacing_delay_calc(rack, tp->gput_seq, @@ -3858,12 +4628,16 @@ * CC wrapper hook functions */ static void -rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, struct tcphdr *th, uint16_t nsegs, +rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, uint16_t type, int32_t recovery) { + uint32_t prior_cwnd, acked; + struct tcp_log_buffer *lgb = NULL; + uint8_t labc_to_use; + INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; - tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); + acked = tp->ccv->bytes_this_ack = (th_ack - tp->snd_una); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; @@ -3872,18 +4646,14 @@ tp->ccv->bytes_this_ack = max; } } - if (rack->r_ctl.cwnd_to_use <= tp->snd_wnd) - tp->ccv->flags |= CCF_CWND_LIMITED; - else - tp->ccv->flags &= ~CCF_CWND_LIMITED; #ifdef STATS stats_voi_update_abs_s32(tp->t_stats, VOI_TCP_CALCFRWINDIFF, ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); #endif if ((tp->t_flags & TF_GPUTINPROG) && - rack_enough_for_measurement(tp, rack, th->th_ack)) { + rack_enough_for_measurement(tp, rack, th_ack)) { /* Measure the Goodput */ - rack_do_goodput_measurement(tp, rack, th->th_ack, __LINE__); + rack_do_goodput_measurement(tp, rack, th_ack, __LINE__); #ifdef NETFLIX_PEAKRATE if ((type == CC_ACK) && (tp->t_maxpeakrate)) { @@ -3893,12 +4663,19 @@ * it will only be used if pace_always is off i.e * we don't do this for paced flows. */ - tcp_update_peakrate_thr(tp); + rack_update_peakrate_thr(tp); } #endif } - if (rack->r_ctl.cwnd_to_use > tp->snd_ssthresh) { - tp->t_bytes_acked += tp->ccv->bytes_this_ack; + /* Which way our we limited, if not cwnd limited no advance in CA */ + if (tp->snd_cwnd <= tp->snd_wnd) + tp->ccv->flags |= CCF_CWND_LIMITED; + else + tp->ccv->flags &= ~CCF_CWND_LIMITED; + if (tp->snd_cwnd > tp->snd_ssthresh) { + tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, + nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); + /* For the setting of a window past use the actual scwnd we are using */ if (tp->t_bytes_acked >= rack->r_ctl.cwnd_to_use) { tp->t_bytes_acked -= rack->r_ctl.cwnd_to_use; tp->ccv->flags |= CCF_ABC_SENTAWND; @@ -3907,11 +4684,61 @@ tp->ccv->flags &= ~CCF_ABC_SENTAWND; tp->t_bytes_acked = 0; } + prior_cwnd = tp->snd_cwnd; + if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || + (rack_client_low_buf && (rack->client_bufferlvl < rack_client_low_buf))) + labc_to_use = rack->rc_labc; + else + labc_to_use = rack_max_abc_post_recovery; + if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = th_ack; + log.u_bbr.flex2 = tp->ccv->flags; + log.u_bbr.flex3 = tp->ccv->bytes_this_ack; + log.u_bbr.flex4 = tp->ccv->nsegs; + log.u_bbr.flex5 = labc_to_use; + log.u_bbr.flex6 = prior_cwnd; + log.u_bbr.flex7 = V_tcp_do_newsack; + log.u_bbr.flex8 = 1; + lgb = tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } if (CC_ALGO(tp)->ack_received != NULL) { /* XXXLAS: Find a way to live without this */ - tp->ccv->curack = th->th_ack; + tp->ccv->curack = th_ack; + tp->ccv->labc = labc_to_use; + tp->ccv->flags |= CCF_USE_LOCAL_ABC; CC_ALGO(tp)->ack_received(tp->ccv, type); } + if (lgb) { + lgb->tlb_stackinfo.u_bbr.flex6 = tp->snd_cwnd; + } + if (rack->r_must_retran) { + if (SEQ_GEQ(th_ack, rack->r_ctl.rc_snd_max_at_rto)) { + /* + * We now are beyond the rxt point so lets disable + * the flag. + */ + rack->r_ctl.rc_out_at_rto = 0; + rack->r_must_retran = 0; + } else if ((prior_cwnd + ctf_fixed_maxseg(tp)) <= tp->snd_cwnd) { + /* + * Only decrement the rc_out_at_rto if the cwnd advances + * at least a whole segment. Otherwise next time the peer + * acks, we won't be able to send this generaly happens + * when we are in Congestion Avoidance. + */ + if (acked <= rack->r_ctl.rc_out_at_rto){ + rack->r_ctl.rc_out_at_rto -= acked; + } else { + rack->r_ctl.rc_out_at_rto = 0; + } + } + } #ifdef STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_LCWIN, rack->r_ctl.cwnd_to_use); #endif @@ -3929,7 +4756,7 @@ } static void -tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th) +tcp_rack_partialack(struct tcpcb *tp) { struct tcp_rack *rack; @@ -3948,7 +4775,7 @@ } static void -rack_post_recovery(struct tcpcb *tp, struct tcphdr *th) +rack_post_recovery(struct tcpcb *tp, uint32_t th_ack) { struct tcp_rack *rack; uint32_t orig_cwnd; @@ -3956,21 +4783,57 @@ orig_cwnd = tp->snd_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); rack = (struct tcp_rack *)tp->t_fb_ptr; - if (rack->rc_not_backing_off == 0) { - /* only alert CC if we alerted when we entered */ - if (CC_ALGO(tp)->post_recovery != NULL) { - tp->ccv->curack = th->th_ack; - CC_ALGO(tp)->post_recovery(tp->ccv); - } - if (tp->snd_cwnd > tp->snd_ssthresh) { - /* Drop us down to the ssthresh (1/2 cwnd at loss) */ + /* only alert CC if we alerted when we entered */ + if (CC_ALGO(tp)->post_recovery != NULL) { + tp->ccv->curack = th_ack; + CC_ALGO(tp)->post_recovery(tp->ccv); + if (tp->snd_cwnd < tp->snd_ssthresh) { + /* + * Rack has burst control and pacing + * so lets not set this any lower than + * snd_ssthresh per RFC-6582 (option 2). + */ tp->snd_cwnd = tp->snd_ssthresh; } } + if (rack_verbose_logging && (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = th_ack; + log.u_bbr.flex2 = tp->ccv->flags; + log.u_bbr.flex3 = tp->ccv->bytes_this_ack; + log.u_bbr.flex4 = tp->ccv->nsegs; + log.u_bbr.flex5 = V_tcp_abc_l_var; + log.u_bbr.flex6 = orig_cwnd; + log.u_bbr.flex7 = V_tcp_do_newsack; + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex8 = 2; + tcp_log_event_(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } if ((rack->rack_no_prr == 0) && + (rack->no_prr_addback == 0) && (rack->r_ctl.rc_prr_sndcnt > 0)) { - /* Suck the next prr cnt back into cwnd */ - tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; + /* + * Suck the next prr cnt back into cwnd, but + * only do that if we are not application limited. + */ + if (ctf_outstanding(tp) <= sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { + /* + * We are allowed to add back to the cwnd the amount we did + * not get out if: + * a) no_prr_addback is off. + * b) we are not app limited + * c) we are doing prr + * + * d) it is bounded by rack_prr_addbackmax (if addback is 0, then none). + */ + tp->snd_cwnd += min((ctf_fixed_maxseg(tp) * rack_prr_addbackmax), + rack->r_ctl.rc_prr_sndcnt); + } rack->r_ctl.rc_prr_sndcnt = 0; rack_log_to_prr(rack, 1, 0); } @@ -3980,12 +4843,21 @@ } static void -rack_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type) +rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack) { struct tcp_rack *rack; + uint32_t ssthresh_enter, cwnd_enter, in_rec_at_entry, orig_cwnd; INP_WLOCK_ASSERT(tp->t_inpcb); - +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_CSIG, type); +#endif + if (IN_RECOVERY(tp->t_flags) == 0) { + in_rec_at_entry = 0; + ssthresh_enter = tp->snd_ssthresh; + cwnd_enter = tp->snd_cwnd; + } else + in_rec_at_entry = 1; rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: @@ -3996,7 +4868,7 @@ rack->r_ctl.rc_prr_out = 0; if (rack->rack_no_prr == 0) { rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 2, 0); + rack_log_to_prr(rack, 2, in_rec_at_entry); } rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; @@ -4010,7 +4882,7 @@ * Allow ECN reaction on ACK to CWR, if * that data segment was also CE marked. */ - SEQ_GEQ(th->th_ack, tp->snd_recover)) { + SEQ_GEQ(ack, tp->snd_recover)) { EXIT_CONGRECOVERY(tp->t_flags); KMOD_TCPSTAT_INC(tcps_ecn_rcwnd); tp->snd_recover = tp->snd_max + 1; @@ -4024,7 +4896,9 @@ EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); + orig_cwnd = tp->snd_cwnd; tp->snd_cwnd = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 16, orig_cwnd); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; @@ -4046,46 +4920,19 @@ tp->t_badrxtwin = 0; break; } - /* - * If we are below our max rtt, don't - * signal the CC control to change things. - * instead set it up so that we are in - * recovery but not going to back off. - */ - - if (rack->rc_highly_buffered) { - /* - * Do we use the higher rtt for - * our threshold to not backoff (like CDG)? - */ - uint32_t rtt_mul, rtt_div; - - if (rack_use_max_for_nobackoff) { - rtt_mul = (rack_gp_rtt_maxmul - 1); - rtt_div = 1; - } else { - rtt_mul = rack_gp_rtt_minmul; - rtt_div = max(rack_gp_rtt_mindiv , 1); - } - if (rack->r_ctl.rc_gp_srtt <= (rack->r_ctl.rc_lowest_us_rtt + - ((rack->r_ctl.rc_lowest_us_rtt * rtt_mul) / - rtt_div))) { - /* below our min threshold */ - rack->rc_not_backing_off = 1; - ENTER_RECOVERY(rack->rc_tp->t_flags); - rack_log_rtt_shrinks(rack, 0, - rtt_mul, - rtt_div, - RACK_RTTS_NOBACKOFF); - return; - } - } - rack->rc_not_backing_off = 0; - if (CC_ALGO(tp)->cong_signal != NULL) { - if (th != NULL) - tp->ccv->curack = th->th_ack; + if ((CC_ALGO(tp)->cong_signal != NULL) && + (type != CC_RTO)){ + tp->ccv->curack = ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } + if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { + rack_log_to_prr(rack, 15, cwnd_enter); + rack->r_ctl.dsack_byte_cnt = 0; + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; + rack->r_ctl.rc_ssthresh_at_erec = ssthresh_enter; + rack->r_ent_rec_ns = 1; + } } static inline void @@ -4131,7 +4978,7 @@ */ #define DELAY_ACK(tp, tlen) \ (((tp->t_flags & TF_RXWIN0SENT) == 0) && \ - ((tp->t_flags & TF_DELACK) == 0) && \ + ((tp->t_flags & TF_DELACK) == 0) && \ (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) @@ -4238,8 +5085,8 @@ thresh += 1; } /* We don't let the rack timeout be above a RTO */ - if (thresh > TICKS_2_MSEC(rack->rc_tp->t_rxtcur)) { - thresh = TICKS_2_MSEC(rack->rc_tp->t_rxtcur); + if (thresh > rack->rc_tp->t_rxtcur) { + thresh = rack->rc_tp->t_rxtcur; } /* And we don't want it above the RTO max either */ if (thresh > rack_rto_max) { @@ -4263,7 +5110,7 @@ else thresh = (srtt * 2); - /* Get the previous sent packet, if any */ + /* Get the previous sent packet, if any */ segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; @@ -4293,12 +5140,12 @@ counter_u64_add(rack_used_tlpmethod, 1); idx = rsm->r_rtr_cnt - 1; nidx = prsm->r_rtr_cnt - 1; - if (TSTMP_GEQ(rsm->r_tim_lastsent[nidx], prsm->r_tim_lastsent[idx])) { + if (rsm->r_tim_lastsent[nidx] >= prsm->r_tim_lastsent[idx]) { /* Yes it was sent later (or at the same time) */ inter_gap = rsm->r_tim_lastsent[idx] - prsm->r_tim_lastsent[nidx]; } thresh += inter_gap; - } else if (len <= segsiz) { + } else if (len <= segsiz) { /* * Possibly compensate for delayed-ack. */ @@ -4322,9 +5169,9 @@ thresh = alt_thresh; } } - /* Not above an RTO */ - if (thresh > TICKS_2_MSEC(tp->t_rxtcur)) { - thresh = TICKS_2_MSEC(tp->t_rxtcur); + /* Not above an RTO */ + if (thresh > tp->t_rxtcur) { + thresh = tp->t_rxtcur; } /* Not above a RTO max */ if (thresh > rack_rto_max) { @@ -4349,10 +5196,10 @@ * yet set. */ if (rack->rc_rack_rtt) - return(rack->rc_rack_rtt); + return (rack->rc_rack_rtt); else if (tp->t_srtt == 0) - return(RACK_INITIAL_RTO); - return (TICKS_2_MSEC(tp->t_srtt >> TCP_RTT_SHIFT)); + return (RACK_INITIAL_RTO); + return (tp->t_srtt); } static struct rack_sendmap * @@ -4384,10 +5231,10 @@ idx = rsm->r_rtr_cnt - 1; srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, tsused); - if (TSTMP_LT(tsused, rsm->r_tim_lastsent[idx])) { + if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { return (NULL); } - if ((tsused - rsm->r_tim_lastsent[idx]) < thresh) { + if ((tsused - ((uint32_t)rsm->r_tim_lastsent[idx])) < thresh) { return (NULL); } /* Ok if we reach here we are over-due and this guy can be sent */ @@ -4400,7 +5247,7 @@ rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; } - rack_cong_signal(tp, NULL, CC_NDUPACK); + rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); return (rsm); } @@ -4411,8 +5258,8 @@ int32_t tt; uint32_t ret_val; - t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); - TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], + t = (tp->t_srtt + (tp->t_rttvar << 2)); + RACK_TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], rack_persist_min, rack_persist_max); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; @@ -4446,32 +5293,65 @@ } rack->rc_on_min_to = 0; if ((tp->t_state < TCPS_ESTABLISHED) || - ((tp->t_flags & TF_SACK_PERMIT) == 0)) + ((tp->t_flags & TF_SACK_PERMIT) == 0)) { goto activate_rxt; + } rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if ((rsm == NULL) || sup_rack) { - /* Nothing on the send map */ + /* Nothing on the send map or no rack */ activate_rxt: time_since_sent = 0; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm) { + /* + * Should we discount the RTX timer any? + * + * We want to discount it the smallest amount. + * If a timer (Rack/TLP or RXT) has gone off more + * recently thats the discount we want to use (now - timer time). + * If the retransmit of the oldest packet was more recent then + * we want to use that (now - oldest-packet-last_transmit_time). + * + */ idx = rsm->r_rtr_cnt - 1; - if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) - tstmp_touse = rsm->r_tim_lastsent[idx]; + if (TSTMP_GEQ(rack->r_ctl.rc_tlp_rxt_last_time, ((uint32_t)rsm->r_tim_lastsent[idx]))) + tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; else - tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; + tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; if (TSTMP_GT(cts, tstmp_touse)) time_since_sent = cts - tstmp_touse; } if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; - to = TICKS_2_MSEC(tp->t_rxtcur); + to = tp->t_rxtcur; if (to > time_since_sent) to -= time_since_sent; else to = rack->r_ctl.rc_min_to; if (to == 0) to = 1; + /* Special case for KEEPINIT */ + if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && + (TP_KEEPINIT(tp) != 0) && + rsm) { + /* + * We have to put a ceiling on the rxt timer + * of the keep-init timeout. + */ + uint32_t max_time, red; + + max_time = TICKS_2_USEC(TP_KEEPINIT(tp)); + if (TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) { + red = (cts - (uint32_t)rsm->r_tim_lastsent[0]); + if (red < max_time) + max_time -= red; + else + max_time = 1; + } + /* Reduce timeout to the keep value if needed */ + if (max_time < to) + to = max_time; + } return (to); } return (0); @@ -4505,7 +5385,7 @@ goto activate_rxt; } if ((rack->use_rack_rr == 0) && - (IN_RECOVERY(tp->t_flags)) && + (IN_FASTRECOVERY(tp->t_flags)) && (rack->rack_no_prr == 0) && (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { /* @@ -4521,7 +5401,7 @@ srtt = rack_grab_rtt(tp, rack); thresh = rack_calc_thresh_rack(rack, srtt, cts); idx = rsm->r_rtr_cnt - 1; - exp = rsm->r_tim_lastsent[idx] + thresh; + exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; if (SEQ_GEQ(exp, cts)) { to = exp - cts; if (to < rack->r_ctl.rc_min_to) { @@ -4557,16 +5437,25 @@ } idx = rsm->r_rtr_cnt - 1; time_since_sent = 0; - if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) - tstmp_touse = rsm->r_tim_lastsent[idx]; + if (TSTMP_GEQ(((uint32_t)rsm->r_tim_lastsent[idx]), rack->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = (uint32_t)rsm->r_tim_lastsent[idx]; else - tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; + tstmp_touse = (uint32_t)rack->r_ctl.rc_tlp_rxt_last_time; if (TSTMP_GT(cts, tstmp_touse)) time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; if (tp->t_srtt) { - srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); - srtt = TICKS_2_MSEC(srtt_cur); + if ((rack->rc_srtt_measure_made == 0) && + (tp->t_srtt == 1)) { + /* + * If another stack as run and set srtt to 1, + * then the srtt was 0, so lets use the initial. + */ + srtt = RACK_INITIAL_RTO; + } else { + srtt_cur = tp->t_srtt; + srtt = srtt_cur; + } } else srtt = RACK_INITIAL_RTO; /* @@ -4574,23 +5463,29 @@ * rack RTT has spiked we want to use * the last RTT not the smoothed one. */ - if (rack_tlp_use_greater && (srtt < rack_grab_rtt(tp, rack))) + if (rack_tlp_use_greater && + tp->t_srtt && + (srtt < rack_grab_rtt(tp, rack))) { srtt = rack_grab_rtt(tp, rack); + } thresh = rack_calc_thresh_tlp(tp, rack, rsm, srtt); - if (thresh > time_since_sent) + if (thresh > time_since_sent) { to = thresh - time_since_sent; - else { + } else { to = rack->r_ctl.rc_min_to; rack_log_alt_to_to_cancel(rack, thresh, /* flex1 */ time_since_sent, /* flex2 */ tstmp_touse, /* flex3 */ rack->r_ctl.rc_tlp_rxt_last_time, /* flex4 */ - rsm->r_tim_lastsent[idx], + (uint32_t)rsm->r_tim_lastsent[idx], srtt, idx, 99); } - if (to > TCPTV_REXMTMAX) { + if (to < rack_tlp_min) { + to = rack_tlp_min; + } + if (to > TICKS_2_USEC(TCPTV_REXMTMAX)) { /* * If the TLP time works out to larger than the max * RTO lets not do TLP.. just RTO. @@ -4630,6 +5525,8 @@ rack->r_ctl.rc_went_idle_time = 1; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); rack->rc_in_persist = 1; } } @@ -4637,9 +5534,9 @@ static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { - if (rack->rc_inp->inp_in_hpts) { + if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - rack->r_ctl.rc_hpts_flags = 0; + rack->r_ctl.rc_hpts_flags = 0; } #ifdef NETFLIX_SHARED_CWND if (rack->r_ctl.rc_scw) { @@ -4665,7 +5562,7 @@ extra /= (uint64_t)rack_probertt_gpsrtt_cnt_div; idle_min += (uint32_t)extra; } - if (time_idle >= idle_min) { + if (time_idle >= idle_min) { /* Yes, we count it as a probe-rtt. */ uint32_t us_cts; @@ -4683,7 +5580,9 @@ rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_rxtshift = 0; - rack->r_ctl.rc_agg_delayed = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); + rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_late = 0; rack->r_ctl.rc_agg_early = 0; @@ -4705,7 +5604,7 @@ log.u_bbr.flex6 = diag->need_new_to; log.u_bbr.flex7 = diag->p_hpts_active; log.u_bbr.flex8 = diag->p_on_min_sleep; - /* Hijack other fields as needed */ + /* Hijack other fields as needed */ log.u_bbr.epoch = diag->have_slept; log.u_bbr.lt_epoch = diag->yet_to_sleep; log.u_bbr.pkts_out = diag->co_ret; @@ -4728,6 +5627,27 @@ } +static void +rack_log_wakeup(struct tcpcb *tp, struct tcp_rack *rack, struct sockbuf *sb, uint32_t len, int type) +{ + if (rack_verbose_logging && rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = sb->sb_flags; + log.u_bbr.flex2 = len; + log.u_bbr.flex3 = sb->sb_state; + log.u_bbr.flex8 = type; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + TCP_LOG_SB_WAKE, 0, + len, &log, false, &tv); + } +} + static void rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t slot, uint32_t tot_len_this_send, int sup_rack) @@ -4737,6 +5657,7 @@ struct timeval tv; uint32_t delayed_ack = 0; uint32_t hpts_timeout; + uint32_t entry_slot = slot; uint8_t stopped; uint32_t left = 0; uint32_t us_cts; @@ -4758,11 +5679,16 @@ rack->r_ctl.rc_hpts_flags = 0; us_cts = tcp_get_usecs(&tv); /* Now early/late accounting */ - if (rack->r_early) { + rack_log_pacing_delay_calc(rack, entry_slot, slot, 0, 0, 0, 26, __LINE__, NULL); + if (rack->r_early && (rack->rc_ack_can_sendout_data == 0)) { /* * We have a early carry over set, * we can always add more time so we * can always make this compensation. + * + * Note if ack's are allowed to wake us do not + * penalize the next timer for being awoke + * by an ack aka the rc_agg_early (non-paced mode). */ slot += rack->r_ctl.rc_agg_early; rack->r_early = 0; @@ -4825,7 +5751,7 @@ } #endif if (tp->t_flags & TF_DELACK) { - delayed_ack = TICKS_2_MSEC(tcp_delacktime); + delayed_ack = TICKS_2_USEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || @@ -4848,10 +5774,16 @@ */ if (TCPS_HAVEESTABLISHED(tp->t_state)) { /* Get the established keep-alive time */ - hpts_timeout = TP_KEEPIDLE(tp); + hpts_timeout = TICKS_2_USEC(TP_KEEPIDLE(tp)); } else { - /* Get the initial setup keep-alive time */ - hpts_timeout = TP_KEEPINIT(tp); + /* + * Get the initial setup keep-alive time, + * note that this is probably not going to + * happen, since rack will be running a rxt timer + * if a SYN of some sort is outstanding. It is + * actually handled in rack_timeout_rxt(). + */ + hpts_timeout = TICKS_2_USEC(TP_KEEPINIT(tp)); } rack->r_ctl.rc_hpts_flags |= PACE_TMR_KEEP; if (rack->in_probe_rtt) { @@ -4862,7 +5794,7 @@ * This will get us out of probe-rtt and update * our min-rtt. */ - hpts_timeout = (rack_min_probertt_hold / HPTS_USEC_IN_MSEC); + hpts_timeout = rack_min_probertt_hold; } } } @@ -4889,7 +5821,9 @@ hpts_timeout = 0x7ffffffe; rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } - if ((rack->rc_gp_filled == 0) && + rack_log_pacing_delay_calc(rack, entry_slot, slot, hpts_timeout, 0, 0, 27, __LINE__, NULL); + if ((rack->gp_ready == 0) && + (rack->use_fixed_rate == 0) && (hpts_timeout < slot) && (rack->r_ctl.rc_hpts_flags & (PACE_TMR_TLP|PACE_TMR_RXT))) { /* @@ -4903,29 +5837,72 @@ slot = hpts_timeout; } rack->r_ctl.last_pacing_time = slot; + /** + * Turn off all the flags for queuing by default. The + * flags have important meanings to what happens when + * LRO interacts with the transport. Most likely (by default now) + * mbuf_queueing and ack compression are on. So the transport + * has a couple of flags that control what happens (if those + * are not on then these flags won't have any effect since it + * won't go through the queuing LRO path). + * + * INP_MBUF_QUEUE_READY - This flags says that I am busy + * pacing output, so don't disturb. But + * it also means LRO can wake me if there + * is a SACK arrival. + * + * INP_DONT_SACK_QUEUE - This flag is used in conjunction + * with the above flag (QUEUE_READY) and + * when present it says don't even wake me + * if a SACK arrives. + * + * The idea behind these flags is that if we are pacing we + * set the MBUF_QUEUE_READY and only get woken up if + * a SACK arrives (which could change things) or if + * our pacing timer expires. If, however, we have a rack + * timer running, then we don't even want a sack to wake + * us since the rack timer has to expire before we can send. + * + * Other cases should usually have none of the flags set + * so LRO can call into us. + */ + inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); if (slot) { rack->r_ctl.rc_last_output_to = us_cts + slot; - if (rack->rc_always_pace || rack->r_mbuf_queue) { - if ((rack->rc_gp_filled == 0) || - rack->pacing_longer_than_rtt) { - inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); - } else { - inp->inp_flags2 |= INP_MBUF_QUEUE_READY; - if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && - (rack->r_rr_config != 3)) - inp->inp_flags2 |= INP_DONT_SACK_QUEUE; - else - inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; - } + /* + * A pacing timer (slot) is being set, in + * such a case we cannot send (we are blocked by + * the timer). So lets tell LRO that it should not + * wake us unless there is a SACK. Note this only + * will be effective if mbuf queueing is on or + * compressed acks are being processed. + */ + inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + /* + * But wait if we have a Rack timer running + * even a SACK should not disturb us (with + * the exception of r_rr_config 3). + */ + if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) && + (rack->r_rr_config != 3)) + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + if (rack->rc_ack_can_sendout_data) { + /* + * Ahh but wait, this is that special case + * where the pacing timer can be disturbed + * backout the changes (used for non-paced + * burst limiting). + */ + inp->inp_flags2 &= ~(INP_DONT_SACK_QUEUE|INP_MBUF_QUEUE_READY); } if ((rack->use_rack_rr) && (rack->r_rr_config < 2) && - ((hpts_timeout) && ((hpts_timeout * HPTS_USEC_IN_MSEC) < slot))) { + ((hpts_timeout) && (hpts_timeout < slot))) { /* * Arrange for the hpts to kick back in after the * t-o if the t-o does not cause a send. */ - (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); @@ -4936,21 +5913,15 @@ rack_log_to_start(rack, cts, hpts_timeout, slot, 1); } } else if (hpts_timeout) { - if (rack->rc_always_pace || rack->r_mbuf_queue) { - if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { - /* For a rack timer, don't wake us */ - inp->inp_flags2 |= INP_MBUF_QUEUE_READY; - if (rack->r_rr_config != 3) - inp->inp_flags2 |= INP_DONT_SACK_QUEUE; - else - inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; - } else { - /* All other timers wake us up */ - inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; - inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; - } - } - (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout), + /* + * With respect to inp_flags2 here, lets let any new acks wake + * us up here. Since we are not pacing (no pacing timer), output + * can happen so we should let it. If its a Rack timer, then any inbound + * packet probably won't change the sending (we will be blocked) + * but it may change the prr stats so letting it in (the set defaults + * at the start of this block) are good enough. + */ + (void)tcp_hpts_insert_diag(tp->t_inpcb, HPTS_USEC_TO_SLOTS(hpts_timeout), __LINE__, &diag); rack_log_hpts_diag(rack, us_cts, &diag, &tv); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); @@ -4986,12 +5957,10 @@ * settings. */ struct rack_sendmap *rsm; - int32_t recovery; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } - recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); @@ -4999,9 +5968,8 @@ rsm = rack_check_recovery_mode(tp, cts); rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); if (rsm) { - uint32_t rtt; - rack->r_ctl.rc_resend = rsm; + rack->r_timer_override = 1; if (rack->use_rack_rr) { /* * Don't accumulate extra pacing delay @@ -5011,36 +5979,8 @@ * time (in other words we get the min pacing * time versus rrr pacing time). */ - rack->r_timer_override = 1; rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; } - rtt = rack->rc_rack_rtt; - if (rtt == 0) - rtt = 1; - if (rack->rack_no_prr == 0) { - if ((recovery == 0) && - (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { - /* - * The rack-timeout that enter's us into recovery - * will force out one MSS and set us up so that we - * can do one more send in 2*rtt (transitioning the - * rack timeout into a rack-tlp). - */ - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack->r_timer_override = 1; - rack_log_to_prr(rack, 3, 0); - } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && - rack->use_rack_rr) { - /* - * When a rack timer goes, if the rack rr is - * on, arrange it so we can send a full segment - * overriding prr (though we pay a price for this - * for future new sends). - */ - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 4, 0); - } - } } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RACK; if (rsm == NULL) { @@ -5052,6 +5992,50 @@ return (0); } +static void +rack_adjust_orig_mlen(struct rack_sendmap *rsm) +{ + if (rsm->m->m_len > rsm->orig_m_len) { + /* + * Mbuf grew, caused by sbcompress, our offset does + * not change. + */ + rsm->orig_m_len = rsm->m->m_len; + } else if (rsm->m->m_len < rsm->orig_m_len) { + /* + * Mbuf shrank, trimmed off the top by an ack, our + * offset changes. + */ + rsm->soff -= (rsm->orig_m_len - rsm->m->m_len); + rsm->orig_m_len = rsm->m->m_len; + } +} + +static void +rack_setup_offset_for_rsm(struct rack_sendmap *src_rsm, struct rack_sendmap *rsm) +{ + struct mbuf *m; + uint32_t soff; + + if (src_rsm->orig_m_len != src_rsm->m->m_len) { + /* Fix up the orig_m_len and possibly the mbuf offset */ + rack_adjust_orig_mlen(src_rsm); + } + m = src_rsm->m; + soff = src_rsm->soff + (src_rsm->r_end - src_rsm->r_start); + while (soff >= m->m_len) { + /* Move out past this mbuf */ + soff -= m->m_len; + m = m->m_next; + KASSERT((m != NULL), + ("rsm:%p nrsm:%p hit at soff:%u null m", + src_rsm, rsm, soff)); + } + rsm->m = m; + rsm->soff = soff; + rsm->orig_m_len = m->m_len; +} + static __inline void rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, struct rack_sendmap *rsm, uint32_t start) @@ -5063,13 +6047,23 @@ nrsm->r_rtr_cnt = rsm->r_rtr_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_dupack = rsm->r_dupack; - nrsm->usec_orig_send = rsm->usec_orig_send; + nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; nrsm->r_rtr_bytes = 0; rsm->r_end = nrsm->r_start; nrsm->r_just_ret = rsm->r_just_ret; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; } + /* + * Now we need to find nrsm's new location in the mbuf chain + * we basically calculate a new offset, which is soff + + * how much is left in original rsm. Then we walk out the mbuf + * chain to find the righ postion, it may be the same mbuf + * or maybe not. + */ + KASSERT((rsm->m != NULL), + ("rsm:%p nrsm:%p rack:%p -- rsm->m is NULL?", rsm, nrsm, rack)); + rack_setup_offset_for_rsm(rsm, nrsm); } static struct rack_sendmap * @@ -5089,6 +6083,8 @@ */ struct rack_sendmap *rm; + rack_log_map_chg(rack->rc_tp, rack, NULL, + l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); l_rsm->r_end = r_rsm->r_end; if (l_rsm->r_dupack < r_rsm->r_dupack) l_rsm->r_dupack = r_rsm->r_dupack; @@ -5132,7 +6128,7 @@ l_rsm->r_limit_type = 0; } rack_free(rack, r_rsm); - return(l_rsm); + return (l_rsm); } /* @@ -5152,7 +6148,7 @@ struct rack_sendmap *rsm = NULL; struct rack_sendmap *insret; struct socket *so; - uint32_t amm, old_prr_snd = 0; + uint32_t amm; uint32_t out, avail; int collapsed_win = 0; @@ -5173,6 +6169,8 @@ * need to figure out how to force a full MSS segment out. */ rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); @@ -5204,10 +6202,9 @@ /* not enough to fill a MTU */ goto need_retran; } - if (IN_RECOVERY(tp->t_flags)) { + if (IN_FASTRECOVERY(tp->t_flags)) { /* Unlikely */ if (rack->rack_no_prr == 0) { - old_prr_snd = rack->r_ctl.rc_prr_sndcnt; if (out + amm <= tp->snd_wnd) { rack->r_ctl.rc_prr_sndcnt = amm; rack_log_to_prr(rack, 4, 0); @@ -5286,6 +6283,7 @@ } rack_clone_rsm(rack, nrsm, rsm, (rsm->r_end - ctf_fixed_maxseg(tp))); + rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); #ifdef INVARIANTS if (insret != NULL) { @@ -5374,7 +6372,7 @@ */ if (tp->t_rxtshift == TCP_MAXRXTSHIFT && (ticks - tp->t_rcvtime >= tcp_maxpersistidle || - ticks - tp->t_rcvtime >= TCP_REXMTVAL(tp) * tcp_totbackoff)) { + TICKS_2_USEC(ticks - tp->t_rcvtime) >= RACK_REXMTVAL(tp) * tcp_totbackoff)) { KMOD_TCPSTAT_INC(tcps_persistdrop); retval = 1; tcp_log_end_status(tp, TCP_EI_STATUS_PERSIST_MAX); @@ -5495,52 +6493,74 @@ */ struct rack_sendmap *rsm, *trsm = NULL; struct tcp_rack *rack; - int32_t cnt = 0; rack = (struct tcp_rack *)tp->t_fb_ptr; - rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); + rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* * Ideally we would like to be able to * mark SACK-PASS on anything not acked here. + * * However, if we do that we would burst out * all that data 1ms apart. This would be unwise, * so for now we will just let the normal rxt timer * and tlp timer take care of it. + * + * Also we really need to stick them back in sequence + * order. This way we send in the proper order and any + * sacks that come floating in will "re-ack" the data. + * To do this we zap the tmap with an INIT and then + * walk through and place every rsm in the RB tree + * back in its seq ordered place. */ + TAILQ_INIT(&rack->r_ctl.rc_tmap); RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { - if (rsm->r_flags & RACK_ACKED) { - cnt++; - rsm->r_dupack = 0; - rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); - if (rsm->r_in_tmap == 0) { - /* We must re-add it back to the tlist */ - if (trsm == NULL) { - TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); - } else { - TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); - } - rsm->r_in_tmap = 1; - } + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + /* We must re-add it back to the tlist */ + if (trsm == NULL) { + TAILQ_INSERT_HEAD(&rack->r_ctl.rc_tmap, rsm, r_tnext); + } else { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } + rsm->r_in_tmap = 1; trsm = rsm; if (rsm->r_flags & RACK_ACKED) rsm->r_flags |= RACK_WAS_ACKED; rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ + rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; rack->r_ctl.rc_sacked = 0; + rack->r_ctl.rc_sacklast = NULL; rack->r_ctl.rc_agg_delayed = 0; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; rack->r_late = 0; /* Clear the tlp rtx mark */ rack->r_ctl.rc_resend = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rack->r_ctl.rc_resend != NULL) + rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; rack->r_ctl.rc_prr_sndcnt = 0; rack_log_to_prr(rack, 6, 0); rack->r_timer_override = 1; + if ((((tp->t_flags & TF_SACK_PERMIT) == 0) +#ifdef NETFLIX_EXP_DETECTION + || (rack->sack_attack_disable != 0) +#endif + ) && ((tp->t_flags & TF_SENTFIN) == 0)) { + /* + * For non-sack customers new data + * needs to go out as retransmits until + * we retransmit up to snd_max. + */ + rack->r_must_retran = 1; + rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, + rack->r_ctl.rc_sacked); + } + rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; } static void @@ -5590,11 +6610,44 @@ return (1); } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_RXT; + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; + if (IN_FASTRECOVERY(tp->t_flags)) + tp->t_flags |= TF_WASFRECOVERY; + else + tp->t_flags &= ~TF_WASFRECOVERY; + if (IN_CONGRECOVERY(tp->t_flags)) + tp->t_flags |= TF_WASCRECOVERY; + else + tp->t_flags &= ~TF_WASCRECOVERY; if (TCPS_HAVEESTABLISHED(tp->t_state) && (tp->snd_una == tp->snd_max)) { /* Nothing outstanding .. nothing to do */ return (0); } + /* + * Rack can only run one timer at a time, so we cannot + * run a KEEPINIT (gating SYN sending) and a retransmit + * timer for the SYN. So if we are in a front state and + * have a KEEPINIT timer we need to check the first transmit + * against now to see if we have exceeded the KEEPINIT time + * (if one is set). + */ + if ((TCPS_HAVEESTABLISHED(tp->t_state) == 0) && + (TP_KEEPINIT(tp) != 0)) { + struct rack_sendmap *rsm; + + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm) { + /* Ok we have something outstanding to test keepinit with */ + if ((TSTMP_GT(cts, (uint32_t)rsm->r_tim_lastsent[0])) && + ((cts - (uint32_t)rsm->r_tim_lastsent[0]) >= TICKS_2_USEC(TP_KEEPINIT(tp)))) { + /* We have exceeded the KEEPINIT time */ + tcp_log_end_status(tp, TCP_EI_STATUS_KEEP_MAX); + goto drop_it; + } + } + } /* * Retransmission timer went off. Message has not been acked within * retransmit interval. Back off to a longer retransmit interval @@ -5612,10 +6665,11 @@ tp->t_rxtshift++; } if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { + tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); +drop_it: tp->t_rxtshift = TCP_MAXRXTSHIFT; KMOD_TCPSTAT_INC(tcps_timeoutdrop); retval = 1; - tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); tcp_set_inp_to_drop(rack->rc_inp, (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; @@ -5639,27 +6693,19 @@ tp->snd_cwnd_prev = tp->snd_cwnd; tp->snd_ssthresh_prev = tp->snd_ssthresh; tp->snd_recover_prev = tp->snd_recover; - if (IN_FASTRECOVERY(tp->t_flags)) - tp->t_flags |= TF_WASFRECOVERY; - else - tp->t_flags &= ~TF_WASFRECOVERY; - if (IN_CONGRECOVERY(tp->t_flags)) - tp->t_flags |= TF_WASCRECOVERY; - else - tp->t_flags &= ~TF_WASCRECOVERY; - tp->t_badrxtwin = ticks + (tp->t_srtt >> (TCP_RTT_SHIFT + 1)); + tp->t_badrxtwin = ticks + (USEC_2_TICKS(tp->t_srtt)/2); tp->t_flags |= TF_PREVVALID; - } else + } else if ((tp->t_flags & TF_RCVD_TSTMP) == 0) tp->t_flags &= ~TF_PREVVALID; KMOD_TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) - rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); + rexmt = RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]; else - rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; - TCPT_RANGESET(tp->t_rxtcur, rexmt, - max(MSEC_2_TICKS(rack_rto_min), rexmt), - MSEC_2_TICKS(rack_rto_max)); + rexmt = max(rack_rto_min, (tp->t_srtt + (tp->t_rttvar << 2))) * tcp_backoff[tp->t_rxtshift]; + + RACK_TCPT_RANGESET(tp->t_rxtcur, rexmt, + max(rack_rto_min, rexmt), rack_rto_max); /* * We enter the path for PLMTUD if connection is established or, if * connection is FIN_WAIT_1 status, reason for the last is that if @@ -5759,7 +6805,17 @@ } } /* - * If we backed off this far, our srtt estimate is probably bogus. + * Disable RFC1323 and SACK if we haven't got any response to + * our third SYN to work-around some broken terminal servers + * (most of which have hopefully been retired) that have bad VJ + * header compression code which trashes TCP segments containing + * unknown-to-them TCP options. + */ + if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && + (tp->t_rxtshift == 3)) + tp->t_flags &= ~(TF_REQ_SCALE|TF_REQ_TSTMP|TF_SACK_PERMIT); + /* + * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit * times until then. @@ -5771,14 +6827,14 @@ else #endif in_losing(tp->t_inpcb); - tp->t_rttvar += (tp->t_srtt >> TCP_RTT_SHIFT); + tp->t_rttvar += tp->t_srtt; tp->t_srtt = 0; } sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); tp->snd_recover = tp->snd_max; tp->t_flags |= TF_ACKNOW; tp->t_rtttime = 0; - rack_cong_signal(tp, NULL, CC_RTO); + rack_cong_signal(tp, CC_RTO, tp->snd_una); out: return (retval); } @@ -5848,12 +6904,14 @@ ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { rack->r_ctl.rc_tlp_rxt_last_time = cts; + rack->r_fast_output = 0; ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { rack->r_ctl.rc_tlp_rxt_last_time = cts; + rack->r_fast_output = 0; ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); @@ -5949,9 +7007,10 @@ static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint32_t ts) + struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag) { int32_t idx; + uint16_t stripped_flags; rsm->r_rtr_cnt++; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); @@ -5966,6 +7025,7 @@ } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; + stripped_flags = rsm->r_flags & ~(RACK_SENT_SP|RACK_SENT_FP); if (rsm->r_flags & RACK_ACKED) { /* Problably MTU discovery messing with us */ rsm->r_flags &= ~RACK_ACKED; @@ -5986,7 +7046,7 @@ static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint32_t ts, int32_t *lenp) + struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag) { /* * We (re-)transmitted starting at rsm->r_start for some length @@ -6003,7 +7063,7 @@ * We retransmitted the whole piece or more than the whole * slopping into the next rsm. */ - rack_update_rsm(tp, rack, rsm, ts); + rack_update_rsm(tp, rack, rsm, ts, add_flag); if (c_end == rsm->r_end) { *lenp = 0; return (0); @@ -6051,15 +7111,17 @@ nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); - rack_update_rsm(tp, rack, rsm, ts); + rack_update_rsm(tp, rack, rsm, ts, add_flag); + /* Log a split of rsm into rsm and nrsm */ + rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); *lenp = 0; return (0); } static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, - uint32_t seq_out, uint8_t th_flags, int32_t err, uint32_t ts, - uint8_t pass, struct rack_sendmap *hintrsm, uint32_t us_cts) + uint32_t seq_out, uint8_t th_flags, int32_t err, uint64_t cts, + struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, uint32_t s_moff) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm, *insret, fe; @@ -6103,21 +7165,6 @@ } rack = (struct tcp_rack *)tp->t_fb_ptr; snd_una = tp->snd_una; - if (SEQ_LEQ((seq_out + len), snd_una)) { - /* Are sending an old segment to induce an ack (keep-alive)? */ - return; - } - if (SEQ_LT(seq_out, snd_una)) { - /* huh? should we panic? */ - uint32_t end; - - end = seq_out + len; - seq_out = snd_una; - if (SEQ_GEQ(end, seq_out)) - len = end - seq_out; - else - len = 0; - } snd_max = tp->snd_max; if (th_flags & (TH_SYN | TH_FIN)) { /* @@ -6138,12 +7185,27 @@ snd_max = tp->snd_nxt; } } + if (SEQ_LEQ((seq_out + len), snd_una)) { + /* Are sending an old segment to induce an ack (keep-alive)? */ + return; + } + if (SEQ_LT(seq_out, snd_una)) { + /* huh? should we panic? */ + uint32_t end; + + end = seq_out + len; + seq_out = snd_una; + if (SEQ_GEQ(end, seq_out)) + len = end - seq_out; + else + len = 0; + } if (len == 0) { /* We don't log zero window probes */ return; } - rack->r_ctl.rc_time_last_sent = ts; - if (IN_RECOVERY(tp->t_flags)) { + rack->r_ctl.rc_time_last_sent = cts; + if (IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_prr_out += len; } /* First question is it a retransmission or new? */ @@ -6159,26 +7221,58 @@ return; } if (th_flags & TH_FIN) { - rsm->r_flags = RACK_HAS_FIN; + rsm->r_flags = RACK_HAS_FIN|add_flag; } else { - rsm->r_flags = 0; + rsm->r_flags = add_flag; } - rsm->r_tim_lastsent[0] = ts; + rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; - rsm->usec_orig_send = us_cts; if (th_flags & TH_SYN) { /* The data space is one beyond snd_una */ - rsm->r_flags |= RACK_HAS_SIN; - rsm->r_start = seq_out + 1; - rsm->r_end = rsm->r_start + (len - 1); - } else { - /* Normal case */ - rsm->r_start = seq_out; - rsm->r_end = rsm->r_start + len; + rsm->r_flags |= RACK_HAS_SYN; } + rsm->r_start = seq_out; + rsm->r_end = rsm->r_start + len; rsm->r_dupack = 0; + /* + * save off the mbuf location that + * sndmbuf_noadv returned (which is + * where we started copying from).. + */ + rsm->m = s_mb; + rsm->soff = s_moff; + /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ + if (rsm->m) { + if (rsm->m->m_len <= rsm->soff) { + /* + * XXXrrs Question, will this happen? + * + * If sbsndptr is set at the correct place + * then s_moff should always be somewhere + * within rsm->m. But if the sbsndptr was + * off then that won't be true. If it occurs + * we need to walkout to the correct location. + */ + struct mbuf *lm; + + lm = rsm->m; + while (lm->m_len <= rsm->soff) { + rsm->soff -= lm->m_len; + lm = lm->m_next; + KASSERT(lm != NULL, ("%s rack:%p lm goes null orig_off:%u origmb:%p rsm->soff:%u", + __func__, rack, s_moff, s_mb, rsm->soff)); + } + rsm->m = lm; + counter_u64_add(rack_sbsndptr_wrong, 1); + } else + counter_u64_add(rack_sbsndptr_right, 1); + rsm->orig_m_len = rsm->m->m_len; + } else + rsm->orig_m_len = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + /* Log a new rsm */ + rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_NEW, 0, __LINE__); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (insret != NULL) { @@ -6194,7 +7288,7 @@ * * If this is true mark it so. */ - if ((IN_RECOVERY(tp->t_flags) == 0) && + if ((IN_FASTRECOVERY(tp->t_flags) == 0) && (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) == ctf_fixed_maxseg(tp))) { struct rack_sendmap *prsm; @@ -6217,7 +7311,7 @@ rsm = NULL; } if ((rsm) && (rsm->r_start == seq_out)) { - seq_out = rack_update_entry(tp, rack, rsm, ts, &len); + seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); if (len == 0) { return; } else { @@ -6230,7 +7324,7 @@ rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); if (rsm) { if (rsm->r_start == seq_out) { - seq_out = rack_update_entry(tp, rack, rsm, ts, &len); + seq_out = rack_update_entry(tp, rack, rsm, cts, &len, add_flag); if (len == 0) { return; } else { @@ -6245,7 +7339,7 @@ */ nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { - rack_update_rsm(tp, rack, rsm, ts); + rack_update_rsm(tp, rack, rsm, cts, add_flag); return; } /* @@ -6254,6 +7348,7 @@ */ rack_clone_rsm(rack, nrsm, rsm, seq_out); insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, nrsm); + rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SPLIT, 0, __LINE__); #ifdef INVARIANTS if (insret != NULL) { panic("Insert in rb tree of %p fails ret:%p rack:%p rsm:%p", @@ -6265,7 +7360,7 @@ nrsm->r_in_tmap = 1; } rsm->r_flags &= (~RACK_HAS_FIN); - seq_out = rack_update_entry(tp, rack, nrsm, ts, &len); + seq_out = rack_update_entry(tp, rack, nrsm, cts, &len, add_flag); if (len == 0) { return; } else if (len > 0) @@ -6281,15 +7376,15 @@ } else if (SEQ_LT(seq_out, tp->snd_max)) { #ifdef INVARIANTS printf("seq_out:%u len:%d snd_una:%u snd_max:%u -- but rsm not found?\n", - seq_out, len, tp->snd_una, tp->snd_max); + seq_out, len, tp->snd_una, tp->snd_max); printf("Starting Dump of all rack entries\n"); RB_FOREACH(rsm, rack_rb_tree_head, &rack->r_ctl.rc_mtree) { printf("rsm:%p start:%u end:%u\n", - rsm, rsm->r_start, rsm->r_end); + rsm, rsm->r_start, rsm->r_end); } printf("Dump complete\n"); panic("seq_out not found rack:%p tp:%p", - rack, tp); + rack, tp); #endif } else { #ifdef INVARIANTS @@ -6298,7 +7393,7 @@ * flag) */ panic("seq_out:%u(%d) is beyond snd_max:%u tp:%p", - seq_out, len, tp->snd_max, tp); + seq_out, len, tp->snd_max, tp); #endif } } @@ -6456,39 +7551,32 @@ rack->r_ctl.rc_lowest_us_rtt = 1; } } - rack_log_rtt_sample(rack, rtt); o_srtt = tp->t_srtt; o_var = tp->t_rttvar; rack = (struct tcp_rack *)tp->t_fb_ptr; if (tp->t_srtt != 0) { /* - * srtt is stored as fixed point with 5 bits after the - * binary point (i.e., scaled by 8). The following magic is - * equivalent to the smoothing algorithm in rfc793 with an - * alpha of .875 (srtt = rtt/8 + srtt*7/8 in fixed point). - * Adjust rtt to origin 0. + * We keep a simple srtt in microseconds, like our rtt + * measurement. We don't need to do any tricks with shifting + * etc. Instead we just add in 1/8th of the new measurement + * and subtract out 1/8 of the old srtt. We do the same with + * the variance after finding the absolute value of the + * difference between this sample and the current srtt. */ - delta = ((rtt - 1) << TCP_DELTA_SHIFT) - - (tp->t_srtt >> (TCP_RTT_SHIFT - TCP_DELTA_SHIFT)); - - tp->t_srtt += delta; + delta = tp->t_srtt - rtt; + /* Take off 1/8th of the current sRTT */ + tp->t_srtt -= (tp->t_srtt >> 3); + /* Add in 1/8th of the new RTT just measured */ + tp->t_srtt += (rtt >> 3); if (tp->t_srtt <= 0) tp->t_srtt = 1; - - /* - * We accumulate a smoothed rtt variance (actually, a - * smoothed mean difference), then set the retransmit timer - * to smoothed rtt + 4 times the smoothed variance. rttvar - * is stored as fixed point with 4 bits after the binary - * point (scaled by 16). The following is equivalent to - * rfc793 smoothing with an alpha of .75 (rttvar = - * rttvar*3/4 + |delta| / 4). This replaces rfc793's - * wired-in beta. - */ + /* Now lets make the absolute value of the variance */ if (delta < 0) delta = -delta; - delta -= tp->t_rttvar >> (TCP_RTTVAR_SHIFT - TCP_DELTA_SHIFT); - tp->t_rttvar += delta; + /* Subtract out 1/8th */ + tp->t_rttvar -= (tp->t_rttvar >> 3); + /* Add in 1/8th of the new variance we just saw */ + tp->t_rttvar += (delta >> 3); if (tp->t_rttvar <= 0) tp->t_rttvar = 1; if (tp->t_rttbest > tp->t_srtt + tp->t_rttvar) @@ -6499,17 +7587,37 @@ * variance to half the rtt (so our first retransmit happens * at 3*rtt). */ - tp->t_srtt = rtt << TCP_RTT_SHIFT; - tp->t_rttvar = rtt << (TCP_RTTVAR_SHIFT - 1); + tp->t_srtt = rtt; + tp->t_rttvar = rtt >> 1; tp->t_rttbest = tp->t_srtt + tp->t_rttvar; } + rack->rc_srtt_measure_made = 1; KMOD_TCPSTAT_INC(tcps_rttupdated); tp->t_rttupdated++; #ifdef STATS - stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); -#endif - tp->t_rxtshift = 0; + if (rack_stats_gets_ms_rtt == 0) { + /* Send in the microsecond rtt used for rxt timeout purposes */ + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rtt)); + } else if (rack_stats_gets_ms_rtt == 1) { + /* Send in the millisecond rtt used for rxt timeout purposes */ + int32_t ms_rtt; + + /* Round up */ + ms_rtt = (rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); + } else if (rack_stats_gets_ms_rtt == 2) { + /* Send in the millisecond rtt has close to the path RTT as we can get */ + int32_t ms_rtt; + + /* Round up */ + ms_rtt = (rack->r_ctl.rack_rs.rs_us_rtt + HPTS_USEC_IN_MSEC - 1) / HPTS_USEC_IN_MSEC; + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, ms_rtt)); + } else { + /* Send in the microsecond rtt has close to the path RTT as we can get */ + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); + } +#endif /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 @@ -6520,63 +7628,13 @@ * statistical, we have to test that we don't drop below the minimum * feasible timer (which is 2 ticks). */ - TCPT_RANGESET(tp->t_rxtcur, TCP_REXMTVAL(tp), - max(MSEC_2_TICKS(rack_rto_min), rtt + 2), MSEC_2_TICKS(rack_rto_max)); + tp->t_rxtshift = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + max(rack_rto_min, rtt + 2), rack_rto_max); + rack_log_rtt_sample(rack, rtt); tp->t_softerror = 0; } -static void -rack_earlier_retran(struct tcpcb *tp, struct rack_sendmap *rsm, - uint32_t t, uint32_t cts) -{ - /* - * For this RSM, we acknowledged the data from a previous - * transmission, not the last one we made. This means we did a false - * retransmit. - */ - struct tcp_rack *rack; - - if (rsm->r_flags & RACK_HAS_FIN) { - /* - * The sending of the FIN often is multiple sent when we - * have everything outstanding ack'd. We ignore this case - * since its over now. - */ - return; - } - if (rsm->r_flags & RACK_TLP) { - /* - * We expect TLP's to have this occur. - */ - return; - } - rack = (struct tcp_rack *)tp->t_fb_ptr; - /* should we undo cc changes and exit recovery? */ - if (IN_RECOVERY(tp->t_flags)) { - if (rack->r_ctl.rc_rsm_start == rsm->r_start) { - /* - * Undo what we ratched down and exit recovery if - * possible - */ - EXIT_RECOVERY(tp->t_flags); - tp->snd_recover = tp->snd_una; - if (rack->r_ctl.rc_cwnd_at > tp->snd_cwnd) - tp->snd_cwnd = rack->r_ctl.rc_cwnd_at; - if (rack->r_ctl.rc_ssthresh_at > tp->snd_ssthresh) - tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at; - } - } - if (rsm->r_flags & RACK_WAS_SACKPASS) { - /* - * We retransmitted based on a sack and the earlier - * retransmission ack'd it - re-ordering is occuring. - */ - counter_u64_add(rack_reorder_seen, 1); - rack->r_ctl.rc_reorder_ts = cts; - } - counter_u64_add(rack_badfr, 1); - counter_u64_add(rack_badfr_bytes, (rsm->r_end - rsm->r_start)); -} static void rack_apply_updated_usrtt(struct tcp_rack *rack, uint32_t us_rtt, uint32_t us_cts) @@ -6632,25 +7690,33 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack) { - int32_t i; + int32_t i, all; uint32_t t, len_acked; if ((rsm->r_flags & RACK_ACKED) || (rsm->r_flags & RACK_WAS_ACKED)) /* Already done */ return (0); - + if (rsm->r_no_rtt_allowed) { + /* Not allowed */ + return (0); + } if (ack_type == CUM_ACKED) { - if (SEQ_GT(th_ack, rsm->r_end)) + if (SEQ_GT(th_ack, rsm->r_end)) { len_acked = rsm->r_end - rsm->r_start; - else + all = 1; + } else { len_acked = th_ack - rsm->r_start; - } else + all = 0; + } + } else { len_acked = rsm->r_end - rsm->r_start; + all = 0; + } if (rsm->r_rtr_cnt == 1) { uint32_t us_rtt; - t = cts - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + t = cts - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; if ((int)t <= 0) t = 1; if (!tp->t_rttlow || tp->t_rttlow > t) @@ -6662,43 +7728,72 @@ rack->r_ctl.rc_rack_min_rtt = 1; } } - us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - rsm->usec_orig_send; + if (TSTMP_GT(tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time), rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) + us_rtt = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + else + us_rtt = tcp_get_usecs(NULL) - (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; if (us_rtt == 0) us_rtt = 1; rack_apply_updated_usrtt(rack, us_rtt, tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time)); - if (ack_type == SACKED) + if (ack_type == SACKED) { + rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 1); tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, 2 , rsm, rsm->r_rtr_cnt); - else { + } else { /* - * For cum-ack we are only confident if what - * is being acked is included in a measurement. - * Otherwise it could be an idle period that - * includes Delayed-ack time. + * We need to setup what our confidence + * is in this ack. + * + * If the rsm was app limited and it is + * less than a mss in length (the end + * of the send) then we have a gap. If we + * were app limited but say we were sending + * multiple MSS's then we are more confident + * int it. + * + * When we are not app-limited then we see if + * the rsm is being included in the current + * measurement, we tell this by the app_limited_needs_set + * flag. + * + * Note that being cwnd blocked is not applimited + * as well as the pacing delay between packets which + * are sending only 1 or 2 MSS's also will show up + * in the RTT. We probably need to examine this algorithm + * a bit more and enhance it to account for the delay + * between rsm's. We could do that by saving off the + * pacing delay of each rsm (in an rsm) and then + * factoring that in somehow though for now I am + * not sure how :) */ + int calc_conf = 0; + + if (rsm->r_flags & RACK_APP_LIMITED) { + if (all && (len_acked <= ctf_fixed_maxseg(tp))) + calc_conf = 0; + else + calc_conf = 1; + } else if (rack->app_limited_needs_set == 0) { + calc_conf = 1; + } else { + calc_conf = 0; + } + rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)], cts, 2); tcp_rack_xmit_timer(rack, t + 1, len_acked, us_rtt, - (rack->app_limited_needs_set ? 0 : 1), rsm, rsm->r_rtr_cnt); + calc_conf, rsm, rsm->r_rtr_cnt); } if ((rsm->r_flags & RACK_TLP) && - (!IN_RECOVERY(tp->t_flags))) { + (!IN_FASTRECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ if (rack->r_ctl.rc_tlp_cwnd_reduce) { rack->r_ctl.rc_rsm_start = tp->snd_max; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; - rack_cong_signal(tp, NULL, CC_NDUPACK); - /* - * When we enter recovery we need to assure - * we send one packet. - */ - if (rack->rack_no_prr == 0) { - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 7, 0); - } + rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); } } - if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { + if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ - rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } return (1); @@ -6709,8 +7804,10 @@ * so we need to clear these to avoid incorrect handling. */ tp->t_rxtshift = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); tp->t_softerror = 0; - if ((to->to_flags & TOF_TS) && + if (to && (to->to_flags & TOF_TS) && (ack_type == CUM_ACKED) && (to->to_tsecr) && ((rsm->r_flags & RACK_OVERMAX) == 0)) { @@ -6719,13 +7816,18 @@ * must be coming from a previous transmission. */ for (i = 0; i < rsm->r_rtr_cnt; i++) { - if (rsm->r_tim_lastsent[i] == to->to_tsecr) { - t = cts - rsm->r_tim_lastsent[i]; + if (rack_ts_to_msec(rsm->r_tim_lastsent[i]) == to->to_tsecr) { + t = cts - (uint32_t)rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if ((i + 1) < rsm->r_rtr_cnt) { - /* Likely */ - rack_earlier_retran(tp, rsm, t, cts); + /* + * The peer ack'd from our previous + * transmission. We have a spurious + * retransmission and thus we dont + * want to update our rack_rtt. + */ + return (0); } if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; @@ -6736,12 +7838,13 @@ } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, - rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { + (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ - rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; + rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]; rack->rc_rack_rtt = t; } - tcp_rack_xmit_timer(rack, t + 1, len_acked, (t * HPTS_USEC_IN_MSEC), 0, rsm, + rack_log_rtt_sample_calc(rack, t, (uint32_t)rsm->r_tim_lastsent[i], cts, 3); + tcp_rack_xmit_timer(rack, t + 1, len_acked, t, 0, rsm, rsm->r_rtr_cnt); return (1); } @@ -6756,19 +7859,20 @@ */ ts_not_found: i = rsm->r_rtr_cnt - 1; - t = cts - rsm->r_tim_lastsent[i]; + t = cts - (uint32_t)rsm->r_tim_lastsent[i]; if ((int)t <= 0) t = 1; if (rack->r_ctl.rc_rack_min_rtt && SEQ_LT(t, rack->r_ctl.rc_rack_min_rtt)) { /* * We retransmitted and the ack came back in less * than the smallest rtt we have observed. We most - * likey did an improper retransmit as outlined in - * 4.2 Step 3 point 2 in the rack-draft. + * likely did an improper retransmit as outlined in + * 6.2 Step 2 point 2 in the rack-draft so we + * don't want to update our rack_rtt. We in + * theory (in future) might want to think about reverting our + * cwnd state but we won't for now. */ - i = rsm->r_rtr_cnt - 2; - t = cts - rsm->r_tim_lastsent[i]; - rack_earlier_retran(tp, rsm, t, cts); + return (0); } else if (rack->r_ctl.rc_rack_min_rtt) { /* * We retransmitted it and the retransmit did the @@ -6781,9 +7885,9 @@ rack->r_ctl.rc_rack_min_rtt = 1; } } - if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[i])) { + if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, (uint32_t)rsm->r_tim_lastsent[i])) { /* New more recent rack_tmit_time */ - rack->r_ctl.rc_rack_tmit_time = rsm->r_tim_lastsent[i]; + rack->r_ctl.rc_rack_tmit_time = (uint32_t)rsm->r_tim_lastsent[i]; rack->rc_rack_rtt = t; } return (1); @@ -6877,7 +7981,7 @@ * measurement not starts. */ tp->gput_seq = rsm->r_start; - rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; } if ((use_which == RACK_USE_END) && SEQ_GEQ(rsm->r_end, tp->gput_seq)) { @@ -6893,7 +7997,7 @@ * in our measurement. */ tp->gput_seq = rsm->r_end; - rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; } if (use_which == RACK_USE_END_OR_THACK) { /* @@ -6906,7 +8010,7 @@ tp->gput_seq = th_ack; else tp->gput_seq = rsm->r_end; - rack->r_ctl.rc_gp_output_ts = rsm->usec_orig_send; + rack->r_ctl.rc_gp_output_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; } if (SEQ_GT(tp->gput_seq, tp->gput_ack)) { /* @@ -6933,15 +8037,25 @@ ((tp->gput_ack - tp->gput_seq) < max(rc_init_window(rack), (MIN_GP_WIN * ctf_fixed_maxseg(tp))))) { - /* - * There is no sense of continuing this measurement - * because its too small to gain us anything we - * trust. Skip it and that way we can start a new - * measurement quicker. - */ - rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, - 0, 0, 0, 6, __LINE__, NULL); - tp->t_flags &= ~TF_GPUTINPROG; + uint32_t ideal_amount; + + ideal_amount = rack_get_measure_window(tp, rack); + if (ideal_amount > sbavail(&tp->t_inpcb->inp_socket->so_snd)) { + /* + * There is no sense of continuing this measurement + * because its too small to gain us anything we + * trust. Skip it and that way we can start a new + * measurement quicker. + */ + tp->t_flags &= ~TF_GPUTINPROG; + rack_log_pacing_delay_calc(rack, tp->gput_ack, tp->gput_seq, + 0, 0, 0, 6, __LINE__, NULL); + } else { + /* + * Reset the window further out. + */ + tp->gput_ack = tp->gput_seq + ideal_amount; + } } } } @@ -7021,6 +8135,10 @@ /* Now adjust our tree blocks */ rsm->r_end = start; next->r_start = start; + /* Now we must adjust back where next->m is */ + rack_setup_offset_for_rsm(rsm, next); + + /* We don't need to adjust rsm, it did not change */ /* Clear out the dup ack count of the remainder */ rsm->r_dupack = 0; rsm->r_just_ret = 0; @@ -7062,6 +8180,7 @@ /* Done with block */ goto out; } + rack_log_map_chg(tp, rack, &stack_map, rsm, next, MAP_SACK_M1, end, __LINE__); counter_u64_add(rack_sack_used_next_merge, 1); /* Postion for the next block */ start = next->r_end; @@ -7108,6 +8227,7 @@ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; } + rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M2, end, __LINE__); rsm->r_flags &= (~RACK_HAS_FIN); /* Position us to point to the new nrsm that starts the sack blk */ rsm = nrsm; @@ -7161,19 +8281,20 @@ } if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); - rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; } + rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_SACK_M3, end, __LINE__); } else { counter_u64_add(rack_sack_skipped_acked, 1); moved++; } if (end == rsm->r_end) { - /* This block only - done, setup for next */ + /* This block only - done, setup for next */ goto out; } /* @@ -7221,6 +8342,11 @@ nrsm->r_end = end; rsm->r_dupack = 0; rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + /* + * Now that the rsm has had its start moved forward + * lets go ahead and get its new place in the world. + */ + rack_setup_offset_for_rsm(prev, rsm); /* * Now nrsm is our new little piece * that is acked (which was merged @@ -7236,6 +8362,7 @@ counter_u64_add(rack_reorder_seen, 1); rack->r_ctl.rc_reorder_ts = cts; } + rack_log_map_chg(tp, rack, prev, &stack_map, rsm, MAP_SACK_M4, end, __LINE__); rsm = prev; counter_u64_add(rack_sack_used_prev_merge, 1); } else { @@ -7297,9 +8424,10 @@ } if (rack->app_limited_needs_set) rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); - rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; rsm->r_flags &= ~RACK_TLP; + rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; @@ -7448,218 +8576,497 @@ } static void -rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th) +rack_process_to_cumack(struct tcpcb *tp, struct tcp_rack *rack, register uint32_t th_ack, uint32_t cts, struct tcpopt *to) { - uint32_t changed, entered_recovery = 0; - struct tcp_rack *rack; struct rack_sendmap *rsm, *rm; - struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; - register uint32_t th_ack; - int32_t i, j, k, num_sack_blks = 0; - uint32_t cts, acked, ack_point, sack_changed = 0; - int loop_start = 0, moved_two = 0; - uint32_t tsused; - - INP_WLOCK_ASSERT(tp->t_inpcb); - if (th->th_flags & TH_RST) { - /* We don't log resets */ - return; - } - rack = (struct tcp_rack *)tp->t_fb_ptr; - cts = tcp_ts_getticks(); - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); - changed = 0; - th_ack = th->th_ack; - if (rack->sack_attack_disable == 0) - rack_do_decay(rack); - if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { - /* - * You only get credit for - * MSS and greater (and you get extra - * credit for larger cum-ack moves). - */ - int ac; - ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); - rack->r_ctl.ack_count += ac; - counter_u64_add(rack_ack_total, ac); - } - if (rack->r_ctl.ack_count > 0xfff00000) { - /* - * reduce the number to keep us under - * a uint32_t. - */ - rack->r_ctl.ack_count /= 2; - rack->r_ctl.sack_count /= 2; - } - if (SEQ_GT(th_ack, tp->snd_una)) { - rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); - tp->t_acktime = ticks; - } - if (rsm && SEQ_GT(th_ack, rsm->r_start)) - changed = th_ack - rsm->r_start; - if (changed) { - /* - * The ACK point is advancing to th_ack, we must drop off - * the packets in the rack log and calculate any eligble - * RTT's. - */ - rack->r_wanted_output = 1; + /* + * The ACK point is advancing to th_ack, we must drop off + * the packets in the rack log and calculate any eligble + * RTT's. + */ + rack->r_wanted_output = 1; more: - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); - if (rsm == NULL) { - if ((th_ack - 1) == tp->iss) { - /* - * For the SYN incoming case we will not - * have called tcp_output for the sending of - * the SYN, so there will be no map. All - * other cases should probably be a panic. - */ - goto proc_sack; - } - if (tp->t_flags & TF_SENTFIN) { - /* if we send a FIN we will not hav a map */ - goto proc_sack; - } + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm == NULL) { + if ((th_ack - 1) == tp->iss) { + /* + * For the SYN incoming case we will not + * have called tcp_output for the sending of + * the SYN, so there will be no map. All + * other cases should probably be a panic. + */ + return; + } + if (tp->t_flags & TF_SENTFIN) { + /* if we sent a FIN we often will not have map */ + return; + } #ifdef INVARIANTS - panic("No rack map tp:%p for th:%p state:%d rack:%p snd_una:%u snd_max:%u snd_nxt:%u chg:%d\n", - tp, - th, tp->t_state, rack, - tp->snd_una, tp->snd_max, tp->snd_nxt, changed); + panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", + tp, + tp->t_state, th_ack, rack, + tp->snd_una, tp->snd_max, tp->snd_nxt); #endif - goto proc_sack; - } - if (SEQ_LT(th_ack, rsm->r_start)) { - /* Huh map is missing this */ + return; + } + if (SEQ_LT(th_ack, rsm->r_start)) { + /* Huh map is missing this */ #ifdef INVARIANTS - printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", - rsm->r_start, - th_ack, tp->t_state, rack->r_state); -#endif - goto proc_sack; - } - rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); - /* Now do we consume the whole thing? */ - if (SEQ_GEQ(th_ack, rsm->r_end)) { - /* Its all consumed. */ - uint32_t left; - uint8_t newly_acked; - - rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; - rsm->r_rtr_bytes = 0; - /* Record the time of highest cumack sent */ - rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; - rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); + printf("Rack map starts at r_start:%u for th_ack:%u huh? ts:%d rs:%d\n", + rsm->r_start, + th_ack, tp->t_state, rack->r_state); +#endif + return; + } + rack_update_rtt(tp, rack, rsm, to, cts, CUM_ACKED, th_ack); + /* Now do we consume the whole thing? */ + if (SEQ_GEQ(th_ack, rsm->r_end)) { + /* Its all consumed. */ + uint32_t left; + uint8_t newly_acked; + + rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); + rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; + rsm->r_rtr_bytes = 0; + /* Record the time of highest cumack sent */ + rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + rm = RB_REMOVE(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS - if (rm != rsm) { - panic("removing head in rack:%p rsm:%p rm:%p", - rack, rsm, rm); - } + if (rm != rsm) { + panic("removing head in rack:%p rsm:%p rm:%p", + rack, rsm, rm); + } #endif - if (rsm->r_in_tmap) { - TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); - rsm->r_in_tmap = 0; - } - newly_acked = 1; - if (rsm->r_flags & RACK_ACKED) { - /* - * It was acked on the scoreboard -- remove - * it from total - */ - rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); - newly_acked = 0; - } else if (rsm->r_flags & RACK_SACK_PASSED) { + if (rsm->r_in_tmap) { + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); + rsm->r_in_tmap = 0; + } + newly_acked = 1; + if (rsm->r_flags & RACK_ACKED) { + /* + * It was acked on the scoreboard -- remove + * it from total + */ + rack->r_ctl.rc_sacked -= (rsm->r_end - rsm->r_start); + newly_acked = 0; + } else if (rsm->r_flags & RACK_SACK_PASSED) { + /* + * There are segments ACKED on the + * scoreboard further up. We are seeing + * reordering. + */ + rsm->r_flags &= ~RACK_SACK_PASSED; + counter_u64_add(rack_reorder_seen, 1); + rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); + rsm->r_flags |= RACK_ACKED; + rack->r_ctl.rc_reorder_ts = cts; + if (rack->r_ent_rec_ns) { /* - * There are segments ACKED on the - * scoreboard further up. We are seeing - * reordering. + * We have sent no more, and we saw an sack + * then ack arrive. */ - rsm->r_flags &= ~RACK_SACK_PASSED; - counter_u64_add(rack_reorder_seen, 1); - rsm->r_ack_arrival = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); - rsm->r_flags |= RACK_ACKED; - rack->r_ctl.rc_reorder_ts = cts; - } - left = th_ack - rsm->r_end; - if (rack->app_limited_needs_set && newly_acked) - rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); - /* Free back to zone */ - rack_free(rack, rsm); - if (left) { - goto more; + rack->r_might_revert = 1; } - goto proc_sack; } - if (rsm->r_flags & RACK_ACKED) { + if ((rsm->r_flags & RACK_TO_REXT) && + (tp->t_flags & TF_RCVD_TSTMP) && + (to->to_flags & TOF_TS) && + (tp->t_flags & TF_PREVVALID)) { /* - * It was acked on the scoreboard -- remove it from - * total for the part being cum-acked. + * We can use the timestamp to see + * if this retransmission was from the + * first transmit. If so we made a mistake. */ - rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); + tp->t_flags &= ~TF_PREVVALID; + if (to->to_tsecr == rack_ts_to_msec(rsm->r_tim_lastsent[0])) { + /* The first transmit is what this ack is for */ + rack_cong_signal(tp, CC_RTO_ERR, th_ack); + } } - /* - * Clear the dup ack count for - * the piece that remains. - */ - rsm->r_dupack = 0; - rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); - if (rsm->r_rtr_bytes) { + left = th_ack - rsm->r_end; + if (rack->app_limited_needs_set && newly_acked) + rack_need_set_test(tp, rack, rsm, th_ack, __LINE__, RACK_USE_END_OR_THACK); + /* Free back to zone */ + rack_free(rack, rsm); + if (left) { + goto more; + } + /* Check for reneging */ + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { /* - * It was retransmitted adjust the - * sack holes for what was acked. + * The peer has moved snd_una up to + * the edge of this send, i.e. one + * that it had previously acked. The only + * way that can be true if the peer threw + * away data (space issues) that it had + * previously sacked (else it would have + * given us snd_una up to (rsm->r_end). + * We need to undo the acked markings here. + * + * Note we have to look to make sure th_ack is + * our rsm->r_start in case we get an old ack + * where th_ack is behind snd_una. */ - int ack_am; - - ack_am = (th_ack - rsm->r_start); - if (ack_am >= rsm->r_rtr_bytes) { - rack->r_ctl.rc_holes_rxt -= ack_am; - rsm->r_rtr_bytes -= ack_am; - } + rack_peer_reneges(rack, rsm, th_ack); } + return; + } + if (rsm->r_flags & RACK_ACKED) { /* - * Update where the piece starts and record - * the time of send of highest cumack sent. + * It was acked on the scoreboard -- remove it from + * total for the part being cum-acked. */ - rack->r_ctl.rc_gp_cumack_ts = rsm->usec_orig_send; - rsm->r_start = th_ack; - if (rack->app_limited_needs_set) - rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); + rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); } -proc_sack: - /* Check for reneging */ - rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); - if (rsm && (rsm->r_flags & RACK_ACKED) && (th_ack == rsm->r_start)) { - /* - * The peer has moved snd_una up to - * the edge of this send, i.e. one - * that it had previously acked. The only - * way that can be true if the peer threw - * away data (space issues) that it had - * previously sacked (else it would have - * given us snd_una up to (rsm->r_end). - * We need to undo the acked markings here. - * - * Note we have to look to make sure th_ack is - * our rsm->r_start in case we get an old ack - * where th_ack is behind snd_una. + /* + * Clear the dup ack count for + * the piece that remains. + */ + rsm->r_dupack = 0; + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + if (rsm->r_rtr_bytes) { + /* + * It was retransmitted adjust the + * sack holes for what was acked. */ - rack_peer_reneges(rack, rsm, th->th_ack); - } - if ((to->to_flags & TOF_SACK) == 0) { - /* We are done nothing left */ - goto out; + int ack_am; + + ack_am = (th_ack - rsm->r_start); + if (ack_am >= rsm->r_rtr_bytes) { + rack->r_ctl.rc_holes_rxt -= ack_am; + rsm->r_rtr_bytes -= ack_am; + } } - /* Sack block processing */ - if (SEQ_GT(th_ack, tp->snd_una)) - ack_point = th_ack; - else - ack_point = tp->snd_una; - for (i = 0; i < to->to_nsacks; i++) { - bcopy((to->to_sacks + i * TCPOLEN_SACK), - &sack, sizeof(sack)); - sack.start = ntohl(sack.start); - sack.end = ntohl(sack.end); - if (SEQ_GT(sack.end, sack.start) && + /* + * Update where the piece starts and record + * the time of send of highest cumack sent. + */ + rack->r_ctl.rc_gp_cumack_ts = rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)]; + rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_TRIM_HEAD, th_ack, __LINE__); + /* Now we need to move our offset forward too */ + if (rsm->orig_m_len != rsm->m->m_len) { + /* Fix up the orig_m_len and possibly the mbuf offset */ + rack_adjust_orig_mlen(rsm); + } + rsm->soff += (th_ack - rsm->r_start); + rsm->r_start = th_ack; + /* Now do we need to move the mbuf fwd too? */ + while (rsm->soff >= rsm->m->m_len) { + rsm->soff -= rsm->m->m_len; + rsm->m = rsm->m->m_next; + KASSERT((rsm->m != NULL), + (" nrsm:%p hit at soff:%u null m", + rsm, rsm->soff)); + } + rsm->orig_m_len = rsm->m->m_len; + if (rack->app_limited_needs_set) + rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_BEG); +} + +static void +rack_handle_might_revert(struct tcpcb *tp, struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + int sack_pass_fnd = 0; + + if (rack->r_might_revert) { + /* + * Ok we have reordering, have not sent anything, we + * might want to revert the congestion state if nothing + * further has SACK_PASSED on it. Lets check. + * + * We also get here when we have DSACKs come in for + * all the data that we FR'd. Note that a rxt or tlp + * timer clears this from happening. + */ + + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { + if (rsm->r_flags & RACK_SACK_PASSED) { + sack_pass_fnd = 1; + break; + } + } + if (sack_pass_fnd == 0) { + /* + * We went into recovery + * incorrectly due to reordering! + */ + int orig_cwnd; + + rack->r_ent_rec_ns = 0; + orig_cwnd = tp->snd_cwnd; + tp->snd_cwnd = rack->r_ctl.rc_cwnd_at_erec; + tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; + tp->snd_recover = tp->snd_una; + rack_log_to_prr(rack, 14, orig_cwnd); + EXIT_RECOVERY(tp->t_flags); + } + rack->r_might_revert = 0; + } +} + +#ifdef NETFLIX_EXP_DETECTION +static void +rack_do_detection(struct tcpcb *tp, struct tcp_rack *rack, uint32_t bytes_this_ack, uint32_t segsiz) +{ + if ((rack->do_detection || tcp_force_detection) && + tcp_sack_to_ack_thresh && + tcp_sack_to_move_thresh && + ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { + /* + * We have thresholds set to find + * possible attackers and disable sack. + * Check them. + */ + uint64_t ackratio, moveratio, movetotal; + + /* Log detecting */ + rack_log_sad(rack, 1); + ackratio = (uint64_t)(rack->r_ctl.sack_count); + ackratio *= (uint64_t)(1000); + if (rack->r_ctl.ack_count) + ackratio /= (uint64_t)(rack->r_ctl.ack_count); + else { + /* We really should not hit here */ + ackratio = 1000; + } + if ((rack->sack_attack_disable == 0) && + (ackratio > rack_highest_sack_thresh_seen)) + rack_highest_sack_thresh_seen = (uint32_t)ackratio; + movetotal = rack->r_ctl.sack_moved_extra; + movetotal += rack->r_ctl.sack_noextra_move; + moveratio = rack->r_ctl.sack_moved_extra; + moveratio *= (uint64_t)1000; + if (movetotal) + moveratio /= movetotal; + else { + /* No moves, thats pretty good */ + moveratio = 0; + } + if ((rack->sack_attack_disable == 0) && + (moveratio > rack_highest_move_thresh_seen)) + rack_highest_move_thresh_seen = (uint32_t)moveratio; + if (rack->sack_attack_disable == 0) { + if ((ackratio > tcp_sack_to_ack_thresh) && + (moveratio > tcp_sack_to_move_thresh)) { + /* Disable sack processing */ + rack->sack_attack_disable = 1; + if (rack->r_rep_attack == 0) { + rack->r_rep_attack = 1; + counter_u64_add(rack_sack_attacks_detected, 1); + } + if (tcp_attack_on_turns_on_logging) { + /* + * Turn on logging, used for debugging + * false positives. + */ + rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; + } + /* Clamp the cwnd at flight size */ + rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; + rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + rack_log_sad(rack, 2); + } + } else { + /* We are sack-disabled check for false positives */ + if ((ackratio <= tcp_restoral_thresh) || + (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { + rack->sack_attack_disable = 0; + rack_log_sad(rack, 3); + /* Restart counting */ + rack->r_ctl.sack_count = 0; + rack->r_ctl.sack_moved_extra = 0; + rack->r_ctl.sack_noextra_move = 1; + rack->r_ctl.ack_count = max(1, + (bytes_this_ack / segsiz)); + + if (rack->r_rep_reverse == 0) { + rack->r_rep_reverse = 1; + counter_u64_add(rack_sack_attacks_reversed, 1); + } + /* Restore the cwnd */ + if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) + rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; + } + } + } +} +#endif + +static void +rack_note_dsack(struct tcp_rack *rack, tcp_seq start, tcp_seq end) +{ + + uint32_t am; + + if (SEQ_GT(end, start)) + am = end - start; + else + am = 0; + /* + * We keep track of how many DSACK blocks we get + * after a recovery incident. + */ + rack->r_ctl.dsack_byte_cnt += am; + if (!IN_FASTRECOVERY(rack->rc_tp->t_flags) && + rack->r_ctl.retran_during_recovery && + (rack->r_ctl.dsack_byte_cnt >= rack->r_ctl.retran_during_recovery)) { + /* + * False recovery most likely culprit is reordering. If + * nothing else is missing we need to revert. + */ + rack->r_might_revert = 1; + rack_handle_might_revert(rack->rc_tp, rack); + rack->r_might_revert = 0; + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; + } +} + +static void +rack_update_prr(struct tcpcb *tp, struct tcp_rack *rack, uint32_t changed, tcp_seq th_ack) +{ + /* Deal with changed and PRR here (in recovery only) */ + uint32_t pipe, snd_una; + + rack->r_ctl.rc_prr_delivered += changed; + + if (sbavail(&rack->rc_inp->inp_socket->so_snd) <= (tp->snd_max - tp->snd_una)) { + /* + * It is all outstanding, we are application limited + * and thus we don't need more room to send anything. + * Note we use tp->snd_una here and not th_ack because + * the data as yet not been cut from the sb. + */ + rack->r_ctl.rc_prr_sndcnt = 0; + return; + } + /* Compute prr_sndcnt */ + if (SEQ_GT(tp->snd_una, th_ack)) { + snd_una = tp->snd_una; + } else { + snd_una = th_ack; + } + pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; + if (pipe > tp->snd_ssthresh) { + long sndcnt; + + sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; + if (rack->r_ctl.rc_prr_recovery_fs > 0) + sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; + else { + rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 9, 0); + sndcnt = 0; + } + sndcnt++; + if (sndcnt > (long)rack->r_ctl.rc_prr_out) + sndcnt -= rack->r_ctl.rc_prr_out; + else + sndcnt = 0; + rack->r_ctl.rc_prr_sndcnt = sndcnt; + rack_log_to_prr(rack, 10, 0); + } else { + uint32_t limit; + + if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) + limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); + else + limit = 0; + if (changed > limit) + limit = changed; + limit += ctf_fixed_maxseg(tp); + if (tp->snd_ssthresh > pipe) { + rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); + rack_log_to_prr(rack, 11, 0); + } else { + rack->r_ctl.rc_prr_sndcnt = min(0, limit); + rack_log_to_prr(rack, 12, 0); + } + } +} + +static void +rack_log_ack(struct tcpcb *tp, struct tcpopt *to, struct tcphdr *th, int entered_recovery, int dup_ack_struck) +{ + uint32_t changed; + struct tcp_rack *rack; + struct rack_sendmap *rsm; + struct sackblk sack, sack_blocks[TCP_MAX_SACK + 1]; + register uint32_t th_ack; + int32_t i, j, k, num_sack_blks = 0; + uint32_t cts, acked, ack_point, sack_changed = 0; + int loop_start = 0, moved_two = 0; + uint32_t tsused; + + + INP_WLOCK_ASSERT(tp->t_inpcb); + if (th->th_flags & TH_RST) { + /* We don't log resets */ + return; + } + rack = (struct tcp_rack *)tp->t_fb_ptr; + cts = tcp_get_usecs(NULL); + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + changed = 0; + th_ack = th->th_ack; + if (rack->sack_attack_disable == 0) + rack_do_decay(rack); + if (BYTES_THIS_ACK(tp, th) >= ctf_fixed_maxseg(rack->rc_tp)) { + /* + * You only get credit for + * MSS and greater (and you get extra + * credit for larger cum-ack moves). + */ + int ac; + + ac = BYTES_THIS_ACK(tp, th) / ctf_fixed_maxseg(rack->rc_tp); + rack->r_ctl.ack_count += ac; + counter_u64_add(rack_ack_total, ac); + } + if (rack->r_ctl.ack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } + if (SEQ_GT(th_ack, tp->snd_una)) { + rack_log_progress_event(rack, tp, ticks, PROGRESS_UPDATE, __LINE__); + tp->t_acktime = ticks; + } + if (rsm && SEQ_GT(th_ack, rsm->r_start)) + changed = th_ack - rsm->r_start; + if (changed) { + rack_process_to_cumack(tp, rack, th_ack, cts, to); + } + if ((to->to_flags & TOF_SACK) == 0) { + /* We are done nothing left and no sack. */ + rack_handle_might_revert(tp, rack); + /* + * For cases where we struck a dup-ack + * with no SACK, add to the changes so + * PRR will work right. + */ + if (dup_ack_struck && (changed == 0)) { + changed += ctf_fixed_maxseg(rack->rc_tp); + } + goto out; + } + /* Sack block processing */ + if (SEQ_GT(th_ack, tp->snd_una)) + ack_point = th_ack; + else + ack_point = tp->snd_una; + for (i = 0; i < to->to_nsacks; i++) { + bcopy((to->to_sacks + i * TCPOLEN_SACK), + &sack, sizeof(sack)); + sack.start = ntohl(sack.start); + sack.end = ntohl(sack.end); + if (SEQ_GT(sack.end, sack.start) && SEQ_GT(sack.start, ack_point) && SEQ_LT(sack.start, tp->snd_max) && SEQ_GT(sack.end, ack_point) && @@ -7674,6 +9081,7 @@ */ tcp_record_dsack(sack.start, sack.end); #endif + rack_note_dsack(rack, sack.start, sack.end); } } /* @@ -7683,7 +9091,7 @@ num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); - if (num_sack_blks == 0) { + if (num_sack_blks == 0) { /* Nothing to sack (DSACKs?) */ goto out_with_totals; } @@ -7883,116 +9291,36 @@ } out: #ifdef NETFLIX_EXP_DETECTION - if ((rack->do_detection || tcp_force_detection) && - tcp_sack_to_ack_thresh && - tcp_sack_to_move_thresh && - ((rack->r_ctl.rc_num_maps_alloced > tcp_map_minimum) || rack->sack_attack_disable)) { + rack_do_detection(tp, rack, BYTES_THIS_ACK(tp, th), ctf_fixed_maxseg(rack->rc_tp)); +#endif + if (changed) { + /* Something changed cancel the rack timer */ + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + } + tsused = tcp_get_usecs(NULL); + rsm = tcp_rack_output(tp, rack, tsused); + if ((!IN_FASTRECOVERY(tp->t_flags)) && + rsm) { + /* Enter recovery */ + rack->r_ctl.rc_rsm_start = rsm->r_start; + rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; + rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; + entered_recovery = 1; + rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); /* - * We have thresholds set to find - * possible attackers and disable sack. - * Check them. + * When we enter recovery we need to assure we send + * one packet. */ - uint64_t ackratio, moveratio, movetotal; - - /* Log detecting */ - rack_log_sad(rack, 1); - ackratio = (uint64_t)(rack->r_ctl.sack_count); - ackratio *= (uint64_t)(1000); - if (rack->r_ctl.ack_count) - ackratio /= (uint64_t)(rack->r_ctl.ack_count); - else { - /* We really should not hit here */ - ackratio = 1000; - } - if ((rack->sack_attack_disable == 0) && - (ackratio > rack_highest_sack_thresh_seen)) - rack_highest_sack_thresh_seen = (uint32_t)ackratio; - movetotal = rack->r_ctl.sack_moved_extra; - movetotal += rack->r_ctl.sack_noextra_move; - moveratio = rack->r_ctl.sack_moved_extra; - moveratio *= (uint64_t)1000; - if (movetotal) - moveratio /= movetotal; - else { - /* No moves, thats pretty good */ - moveratio = 0; - } - if ((rack->sack_attack_disable == 0) && - (moveratio > rack_highest_move_thresh_seen)) - rack_highest_move_thresh_seen = (uint32_t)moveratio; - if (rack->sack_attack_disable == 0) { - if ((ackratio > tcp_sack_to_ack_thresh) && - (moveratio > tcp_sack_to_move_thresh)) { - /* Disable sack processing */ - rack->sack_attack_disable = 1; - if (rack->r_rep_attack == 0) { - rack->r_rep_attack = 1; - counter_u64_add(rack_sack_attacks_detected, 1); - } - if (tcp_attack_on_turns_on_logging) { - /* - * Turn on logging, used for debugging - * false positives. - */ - rack->rc_tp->t_logstate = tcp_attack_on_turns_on_logging; - } - /* Clamp the cwnd at flight size */ - rack->r_ctl.rc_saved_cwnd = rack->rc_tp->snd_cwnd; - rack->rc_tp->snd_cwnd = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - rack_log_sad(rack, 2); - } - } else { - /* We are sack-disabled check for false positives */ - if ((ackratio <= tcp_restoral_thresh) || - (rack->r_ctl.rc_num_maps_alloced < tcp_map_minimum)) { - rack->sack_attack_disable = 0; - rack_log_sad(rack, 3); - /* Restart counting */ - rack->r_ctl.sack_count = 0; - rack->r_ctl.sack_moved_extra = 0; - rack->r_ctl.sack_noextra_move = 1; - rack->r_ctl.ack_count = max(1, - (BYTES_THIS_ACK(tp, th)/ctf_fixed_maxseg(rack->rc_tp))); - - if (rack->r_rep_reverse == 0) { - rack->r_rep_reverse = 1; - counter_u64_add(rack_sack_attacks_reversed, 1); - } - /* Restore the cwnd */ - if (rack->r_ctl.rc_saved_cwnd > rack->rc_tp->snd_cwnd) - rack->rc_tp->snd_cwnd = rack->r_ctl.rc_saved_cwnd; - } - } - } -#endif - if (changed) { - /* Something changed cancel the rack timer */ - rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - } - tsused = tcp_ts_getticks(); - rsm = tcp_rack_output(tp, rack, tsused); - if ((!IN_RECOVERY(tp->t_flags)) && - rsm) { - /* Enter recovery */ - rack->r_ctl.rc_rsm_start = rsm->r_start; - rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; - rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; - entered_recovery = 1; - rack_cong_signal(tp, NULL, CC_NDUPACK); - /* - * When we enter recovery we need to assure we send - * one packet. - */ - if (rack->rack_no_prr == 0) { - rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 8, 0); + if (rack->rack_no_prr == 0) { + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 8, 0); } rack->r_timer_override = 1; rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; - } else if (IN_RECOVERY(tp->t_flags) && + } else if (IN_FASTRECOVERY(tp->t_flags) && rsm && - (rack->r_rr_config == 3)) { + (rack->r_rr_config == 3)) { /* * Assure we can output and we get no * remembered pace time except the retransmit. @@ -8001,56 +9329,10 @@ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; rack->r_ctl.rc_resend = rsm; } - if (IN_RECOVERY(tp->t_flags) && + if (IN_FASTRECOVERY(tp->t_flags) && (rack->rack_no_prr == 0) && (entered_recovery == 0)) { - /* Deal with PRR here (in recovery only) */ - uint32_t pipe, snd_una; - - rack->r_ctl.rc_prr_delivered += changed; - /* Compute prr_sndcnt */ - if (SEQ_GT(tp->snd_una, th_ack)) { - snd_una = tp->snd_una; - } else { - snd_una = th_ack; - } - pipe = ((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt; - if (pipe > tp->snd_ssthresh) { - long sndcnt; - - sndcnt = rack->r_ctl.rc_prr_delivered * tp->snd_ssthresh; - if (rack->r_ctl.rc_prr_recovery_fs > 0) - sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; - else { - rack->r_ctl.rc_prr_sndcnt = 0; - rack_log_to_prr(rack, 9, 0); - sndcnt = 0; - } - sndcnt++; - if (sndcnt > (long)rack->r_ctl.rc_prr_out) - sndcnt -= rack->r_ctl.rc_prr_out; - else - sndcnt = 0; - rack->r_ctl.rc_prr_sndcnt = sndcnt; - rack_log_to_prr(rack, 10, 0); - } else { - uint32_t limit; - - if (rack->r_ctl.rc_prr_delivered > rack->r_ctl.rc_prr_out) - limit = (rack->r_ctl.rc_prr_delivered - rack->r_ctl.rc_prr_out); - else - limit = 0; - if (changed > limit) - limit = changed; - limit += ctf_fixed_maxseg(tp); - if (tp->snd_ssthresh > pipe) { - rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); - rack_log_to_prr(rack, 11, 0); - } else { - rack->r_ctl.rc_prr_sndcnt = min(0, limit); - rack_log_to_prr(rack, 12, 0); - } - } + rack_update_prr(tp, rack, changed, th_ack); if ((rsm && (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) && ((rack->rc_inp->inp_in_hpts == 0) && ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0)))) { @@ -8071,12 +9353,32 @@ struct rack_sendmap *rsm; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + while (rsm && (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { + rsm = TAILQ_NEXT(rsm, r_tnext); + } if (rsm && (rsm->r_dupack < 0xff)) { rsm->r_dupack++; if (rsm->r_dupack >= DUP_ACK_THRESHOLD) { - rack->r_wanted_output = 1; - rack->r_timer_override = 1; - rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); + struct timeval tv; + uint32_t cts; + /* + * Here we see if we need to retransmit. For + * a SACK type connection if enough time has passed + * we will get a return of the rsm. For a non-sack + * connection we will get the rsm returned if the + * dupack value is 3 or more. + */ + cts = tcp_get_usecs(&tv); + rack->r_ctl.rc_resend = tcp_rack_output(rack->rc_tp, rack, cts); + if (rack->r_ctl.rc_resend != NULL) { + if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { + rack_cong_signal(rack->rc_tp, CC_NDUPACK, + rack->rc_tp->snd_una); + } + rack->r_wanted_output = 1; + rack->r_timer_override = 1; + rack_log_retran_reason(rack, rsm, __LINE__, 1, 3); + } } else { rack_log_retran_reason(rack, rsm, __LINE__, 0, 3); } @@ -8118,6 +9420,14 @@ uint64_t bw, calc_bw, rtt; rtt = rack->r_ctl.rack_rs.rs_us_rtt; + if (rtt == 0) { + /* no us sample is there a ms one? */ + if (rack->r_ctl.rack_rs.rs_rtt_lowest) { + rtt = rack->r_ctl.rack_rs.rs_rtt_lowest; + } else { + goto no_measurement; + } + } bw = acked; calc_bw = bw * 1000000; calc_bw /= rtt; @@ -8145,15 +9455,25 @@ rack->r_ctl.rc_rtt_diff = 0; rack->r_ctl.gp_bw = calc_bw; rack->rc_gp_filled = 1; - rack->r_ctl.num_avg = RACK_REQ_AVG; - rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + if (rack->r_ctl.num_measurements < RACK_REQ_AVG) + rack->r_ctl.num_measurements = RACK_REQ_AVG; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); } else if (calc_bw > rack->r_ctl.gp_bw) { rack->r_ctl.rc_rtt_diff = 0; - rack->r_ctl.num_avg = RACK_REQ_AVG; + if (rack->r_ctl.num_measurements < RACK_REQ_AVG) + rack->r_ctl.num_measurements = RACK_REQ_AVG; rack->r_ctl.gp_bw = calc_bw; - rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); } else rack_increase_bw_mul(rack, -1, 0, 0, 1); + if ((rack->gp_ready == 0) && + (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { + /* We have enough measurements now */ + rack->gp_ready = 1; + rack_set_cc_pacing(rack); + if (rack->defer_options) + rack_apply_deferred_options(rack); + } /* * For acks over 1mss we do a extra boost to simulate * where we would get 2 acks (we want 110 for the mul). @@ -8162,12 +9482,12 @@ rack_increase_bw_mul(rack, -1, 0, 0, 1); } else { /* - * Huh, this should not be, settle - * for just an old increase. + * zero rtt possibly?, settle for just an old increase. */ +no_measurement: rack_increase_bw_mul(rack, -1, 0, 0, 1); } - } else if ((IN_RECOVERY(tp->t_flags) == 0) && + } else if ((IN_FASTRECOVERY(tp->t_flags) == 0) && (sbavail(&so->so_snd) > max((segsiz * (4 + rack_req_segs)), minseg)) && (rack->r_ctl.cwnd_to_use > max((segsiz * (rack_req_segs + 2)), minseg)) && @@ -8188,6 +9508,114 @@ } } + + +static void +rack_gain_for_fastoutput(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t acked_amount) +{ + /* + * The fast output path is enabled and we + * have moved the cumack forward. Lets see if + * we can expand forward the fast path length by + * that amount. What we would ideally like to + * do is increase the number of bytes in the + * fast path block (left_to_send) by the + * acked amount. However we have to gate that + * by two factors: + * 1) The amount outstanding and the rwnd of the peer + * (i.e. we don't want to exceed the rwnd of the peer). + * + * 2) The amount of data left in the socket buffer (i.e. + * we can't send beyond what is in the buffer). + * + * Note that this does not take into account any increase + * in the cwnd. We will only extend the fast path by + * what was acked. + */ + uint32_t new_total, gating_val; + + new_total = acked_amount + rack->r_ctl.fsb.left_to_send; + gating_val = min((sbavail(&so->so_snd) - (tp->snd_max - tp->snd_una)), + (tp->snd_wnd - (tp->snd_max - tp->snd_una))); + if (new_total <= gating_val) { + /* We can increase left_to_send by the acked amount */ + counter_u64_add(rack_extended_rfo, 1); + rack->r_ctl.fsb.left_to_send = new_total; + KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(&rack->rc_inp->inp_socket->so_snd) - (tp->snd_max - tp->snd_una))), + ("rack:%p left_to_send:%u sbavail:%u out:%u", + rack, rack->r_ctl.fsb.left_to_send, + sbavail(&rack->rc_inp->inp_socket->so_snd), + (tp->snd_max - tp->snd_una))); + + } +} + +static void +rack_adjust_sendmap(struct tcp_rack *rack, struct sockbuf *sb, tcp_seq snd_una) +{ + /* + * Here any sendmap entry that points to the + * beginning mbuf must be adjusted to the correct + * offset. This must be called with: + * 1) The socket buffer locked + * 2) snd_una adjusted to its new postion. + * + * Note that (2) implies rack_ack_received has also + * been called. + * + * We grab the first mbuf in the socket buffer and + * then go through the front of the sendmap, recalculating + * the stored offset for any sendmap entry that has + * that mbuf. We must use the sb functions to do this + * since its possible an add was done has well as + * the subtraction we may have just completed. This should + * not be a penalty though, since we just referenced the sb + * to go in and trim off the mbufs that we freed (of course + * there will be a penalty for the sendmap references though). + */ + struct mbuf *m; + struct rack_sendmap *rsm; + + SOCKBUF_LOCK_ASSERT(sb); + m = sb->sb_mb; + rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if ((rsm == NULL) || (m == NULL)) { + /* Nothing outstanding */ + return; + } + while (rsm->m == m) { + /* one to adjust */ +#ifdef INVARIANTS + struct mbuf *tm; + uint32_t soff; + + tm = sbsndmbuf(sb, (rsm->r_start - snd_una), &soff); + if (rsm->orig_m_len != m->m_len) { + rack_adjust_orig_mlen(rsm); + } + if (rsm->soff != soff) { + /* + * This is not a fatal error, we anticipate it + * might happen (the else code), so we count it here + * so that under invariant we can see that it really + * does happen. + */ + counter_u64_add(rack_adjust_map_bw, 1); + } + rsm->m = tm; + rsm->soff = soff; + rsm->orig_m_len = rsm->m->m_len; +#else + rsm->m = sbsndmbuf(sb, (rsm->r_start - snd_una), &rsm->soff); + rsm->orig_m_len = rsm->m->m_len; +#endif + rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, + rsm); + if (rsm == NULL) + break; + } +} + /* * Return value of 1, we do not need to call rack_process_data(). * return value of 0, rack_process_data can be called. @@ -8198,7 +9626,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, - int32_t * ofia, int32_t thflags, int32_t * ret_val) + int32_t * ofia, int32_t thflags, int32_t *ret_val) { int32_t ourfinisacked = 0; int32_t nsegs, acked_amount; @@ -8210,29 +9638,41 @@ rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { - ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt); rack->r_wanted_output = 1; return (1); } - if (rack->rc_gp_filled && + if (rack->gp_ready && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { under_pacing = 1; } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { - if (rack->rc_in_persist) + int in_rec, dup_ack_struck = 0; + + in_rec = IN_FASTRECOVERY(tp->t_flags); + if (rack->rc_in_persist) { tp->t_rxtshift = 0; - if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); + } + if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd)) { rack_strike_dupack(rack); - rack_log_ack(tp, to, th); + dup_ack_struck = 1; + } + rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), dup_ack_struck); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { /* * Old ack, behind (or duplicate to) the last one rcv'd - * Note: Should mark reordering is occuring! We should also - * look for sack blocks arriving e.g. ack 1, 4-4 then ack 1, - * 3-3, 4-4 would be reording. As well as ack 1, 3-3 ack 3 + * Note: We mark reordering is occuring if its + * less than and we have not closed our window. */ + if (SEQ_LT(th->th_ack, tp->snd_una) && (sbspace(&so->so_rcv) > ctf_fixed_maxseg(tp))) { + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + } return (0); } /* @@ -8267,15 +9707,18 @@ * retransmit in the first place. Recover our original cwnd and * ssthresh, and proceed to transmit where we left off. */ - if (tp->t_flags & TF_PREVVALID) { + if ((tp->t_flags & TF_PREVVALID) && + ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) - rack_cong_signal(tp, th, CC_RTO_ERR); + rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); } if (acked) { /* assure we are not backed off */ tp->t_rxtshift = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); rack->rc_tlp_in_progress = 0; rack->r_ctl.rc_tlp_cnt_out = 0; /* @@ -8311,15 +9754,13 @@ *ofia = ourfinisacked; return (0); } - if (rack->r_ctl.rc_early_recovery) { - if (IN_RECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover) && - (SEQ_LT(th->th_ack, tp->snd_max))) { - tcp_rack_partialack(tp, th); - } else { - rack_post_recovery(tp, th); - recovery = 1; - } + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover) && + (SEQ_LT(th->th_ack, tp->snd_max))) { + tcp_rack_partialack(tp); + } else { + rack_post_recovery(tp, th->th_ack); + recovery = 1; } } /* @@ -8327,7 +9768,7 @@ * related information. This typically means increasing the * congestion window. */ - rack_ack_received(tp, rack, th, nsegs, CC_ACK, recovery); + rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; @@ -8344,20 +9785,13 @@ */ ourfinisacked = 1; } + tp->snd_una = th->th_ack; + if (acked_amount && sbavail(&so->so_snd)) + rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); + rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); SOCKBUF_UNLOCK(&so->so_snd); tp->t_flags |= TF_WAKESOW; m_freem(mfree); - if (rack->r_ctl.rc_early_recovery == 0) { - if (IN_RECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover) && - (SEQ_LT(th->th_ack, tp->snd_max))) { - tcp_rack_partialack(tp, th); - } else { - rack_post_recovery(tp, th); - } - } - } - tp->snd_una = th->th_ack; if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; @@ -8374,7 +9808,10 @@ } if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ + tp->t_flags &= ~TF_PREVVALID; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); @@ -8389,7 +9826,7 @@ (tp->t_flags2 & TF2_DROP_AF_DATA)) { /* * The socket was gone and the - * peer sent data, time to + * peer sent data (now or in the past), time to * reset him. */ *ret_val = 1; @@ -8457,6 +9894,7 @@ nrsm, insret, rack, rsm); } #endif + rack_log_map_chg(rack->rc_tp, rack, NULL, rsm, nrsm, MAP_SPLIT, max_seq, __LINE__); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); nrsm->r_in_tmap = 1; @@ -8471,8 +9909,8 @@ counter_u64_add(rack_collapsed_win, 1); RB_FOREACH_FROM(nrsm, rack_rb_tree_head, rsm) { nrsm->r_flags |= RACK_RWND_COLLAPSED; - rack->rc_has_collapsed = 1; } + rack->rc_has_collapsed = 1; } static void @@ -8514,6 +9952,40 @@ } } } + +static void +rack_validate_fo_sendwin_up(struct tcpcb *tp, struct tcp_rack *rack) +{ + /* + * If fast output is in progress, lets validate that + * the new window did not shrink on us and make it + * so fast output should end. + */ + if (rack->r_fast_output) { + uint32_t out; + + /* + * Calculate what we will send if left as is + * and compare that to our send window. + */ + out = ctf_outstanding(tp); + if ((out + rack->r_ctl.fsb.left_to_send) > tp->snd_wnd) { + /* ok we have an issue */ + if (out >= tp->snd_wnd) { + /* Turn off fast output the window is met or collapsed */ + rack->r_fast_output = 0; + } else { + /* we have some room left */ + rack->r_ctl.fsb.left_to_send = tp->snd_wnd - out; + if (rack->r_ctl.fsb.left_to_send < ctf_fixed_maxseg(tp)) { + /* If not at least 1 full segment never mind */ + rack->r_fast_output = 0; + } + } + } + } +} + /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still @@ -8544,6 +10016,7 @@ tp->snd_wl2 == th->th_ack && tiwin > tp->snd_wnd) KMOD_TCPSTAT_INC(tcps_rcvwinupd); tp->snd_wnd = tiwin; + rack_validate_fo_sendwin_up(tp, rack); tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; if (tp->snd_wnd > tp->max_sndwnd) @@ -8552,6 +10025,7 @@ } else if (thflags & TH_ACK) { if ((tp->snd_wl2 == th->th_ack) && (tiwin < tp->snd_wnd)) { tp->snd_wnd = tiwin; + rack_validate_fo_sendwin_up(tp, rack); tp->snd_wl1 = th->th_seq; tp->snd_wl2 = th->th_ack; } @@ -8583,7 +10057,6 @@ * nothing is outstanding, and there is * data to send. Enter persists. */ - tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); } if (tp->t_flags2 & TF2_DROP_AF_DATA) { @@ -8665,6 +10138,8 @@ appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); + + rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; #ifdef NETFLIX_SB_LIMITS @@ -8680,47 +10155,48 @@ * trimming from the head. */ tcp_seq temp = save_start; + thflags = tcp_reass(tp, th, &temp, &tlen, m); tp->t_flags |= TF_ACKNOW; } if ((tp->t_flags & TF_SACK_PERMIT) && (save_tlen > 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { - if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { - /* - * DSACK actually handled in the fastpath - * above. - */ + if ((tlen == 0) && (SEQ_LT(save_start, save_rnxt))) { + /* + * DSACK actually handled in the fastpath + * above. + */ RACK_OPTS_INC(tcp_sack_path_1); - tcp_update_sack_list(tp, save_start, - save_start + save_tlen); - } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { - if ((tp->rcv_numsacks >= 1) && - (tp->sackblks[0].end == save_start)) { - /* - * Partial overlap, recorded at todrop - * above. - */ + tcp_update_sack_list(tp, save_start, + save_start + save_tlen); + } else if ((tlen > 0) && SEQ_GT(tp->rcv_nxt, save_rnxt)) { + if ((tp->rcv_numsacks >= 1) && + (tp->sackblks[0].end == save_start)) { + /* + * Partial overlap, recorded at todrop + * above. + */ RACK_OPTS_INC(tcp_sack_path_2a); - tcp_update_sack_list(tp, - tp->sackblks[0].start, - tp->sackblks[0].end); - } else { + tcp_update_sack_list(tp, + tp->sackblks[0].start, + tp->sackblks[0].end); + } else { RACK_OPTS_INC(tcp_sack_path_2b); - tcp_update_dsack_list(tp, save_start, - save_start + save_tlen); - } - } else if (tlen >= save_tlen) { - /* Update of sackblks. */ + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } + } else if (tlen >= save_tlen) { + /* Update of sackblks. */ RACK_OPTS_INC(tcp_sack_path_3); - tcp_update_dsack_list(tp, save_start, - save_start + save_tlen); - } else if (tlen > 0) { + tcp_update_dsack_list(tp, save_start, + save_start + save_tlen); + } else if (tlen > 0) { RACK_OPTS_INC(tcp_sack_path_4); - tcp_update_dsack_list(tp, save_start, - save_start + tlen); - } - } + tcp_update_dsack_list(tp, save_start, + save_start + tlen); + } + } } else { m_freem(m); thflags &= ~TH_FIN; @@ -8926,6 +10402,7 @@ sbappendstream_locked(&so->so_rcv, m, 0); ctf_calc_rwin(so, tp); } + rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); SOCKBUF_UNLOCK(&so->so_rcv); tp->t_flags |= TF_WAKESOR; #ifdef NETFLIX_SB_LIMITS @@ -9000,15 +10477,16 @@ return (0); } /* Ok if we reach here, we can process a fast-ack */ - if (rack->rc_gp_filled && + if (rack->gp_ready && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { under_pacing = 1; } nsegs = max(1, m->m_pkthdr.lro_nsegs); - rack_log_ack(tp, to, th); + rack_log_ack(tp, to, th, 0, 0); /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; + rack_validate_fo_sendwin_up(tp, rack); tp->snd_wl1 = th->th_seq; if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; @@ -9032,7 +10510,6 @@ * nothing is outstanding, and there is * data to send. Enter persists. */ - tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); } /* @@ -9053,11 +10530,12 @@ /* * "bad retransmit" recovery. */ - if (tp->t_flags & TF_PREVVALID) { + if ((tp->t_flags & TF_PREVVALID) && + ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { tp->t_flags &= ~TF_PREVVALID; if (tp->t_rxtshift == 1 && (int)(ticks - tp->t_badrxtwin) < 0) - rack_cong_signal(tp, th, CC_RTO_ERR); + rack_cong_signal(tp, CC_RTO_ERR, th->th_ack); } /* * Recalculate the transmit timer / rtt. @@ -9072,13 +10550,25 @@ /* Run HHOOK_TCP_ESTABLISHED_IN helper hooks. */ hhook_run_tcp_est_in(tp, th, to); #endif - KMOD_TCPSTAT_ADD(tcps_rcvackpack, nsegs); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); - sbdrop(&so->so_snd, acked); if (acked) { - /* assure we are not backed off */ + struct mbuf *mfree; + + rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, 0); + SOCKBUF_LOCK(&so->so_snd); + mfree = sbcut_locked(&so->so_snd, acked); + tp->snd_una = th->th_ack; + /* Note we want to hold the sb lock through the sendmap adjust */ + rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); + /* Wake up the socket if we have room to write more */ + rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); + SOCKBUF_UNLOCK(&so->so_snd); + tp->t_flags |= TF_WAKESOW; + m_freem(mfree); tp->t_rxtshift = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); rack->rc_tlp_in_progress = 0; rack->r_ctl.rc_tlp_cnt_out = 0; /* @@ -9096,9 +10586,6 @@ * related information. This typically means increasing the * congestion window. */ - rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); - - tp->snd_una = th->th_ack; if (tp->snd_wnd < ctf_outstanding(tp)) { /* The peer collapsed the window */ rack_collapsed_window(rack); @@ -9135,6 +10622,9 @@ rack_check_bottom_drag(tp, rack, so, acked); } if (tp->snd_una == tp->snd_max) { + tp->t_flags &= ~TF_PREVVALID; + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; @@ -9143,8 +10633,8 @@ tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } - /* Wake up the socket if we have room to write more */ - tp->t_flags |= TF_WAKESOW; + if (acked && rack->r_fast_output) + rack_gain_for_fastoutput(rack, tp, so, (uint32_t)acked); if (sbavail(&so->so_snd)) { rack->r_wanted_output = 1; } @@ -9312,12 +10802,14 @@ if (thflags & TH_ACK) { /* For syn-sent we need to possibly update the rtt */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { - uint32_t t; + uint32_t t, mcts; - t = tcp_ts_getticks() - to->to_tsecr; + mcts = tcp_ts_getticks(); + t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; - tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); + rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 4); + tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) @@ -9427,7 +10919,9 @@ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -9452,6 +10946,7 @@ tp->ts_recent = to->to_tsval; } tp->snd_wnd = tiwin; + rack_validate_fo_sendwin_up(tp, rack); /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later @@ -9513,12 +11008,14 @@ tp->snd_wl1 = th->th_seq - 1; /* For syn-recv we need to possibly update the rtt */ if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { - uint32_t t; + uint32_t t, mcts; - t = tcp_ts_getticks() - to->to_tsecr; + mcts = tcp_ts_getticks(); + t = (mcts - to->to_tsecr) * HPTS_USEC_IN_MSEC; if (!tp->t_rttlow || tp->t_rttlow > t) tp->t_rttlow = t; - tcp_rack_xmit_timer(rack, t + 1, 1, (t * HPTS_USEC_IN_MSEC), 0, NULL, 2); + rack_log_rtt_sample_calc(rack, t, (to->to_tsecr * 1000), (mcts * 1000), 5); + tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { @@ -9622,7 +11119,9 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -9658,7 +11157,7 @@ } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -9695,7 +11194,9 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || (tp->t_fin_is_rst && (thflags & TH_FIN))) @@ -9717,7 +11218,9 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -9819,7 +11322,9 @@ { int32_t ret_val = 0; int32_t ourfinisacked = 0; + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || @@ -9842,7 +11347,9 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -9942,7 +11449,9 @@ { int32_t ret_val = 0; int32_t ourfinisacked = 0; + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || @@ -9965,7 +11474,9 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -10008,7 +11519,7 @@ tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output= 1; + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output = 1; return (ret_val); } else { ctf_do_drop(m, NULL); @@ -10051,7 +11562,9 @@ { int32_t ret_val = 0; int32_t ourfinisacked = 0; + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); if ((thflags & TH_RST) || @@ -10074,7 +11587,9 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -10160,7 +11675,9 @@ { int32_t ret_val = 0; int32_t ourfinisacked = 0; + struct tcp_rack *rack; + rack = (struct tcp_rack *)tp->t_fb_ptr; ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ @@ -10184,7 +11701,9 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, + &rack->r_ctl.challenge_ack_ts, + &rack->r_ctl.challenge_ack_cnt)) { return (ret_val); } /* @@ -10263,12 +11782,14 @@ } static void -rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line) +rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack, uint32_t line, uint64_t *fill_override) { uint64_t bw_est, rate_wanted; int chged = 0; - uint32_t user_max; + uint32_t user_max, orig_min, orig_max; + orig_min = rack->r_ctl.rc_pace_min_segs; + orig_max = rack->r_ctl.rc_pace_max_segs; user_max = ctf_fixed_maxseg(tp) * rack->rc_user_set_max_segs; if (ctf_fixed_maxseg(tp) != rack->r_ctl.rc_pace_min_segs) chged = 1; @@ -10282,7 +11803,7 @@ } else if (rack->use_fixed_rate) { bw_est = rack_get_bw(rack); if ((rack->r_ctl.crte == NULL) || - (bw_est != rack->r_ctl.crte->rate)) { + (bw_est != rack->r_ctl.crte->rate)) { rack->r_ctl.rc_pace_max_segs = user_max; } else { /* We are pacing right at the hardware rate */ @@ -10305,7 +11826,10 @@ bw_est = rack_get_bw(rack); orig = rack->r_ctl.rc_pace_max_segs; - rate_wanted = rack_get_output_bw(rack, bw_est, NULL); + if (fill_override) + rate_wanted = *fill_override; + else + rate_wanted = rack_get_output_bw(rack, bw_est, NULL, NULL); if (rate_wanted) { /* We have something */ rack->r_ctl.rc_pace_max_segs = rack_get_pacing_len(rack, @@ -10330,7 +11854,78 @@ rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; } if (chged) - rack_log_type_hrdwtso(tp, rack, 0, rack->rc_inp->inp_socket->so_snd.sb_flags, line, 2); + rack_log_type_pacing_sizes(tp, rack, orig_min, orig_max, line, 2); +} + + +static void +rack_init_fsb_block(struct tcpcb *tp, struct tcp_rack *rack) +{ +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; +#endif +#ifdef INET + struct ip *ip = NULL; +#endif + struct udphdr *udp = NULL; + + /* Ok lets fill in the fast block, it can only be used with no IP options! */ +#ifdef INET6 + if (rack->r_is_v6) { + rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip6 + sizeof(struct ip6_hdr)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + rack->r_ctl.fsb.udp = udp; + rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); + } else + { + rack->r_ctl.fsb.th = (struct tcphdr *)(ip6 + 1); + rack->r_ctl.fsb.udp = NULL; + } + tcpip_fillheaders(rack->rc_inp, + tp->t_port, + ip6, rack->r_ctl.fsb.th); + } else +#endif /* INET6 */ + { + rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); + ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip + sizeof(struct ip)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + rack->r_ctl.fsb.udp = udp; + rack->r_ctl.fsb.th = (struct tcphdr *)(udp + 1); + } else + { + rack->r_ctl.fsb.udp = NULL; + rack->r_ctl.fsb.th = (struct tcphdr *)(ip + 1); + } + tcpip_fillheaders(rack->rc_inp, + tp->t_port, + ip, rack->r_ctl.fsb.th); + } + rack->r_fsb_inited = 1; +} + +static int +rack_init_fsb(struct tcpcb *tp, struct tcp_rack *rack) +{ + /* Allocate the larger of spaces V6 if available else just V4 */ + rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct tcpiphdr); +#ifdef INET6 + rack->r_ctl.fsb.tcp_ip_hdr_len = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); +#endif + rack->r_ctl.fsb.tcp_ip_hdr = malloc(rack->r_ctl.fsb.tcp_ip_hdr_len, + M_TCPFSB, M_NOWAIT|M_ZERO); + if (rack->r_ctl.fsb.tcp_ip_hdr == NULL) { + return (ENOMEM); + } + rack->r_fsb_inited = 0; + return (0); } static int @@ -10339,6 +11934,7 @@ struct tcp_rack *rack = NULL; struct rack_sendmap *insret; uint32_t iwin, snt, us_cts; + int err; tp->t_fb_ptr = uma_zalloc(rack_pcb_zone, M_NOWAIT); if (tp->t_fb_ptr == NULL) { @@ -10357,11 +11953,27 @@ TAILQ_INIT(&rack->r_ctl.rc_free); TAILQ_INIT(&rack->r_ctl.rc_tmap); rack->rc_tp = tp; - if (tp->t_inpcb) { - rack->rc_inp = tp->t_inpcb; - } + rack->rc_inp = tp->t_inpcb; + /* Set the flag */ + rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); + /* + * Save off the default values, socket options will poke + * at these if pacing is not on or we have not yet + * reached where pacing is on (gp_ready/fixed enabled). + * When they get set into the CC module (when gp_ready + * is enabled or we enable fixed) then we will set these + * values into the CC and place in here the old values + * so we have a restoral. Then we will set the flag + * rc_pacing_cc_set. That way whenever we turn off pacing + * or switch off this stack, we will know to go restore + * the saved values. + */ + rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; + rack->r_ctl.rc_saved_beta.beta_ecn = V_newreno_beta_ecn; + /* We want abe like behavior as well */ + rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; @@ -10371,38 +11983,50 @@ tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; +#ifdef TCP_ACCOUNTING + if (rack_tcp_accounting) { + tp->t_flags2 |= TF2_TCP_ACCOUNTING; + } +#endif if (rack_enable_shared_cwnd) rack->rack_enable_scwnd = 1; rack->rc_user_set_max_segs = rack_hptsi_segments; rack->rc_force_max_seg = 0; if (rack_use_imac_dack) rack->rc_dack_mode = 1; + TAILQ_INIT(&rack->r_ctl.opt_list); rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; - rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; - rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; - rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; rack->r_ctl.rc_highest_us_rtt = 0; + rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; + if (rack_use_cmp_acks) + rack->r_use_cmp_ack = 1; if (rack_disable_prr) rack->rack_no_prr = 1; if (rack_gp_no_rec_chg) rack->rc_gp_no_rec_chg = 1; - rack->rc_always_pace = rack_pace_every_seg; - if (rack_enable_mqueue_for_nonpaced) + if (rack_pace_every_seg && tcp_can_enable_pacing()) { + rack->rc_always_pace = 1; + if (rack->use_fixed_rate || rack->gp_ready) + rack_set_cc_pacing(rack); + } else + rack->rc_always_pace = 0; + if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) rack->r_mbuf_queue = 1; else rack->r_mbuf_queue = 0; - if (rack->r_mbuf_queue || rack->rc_always_pace) + if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; - rack_set_pace_segments(tp, rack, __LINE__); + rack_set_pace_segments(tp, rack, __LINE__, NULL); if (rack_limits_scwnd) - rack->r_limit_scw = 1; + rack->r_limit_scw = 1; else - rack->r_limit_scw = 0; + rack->r_limit_scw = 0; + rack->rc_labc = V_tcp_abc_l_var; rack->r_ctl.rc_high_rwnd = tp->snd_wnd; rack->r_ctl.cwnd_to_use = tp->snd_cwnd; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; @@ -10415,6 +12039,8 @@ rack->r_running_early = 0; rack->rc_init_win = rack_default_init_window; rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; + if (rack_hw_up_only) + rack->r_up_only = 1; if (rack_do_dyn_mul) { /* When dynamic adjustment is on CA needs to start at 100% */ rack->rc_gp_dyn_mul = 1; @@ -10430,7 +12056,17 @@ us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_of_last_probertt = us_cts; + rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); rack->r_ctl.rc_time_probertt_starts = 0; + /* We require at least one measurement, even if the sysctl is 0 */ + if (rack_req_measurements) + rack->r_ctl.req_measurements = rack_req_measurements; + else + rack->r_ctl.req_measurements = 1; + if (rack_enable_hw_pacing) + rack->rack_hdw_pace_ena = 1; + if (rack_hw_rate_caps) + rack->r_rack_hw_rate_caps = 1; /* Do we force on detection? */ #ifdef NETFLIX_EXP_DETECTION if (tcp_force_detection) @@ -10440,6 +12076,12 @@ rack->do_detection = 0; if (rack_non_rxt_use_cr) rack->rack_rec_nonrxt_use_cr = 1; + err = rack_init_fsb(tp, rack); + if (err) { + uma_zfree(rack_pcb_zone, tp->t_fb_ptr); + tp->t_fb_ptr = NULL; + return (err); + } if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; @@ -10450,19 +12092,36 @@ tp->t_fb_ptr = NULL; return (ENOMEM); } - rsm->r_flags = RACK_OVERMAX; - rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; + rsm->r_no_rtt_allowed = 1; + rsm->r_tim_lastsent[0] = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; - rsm->r_start = tp->snd_una; if (tp->t_flags & TF_SENTFIN) { rsm->r_end = tp->snd_max - 1; rsm->r_flags |= RACK_HAS_FIN; } else { rsm->r_end = tp->snd_max; } - rsm->usec_orig_send = us_cts; + if (tp->snd_una == tp->iss) { + /* The data space is one beyond snd_una */ + rsm->r_flags |= RACK_HAS_SYN; + rsm->r_start = tp->iss; + rsm->r_end = rsm->r_start + (tp->snd_max - tp->snd_una); + } else + rsm->r_start = tp->snd_una; rsm->r_dupack = 0; + if (rack->rc_inp->inp_socket->so_snd.sb_mb != NULL) { + rsm->m = sbsndmbuf(&rack->rc_inp->inp_socket->so_snd, 0, &rsm->soff); + rsm->orig_m_len = rsm->m->m_len; + } else { + /* + * This can happen if we have a stand-alone FIN or + * SYN. + */ + rsm->m = NULL; + rsm->orig_m_len = 0; + rsm->soff = 0; + } insret = RB_INSERT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, rsm); #ifdef INVARIANTS if (insret != NULL) { @@ -10473,8 +12132,61 @@ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } - /* Cancel the GP measurement in progress */ - tp->t_flags &= ~TF_GPUTINPROG; + /* + * Timers in Rack are kept in microseconds so lets + * convert any initial incoming variables + * from ticks into usecs. Note that we + * also change the values of t_srtt and t_rttvar, if + * they are non-zero. They are kept with a 5 + * bit decimal so we have to carefully convert + * these to get the full precision. + */ + if (tp->t_srtt > 1) { + uint32_t val, frac; + + val = tp->t_srtt >> TCP_RTT_SHIFT; + frac = tp->t_srtt & 0x1f; + tp->t_srtt = TICKS_2_USEC(val); + /* + * frac is the fractional part of the srtt (if any) + * but its in ticks and every bit represents + * 1/32nd of a hz. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); + } else { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); + } + tp->t_srtt += frac; + } + } + if (tp->t_rttvar) { + uint32_t val, frac; + + val = tp->t_rttvar >> TCP_RTTVAR_SHIFT; + frac = tp->t_rttvar & 0x1f; + tp->t_rttvar = TICKS_2_USEC(val); + /* + * frac is the fractional part of the srtt (if any) + * but its in ticks and every bit represents + * 1/32nd of a hz. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_MSEC) / (uint64_t)TCP_RTT_SCALE); + } else { + frac = (((uint64_t)frac * (uint64_t)HPTS_USEC_IN_SEC) / ((uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE)); + } + tp->t_rttvar += frac; + } + } + tp->t_rxtcur = TICKS_2_USEC(tp->t_rxtcur); + tp->t_rttlow = TICKS_2_USEC(tp->t_rttlow); + if (rack_def_profile) + rack_set_profile(rack, rack_def_profile); + /* Cancel the GP measurement in progress */ + tp->t_flags &= ~TF_GPUTINPROG; if (SEQ_GT(tp->snd_max, tp->iss)) snt = tp->snd_max - tp->iss; else @@ -10510,7 +12222,8 @@ tp->snd_ssthresh = 0xffffffff; } rack_stop_all_timers(tp); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); + /* Lets setup the fsb block */ + rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); rack_log_rtt_shrinks(rack, us_cts, 0, __LINE__, RACK_RTTS_INIT); return (0); @@ -10555,14 +12268,55 @@ return (EINVAL); } + static void rack_fini(struct tcpcb *tp, int32_t tcb_is_purged) { + int ack_cmp = 0; + if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm, *rm; rack = (struct tcp_rack *)tp->t_fb_ptr; + if (tp->t_in_pkt) { + /* + * Since we are switching we need to process any + * inbound packets in case a compressed ack is + * in queue or the new stack does not support + * mbuf queuing. These packets in theory should + * have been handled by the old stack anyway. + */ + if ((rack->rc_inp->inp_flags & (INP_DROPPED|INP_TIMEWAIT)) || + (rack->rc_inp->inp_flags2 & INP_FREED)) { + /* Kill all the packets */ + struct mbuf *save, *m; + + m = tp->t_in_pkt; + tp->t_in_pkt = NULL; + tp->t_tail_pkt = NULL; + while (m) { + save = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = save; + } + } else { + /* Process all the packets */ + ctf_do_queued_segments(rack->rc_inp->inp_socket, rack->rc_tp, 0); + } + if ((tp->t_inpcb) && + (tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP)) + ack_cmp = 1; + if (ack_cmp) { + /* Total if we used large or small (if ack-cmp was used). */ + if (rack->rc_inp->inp_flags2 & INP_MBUF_L_ACKS) + counter_u64_add(rack_large_ackcmp, 1); + else + counter_u64_add(rack_small_ackcmp, 1); + } + } + tp->t_flags &= ~TF_FORCEDATA; #ifdef NETFLIX_SHARED_CWND if (rack->r_ctl.rc_scw) { uint32_t limit; @@ -10577,12 +12331,74 @@ rack->r_ctl.rc_scw = NULL; } #endif + if (rack->r_ctl.fsb.tcp_ip_hdr) { + free(rack->r_ctl.fsb.tcp_ip_hdr, M_TCPFSB); + rack->r_ctl.fsb.tcp_ip_hdr = NULL; + rack->r_ctl.fsb.th = NULL; + } + /* Convert back to ticks, with */ + if (tp->t_srtt > 1) { + uint32_t val, frac; + + val = USEC_2_TICKS(tp->t_srtt); + frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); + tp->t_srtt = val << TCP_RTT_SHIFT; + /* + * frac is the fractional part here is left + * over from converting to hz and shifting. + * We need to convert this to the 5 bit + * remainder. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); + } else { + frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); + } + tp->t_srtt += frac; + } + } + if (tp->t_rttvar) { + uint32_t val, frac; + + val = USEC_2_TICKS(tp->t_rttvar); + frac = tp->t_srtt % (HPTS_USEC_IN_SEC / hz); + tp->t_rttvar = val << TCP_RTTVAR_SHIFT; + /* + * frac is the fractional part here is left + * over from converting to hz and shifting. + * We need to convert this to the 5 bit + * remainder. + */ + if (frac) { + if (hz == 1000) { + frac = (((uint64_t)frac * (uint64_t)TCP_RTT_SCALE) / (uint64_t)HPTS_USEC_IN_MSEC); + } else { + frac = (((uint64_t)frac * (uint64_t)(hz) * (uint64_t)TCP_RTT_SCALE) /(uint64_t)HPTS_USEC_IN_SEC); + } + tp->t_rttvar += frac; + } + } + tp->t_rxtcur = USEC_2_TICKS(tp->t_rxtcur); + tp->t_rttlow = USEC_2_TICKS(tp->t_rttlow); + if (rack->rc_always_pace) { + tcp_decrement_paced_conn(); + rack_undo_cc_pacing(rack); + rack->rc_always_pace = 0; + } + /* Clean up any options if they were not applied */ + while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { + struct deferred_opt_list *dol; + + dol = TAILQ_FIRST(&rack->r_ctl.opt_list); + TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); + free(dol, M_TCPDO); + } /* rack does not use force data but other stacks may clear it */ - tp->t_flags &= ~TF_FORCEDATA; - if (tp->t_inpcb) { - tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; - tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; - tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + if (rack->r_ctl.crte != NULL) { + tcp_rel_pacing_rate(rack->r_ctl.crte, tp); + rack->rack_hdrw_pacing = 0; + rack->r_ctl.crte = NULL; } #ifdef TCP_BLACKBOX tcp_log_flowend(tp); @@ -10607,8 +12423,15 @@ uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } - /* Cancel the GP measurement in progress */ - tp->t_flags &= ~TF_GPUTINPROG; + if (tp->t_inpcb) { + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + tp->t_inpcb->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + tp->t_inpcb->inp_flags2 &= ~INP_MBUF_ACKCMP; + /* Cancel the GP measurement in progress */ + tp->t_flags &= ~TF_GPUTINPROG; + tp->t_inpcb->inp_flags2 &= ~INP_MBUF_L_ACKS; + } /* Make sure snd_nxt is correctly set */ tp->snd_nxt = tp->snd_max; } @@ -10626,27 +12449,32 @@ rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: - rack_set_pace_segments(tp, rack, __LINE__); + rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; case TCPS_CLOSE_WAIT: + rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_CLOSE_WAIT; rack->r_substate = rack_do_close_wait; break; case TCPS_FIN_WAIT_1: + rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_FIN_WAIT_1; rack->r_substate = rack_do_fin_wait_1; break; case TCPS_CLOSING: + rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_CLOSING; rack->r_substate = rack_do_closing; break; case TCPS_LAST_ACK: + rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_LAST_ACK; rack->r_substate = rack_do_lastack; break; case TCPS_FIN_WAIT_2: + rack_set_pace_segments(tp, rack, __LINE__, NULL); rack->r_state = TCPS_FIN_WAIT_2; rack->r_substate = rack_do_fin_wait_2; break; @@ -10656,6 +12484,9 @@ default: break; }; + if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + } static void @@ -10738,63 +12569,67 @@ tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); + rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); } -static int -rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, - int32_t nxt_pkt, struct timeval *tv) -{ - int32_t thflags, retval, did_out = 0; - int32_t way_out = 0; - uint32_t cts; - uint32_t tiwin; - struct timespec ts; - struct tcpopt to; - struct tcp_rack *rack; - struct rack_sendmap *rsm; - int32_t prev_state = 0; - uint32_t us_cts; - /* - * tv passed from common code is from either M_TSTMP_LRO or - * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. The - * rack_pacing stack assumes tv always refers to 'now', so we overwrite - * tv here to guarantee that. - */ - if (m->m_flags & M_TSTMP_LRO) - tcp_get_usecs(tv); - cts = tcp_tv_to_mssectick(tv); - rack = (struct tcp_rack *)tp->t_fb_ptr; +static void +rack_do_win_updates(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tiwin, uint32_t seq, uint32_t ack, uint32_t cts, uint32_t high_seq) +{ + tp->snd_wnd = tiwin; + rack_validate_fo_sendwin_up(tp, rack); + tp->snd_wl1 = seq; + tp->snd_wl2 = ack; + if (tp->snd_wnd > tp->max_sndwnd) + tp->max_sndwnd = tp->snd_wnd; + if (tp->snd_wnd < (tp->snd_max - high_seq)) { + /* The peer collapsed the window */ + rack_collapsed_window(rack); + } else if (rack->rc_has_collapsed) + rack_un_collapse_window(rack); + /* Do we exit persists? */ + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) { + rack_exit_persist(tp, rack, cts); + } + /* Do we enter persists? */ + if ((rack->rc_in_persist == 0) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* + * Here the rwnd is less than + * the pacing size, we are established, + * nothing is outstanding, and there is + * data to send. Enter persists. + */ + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } +} - if ((m->m_flags & M_TSTMP) || - (m->m_flags & M_TSTMP_LRO)) { - mbuf_tstmp2timespec(m, &ts); - rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; - rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; - } else - rack->r_ctl.act_rcv_time = *tv; - kern_prefetch(rack, &prev_state); - prev_state = 0; - thflags = th->th_flags; +static void +rack_log_input_packet(struct tcpcb *tp, struct tcp_rack *rack, struct tcp_ackent *ae, int ackval, uint32_t high_seq) +{ - NET_EPOCH_ASSERT(); - INP_WLOCK_ASSERT(tp->t_inpcb); - KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", - __func__)); - KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", - __func__)); if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; struct timeval ltv; + char tcp_hdr_buf[60]; + struct tcphdr *th; + struct timespec ts; + uint32_t orig_snd_una; + uint8_t xx = 0; + #ifdef NETFLIX_HTTP_LOGGING struct http_sendfile_track *http_req; - if (SEQ_GT(th->th_ack, tp->snd_una)) { - http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); + if (SEQ_GT(ae->ack, tp->snd_una)) { + http_req = tcp_http_find_req_for_seq(tp, (ae->ack-1)); } else { - http_req = tcp_http_find_req_for_seq(tp, th->th_ack); + http_req = tcp_http_find_req_for_seq(tp, ae->ack); } #endif memset(&log.u_bbr, 0, sizeof(log.u_bbr)); @@ -10804,27 +12639,37 @@ log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; else log.u_bbr.flex1 = 0; + log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->r_might_revert; log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; - log.u_bbr.flex3 = m->m_flags; + log.u_bbr.inflight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = tp->t_maxseg; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; - if (m->m_flags & M_TSTMP) { + log.u_bbr.flex7 = 1; + log.u_bbr.lost = ae->flags; + log.u_bbr.cwnd_gain = ackval; + log.u_bbr.pacing_gain = 0x2; + if (ae->flags & TSTMP_HDWR) { /* Record the hardware timestamp if present */ - mbuf_tstmp2timespec(m, &ts); + log.u_bbr.flex3 = M_TSTMP; + ts.tv_sec = ae->timestamp / 1000000000; + ts.tv_nsec = ae->timestamp % 1000000000; ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); - } else if (m->m_flags & M_TSTMP_LRO) { + } else if (ae->flags & TSTMP_LRO) { /* Record the LRO the arrival timestamp */ - mbuf_tstmp2timespec(m, &ts); + log.u_bbr.flex3 = M_TSTMP_LRO; + ts.tv_sec = ae->timestamp / 1000000000; + ts.tv_nsec = ae->timestamp % 1000000000; ltv.tv_sec = ts.tv_sec; ltv.tv_usec = ts.tv_nsec / 1000; log.u_bbr.flex5 = tcp_tv_to_usectick(<v); } log.u_bbr.timeStamp = tcp_get_usecs(<v); /* Log the rcv time */ - log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; + log.u_bbr.delRate = ae->timestamp; #ifdef NETFLIX_HTTP_LOGGING log.u_bbr.applimited = tp->t_http_closed; log.u_bbr.applimited <<= 8; @@ -10852,900 +12697,1961 @@ } } #endif - TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, - tlen, &log, true, <v); - } - if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { - way_out = 4; - retval = 0; - goto done_with_input; - } - /* - * If a segment with the ACK-bit set arrives in the SYN-SENT state - * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. - */ - if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && - (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); - ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); - return(1); + memset(tcp_hdr_buf, 0, sizeof(tcp_hdr_buf)); + th = (struct tcphdr *)tcp_hdr_buf; + th->th_seq = ae->seq; + th->th_ack = ae->ack; + th->th_win = ae->win; + /* Now fill in the ports */ + th->th_sport = tp->t_inpcb->inp_fport; + th->th_dport = tp->t_inpcb->inp_lport; + th->th_flags = ae->flags & 0xff; + /* Now do we have a timestamp option? */ + if (ae->flags & HAS_TSTMP) { + u_char *cp; + uint32_t val; + + th->th_off = ((sizeof(struct tcphdr) + TCPOLEN_TSTAMP_APPA) >> 2); + cp = (u_char *)(th + 1); + *cp = TCPOPT_NOP; + cp++; + *cp = TCPOPT_NOP; + cp++; + *cp = TCPOPT_TIMESTAMP; + cp++; + *cp = TCPOLEN_TIMESTAMP; + cp++; + val = htonl(ae->ts_value); + bcopy((char *)&val, + (char *)cp, sizeof(uint32_t)); + val = htonl(ae->ts_echo); + bcopy((char *)&val, + (char *)(cp + 4), sizeof(uint32_t)); + } else + th->th_off = (sizeof(struct tcphdr) >> 2); + + /* + * For sane logging we need to play a little trick. + * If the ack were fully processed we would have moved + * snd_una to high_seq, but since compressed acks are + * processed in two phases, at this point (logging) snd_una + * won't be advanced. So we would see multiple acks showing + * the advancement. We can prevent that by "pretending" that + * snd_una was advanced and then un-advancing it so that the + * logging code has the right value for tlb_snd_una. + */ + if (tp->snd_una != high_seq) { + orig_snd_una = tp->snd_una; + tp->snd_una = high_seq; + xx = 1; + } else + xx = 0; + TCP_LOG_EVENTP(tp, th, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, TCP_LOG_IN, 0, + 0, &log, true, <v); + if (xx) { + tp->snd_una = orig_snd_una; + } } - /* - * Parse options on any incoming segment. - */ - tcp_dooptions(&to, (u_char *)(th + 1), - (th->th_off << 2) - sizeof(struct tcphdr), - (thflags & TH_SYN) ? TO_SYN : 0); +} +static int +rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) +{ /* - * If timestamps were negotiated during SYN/ACK and a - * segment without a timestamp is received, silently drop - * the segment, unless it is a RST segment or missing timestamps are - * tolerated. - * See section 3.2 of RFC 7323. + * Handle a "special" compressed ack mbuf. Each incoming + * ack has only four possible dispositions: + * + * A) It moves the cum-ack forward + * B) It is behind the cum-ack. + * C) It is a window-update ack. + * D) It is a dup-ack. + * + * Note that we can have between 1 -> TCP_COMP_ACK_ENTRIES + * in the incoming mbuf. We also need to still pay attention + * to nxt_pkt since there may be another packet after this + * one. */ - if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && - ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { - way_out = 5; - retval = 0; - goto done_with_input; - } +#ifdef TCP_ACCOUNTING + uint64_t ts_val; + uint64_t rdstc; +#endif + int segsiz; + struct timespec ts; + struct tcp_rack *rack; + struct tcp_ackent *ae; + uint32_t tiwin, us_cts, cts, acked, acked_amount, high_seq, win_seq, the_win, win_upd_ack; + int cnt, i, did_out, ourfinisacked = 0; + int win_up_req = 0; + struct tcpopt to_holder, *to = NULL; + int nsegs = 0; + int under_pacing = 1; + int recovery = 0; + int idx; +#ifdef TCP_ACCOUNTING + sched_pin(); +#endif + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->gp_ready && + (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) + under_pacing = 0; + else + under_pacing = 1; - /* - * Segment received on connection. Reset idle time and keep-alive - * timer. XXX: This should be done after segment validation to - * ignore broken/spoofed segs. - */ - if (tp->t_idle_reduce && - (tp->snd_max == tp->snd_una) && - ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { - counter_u64_add(rack_input_idle_reduces, 1); - rack_cc_after_idle(rack, tp); + if (rack->r_state != tp->t_state) + rack_set_state(tp, rack); + to = &to_holder; + to->to_flags = 0; + KASSERT((m->m_len >= sizeof(struct tcp_ackent)), + ("tp:%p m_cmpack:%p with invalid len:%u", tp, m, m->m_len)); + cnt = m->m_len / sizeof(struct tcp_ackent); + idx = cnt / 5; + if (idx >= MAX_NUM_OF_CNTS) + idx = MAX_NUM_OF_CNTS - 1; + counter_u64_add(rack_proc_comp_ack[idx], 1); + counter_u64_add(rack_multi_single_eq, cnt); + high_seq = tp->snd_una; + the_win = tp->snd_wnd; + win_seq = tp->snd_wl1; + win_upd_ack = tp->snd_wl2; + cts = us_cts = tcp_tv_to_usectick(tv); + segsiz = ctf_fixed_maxseg(tp); + if ((rack->rc_gp_dyn_mul) && + (rack->use_fixed_rate == 0) && + (rack->rc_always_pace)) { + /* Check in on probertt */ + rack_check_probe_rtt(rack, us_cts); } - tp->t_rcvtime = ticks; - /* - * Unscale the window into a 32-bit value. For the SYN_SENT state - * the scale is zero. - */ - tiwin = th->th_win << tp->snd_scale; -#ifdef STATS - stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); + for (i = 0; i < cnt; i++) { +#ifdef TCP_ACCOUNTING + ts_val = get_cyclecount(); #endif - if (tiwin > rack->r_ctl.rc_high_rwnd) - rack->r_ctl.rc_high_rwnd = tiwin; - /* - * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move - * this to occur after we've validated the segment. - */ - if (tp->t_flags2 & TF2_ECN_PERMIT) { - if (thflags & TH_CWR) { - tp->t_flags2 &= ~TF2_ECN_SND_ECE; - tp->t_flags |= TF_ACKNOW; - } - switch (iptos & IPTOS_ECN_MASK) { - case IPTOS_ECN_CE: - tp->t_flags2 |= TF2_ECN_SND_ECE; - KMOD_TCPSTAT_INC(tcps_ecn_ce); - break; - case IPTOS_ECN_ECT0: - KMOD_TCPSTAT_INC(tcps_ecn_ect0); - break; - case IPTOS_ECN_ECT1: - KMOD_TCPSTAT_INC(tcps_ecn_ect1); - break; + rack_clear_rate_sample(rack); + ae = ((mtod(m, struct tcp_ackent *)) + i); + /* Setup the window */ + tiwin = ae->win << tp->snd_scale; + /* figure out the type of ack */ + if (SEQ_LT(ae->ack, high_seq)) { + /* Case B*/ + ae->ack_val_set = ACK_BEHIND; + } else if (SEQ_GT(ae->ack, high_seq)) { + /* Case A */ + ae->ack_val_set = ACK_CUMACK; + } else if (tiwin == the_win) { + /* Case D */ + ae->ack_val_set = ACK_DUPACK; + } else { + /* Case C */ + ae->ack_val_set = ACK_RWND; + } + rack_log_input_packet(tp, rack, ae, ae->ack_val_set, high_seq); + /* Validate timestamp */ + if (ae->flags & HAS_TSTMP) { + /* Setup for a timestamp */ + to->to_flags = TOF_TS; + ae->ts_echo -= tp->ts_offset; + to->to_tsecr = ae->ts_echo; + to->to_tsval = ae->ts_value; + /* + * If echoed timestamp is later than the current time, fall back to + * non RFC1323 RTT calculation. Normalize timestamp if syncookies + * were used when this connection was established. + */ + if (TSTMP_GT(ae->ts_echo, cts)) + ae->ts_echo = 0; + if (tp->ts_recent && + TSTMP_LT(ae->ts_value, tp->ts_recent)) { + if (ctf_ts_check_ac(tp, (ae->flags & 0xff))) { +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ae->ack_val_set] , + (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); + } + } +#endif + continue; + } + } + if (SEQ_LEQ(ae->seq, tp->last_ack_sent) && + SEQ_LEQ(tp->last_ack_sent, ae->seq)) { + tp->ts_recent_age = tcp_ts_getticks(); + tp->ts_recent = ae->ts_value; + } + } else { + /* Setup for a no options */ + to->to_flags = 0; + } + /* Update the rcv time and perform idle reduction possibly */ + if (tp->t_idle_reduce && + (tp->snd_max == tp->snd_una) && + ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { + counter_u64_add(rack_input_idle_reduces, 1); + rack_cc_after_idle(rack, tp); } + tp->t_rcvtime = ticks; + /* Now what about ECN? */ + if (tp->t_flags2 & TF2_ECN_PERMIT) { + if (ae->flags & TH_CWR) { + tp->t_flags2 &= ~TF2_ECN_SND_ECE; + tp->t_flags |= TF_ACKNOW; + } + switch (ae->codepoint & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags2 |= TF2_ECN_SND_ECE; + KMOD_TCPSTAT_INC(tcps_ecn_ce); + break; + case IPTOS_ECN_ECT0: + KMOD_TCPSTAT_INC(tcps_ecn_ect0); + break; + case IPTOS_ECN_ECT1: + KMOD_TCPSTAT_INC(tcps_ecn_ect1); + break; + } - /* Process a packet differently from RFC3168. */ - cc_ecnpkt_handler(tp, th, iptos); - - /* Congestion experienced. */ - if (thflags & TH_ECE) { - rack_cong_signal(tp, th, CC_ECN); + /* Process a packet differently from RFC3168. */ + cc_ecnpkt_handler_flags(tp, ae->flags, ae->codepoint); + /* Congestion experienced. */ + if (ae->flags & TH_ECE) { + rack_cong_signal(tp, CC_ECN, ae->ack); + } } - } - - /* - * If echoed timestamp is later than the current time, fall back to - * non RFC1323 RTT calculation. Normalize timestamp if syncookies - * were used when this connection was established. - */ - if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { - to.to_tsecr -= tp->ts_offset; - if (TSTMP_GT(to.to_tsecr, cts)) - to.to_tsecr = 0; - } - - /* - * If its the first time in we need to take care of options and - * verify we can do SACK for rack! - */ - if (rack->r_state == 0) { - /* Should be init'd by rack_init() */ - KASSERT(rack->rc_inp != NULL, - ("%s: rack->rc_inp unexpectedly NULL", __func__)); - if (rack->rc_inp == NULL) { - rack->rc_inp = tp->t_inpcb; +#ifdef TCP_ACCOUNTING + /* Count for the specific type of ack in */ + counter_u64_add(tcp_cnt_counters[ae->ack_val_set], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[ae->ack_val_set]++; } - +#endif /* - * Process options only when we get SYN/ACK back. The SYN - * case for incoming connections is handled in tcp_syncache. - * According to RFC1323 the window field in a SYN (i.e., a - * or ) segment itself is never scaled. XXX - * this is traditional behavior, may need to be cleaned up. + * Note how we could move up these in the determination + * above, but we don't so that way the timestamp checks (and ECN) + * is done first before we do any processing on the ACK. + * The non-compressed path through the code has this + * weakness (noted by @jtl) that it actually does some + * processing before verifying the timestamp information. + * We don't take that path here which is why we set + * the ack_val_set first, do the timestamp and ecn + * processing, and then look at what we have setup. */ - if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { - /* Handle parallel SYN for ECN */ - if (!(thflags & TH_ACK) && - ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && - ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { - tp->t_flags2 |= TF2_ECN_PERMIT; - tp->t_flags2 |= TF2_ECN_SND_ECE; - TCPSTAT_INC(tcps_ecn_shs); - } - if ((to.to_flags & TOF_SCALE) && - (tp->t_flags & TF_REQ_SCALE)) { - tp->t_flags |= TF_RCVD_SCALE; - tp->snd_scale = to.to_wscale; - } else - tp->t_flags &= ~TF_REQ_SCALE; + if (ae->ack_val_set == ACK_BEHIND) { /* - * Initial send window. It will be updated with the - * next incoming segment to the scaled value. + * Case B flag reordering, if window is not closed + * or it could be a keep-alive or persists */ - tp->snd_wnd = th->th_win; - if ((to.to_flags & TOF_TS) && - (tp->t_flags & TF_REQ_TSTMP)) { - tp->t_flags |= TF_RCVD_TSTMP; - tp->ts_recent = to.to_tsval; - tp->ts_recent_age = cts; - } else - tp->t_flags &= ~TF_REQ_TSTMP; - if (to.to_flags & TOF_MSS) - tcp_mss(tp, to.to_mss); - if ((tp->t_flags & TF_SACK_PERMIT) && - (to.to_flags & TOF_SACKPERM) == 0) - tp->t_flags &= ~TF_SACK_PERMIT; - if (IS_FASTOPEN(tp->t_flags)) { - if (to.to_flags & TOF_FASTOPEN) { - uint16_t mss; + if (SEQ_LT(ae->ack, tp->snd_una) && (sbspace(&so->so_rcv) > segsiz)) { + counter_u64_add(rack_reorder_seen, 1); + rack->r_ctl.rc_reorder_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + } + } else if (ae->ack_val_set == ACK_DUPACK) { + /* Case D */ - if (to.to_flags & TOF_MSS) - mss = to.to_mss; - else - if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) - mss = TCP6_MSS; - else - mss = TCP_MSS; - tcp_fastopen_update_cache(tp, mss, - to.to_tfo_len, to.to_tfo_cookie); - } else - tcp_fastopen_disable_path(tp); + rack_strike_dupack(rack); + } else if (ae->ack_val_set == ACK_RWND) { + /* Case C */ + + win_up_req = 1; + win_upd_ack = ae->ack; + win_seq = ae->seq; + the_win = tiwin; + } else { + /* Case A */ + + if (SEQ_GT(ae->ack, tp->snd_max)) { + /* + * We just send an ack since the incoming + * ack is beyond the largest seq we sent. + */ + if ((tp->t_flags & TF_ACKNOW) == 0) { + ctf_ack_war_checks(tp, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt); + if (tp->t_flags && TF_ACKNOW) + rack->r_wanted_output = 1; + } + } else { + nsegs++; + /* If the window changed setup to update */ + if (tiwin != tp->snd_wnd) { + win_up_req = 1; + win_upd_ack = ae->ack; + win_seq = ae->seq; + the_win = tiwin; + } +#ifdef TCP_ACCOUNTING + /* Account for the acks */ + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[CNT_OF_ACKS_IN] += (((ae->ack - high_seq) + segsiz - 1) / segsiz); + } + counter_u64_add(tcp_cnt_counters[CNT_OF_ACKS_IN], + (((ae->ack - high_seq) + segsiz - 1) / segsiz)); +#endif + high_seq = ae->ack; + /* Setup our act_rcv_time */ + if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { + ts.tv_sec = ae->timestamp / 1000000000; + ts.tv_nsec = ae->timestamp % 1000000000; + rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; + rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; + } else { + rack->r_ctl.act_rcv_time = *tv; + } + rack_process_to_cumack(tp, rack, ae->ack, cts, to); } } - /* - * At this point we are at the initial call. Here we decide - * if we are doing RACK or not. We do this by seeing if - * TF_SACK_PERMIT is set and the sack-not-required is clear. - * The code now does do dup-ack counting so if you don't - * switch back you won't get rack & TLP, but you will still - * get this stack. - */ - - if ((rack_sack_not_required == 0) && - ((tp->t_flags & TF_SACK_PERMIT) == 0)) { - tcp_switch_back_to_default(tp); - (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, - tlen, iptos); - return (1); + /* And lets be sure to commit the rtt measurements for this ack */ + tcp_rack_xmit_timer_commit(rack, tp); +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ae->ack_val_set] , (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ae->ack_val_set] += (rdstc - ts_val); + if (ae->ack_val_set == ACK_CUMACK) + tp->tcp_proc_time[CYC_HANDLE_MAP] += (rdstc - ts_val); + } } - /* Set the flag */ - rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; - tcp_set_hpts(tp->t_inpcb); - sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); +#endif } - if (thflags & TH_FIN) - tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); - us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); - if ((rack->rc_gp_dyn_mul) && - (rack->use_fixed_rate == 0) && - (rack->rc_always_pace)) { - /* Check in on probertt */ - rack_check_probe_rtt(rack, us_cts); +#ifdef TCP_ACCOUNTING + ts_val = get_cyclecount(); +#endif + acked_amount = acked = (high_seq - tp->snd_una); + if (win_up_req) { + rack_do_win_updates(tp, rack, the_win, win_seq, win_upd_ack, cts, high_seq); } - if (rack->forced_ack) { - uint32_t us_rtt; + if (acked) { + if (rack->sack_attack_disable == 0) + rack_do_decay(rack); + if (acked >= segsiz) { + /* + * You only get credit for + * MSS and greater (and you get extra + * credit for larger cum-ack moves). + */ + int ac; + ac = acked / segsiz; + rack->r_ctl.ack_count += ac; + counter_u64_add(rack_ack_total, ac); + } + if (rack->r_ctl.ack_count > 0xfff00000) { + /* + * reduce the number to keep us under + * a uint32_t. + */ + rack->r_ctl.ack_count /= 2; + rack->r_ctl.sack_count /= 2; + } + if (tp->t_flags & TF_NEEDSYN) { + /* + * T/TCP: Connection was half-synchronized, and our SYN has + * been ACK'd (so connection is now fully synchronized). Go + * to non-starred state, increment snd_una for ACK of SYN, + * and check if we can do window scaling. + */ + tp->t_flags &= ~TF_NEEDSYN; + tp->snd_una++; + acked_amount = acked = (high_seq - tp->snd_una); + } + if (acked > sbavail(&so->so_snd)) + acked_amount = sbavail(&so->so_snd); +#ifdef NETFLIX_EXP_DETECTION /* - * A persist or keep-alive was forced out, update our - * min rtt time. Note we do not worry about lost - * retransmissions since KEEP-ALIVES and persists - * are usually way long on times of sending (though - * if we were really paranoid or worried we could - * at least use timestamps if available to validate). + * We only care on a cum-ack move if we are in a sack-disabled + * state. We have already added in to the ack_count, and we never + * would disable on a cum-ack move, so we only care to do the + * detection if it may "undo" it, i.e. we were in disabled already. */ - rack->forced_ack = 0; - us_rtt = us_cts - rack->r_ctl.forced_ack_ts; - if (us_rtt == 0) - us_rtt = 1; - rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); - rack_apply_updated_usrtt(rack, us_rtt, us_cts); - } - /* - * This is the one exception case where we set the rack state - * always. All other times (timers etc) we must have a rack-state - * set (so we assure we have done the checks above for SACK). - */ - rack->r_ctl.rc_rcvtime = cts; - if (rack->r_state != tp->t_state) - rack_set_state(tp, rack); - if (SEQ_GT(th->th_ack, tp->snd_una) && - (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) - kern_prefetch(rsm, &prev_state); - prev_state = rack->r_state; - rack_clear_rate_sample(rack); - retval = (*rack->r_substate) (m, th, so, - tp, &to, drop_hdrlen, - tlen, tiwin, thflags, nxt_pkt, iptos); -#ifdef INVARIANTS - if ((retval == 0) && - (tp->t_inpcb == NULL)) { - panic("retval:%d tp:%p t_inpcb:NULL state:%d", - retval, tp, prev_state); - } + if (rack->sack_attack_disable) + rack_do_detection(tp, rack, acked_amount, segsiz); #endif - if (retval == 0) { - /* - * If retval is 1 the tcb is unlocked and most likely the tp - * is gone. - */ - INP_WLOCK_ASSERT(tp->t_inpcb); - if ((rack->rc_gp_dyn_mul) && - (rack->rc_always_pace) && - (rack->use_fixed_rate == 0) && - rack->in_probe_rtt && - (rack->r_ctl.rc_time_probertt_starts == 0)) { + if (IN_FASTRECOVERY(tp->t_flags) && + (rack->rack_no_prr == 0)) + rack_update_prr(tp, rack, acked_amount, high_seq); + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(high_seq, tp->snd_recover) && + (SEQ_LT(high_seq, tp->snd_max))) { + tcp_rack_partialack(tp); + } else { + rack_post_recovery(tp, high_seq); + recovery = 1; + } + } + /* Handle the rack-log-ack part (sendmap) */ + if ((sbused(&so->so_snd) == 0) && + (acked > acked_amount) && + (tp->t_state >= TCPS_FIN_WAIT_1) && + (tp->t_flags & TF_SENTFIN)) { /* - * If we are going for target, lets recheck before - * we output. + * We must be sure our fin + * was sent and acked (we can be + * in FIN_WAIT_1 without having + * sent the fin). */ - rack_check_probe_rtt(rack, us_cts); + ourfinisacked = 1; + /* + * Lets make sure snd_una is updated + * since most likely acked_amount = 0 (it + * should be). + */ + tp->snd_una = high_seq; + } + /* Did we make a RTO error? */ + if ((tp->t_flags & TF_PREVVALID) && + ((tp->t_flags & TF_RCVD_TSTMP) == 0)) { + tp->t_flags &= ~TF_PREVVALID; + if (tp->t_rxtshift == 1 && + (int)(ticks - tp->t_badrxtwin) < 0) + rack_cong_signal(tp, CC_RTO_ERR, high_seq); + } + /* Handle the data in the socket buffer */ + KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); + KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); + if (acked_amount > 0) { + struct mbuf *mfree; + + rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); + SOCKBUF_LOCK(&so->so_snd); + mfree = sbcut_locked(&so->so_snd, acked); + tp->snd_una = high_seq; + /* Note we want to hold the sb lock through the sendmap adjust */ + rack_adjust_sendmap(rack, &so->so_snd, tp->snd_una); + /* Wake up the socket if we have room to write more */ + rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); + SOCKBUF_UNLOCK(&so->so_snd); + tp->t_flags |= TF_WAKESOW; + m_freem(mfree); } - if (rack->set_pacing_done_a_iw == 0) { - /* How much has been acked? */ - if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { - /* We have enough to set in the pacing segment size */ - rack->set_pacing_done_a_iw = 1; - rack_set_pace_segments(tp, rack, __LINE__); + /* update progress */ + tp->t_acktime = ticks; + rack_log_progress_event(rack, tp, tp->t_acktime, + PROGRESS_UPDATE, __LINE__); + /* Clear out shifts and such */ + tp->t_rxtshift = 0; + RACK_TCPT_RANGESET(tp->t_rxtcur, RACK_REXMTVAL(tp), + rack_rto_min, rack_rto_max); + rack->rc_tlp_in_progress = 0; + rack->r_ctl.rc_tlp_cnt_out = 0; + /* Send recover and snd_nxt must be dragged along */ + if (SEQ_GT(tp->snd_una, tp->snd_recover)) + tp->snd_recover = tp->snd_una; + if (SEQ_LT(tp->snd_nxt, tp->snd_una)) + tp->snd_nxt = tp->snd_una; + /* + * If the RXT timer is running we want to + * stop it, so we can restart a TLP (or new RXT). + */ + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); +#ifdef NETFLIX_HTTP_LOGGING + tcp_http_check_for_comp(rack->rc_tp, high_seq); +#endif + tp->snd_wl2 = high_seq; + tp->t_dupacks = 0; + if (under_pacing && + (rack->use_fixed_rate == 0) && + (rack->in_probe_rtt == 0) && + rack->rc_gp_dyn_mul && + rack->rc_always_pace) { + /* Check if we are dragging bottom */ + rack_check_bottom_drag(tp, rack, so, acked); + } + if (tp->snd_una == tp->snd_max) { + tp->t_flags &= ~TF_PREVVALID; + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; + rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); + if (rack->r_ctl.rc_went_idle_time == 0) + rack->r_ctl.rc_went_idle_time = 1; + rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + tp->t_acktime = 0; + /* Set so we might enter persists... */ + rack->r_wanted_output = 1; + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + if ((tp->t_state >= TCPS_FIN_WAIT_1) && + (sbavail(&so->so_snd) == 0) && + (tp->t_flags2 & TF2_DROP_AF_DATA)) { + /* + * The socket was gone and the + * peer sent data (not now in the past), time to + * reset him. + */ + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + /* tcp_close will kill the inp pre-log the Reset */ + tcp_log_end_status(tp, TCP_EI_STATUS_SERVER_RST); +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); + tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); + } + } +#endif + m_freem(m); + tp = tcp_close(tp); + if (tp == NULL) { +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (1); + } + /* + * We would normally do drop-with-reset which would + * send back a reset. We can't since we don't have + * all the needed bits. Instead lets arrange for + * a call to tcp_output(). That way since we + * are in the closed state we will generate a reset. + * + * Note if tcp_accounting is on we don't unpin since + * we do that after the goto label. + */ + goto send_out_a_rst; + } + if ((sbused(&so->so_snd) == 0) && + (tp->t_state >= TCPS_FIN_WAIT_1) && + (tp->t_flags & TF_SENTFIN)) { + /* + * If we can't receive any more data, then closing user can + * proceed. Starting the timer is contrary to the + * specification, but if we don't get a FIN we'll hang + * forever. + * + */ + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + if (ourfinisacked == 0) { + /* + * We don't change to fin-wait-2 if we have our fin acked + * which means we are probably in TCPS_CLOSING. + */ + tcp_state_change(tp, TCPS_FIN_WAIT_2); + } } } - tcp_rack_xmit_timer_commit(rack, tp); - if (nxt_pkt == 0) { - if (rack->r_wanted_output != 0) { -do_output_now: - did_out = 1; - (void)tp->t_fb->tfb_tcp_output(tp); + /* Wake up the socket if we have room to write more */ + if (sbavail(&so->so_snd)) { + rack->r_wanted_output = 1; + if (ctf_progress_timeout_check(tp, true)) { + rack_log_progress_event((struct tcp_rack *)tp->t_fb_ptr, + tp, tick, PROGRESS_DROP, __LINE__); + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + /* + * We cheat here and don't send a RST, we should send one + * when the pacer drops the connection. + */ +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); + tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); + } + } + sched_unpin(); +#endif + INP_WUNLOCK(rack->rc_inp); + m_freem(m); + return (1); } - rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } - if ((nxt_pkt == 0) && - ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && - (SEQ_GT(tp->snd_max, tp->snd_una) || - (tp->t_flags & TF_DELACK) || - ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && - (tp->t_state <= TCPS_CLOSING)))) { - /* We could not send (probably in the hpts but stopped the timer earlier)? */ - if ((tp->snd_max == tp->snd_una) && - ((tp->t_flags & TF_DELACK) == 0) && - (rack->rc_inp->inp_in_hpts) && - (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { - /* keep alive not needed if we are hptsi output yet */ - ; - } else { - int late = 0; - if (rack->rc_inp->inp_in_hpts) { - if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { - us_cts = tcp_get_usecs(NULL); - if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { - rack->r_early = 1; - rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); - } else - late = 1; - rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + if (ourfinisacked) { + switch(tp->t_state) { + case TCPS_CLOSING: +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_CUMACK] , + (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); + tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); } - tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); } - if (late && (did_out == 0)) { - /* - * We are late in the sending - * and we did not call the output - * (this probably should not happen). - */ - goto do_output_now; + sched_unpin(); +#endif + tcp_twstart(tp); + m_freem(m); + return (1); + break; + case TCPS_LAST_ACK: +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_CUMACK] , + (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); + tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); + } + } + sched_unpin(); +#endif + tp = tcp_close(tp); + ctf_do_drop(m, tp); + return (1); + break; + case TCPS_FIN_WAIT_1: +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_CUMACK] , + (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); + tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); + } } - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); +#endif + if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { + soisdisconnected(so); + tcp_timer_activate(tp, TT_2MSL, + (tcp_fast_finwait2_recycle ? + tcp_finwait2_timeout : + TP_MAXIDLE(tp))); + } + tcp_state_change(tp, TCPS_FIN_WAIT_2); + break; + default: + break; } - way_out = 1; - } else if (nxt_pkt == 0) { - /* Do we have the correct timer running? */ - rack_timer_audit(tp, rack, &so->so_snd); - way_out = 2; } - done_with_input: - rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); - if (did_out) - rack->r_wanted_output = 0; -#ifdef INVARIANTS - if (tp->t_inpcb == NULL) { - panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", - did_out, - retval, tp, prev_state); + if (rack->r_fast_output) { + /* + * We re doing fast output.. can we expand that? + */ + rack_gain_for_fastoutput(rack, tp, so, acked_amount); + } +#ifdef TCP_ACCOUNTING + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_CUMACK] , (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_CUMACK] += (rdstc - ts_val); + tp->tcp_proc_time[CYC_HANDLE_ACK] += (rdstc - ts_val); + } } -#endif - } - return (retval); -} - -void -rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, - struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) -{ - struct timeval tv; - /* First lets see if we have old packets */ - if (tp->t_in_pkt) { - if (ctf_do_queued_segments(so, tp, 1)) { - m_freem(m); - return; + } else if (win_up_req) { + rdstc = get_cyclecount(); + if (rdstc > ts_val) { + counter_u64_add(tcp_proc_time[ACK_RWND] , (rdstc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ACK_RWND] += (rdstc - ts_val); + } } +#endif } - if (m->m_flags & M_TSTMP_LRO) { - tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; - tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; - } else { - /* Should not be should we kassert instead? */ - tcp_get_usecs(&tv); - } - if(rack_do_segment_nounlock(m, th, so, tp, - drop_hdrlen, tlen, iptos, 0, &tv) == 0) { - tcp_handle_wakeup(tp, so); - INP_WUNLOCK(tp->t_inpcb); + /* Now is there a next packet, if so we are done */ + m_freem(m); + did_out = 0; + if (nxt_pkt) { +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 5, nsegs); + return (0); } + rack_handle_might_revert(tp, rack); + ctf_calc_rwin(so, tp); + if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { + send_out_a_rst: + (void)tp->t_fb->tfb_tcp_output(tp); + did_out = 1; + } + rack_free_trim(rack); +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + rack_timer_audit(tp, rack, &so->so_snd); + rack_log_doseg_done(rack, cts, nxt_pkt, did_out, 6, nsegs); + return (0); } -struct rack_sendmap * -tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) -{ - struct rack_sendmap *rsm = NULL; - int32_t idx; - uint32_t srtt = 0, thresh = 0, ts_low = 0; - /* Return the next guy to be re-transmitted */ - if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { - return (NULL); - } - if (tp->t_flags & TF_SENTFIN) { - /* retran the end FIN? */ - return (NULL); - } - /* ok lets look at this one */ - rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { - goto check_it; - } - rsm = rack_find_lowest_rsm(rack); - if (rsm == NULL) { - return (NULL); - } -check_it: - if (rsm->r_flags & RACK_ACKED) { - return (NULL); - } - if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && - (rsm->r_dupack < DUP_ACK_THRESHOLD)) { - /* Its not yet ready */ - return (NULL); - } - srtt = rack_grab_rtt(tp, rack); - idx = rsm->r_rtr_cnt - 1; - ts_low = rsm->r_tim_lastsent[idx]; - thresh = rack_calc_thresh_rack(rack, srtt, tsused); - if ((tsused == ts_low) || - (TSTMP_LT(tsused, ts_low))) { - /* No time since sending */ - return (NULL); +static int +rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, + int32_t nxt_pkt, struct timeval *tv) +{ +#ifdef TCP_ACCOUNTING + uint64_t ts_val; +#endif + int32_t thflags, retval, did_out = 0; + int32_t way_out = 0; + uint32_t cts; + uint32_t tiwin; + struct timespec ts; + struct tcpopt to; + struct tcp_rack *rack; + struct rack_sendmap *rsm; + int32_t prev_state = 0; +#ifdef TCP_ACCOUNTING + int ack_val_set = 0xf; +#endif + uint32_t us_cts; + /* + * tv passed from common code is from either M_TSTMP_LRO or + * tcp_get_usecs() if no LRO m_pkthdr timestamp is present. + */ + if (m->m_flags & M_ACKCMP) { + return (rack_do_compressed_ack_processing(tp, so, m, nxt_pkt, tv)); } - if ((tsused - ts_low) < thresh) { - /* It has not been long enough yet */ - return (NULL); + if (m->m_flags & M_ACKCMP) { + panic("Impossible reach m has ackcmp? m:%p tp:%p", m, tp); } - if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || - ((rsm->r_flags & RACK_SACK_PASSED) && - (rack->sack_attack_disable == 0))) { + counter_u64_add(rack_proc_non_comp_ack, 1); + thflags = th->th_flags; +#ifdef TCP_ACCOUNTING + sched_pin(); + if (thflags & TH_ACK) + ts_val = get_cyclecount(); +#endif + cts = tcp_tv_to_usectick(tv); + rack = (struct tcp_rack *)tp->t_fb_ptr; + + if ((m->m_flags & M_TSTMP) || + (m->m_flags & M_TSTMP_LRO)) { + mbuf_tstmp2timespec(m, &ts); + rack->r_ctl.act_rcv_time.tv_sec = ts.tv_sec; + rack->r_ctl.act_rcv_time.tv_usec = ts.tv_nsec/1000; + } else + rack->r_ctl.act_rcv_time = *tv; + kern_prefetch(rack, &prev_state); + prev_state = 0; + /* + * Unscale the window into a 32-bit value. For the SYN_SENT state + * the scale is zero. + */ + tiwin = th->th_win << tp->snd_scale; + /* + * Parse options on any incoming segment. + */ + memset(&to, 0, sizeof(to)); + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); +#ifdef TCP_ACCOUNTING + if (thflags & TH_ACK) { /* - * We have passed the dup-ack threshold - * a SACK has indicated this is missing. - * Note that if you are a declared attacker - * it is only the dup-ack threshold that - * will cause retransmits. + * We have a tradeoff here. We can either do what we are + * doing i.e. pinning to this CPU and then doing the accounting + * we could do a critical enter, setup the rdtsc and cpu + * as in below, and then validate we are on the same CPU on + * exit. I have choosen to not do the critical enter since + * that often will gain you a context switch, and instead lock + * us (line above this if) to the same CPU with sched_pin(). This + * means we may be context switched out for a higher priority + * interupt but we won't be moved to another CPU. + * + * If this occurs (which it won't very often since we most likely + * are running this code in interupt context and only a higher + * priority will bump us ... clock?) we will falsely add in + * to the time the interupt processing time plus the ack processing + * time. This is ok since its a rare event. */ - /* log retransmit reason */ - rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); - return (rsm); + ack_val_set = tcp_do_ack_accounting(tp, th, &to, tiwin, + ctf_fixed_maxseg(tp)); } - return (NULL); -} - -static void -rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, - uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, - int line, struct rack_sendmap *rsm) -{ - if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { +#endif + NET_EPOCH_ASSERT(); + INP_WLOCK_ASSERT(tp->t_inpcb); + KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", + __func__)); + KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", + __func__)); + if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; - struct timeval tv; + struct timeval ltv; +#ifdef NETFLIX_HTTP_LOGGING + struct http_sendfile_track *http_req; - memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = slot; - log.u_bbr.flex2 = len; - log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; - log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; - log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; - log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; - log.u_bbr.use_lt_bw = rack->app_limited_needs_set; - log.u_bbr.use_lt_bw <<= 1; - log.u_bbr.use_lt_bw = rack->rc_gp_filled; - log.u_bbr.use_lt_bw <<= 1; - log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; - log.u_bbr.use_lt_bw <<= 1; - log.u_bbr.use_lt_bw |= rack->in_probe_rtt; - log.u_bbr.pkt_epoch = line; - log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; - log.u_bbr.bw_inuse = bw_est; - log.u_bbr.delRate = bw; - if (rack->r_ctl.gp_bw == 0) - log.u_bbr.cur_del_rate = 0; - else - log.u_bbr.cur_del_rate = rack_get_bw(rack); - log.u_bbr.rttProp = len_time; - log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; - log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; - log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); - if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { - /* We are in slow start */ - log.u_bbr.flex7 = 1; + if (SEQ_GT(th->th_ack, tp->snd_una)) { + http_req = tcp_http_find_req_for_seq(tp, (th->th_ack-1)); } else { - /* we are on congestion avoidance */ - log.u_bbr.flex7 = 0; + http_req = tcp_http_find_req_for_seq(tp, th->th_ack); } - log.u_bbr.flex8 = method; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; - log.u_bbr.cwnd_gain <<= 1; - log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; - log.u_bbr.cwnd_gain <<= 1; - log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; - TCP_LOG_EVENTP(rack->rc_tp, NULL, - &rack->rc_inp->inp_socket->so_rcv, - &rack->rc_inp->inp_socket->so_snd, - BBR_LOG_HPTSI_CALC, 0, - 0, &log, false, &tv); +#endif + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + if (rack->rack_no_prr == 0) + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + else + log.u_bbr.flex1 = 0; + log.u_bbr.use_lt_bw = rack->r_ent_rec_ns; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->r_might_revert; + log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; + log.u_bbr.flex3 = m->m_flags; + log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.lost = thflags; + log.u_bbr.pacing_gain = 0x1; +#ifdef TCP_ACCOUNTING + log.u_bbr.cwnd_gain = ack_val_set; +#endif + log.u_bbr.flex7 = 2; + if (m->m_flags & M_TSTMP) { + /* Record the hardware timestamp if present */ + mbuf_tstmp2timespec(m, &ts); + ltv.tv_sec = ts.tv_sec; + ltv.tv_usec = ts.tv_nsec / 1000; + log.u_bbr.lt_epoch = tcp_tv_to_usectick(<v); + } else if (m->m_flags & M_TSTMP_LRO) { + /* Record the LRO the arrival timestamp */ + mbuf_tstmp2timespec(m, &ts); + ltv.tv_sec = ts.tv_sec; + ltv.tv_usec = ts.tv_nsec / 1000; + log.u_bbr.flex5 = tcp_tv_to_usectick(<v); + } + log.u_bbr.timeStamp = tcp_get_usecs(<v); + /* Log the rcv time */ + log.u_bbr.delRate = m->m_pkthdr.rcv_tstmp; +#ifdef NETFLIX_HTTP_LOGGING + log.u_bbr.applimited = tp->t_http_closed; + log.u_bbr.applimited <<= 8; + log.u_bbr.applimited |= tp->t_http_open; + log.u_bbr.applimited <<= 8; + log.u_bbr.applimited |= tp->t_http_req; + if (http_req) { + /* Copy out any client req info */ + /* seconds */ + log.u_bbr.pkt_epoch = (http_req->localtime / HPTS_USEC_IN_SEC); + /* useconds */ + log.u_bbr.delivered = (http_req->localtime % HPTS_USEC_IN_SEC); + log.u_bbr.rttProp = http_req->timestamp; + log.u_bbr.cur_del_rate = http_req->start; + if (http_req->flags & TCP_HTTP_TRACK_FLG_OPEN) { + log.u_bbr.flex8 |= 1; + } else { + log.u_bbr.flex8 |= 2; + log.u_bbr.bw_inuse = http_req->end; + } + log.u_bbr.flex6 = http_req->start_seq; + if (http_req->flags & TCP_HTTP_TRACK_FLG_COMP) { + log.u_bbr.flex8 |= 4; + log.u_bbr.epoch = http_req->end_seq; + } + } +#endif + TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, + tlen, &log, true, <v); } -} - -static uint32_t -rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) -{ - uint32_t new_tso, user_max; - - user_max = rack->rc_user_set_max_segs * mss; - if (rack->rc_force_max_seg) { - return (user_max); + if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { + way_out = 4; + retval = 0; + goto done_with_input; } - if (rack->use_fixed_rate && - ((rack->r_ctl.crte == NULL) || - (bw != rack->r_ctl.crte->rate))) { - /* Use the user mss since we are not exactly matched */ - return (user_max); + /* + * If a segment with the ACK-bit set arrives in the SYN-SENT state + * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. + */ + if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && + (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { + tcp_log_end_status(tp, TCP_EI_STATUS_RST_IN_FRONT); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (1); } - new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); - if (new_tso > user_max) - new_tso = user_max; - return(new_tso); -} -static void -rack_log_hdwr_pacing(struct tcp_rack *rack, const struct ifnet *ifp, - uint64_t rate, uint64_t hw_rate, int line, - int error) -{ - if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { - union tcp_log_stackspecific log; - struct timeval tv; + /* + * Parse options on any incoming segment. + */ + tcp_dooptions(&to, (u_char *)(th + 1), + (th->th_off << 2) - sizeof(struct tcphdr), + (thflags & TH_SYN) ? TO_SYN : 0); - memset(&log, 0, sizeof(log)); - log.u_bbr.flex1 = ((hw_rate >> 32) & 0x00000000ffffffff); - log.u_bbr.flex2 = (hw_rate & 0x00000000ffffffff); - log.u_bbr.flex3 = (((uint64_t)ifp >> 32) & 0x00000000ffffffff); - log.u_bbr.flex4 = ((uint64_t)ifp & 0x00000000ffffffff); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.bw_inuse = rate; - log.u_bbr.flex5 = line; - log.u_bbr.flex6 = error; - log.u_bbr.applimited = rack->r_ctl.rc_pace_max_segs; - log.u_bbr.flex8 = rack->use_fixed_rate; - log.u_bbr.flex8 <<= 1; - log.u_bbr.flex8 |= rack->rack_hdrw_pacing; - log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; - TCP_LOG_EVENTP(rack->rc_tp, NULL, - &rack->rc_inp->inp_socket->so_rcv, - &rack->rc_inp->inp_socket->so_snd, - BBR_LOG_HDWR_PACE, 0, - 0, &log, false, &tv); + /* + * If timestamps were negotiated during SYN/ACK and a + * segment without a timestamp is received, silently drop + * the segment, unless it is a RST segment or missing timestamps are + * tolerated. + * See section 3.2 of RFC 7323. + */ + if ((tp->t_flags & TF_RCVD_TSTMP) && !(to.to_flags & TOF_TS) && + ((thflags & TH_RST) == 0) && (V_tcp_tolerate_missing_ts == 0)) { + way_out = 5; + retval = 0; + goto done_with_input; } -} - -static int32_t -pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz) -{ - uint64_t lentim, fill_bw; - /* Lets first see if we are full, if so continue with normal rate */ - if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) - return (slot); - if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) - return (slot); - if (rack->r_ctl.rc_last_us_rtt == 0) - return (slot); - if (rack->rc_pace_fill_if_rttin_range && - (rack->r_ctl.rc_last_us_rtt >= - (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { - /* The rtt is huge, N * smallest, lets not fill */ - return (slot); + /* + * Segment received on connection. Reset idle time and keep-alive + * timer. XXX: This should be done after segment validation to + * ignore broken/spoofed segs. + */ + if (tp->t_idle_reduce && + (tp->snd_max == tp->snd_una) && + ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { + counter_u64_add(rack_input_idle_reduces, 1); + rack_cc_after_idle(rack, tp); } + tp->t_rcvtime = ticks; +#ifdef STATS + stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); +#endif + if (tiwin > rack->r_ctl.rc_high_rwnd) + rack->r_ctl.rc_high_rwnd = tiwin; /* - * first lets calculate the b/w based on the last us-rtt - * and the sndwnd. + * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move + * this to occur after we've validated the segment. */ - fill_bw = rack->r_ctl.cwnd_to_use; - /* Take the rwnd if its smaller */ - if (fill_bw > rack->rc_tp->snd_wnd) - fill_bw = rack->rc_tp->snd_wnd; - fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; - fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; - /* We are below the min b/w */ - if (fill_bw < RACK_MIN_BW) - return (slot); + if (tp->t_flags2 & TF2_ECN_PERMIT) { + if (thflags & TH_CWR) { + tp->t_flags2 &= ~TF2_ECN_SND_ECE; + tp->t_flags |= TF_ACKNOW; + } + switch (iptos & IPTOS_ECN_MASK) { + case IPTOS_ECN_CE: + tp->t_flags2 |= TF2_ECN_SND_ECE; + KMOD_TCPSTAT_INC(tcps_ecn_ce); + break; + case IPTOS_ECN_ECT0: + KMOD_TCPSTAT_INC(tcps_ecn_ect0); + break; + case IPTOS_ECN_ECT1: + KMOD_TCPSTAT_INC(tcps_ecn_ect1); + break; + } + + /* Process a packet differently from RFC3168. */ + cc_ecnpkt_handler(tp, th, iptos); + + /* Congestion experienced. */ + if (thflags & TH_ECE) { + rack_cong_signal(tp, CC_ECN, th->th_ack); + } + } + /* - * Ok fill_bw holds our mythical b/w to fill the cwnd - * in a rtt, what does that time wise equate too? + * If echoed timestamp is later than the current time, fall back to + * non RFC1323 RTT calculation. Normalize timestamp if syncookies + * were used when this connection was established. */ - lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; - lentim /= fill_bw; - if (lentim < slot) { - rack_log_pacing_delay_calc(rack, len, slot, fill_bw, - 0, lentim, 12, __LINE__, NULL); - return ((int32_t)lentim); - } else - return (slot); -} + if ((to.to_flags & TOF_TS) && (to.to_tsecr != 0)) { + to.to_tsecr -= tp->ts_offset; + if (TSTMP_GT(to.to_tsecr, cts)) + to.to_tsecr = 0; + } -static int32_t -rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) -{ - struct rack_sendmap *lrsm; - int32_t slot = 0; - int err; + /* + * If its the first time in we need to take care of options and + * verify we can do SACK for rack! + */ + if (rack->r_state == 0) { + /* Should be init'd by rack_init() */ + KASSERT(rack->rc_inp != NULL, + ("%s: rack->rc_inp unexpectedly NULL", __func__)); + if (rack->rc_inp == NULL) { + rack->rc_inp = tp->t_inpcb; + } - if (rack->rc_always_pace == 0) { /* - * We use the most optimistic possible cwnd/srtt for - * sending calculations. This will make our - * calculation anticipate getting more through - * quicker then possible. But thats ok we don't want - * the peer to have a gap in data sending. + * Process options only when we get SYN/ACK back. The SYN + * case for incoming connections is handled in tcp_syncache. + * According to RFC1323 the window field in a SYN (i.e., a + * or ) segment itself is never scaled. XXX + * this is traditional behavior, may need to be cleaned up. */ - uint32_t srtt, cwnd, tr_perms = 0; - int32_t reduce = 0; - - old_method: - /* - * We keep no precise pacing with the old method - * instead we use the pacer to mitigate bursts. - */ - rack->r_ctl.rc_agg_delayed = 0; - rack->r_early = 0; - rack->r_late = 0; - rack->r_ctl.rc_agg_early = 0; - if (rack->r_ctl.rc_rack_min_rtt) - srtt = rack->r_ctl.rc_rack_min_rtt; - else - srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); - if (rack->r_ctl.rc_rack_largest_cwnd) - cwnd = rack->r_ctl.rc_rack_largest_cwnd; - else - cwnd = rack->r_ctl.cwnd_to_use; - tr_perms = cwnd / srtt; - if (tr_perms == 0) { - tr_perms = ctf_fixed_maxseg(tp); - } - /* - * Calculate how long this will take to drain, if - * the calculation comes out to zero, thats ok we - * will use send_a_lot to possibly spin around for - * more increasing tot_len_this_send to the point - * that its going to require a pace, or we hit the - * cwnd. Which in that case we are just waiting for - * a ACK. - */ - slot = len / tr_perms; - /* Now do we reduce the time so we don't run dry? */ - if (slot && rack_slot_reduction) { - reduce = (slot / rack_slot_reduction); - if (reduce < slot) { - slot -= reduce; + if (tp->t_state == TCPS_SYN_SENT && (thflags & TH_SYN)) { + /* Handle parallel SYN for ECN */ + if (!(thflags & TH_ACK) && + ((thflags & (TH_CWR | TH_ECE)) == (TH_CWR | TH_ECE)) && + ((V_tcp_do_ecn == 1) || (V_tcp_do_ecn == 2))) { + tp->t_flags2 |= TF2_ECN_PERMIT; + tp->t_flags2 |= TF2_ECN_SND_ECE; + TCPSTAT_INC(tcps_ecn_shs); + } + if ((to.to_flags & TOF_SCALE) && + (tp->t_flags & TF_REQ_SCALE)) { + tp->t_flags |= TF_RCVD_SCALE; + tp->snd_scale = to.to_wscale; } else - slot = 0; - } - slot *= HPTS_USEC_IN_MSEC; - if (rsm == NULL) { + tp->t_flags &= ~TF_REQ_SCALE; /* - * We always consider ourselves app limited with old style - * that are not retransmits. This could be the initial - * measurement, but thats ok its all setup and specially - * handled. If another send leaks out, then that too will - * be mark app-limited. + * Initial send window. It will be updated with the + * next incoming segment to the scaled value. */ - lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); - if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { - rack->r_ctl.rc_first_appl = lrsm; - lrsm->r_flags |= RACK_APP_LIMITED; - rack->r_ctl.rc_app_limited_cnt++; + tp->snd_wnd = th->th_win; + rack_validate_fo_sendwin_up(tp, rack); + if ((to.to_flags & TOF_TS) && + (tp->t_flags & TF_REQ_TSTMP)) { + tp->t_flags |= TF_RCVD_TSTMP; + tp->ts_recent = to.to_tsval; + tp->ts_recent_age = cts; + } else + tp->t_flags &= ~TF_REQ_TSTMP; + if (to.to_flags & TOF_MSS) { + tcp_mss(tp, to.to_mss); } - } - rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); - } else { - uint64_t bw_est, res, lentim, rate_wanted; - uint32_t orig_val, srtt, segs, oh; + if ((tp->t_flags & TF_SACK_PERMIT) && + (to.to_flags & TOF_SACKPERM) == 0) + tp->t_flags &= ~TF_SACK_PERMIT; + if (IS_FASTOPEN(tp->t_flags)) { + if (to.to_flags & TOF_FASTOPEN) { + uint16_t mss; - if ((rack->r_rr_config == 1) && rsm) { - return (rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC); + if (to.to_flags & TOF_MSS) + mss = to.to_mss; + else + if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) + mss = TCP6_MSS; + else + mss = TCP_MSS; + tcp_fastopen_update_cache(tp, mss, + to.to_tfo_len, to.to_tfo_cookie); + } else + tcp_fastopen_disable_path(tp); + } } - if (rack->use_fixed_rate) { - rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); - } else if ((rack->r_ctl.init_rate == 0) && -#ifdef NETFLIX_PEAKRATE - (rack->rc_tp->t_maxpeakrate == 0) && + /* + * At this point we are at the initial call. Here we decide + * if we are doing RACK or not. We do this by seeing if + * TF_SACK_PERMIT is set and the sack-not-required is clear. + * The code now does do dup-ack counting so if you don't + * switch back you won't get rack & TLP, but you will still + * get this stack. + */ + + if ((rack_sack_not_required == 0) && + ((tp->t_flags & TF_SACK_PERMIT) == 0)) { + tcp_switch_back_to_default(tp); + (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, + tlen, iptos); +#ifdef TCP_ACCOUNTING + sched_unpin(); #endif - (rack->r_ctl.gp_bw == 0)) { - /* no way to yet do an estimate */ - bw_est = rate_wanted = 0; - } else { - bw_est = rack_get_bw(rack); - rate_wanted = rack_get_output_bw(rack, bw_est, rsm); - } - if ((bw_est == 0) || (rate_wanted == 0)) { - /* - * No way yet to make a b/w estimate or - * our raise is set incorrectly. - */ - goto old_method; + return (1); } - /* We need to account for all the overheads */ - segs = (len + segsiz - 1) / segsiz; + tcp_set_hpts(tp->t_inpcb); + sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); + } + if (thflags & TH_FIN) + tcp_log_end_status(tp, TCP_EI_STATUS_CLIENT_FIN); + us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + if ((rack->rc_gp_dyn_mul) && + (rack->use_fixed_rate == 0) && + (rack->rc_always_pace)) { + /* Check in on probertt */ + rack_check_probe_rtt(rack, us_cts); + } + if (rack->forced_ack) { + uint32_t us_rtt; + /* - * We need the diff between 1514 bytes (e-mtu with e-hdr) - * and how much data we put in each packet. Yes this - * means we may be off if we are larger than 1500 bytes - * or smaller. But this just makes us more conservative. + * A persist or keep-alive was forced out, update our + * min rtt time. Note we do not worry about lost + * retransmissions since KEEP-ALIVES and persists + * are usually way long on times of sending (though + * if we were really paranoid or worried we could + * at least use timestamps if available to validate). */ - if (ETHERNET_SEGMENT_SIZE > segsiz) - oh = ETHERNET_SEGMENT_SIZE - segsiz; - else - oh = 0; - segs *= oh; - lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; - res = lentim / rate_wanted; - slot = (uint32_t)res; - orig_val = rack->r_ctl.rc_pace_max_segs; - rack_set_pace_segments(rack->rc_tp, rack, __LINE__); - /* Did we change the TSO size, if so log it */ - if (rack->r_ctl.rc_pace_max_segs != orig_val) - rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); - if ((rack->rc_pace_to_cwnd) && - (rack->in_probe_rtt == 0) && - (IN_RECOVERY(rack->rc_tp->t_flags) == 0)) { + rack->forced_ack = 0; + us_rtt = us_cts - rack->r_ctl.forced_ack_ts; + if (us_rtt == 0) + us_rtt = 1; + rack_log_rtt_upd(tp, rack, us_rtt, 0, NULL, 3); + rack_apply_updated_usrtt(rack, us_rtt, us_cts); + } + /* + * This is the one exception case where we set the rack state + * always. All other times (timers etc) we must have a rack-state + * set (so we assure we have done the checks above for SACK). + */ + rack->r_ctl.rc_rcvtime = cts; + if (rack->r_state != tp->t_state) + rack_set_state(tp, rack); + if (SEQ_GT(th->th_ack, tp->snd_una) && + (rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree)) != NULL) + kern_prefetch(rsm, &prev_state); + prev_state = rack->r_state; + rack_clear_rate_sample(rack); + retval = (*rack->r_substate) (m, th, so, + tp, &to, drop_hdrlen, + tlen, tiwin, thflags, nxt_pkt, iptos); +#ifdef INVARIANTS + if ((retval == 0) && + (tp->t_inpcb == NULL)) { + panic("retval:%d tp:%p t_inpcb:NULL state:%d", + retval, tp, prev_state); + } +#endif + if (retval == 0) { + /* + * If retval is 1 the tcb is unlocked and most likely the tp + * is gone. + */ + INP_WLOCK_ASSERT(tp->t_inpcb); + if ((rack->rc_gp_dyn_mul) && + (rack->rc_always_pace) && + (rack->use_fixed_rate == 0) && + rack->in_probe_rtt && + (rack->r_ctl.rc_time_probertt_starts == 0)) { /* - * We want to pace at our rate *or* faster to - * fill the cwnd to the max if its not full. + * If we are going for target, lets recheck before + * we output. */ - slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz); + rack_check_probe_rtt(rack, us_cts); } - if ((rack->rc_inp->inp_route.ro_nh != NULL) && - (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { - if ((rack->rack_hdw_pace_ena) && - (rack->rack_hdrw_pacing == 0) && - (rack->rack_attempt_hdwr_pace == 0)) { - /* - * Lets attempt to turn on hardware pacing - * if we can. - */ - rack->rack_attempt_hdwr_pace = 1; - rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, - rack->rc_inp->inp_route.ro_nh->nh_ifp, - rate_wanted, - RS_PACING_GEQ, - &err, NULL); - if (rack->r_ctl.crte) { - rack->rack_hdrw_pacing = 1; - rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, segsiz, - 0, rack->r_ctl.crte, - NULL); - rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, - rate_wanted, rack->r_ctl.crte->rate, __LINE__, - err); - } - } else if (rack->rack_hdrw_pacing && - (rack->r_ctl.crte->rate != rate_wanted)) { - /* Do we need to adjust our rate? */ - const struct tcp_hwrate_limit_table *nrte; + if (rack->set_pacing_done_a_iw == 0) { + /* How much has been acked? */ + if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { + /* We have enough to set in the pacing segment size */ + rack->set_pacing_done_a_iw = 1; + rack_set_pace_segments(tp, rack, __LINE__, NULL); + } + } + tcp_rack_xmit_timer_commit(rack, tp); +#ifdef TCP_ACCOUNTING + /* + * If we set the ack_val_se to what ack processing we are doing + * we also want to track how many cycles we burned. Note + * the bits after tcp_output we let be "free". This is because + * we are also tracking the tcp_output times as well. Note the + * use of 0xf here since we only have 11 counter (0 - 0xa) and + * 0xf cannot be returned and is what we initialize it too to + * indicate we are not doing the tabulations. + */ + if (ack_val_set != 0xf) { + uint64_t crtsc; - nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, - rack->rc_tp, - rack->rc_inp->inp_route.ro_nh->nh_ifp, - rate_wanted, - RS_PACING_GEQ, - &err, NULL); - if (nrte == NULL) { - /* Lost the rate */ - rack->rack_hdrw_pacing = 0; - rack_set_pace_segments(rack->rc_tp, rack, __LINE__); - } else if (nrte != rack->r_ctl.crte) { - rack->r_ctl.crte = nrte; - rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(rack->rc_tp, rate_wanted, - segsiz, 0, - rack->r_ctl.crte, - NULL); - rack_log_hdwr_pacing(rack, rack->rc_inp->inp_route.ro_nh->nh_ifp, - rate_wanted, rack->r_ctl.crte->rate, __LINE__, - err); - } + crtsc = get_cyclecount(); + counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[ack_val_set] += (crtsc - ts_val); } } - if (rack_limit_time_with_srtt && - (rack->use_fixed_rate == 0) && -#ifdef NETFLIX_PEAKRATE - (rack->rc_tp->t_maxpeakrate == 0) && #endif - (rack->rack_hdrw_pacing == 0)) { + if (nxt_pkt == 0) { + if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { +do_output_now: + did_out = 1; + (void)tp->t_fb->tfb_tcp_output(tp); + } + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); + rack_free_trim(rack); + } + if ((nxt_pkt == 0) && + ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && + (SEQ_GT(tp->snd_max, tp->snd_una) || + (tp->t_flags & TF_DELACK) || + ((V_tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && + (tp->t_state <= TCPS_CLOSING)))) { + /* We could not send (probably in the hpts but stopped the timer earlier)? */ + if ((tp->snd_max == tp->snd_una) && + ((tp->t_flags & TF_DELACK) == 0) && + (rack->rc_inp->inp_in_hpts) && + (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { + /* keep alive not needed if we are hptsi output yet */ + ; + } else { + int late = 0; + if (rack->rc_inp->inp_in_hpts) { + if (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) { + us_cts = tcp_get_usecs(NULL); + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { + rack->r_early = 1; + rack->r_ctl.rc_agg_early += (rack->r_ctl.rc_last_output_to - us_cts); + } else + late = 1; + rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; + } + tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); + } + if (late && (did_out == 0)) { + /* + * We are late in the sending + * and we did not call the output + * (this probably should not happen). + */ + goto do_output_now; + } + rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); + } + way_out = 1; + } else if (nxt_pkt == 0) { + /* Do we have the correct timer running? */ + rack_timer_audit(tp, rack, &so->so_snd); + way_out = 2; + } + done_with_input: + rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out, max(1, m->m_pkthdr.lro_nsegs)); + if (did_out) + rack->r_wanted_output = 0; +#ifdef INVARIANTS + if (tp->t_inpcb == NULL) { + panic("OP:%d retval:%d tp:%p t_inpcb:NULL state:%d", + did_out, + retval, tp, prev_state); + } +#endif +#ifdef TCP_ACCOUNTING + } else { + /* + * Track the time (see above). + */ + if (ack_val_set != 0xf) { + uint64_t crtsc; + + crtsc = get_cyclecount(); + counter_u64_add(tcp_proc_time[ack_val_set] , (crtsc - ts_val)); /* - * Sanity check, we do not allow the pacing delay - * to be longer than the SRTT of the path. If it is - * a slow path, then adding a packet should increase - * the RTT and compensate for this i.e. the srtt will - * be greater so the allowed pacing time will be greater. - * - * Note this restriction is not for where a peak rate - * is set, we are doing fixed pacing or hardware pacing. + * Note we *DO NOT* increment the per-tcb counters since + * in the else the TP may be gone!! */ - if (rack->rc_tp->t_srtt) - srtt = (TICKS_2_USEC(rack->rc_tp->t_srtt) >> TCP_RTT_SHIFT); - else - srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ - if (srtt < slot) { - rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); - slot = srtt; - } } - rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); +#endif } - if (slot) - counter_u64_add(rack_calc_nonzero, 1); - else - counter_u64_add(rack_calc_zero, 1); - return (slot); +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (retval); } -static void -rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, - tcp_seq startseq, uint32_t sb_offset) +void +rack_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, + struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { - struct rack_sendmap *my_rsm = NULL; - struct rack_sendmap fe; + struct timeval tv; - if (tp->t_state < TCPS_ESTABLISHED) { + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + if (ctf_do_queued_segments(so, tp, 1)) { + m_freem(m); + return; + } + } + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + if (rack_do_segment_nounlock(m, th, so, tp, + drop_hdrlen, tlen, iptos, 0, &tv) == 0) { + tcp_handle_wakeup(tp, so); + INP_WUNLOCK(tp->t_inpcb); + } +} + +struct rack_sendmap * +tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused) +{ + struct rack_sendmap *rsm = NULL; + int32_t idx; + uint32_t srtt = 0, thresh = 0, ts_low = 0; + + /* Return the next guy to be re-transmitted */ + if (RB_EMPTY(&rack->r_ctl.rc_mtree)) { + return (NULL); + } + if (tp->t_flags & TF_SENTFIN) { + /* retran the end FIN? */ + return (NULL); + } + /* ok lets look at this one */ + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm && ((rsm->r_flags & RACK_ACKED) == 0)) { + goto check_it; + } + rsm = rack_find_lowest_rsm(rack); + if (rsm == NULL) { + return (NULL); + } +check_it: + if (((rack->rc_tp->t_flags & TF_SACK_PERMIT) == 0) && + (rsm->r_dupack >= DUP_ACK_THRESHOLD)) { /* - * We don't start any measurements if we are - * not at least established. + * No sack so we automatically do the 3 strikes and + * retransmit (no rack timer would be started). */ - return; + + return (rsm); } - tp->t_flags |= TF_GPUTINPROG; - rack->r_ctl.rc_gp_lowrtt = 0xffffffff; - rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; - tp->gput_seq = startseq; - rack->app_limited_needs_set = 0; - if (rack->in_probe_rtt) - rack->measure_saw_probe_rtt = 1; - else if ((rack->measure_saw_probe_rtt) && - (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) - rack->measure_saw_probe_rtt = 0; - if (rack->rc_gp_filled) - tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); - else { - /* Special case initial measurement */ - rack->r_ctl.rc_gp_output_ts = tp->gput_ts = tcp_get_usecs(NULL); + if (rsm->r_flags & RACK_ACKED) { + return (NULL); } - /* - * We take a guess out into the future, - * if we have no measurement and no - * initial rate, we measure the first - * initial-windows worth of data to - * speed up getting some GP measurement and - * thus start pacing. - */ - if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { - rack->app_limited_needs_set = 1; - tp->gput_ack = startseq + max(rc_init_window(rack), - (MIN_GP_WIN * ctf_fixed_maxseg(tp))); - rack_log_pacing_delay_calc(rack, - tp->gput_seq, - tp->gput_ack, - 0, - tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, - 9, - __LINE__, NULL); - return; + if (((rsm->r_flags & RACK_SACK_PASSED) == 0) && + (rsm->r_dupack < DUP_ACK_THRESHOLD)) { + /* Its not yet ready */ + return (NULL); } - if (sb_offset) { + srtt = rack_grab_rtt(tp, rack); + idx = rsm->r_rtr_cnt - 1; + ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; + thresh = rack_calc_thresh_rack(rack, srtt, tsused); + if ((tsused == ts_low) || + (TSTMP_LT(tsused, ts_low))) { + /* No time since sending */ + return (NULL); + } + if ((tsused - ts_low) < thresh) { + /* It has not been long enough yet */ + return (NULL); + } + if ((rsm->r_dupack >= DUP_ACK_THRESHOLD) || + ((rsm->r_flags & RACK_SACK_PASSED) && + (rack->sack_attack_disable == 0))) { /* - * We are out somewhere in the sb - * can we use the already outstanding data? + * We have passed the dup-ack threshold + * a SACK has indicated this is missing. + * Note that if you are a declared attacker + * it is only the dup-ack threshold that + * will cause retransmits. */ + /* log retransmit reason */ + rack_log_retran_reason(rack, rsm, (tsused - ts_low), thresh, 1); + rack->r_fast_output = 0; + return (rsm); + } + return (NULL); +} - if (rack->r_ctl.rc_app_limited_cnt == 0) { - /* - * Yes first one is good and in this case - * the tp->gput_ts is correctly set based on - * the last ack that arrived (no need to - * set things up when an ack comes in). - */ - my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); - if ((my_rsm == NULL) || - (my_rsm->r_rtr_cnt != 1)) { - /* retransmission? */ - goto use_latest; - } - } else { - if (rack->r_ctl.rc_first_appl == NULL) { - /* - * If rc_first_appl is NULL - * then the cnt should be 0. - * This is probably an error, maybe - * a KASSERT would be approprate. - */ - goto use_latest; - } - /* - * If we have a marker pointer to the last one that is - * app limited we can use that, but we need to set - * things up so that when it gets ack'ed we record - * the ack time (if its not already acked). - */ - rack->app_limited_needs_set = 1; - /* - * We want to get to the rsm that is either - * next with space i.e. over 1 MSS or the one - * after that (after the app-limited). - */ - my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, - rack->r_ctl.rc_first_appl); - if (my_rsm) { - if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) - /* Have to use the next one */ +static void +rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, + uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, + int line, struct rack_sendmap *rsm) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.flex1 = slot; + log.u_bbr.flex2 = len; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex4 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex5 = rack->r_ctl.rack_per_of_gp_ss; + log.u_bbr.flex6 = rack->r_ctl.rack_per_of_gp_ca; + log.u_bbr.use_lt_bw = rack->rc_ack_can_sendout_data; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->r_late; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->r_early; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->app_limited_needs_set; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->rc_gp_filled; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->measure_saw_probe_rtt; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->in_probe_rtt; + log.u_bbr.use_lt_bw <<= 1; + log.u_bbr.use_lt_bw |= rack->gp_ready; + log.u_bbr.pkt_epoch = line; + log.u_bbr.epoch = rack->r_ctl.rc_agg_delayed; + log.u_bbr.lt_epoch = rack->r_ctl.rc_agg_early; + log.u_bbr.applimited = rack->r_ctl.rack_per_of_gp_rec; + log.u_bbr.bw_inuse = bw_est; + log.u_bbr.delRate = bw; + if (rack->r_ctl.gp_bw == 0) + log.u_bbr.cur_del_rate = 0; + else + log.u_bbr.cur_del_rate = rack_get_bw(rack); + log.u_bbr.rttProp = len_time; + log.u_bbr.pkts_out = rack->r_ctl.rc_rack_min_rtt; + log.u_bbr.lost = rack->r_ctl.rc_probertt_sndmax_atexit; + log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); + if (rack->r_ctl.cwnd_to_use < rack->rc_tp->snd_ssthresh) { + /* We are in slow start */ + log.u_bbr.flex7 = 1; + } else { + /* we are on congestion avoidance */ + log.u_bbr.flex7 = 0; + } + log.u_bbr.flex8 = method; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.cwnd_gain = rack->rc_gp_saw_rec; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ss; + log.u_bbr.cwnd_gain <<= 1; + log.u_bbr.cwnd_gain |= rack->rc_gp_saw_ca; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_HPTSI_CALC, 0, + 0, &log, false, &tv); + } +} + +static uint32_t +rack_get_pacing_len(struct tcp_rack *rack, uint64_t bw, uint32_t mss) +{ + uint32_t new_tso, user_max; + + user_max = rack->rc_user_set_max_segs * mss; + if (rack->rc_force_max_seg) { + return (user_max); + } + if (rack->use_fixed_rate && + ((rack->r_ctl.crte == NULL) || + (bw != rack->r_ctl.crte->rate))) { + /* Use the user mss since we are not exactly matched */ + return (user_max); + } + new_tso = tcp_get_pacing_burst_size(rack->rc_tp, bw, mss, rack_pace_one_seg, rack->r_ctl.crte, NULL); + if (new_tso > user_max) + new_tso = user_max; + return (new_tso); +} + +static int32_t +pace_to_fill_cwnd(struct tcp_rack *rack, int32_t slot, uint32_t len, uint32_t segsiz, int *capped, uint64_t *rate_wanted, uint8_t non_paced) +{ + uint64_t lentim, fill_bw; + + /* Lets first see if we are full, if so continue with normal rate */ + rack->r_via_fill_cw = 0; + if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) + return (slot); + if ((ctf_outstanding(rack->rc_tp) + (segsiz-1)) > rack->rc_tp->snd_wnd) + return (slot); + if (rack->r_ctl.rc_last_us_rtt == 0) + return (slot); + if (rack->rc_pace_fill_if_rttin_range && + (rack->r_ctl.rc_last_us_rtt >= + (get_filter_value_small(&rack->r_ctl.rc_gp_min_rtt) * rack->rtt_limit_mul))) { + /* The rtt is huge, N * smallest, lets not fill */ + return (slot); + } + /* + * first lets calculate the b/w based on the last us-rtt + * and the sndwnd. + */ + fill_bw = rack->r_ctl.cwnd_to_use; + /* Take the rwnd if its smaller */ + if (fill_bw > rack->rc_tp->snd_wnd) + fill_bw = rack->rc_tp->snd_wnd; + if (rack->r_fill_less_agg) { + /* + * Now take away the inflight (this will reduce our + * aggressiveness and yeah, if we get that much out in 1RTT + * we will have had acks come back and still be behind). + */ + fill_bw -= ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + } + /* Now lets make it into a b/w */ + fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; + fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; + /* We are below the min b/w */ + if (non_paced) + *rate_wanted = fill_bw; + if ((fill_bw < RACK_MIN_BW) || (fill_bw < *rate_wanted)) + return (slot); + if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) + fill_bw = rack->r_ctl.bw_rate_cap; + rack->r_via_fill_cw = 1; + if (rack->r_rack_hw_rate_caps && + (rack->r_ctl.crte != NULL)) { + uint64_t high_rate; + + high_rate = tcp_hw_highest_rate(rack->r_ctl.crte); + if (fill_bw > high_rate) { + /* We are capping bw at the highest rate table entry */ + if (*rate_wanted > high_rate) { + /* The original rate was also capped */ + rack->r_via_fill_cw = 0; + } + rack_log_hdwr_pacing(rack, + fill_bw, high_rate, __LINE__, + 0, 3); + fill_bw = high_rate; + if (capped) + *capped = 1; + } + } else if ((rack->r_ctl.crte == NULL) && + (rack->rack_hdrw_pacing == 0) && + (rack->rack_hdw_pace_ena) && + rack->r_rack_hw_rate_caps && + (rack->rack_attempt_hdwr_pace == 0) && + (rack->rc_inp->inp_route.ro_nh != NULL) && + (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { + /* + * Ok we may have a first attempt that is greater than our top rate + * lets check. + */ + uint64_t high_rate; + + high_rate = tcp_hw_highest_rate_ifp(rack->rc_inp->inp_route.ro_nh->nh_ifp, rack->rc_inp); + if (high_rate) { + if (fill_bw > high_rate) { + fill_bw = high_rate; + if (capped) + *capped = 1; + } + } + } + /* + * Ok fill_bw holds our mythical b/w to fill the cwnd + * in a rtt, what does that time wise equate too? + */ + lentim = (uint64_t)(len) * (uint64_t)HPTS_USEC_IN_SEC; + lentim /= fill_bw; + *rate_wanted = fill_bw; + if (non_paced || (lentim < slot)) { + rack_log_pacing_delay_calc(rack, len, slot, fill_bw, + 0, lentim, 12, __LINE__, NULL); + return ((int32_t)lentim); + } else + return (slot); +} + +static int32_t +rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) +{ + struct rack_sendmap *lrsm; + int32_t slot = 0; + int can_start_hw_pacing = 1; + int err; + + if (rack->rc_always_pace == 0) { + /* + * We use the most optimistic possible cwnd/srtt for + * sending calculations. This will make our + * calculation anticipate getting more through + * quicker then possible. But thats ok we don't want + * the peer to have a gap in data sending. + */ + uint32_t srtt, cwnd, tr_perms = 0; + int32_t reduce = 0; + + old_method: + /* + * We keep no precise pacing with the old method + * instead we use the pacer to mitigate bursts. + */ + if (rack->r_ctl.rc_rack_min_rtt) + srtt = rack->r_ctl.rc_rack_min_rtt; + else + srtt = max(tp->t_srtt, 1); + if (rack->r_ctl.rc_rack_largest_cwnd) + cwnd = rack->r_ctl.rc_rack_largest_cwnd; + else + cwnd = rack->r_ctl.cwnd_to_use; + /* Inflate cwnd by 1000 so srtt of usecs is in ms */ + tr_perms = (cwnd * 1000) / srtt; + if (tr_perms == 0) { + tr_perms = ctf_fixed_maxseg(tp); + } + /* + * Calculate how long this will take to drain, if + * the calculation comes out to zero, thats ok we + * will use send_a_lot to possibly spin around for + * more increasing tot_len_this_send to the point + * that its going to require a pace, or we hit the + * cwnd. Which in that case we are just waiting for + * a ACK. + */ + slot = len / tr_perms; + /* Now do we reduce the time so we don't run dry? */ + if (slot && rack_slot_reduction) { + reduce = (slot / rack_slot_reduction); + if (reduce < slot) { + slot -= reduce; + } else + slot = 0; + } + slot *= HPTS_USEC_IN_MSEC; + if (rsm == NULL) { + /* + * We always consider ourselves app limited with old style + * that are not retransmits. This could be the initial + * measurement, but thats ok its all setup and specially + * handled. If another send leaks out, then that too will + * be mark app-limited. + */ + lrsm = RB_MAX(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if (lrsm && ((lrsm->r_flags & RACK_APP_LIMITED) == 0)) { + rack->r_ctl.rc_first_appl = lrsm; + lrsm->r_flags |= RACK_APP_LIMITED; + rack->r_ctl.rc_app_limited_cnt++; + } + } + if (rack->rc_pace_to_cwnd) { + uint64_t rate_wanted = 0; + + slot = pace_to_fill_cwnd(rack, slot, len, segsiz, NULL, &rate_wanted, 1); + rack->rc_ack_can_sendout_data = 1; + rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, 0, 0, 14, __LINE__, NULL); + } else + rack_log_pacing_delay_calc(rack, len, slot, tr_perms, reduce, 0, 7, __LINE__, NULL); + } else { + uint64_t bw_est, res, lentim, rate_wanted; + uint32_t orig_val, srtt, segs, oh; + int capped = 0; + int prev_fill; + + if ((rack->r_rr_config == 1) && rsm) { + return (rack->r_ctl.rc_min_to); + } + if (rack->use_fixed_rate) { + rate_wanted = bw_est = rack_get_fixed_pacing_bw(rack); + } else if ((rack->r_ctl.init_rate == 0) && +#ifdef NETFLIX_PEAKRATE + (rack->rc_tp->t_maxpeakrate == 0) && +#endif + (rack->r_ctl.gp_bw == 0)) { + /* no way to yet do an estimate */ + bw_est = rate_wanted = 0; + } else { + bw_est = rack_get_bw(rack); + rate_wanted = rack_get_output_bw(rack, bw_est, rsm, &capped); + } + if ((bw_est == 0) || (rate_wanted == 0) || + ((rack->gp_ready == 0) && (rack->use_fixed_rate == 0))) { + /* + * No way yet to make a b/w estimate or + * our raise is set incorrectly. + */ + goto old_method; + } + /* We need to account for all the overheads */ + segs = (len + segsiz - 1) / segsiz; + /* + * We need the diff between 1514 bytes (e-mtu with e-hdr) + * and how much data we put in each packet. Yes this + * means we may be off if we are larger than 1500 bytes + * or smaller. But this just makes us more conservative. + */ + if (rack_hw_rate_min && + (bw_est < rack_hw_rate_min)) + can_start_hw_pacing = 0; + if (ETHERNET_SEGMENT_SIZE > segsiz) + oh = ETHERNET_SEGMENT_SIZE - segsiz; + else + oh = 0; + segs *= oh; + lentim = (uint64_t)(len + segs) * (uint64_t)HPTS_USEC_IN_SEC; + res = lentim / rate_wanted; + slot = (uint32_t)res; + orig_val = rack->r_ctl.rc_pace_max_segs; + if (rack->r_ctl.crte == NULL) { + /* + * Only do this if we are not hardware pacing + * since if we are doing hw-pacing below we will + * set make a call after setting up or changing + * the rate. + */ + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); + } else if (rack->rc_inp->inp_snd_tag == NULL) { + /* + * We lost our rate somehow, this can happen + * if the interface changed underneath us. + */ + tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); + rack->r_ctl.crte = NULL; + /* Lets re-allow attempting to setup pacing */ + rack->rack_hdrw_pacing = 0; + rack->rack_attempt_hdwr_pace = 0; + rack_log_hdwr_pacing(rack, + rate_wanted, bw_est, __LINE__, + 0, 6); + } + /* Did we change the TSO size, if so log it */ + if (rack->r_ctl.rc_pace_max_segs != orig_val) + rack_log_pacing_delay_calc(rack, len, slot, orig_val, 0, 0, 15, __LINE__, NULL); + prev_fill = rack->r_via_fill_cw; + if ((rack->rc_pace_to_cwnd) && + (capped == 0) && + (rack->use_fixed_rate == 0) && + (rack->in_probe_rtt == 0) && + (IN_FASTRECOVERY(rack->rc_tp->t_flags) == 0)) { + /* + * We want to pace at our rate *or* faster to + * fill the cwnd to the max if its not full. + */ + slot = pace_to_fill_cwnd(rack, slot, (len+segs), segsiz, &capped, &rate_wanted, 0); + } + if ((rack->rc_inp->inp_route.ro_nh != NULL) && + (rack->rc_inp->inp_route.ro_nh->nh_ifp != NULL)) { + if ((rack->rack_hdw_pace_ena) && + (can_start_hw_pacing > 0) && + (rack->rack_hdrw_pacing == 0) && + (rack->rack_attempt_hdwr_pace == 0)) { + /* + * Lets attempt to turn on hardware pacing + * if we can. + */ + rack->rack_attempt_hdwr_pace = 1; + rack->r_ctl.crte = tcp_set_pacing_rate(rack->rc_tp, + rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, + RS_PACING_GEQ, + &err, &rack->r_ctl.crte_prev_rate); + if (rack->r_ctl.crte) { + rack->rack_hdrw_pacing = 1; + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, segsiz, + 0, rack->r_ctl.crte, + NULL); + rack_log_hdwr_pacing(rack, + rate_wanted, rack->r_ctl.crte->rate, __LINE__, + err, 0); + rack->r_ctl.last_hw_bw_req = rate_wanted; + } else { + counter_u64_add(rack_hw_pace_init_fail, 1); + } + } else if (rack->rack_hdrw_pacing && + (rack->r_ctl.last_hw_bw_req != rate_wanted)) { + /* Do we need to adjust our rate? */ + const struct tcp_hwrate_limit_table *nrte; + + if (rack->r_up_only && + (rate_wanted < rack->r_ctl.crte->rate)) { + /** + * We have four possible states here + * having to do with the previous time + * and this time. + * previous | this-time + * A) 0 | 0 -- fill_cw not in the picture + * B) 1 | 0 -- we were doing a fill-cw but now are not + * C) 1 | 1 -- all rates from fill_cw + * D) 0 | 1 -- we were doing non-fill and now we are filling + * + * For case A, C and D we don't allow a drop. But for + * case B where we now our on our steady rate we do + * allow a drop. + * + */ + if (!((prev_fill == 1) && (rack->r_via_fill_cw == 0))) + goto done_w_hdwr; + } + if ((rate_wanted > rack->r_ctl.crte->rate) || + (rate_wanted <= rack->r_ctl.crte_prev_rate)) { + if (rack_hw_rate_to_low && + (bw_est < rack_hw_rate_to_low)) { + /* + * The pacing rate is too low for hardware, but + * do allow hardware pacing to be restarted. + */ + rack_log_hdwr_pacing(rack, + bw_est, rack->r_ctl.crte->rate, __LINE__, + 0, 5); + tcp_rel_pacing_rate(rack->r_ctl.crte, rack->rc_tp); + rack->r_ctl.crte = NULL; + rack->rack_attempt_hdwr_pace = 0; + rack->rack_hdrw_pacing = 0; + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); + goto done_w_hdwr; + } + nrte = tcp_chg_pacing_rate(rack->r_ctl.crte, + rack->rc_tp, + rack->rc_inp->inp_route.ro_nh->nh_ifp, + rate_wanted, + RS_PACING_GEQ, + &err, &rack->r_ctl.crte_prev_rate); + if (nrte == NULL) { + /* Lost the rate */ + rack->rack_hdrw_pacing = 0; + rack->r_ctl.crte = NULL; + rack_log_hdwr_pacing(rack, + rate_wanted, 0, __LINE__, + err, 1); + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); + counter_u64_add(rack_hw_pace_lost, 1); + } else if (nrte != rack->r_ctl.crte) { + rack->r_ctl.crte = nrte; + rack->r_ctl.rc_pace_max_segs = tcp_get_pacing_burst_size(tp, rate_wanted, + segsiz, 0, + rack->r_ctl.crte, + NULL); + rack_log_hdwr_pacing(rack, + rate_wanted, rack->r_ctl.crte->rate, __LINE__, + err, 2); + rack->r_ctl.last_hw_bw_req = rate_wanted; + } + } else { + /* We just need to adjust the segment size */ + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, &rate_wanted); + rack_log_hdwr_pacing(rack, + rate_wanted, rack->r_ctl.crte->rate, __LINE__, + 0, 4); + rack->r_ctl.last_hw_bw_req = rate_wanted; + } + } + } + if ((rack->r_ctl.crte != NULL) && + (rack->r_ctl.crte->rate == rate_wanted)) { + /* + * We need to add a extra if the rates + * are exactly matched. The idea is + * we want the software to make sure the + * queue is empty before adding more, this + * gives us N MSS extra pace times where + * N is our sysctl + */ + slot += (rack->r_ctl.crte->time_between * rack_hw_pace_extra_slots); + } +done_w_hdwr: + if (rack_limit_time_with_srtt && + (rack->use_fixed_rate == 0) && +#ifdef NETFLIX_PEAKRATE + (rack->rc_tp->t_maxpeakrate == 0) && +#endif + (rack->rack_hdrw_pacing == 0)) { + /* + * Sanity check, we do not allow the pacing delay + * to be longer than the SRTT of the path. If it is + * a slow path, then adding a packet should increase + * the RTT and compensate for this i.e. the srtt will + * be greater so the allowed pacing time will be greater. + * + * Note this restriction is not for where a peak rate + * is set, we are doing fixed pacing or hardware pacing. + */ + if (rack->rc_tp->t_srtt) + srtt = rack->rc_tp->t_srtt; + else + srtt = RACK_INITIAL_RTO * HPTS_USEC_IN_MSEC; /* its in ms convert */ + if (srtt < slot) { + rack_log_pacing_delay_calc(rack, srtt, slot, rate_wanted, bw_est, lentim, 99, __LINE__, NULL); + slot = srtt; + } + } + rack_log_pacing_delay_calc(rack, len, slot, rate_wanted, bw_est, lentim, 2, __LINE__, rsm); + } + if (rack->r_ctl.crte && (rack->r_ctl.crte->rs_num_enobufs > 0)) { + /* + * If this rate is seeing enobufs when it + * goes to send then either the nic is out + * of gas or we are mis-estimating the time + * somehow and not letting the queue empty + * completely. Lets add to the pacing time. + */ + int hw_boost_delay; + + hw_boost_delay = rack->r_ctl.crte->time_between * rack_enobuf_hw_boost_mult; + if (hw_boost_delay > rack_enobuf_hw_max) + hw_boost_delay = rack_enobuf_hw_max; + else if (hw_boost_delay < rack_enobuf_hw_min) + hw_boost_delay = rack_enobuf_hw_min; + slot += hw_boost_delay; + } + if (slot) + counter_u64_add(rack_calc_nonzero, 1); + else + counter_u64_add(rack_calc_zero, 1); + return (slot); +} + +static void +rack_start_gp_measurement(struct tcpcb *tp, struct tcp_rack *rack, + tcp_seq startseq, uint32_t sb_offset) +{ + struct rack_sendmap *my_rsm = NULL; + struct rack_sendmap fe; + + if (tp->t_state < TCPS_ESTABLISHED) { + /* + * We don't start any measurements if we are + * not at least established. + */ + return; + } + tp->t_flags |= TF_GPUTINPROG; + rack->r_ctl.rc_gp_lowrtt = 0xffffffff; + rack->r_ctl.rc_gp_high_rwnd = rack->rc_tp->snd_wnd; + tp->gput_seq = startseq; + rack->app_limited_needs_set = 0; + if (rack->in_probe_rtt) + rack->measure_saw_probe_rtt = 1; + else if ((rack->measure_saw_probe_rtt) && + (SEQ_GEQ(tp->gput_seq, rack->r_ctl.rc_probertt_sndmax_atexit))) + rack->measure_saw_probe_rtt = 0; + if (rack->rc_gp_filled) + tp->gput_ts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); + else { + /* Special case initial measurement */ + struct timeval tv; + + tp->gput_ts = tcp_get_usecs(&tv); + rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); + } + /* + * We take a guess out into the future, + * if we have no measurement and no + * initial rate, we measure the first + * initial-windows worth of data to + * speed up getting some GP measurement and + * thus start pacing. + */ + if ((rack->rc_gp_filled == 0) && (rack->r_ctl.init_rate == 0)) { + rack->app_limited_needs_set = 1; + tp->gput_ack = startseq + max(rc_init_window(rack), + (MIN_GP_WIN * ctf_fixed_maxseg(tp))); + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + 0, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, + __LINE__, NULL); + return; + } + if (sb_offset) { + /* + * We are out somewhere in the sb + * can we use the already outstanding data? + */ + + if (rack->r_ctl.rc_app_limited_cnt == 0) { + /* + * Yes first one is good and in this case + * the tp->gput_ts is correctly set based on + * the last ack that arrived (no need to + * set things up when an ack comes in). + */ + my_rsm = RB_MIN(rack_rb_tree_head, &rack->r_ctl.rc_mtree); + if ((my_rsm == NULL) || + (my_rsm->r_rtr_cnt != 1)) { + /* retransmission? */ + goto use_latest; + } + } else { + if (rack->r_ctl.rc_first_appl == NULL) { + /* + * If rc_first_appl is NULL + * then the cnt should be 0. + * This is probably an error, maybe + * a KASSERT would be approprate. + */ + goto use_latest; + } + /* + * If we have a marker pointer to the last one that is + * app limited we can use that, but we need to set + * things up so that when it gets ack'ed we record + * the ack time (if its not already acked). + */ + rack->app_limited_needs_set = 1; + /* + * We want to get to the rsm that is either + * next with space i.e. over 1 MSS or the one + * after that (after the app-limited). + */ + my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, + rack->r_ctl.rc_first_appl); + if (my_rsm) { + if ((my_rsm->r_end - my_rsm->r_start) <= ctf_fixed_maxseg(tp)) + /* Have to use the next one */ my_rsm = RB_NEXT(rack_rb_tree_head, &rack->r_ctl.rc_mtree, my_rsm); else { @@ -11754,121 +14660,1231 @@ goto start_set; } } - if ((my_rsm == NULL) || - (my_rsm->r_rtr_cnt != 1)) { - /* - * Either its a retransmit or - * the last is the app-limited one. - */ - goto use_latest; + if ((my_rsm == NULL) || + (my_rsm->r_rtr_cnt != 1)) { + /* + * Either its a retransmit or + * the last is the app-limited one. + */ + goto use_latest; + } + } + tp->gput_seq = my_rsm->r_start; +start_set: + if (my_rsm->r_flags & RACK_ACKED) { + /* + * This one has been acked use the arrival ack time + */ + tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; + rack->app_limited_needs_set = 0; + } + rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; + tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + (uint64_t)my_rsm, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, + __LINE__, NULL); + return; + } + +use_latest: + /* + * We don't know how long we may have been + * idle or if this is the first-send. Lets + * setup the flag so we will trim off + * the first ack'd data so we get a true + * measurement. + */ + rack->app_limited_needs_set = 1; + tp->gput_ack = startseq + rack_get_measure_window(tp, rack); + /* Find this guy so we can pull the send time */ + fe.r_start = startseq; + my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); + if (my_rsm) { + rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[(my_rsm->r_rtr_cnt-1)]; + if (my_rsm->r_flags & RACK_ACKED) { + /* + * Unlikely since its probably what was + * just transmitted (but I am paranoid). + */ + tp->gput_ts = (uint32_t)my_rsm->r_ack_arrival; + rack->app_limited_needs_set = 0; + } + if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { + /* This also is unlikely */ + tp->gput_seq = my_rsm->r_start; + } + } else { + /* + * TSNH unless we have some send-map limit, + * and even at that it should not be hitting + * that limit (we should have stopped sending). + */ + struct timeval tv; + + microuptime(&tv); + rack->r_ctl.rc_gp_output_ts = rack_to_usec_ts(&tv); + } + rack_log_pacing_delay_calc(rack, + tp->gput_seq, + tp->gput_ack, + (uint64_t)my_rsm, + tp->gput_ts, + rack->r_ctl.rc_app_limited_cnt, + 9, __LINE__, NULL); +} + +static inline uint32_t +rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, + uint32_t avail, int32_t sb_offset) +{ + uint32_t len; + uint32_t sendwin; + + if (tp->snd_wnd > cwnd_to_use) + sendwin = cwnd_to_use; + else + sendwin = tp->snd_wnd; + if (ctf_outstanding(tp) >= tp->snd_wnd) { + /* We never want to go over our peers rcv-window */ + len = 0; + } else { + uint32_t flight; + + flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); + if (flight >= sendwin) { + /* + * We have in flight what we are allowed by cwnd (if + * it was rwnd blocking it would have hit above out + * >= tp->snd_wnd). + */ + return (0); + } + len = sendwin - flight; + if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { + /* We would send too much (beyond the rwnd) */ + len = tp->snd_wnd - ctf_outstanding(tp); + } + if ((len + sb_offset) > avail) { + /* + * We don't have that much in the SB, how much is + * there? + */ + len = avail - sb_offset; + } + } + return (len); +} + +static void +rack_log_fsb(struct tcp_rack *rack, struct tcpcb *tp, struct socket *so, uint32_t flags, + unsigned ipoptlen, int32_t orig_len, int32_t len, int error, + int rsm_is_null, int optlen, int line, uint16_t mode) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex1 = error; + log.u_bbr.flex2 = flags; + log.u_bbr.flex3 = rsm_is_null; + log.u_bbr.flex4 = ipoptlen; + log.u_bbr.flex5 = tp->rcv_numsacks; + log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; + log.u_bbr.flex7 = optlen; + log.u_bbr.flex8 = rack->r_fsb_inited; + log.u_bbr.applimited = rack->r_fast_output; + log.u_bbr.bw_inuse = rack_get_bw(rack); + log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); + log.u_bbr.cwnd_gain = mode; + log.u_bbr.pkts_out = orig_len; + log.u_bbr.lt_epoch = len; + log.u_bbr.delivered = line; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + tcp_log_event_(tp, NULL, &so->so_rcv, &so->so_snd, TCP_LOG_FSB, 0, + len, &log, false, NULL, NULL, 0, &tv); + } +} + + +static struct mbuf * +rack_fo_base_copym(struct mbuf *the_m, uint32_t the_off, int32_t *plen, + struct rack_fast_send_blk *fsb, + int32_t seglimit, int32_t segsize) +{ +#ifdef KERN_TLS + struct ktls_session *tls, *ntls; + struct mbuf *start; +#endif + struct mbuf *m, *n, **np, *smb; + struct mbuf *top; + int32_t off, soff; + int32_t len = *plen; + int32_t fragsize; + int32_t len_cp = 0; + uint32_t mlen, frags; + + soff = off = the_off; + smb = m = the_m; + np = ⊤ + top = NULL; +#ifdef KERN_TLS + if (hw_tls && (m->m_flags & M_EXTPG)) + tls = m->m_epg_tls; + else + tls = NULL; + start = m; +#endif + while (len > 0) { + if (m == NULL) { + *plen = len_cp; + break; + } +#ifdef KERN_TLS + if (hw_tls) { + if (m->m_flags & M_EXTPG) + ntls = m->m_epg_tls; + else + ntls = NULL; + + /* + * Avoid mixing TLS records with handshake + * data or TLS records from different + * sessions. + */ + if (tls != ntls) { + MPASS(m != start); + *plen = len_cp; + break; + } + } +#endif + mlen = min(len, m->m_len - off); + if (seglimit) { + /* + * For M_EXTPG mbufs, add 3 segments + * + 1 in case we are crossing page boundaries + * + 2 in case the TLS hdr/trailer are used + * It is cheaper to just add the segments + * than it is to take the cache miss to look + * at the mbuf ext_pgs state in detail. + */ + if (m->m_flags & M_EXTPG) { + fragsize = min(segsize, PAGE_SIZE); + frags = 3; + } else { + fragsize = segsize; + frags = 0; + } + + /* Break if we really can't fit anymore. */ + if ((frags + 1) >= seglimit) { + *plen = len_cp; + break; + } + + /* + * Reduce size if you can't copy the whole + * mbuf. If we can't copy the whole mbuf, also + * adjust len so the loop will end after this + * mbuf. + */ + if ((frags + howmany(mlen, fragsize)) >= seglimit) { + mlen = (seglimit - frags - 1) * fragsize; + len = mlen; + *plen = len_cp + len; } + frags += howmany(mlen, fragsize); + if (frags == 0) + frags++; + seglimit -= frags; + KASSERT(seglimit > 0, + ("%s: seglimit went too low", __func__)); + } + n = m_get(M_NOWAIT, m->m_type); + *np = n; + if (n == NULL) + goto nospace; + n->m_len = mlen; + soff += mlen; + len_cp += n->m_len; + if (m->m_flags & (M_EXT|M_EXTPG)) { + n->m_data = m->m_data + off; + mb_dupcl(n, m); + } else { + bcopy(mtod(m, caddr_t)+off, mtod(n, caddr_t), + (u_int)n->m_len); + } + len -= n->m_len; + off = 0; + m = m->m_next; + np = &n->m_next; + if (len || (soff == smb->m_len)) { + /* + * We have more so we move forward or + * we have consumed the entire mbuf and + * len has fell to 0. + */ + soff = 0; + smb = m; } - tp->gput_seq = my_rsm->r_start; -start_set: - if (my_rsm->r_flags & RACK_ACKED) { + + } + if (fsb != NULL) { + fsb->m = smb; + fsb->off = soff; + if (smb) { + /* + * Save off the size of the mbuf. We do + * this so that we can recognize when it + * has been trimmed by sbcut() as acks + * come in. + */ + fsb->o_m_len = smb->m_len; + } else { + /* + * This is the case where the next mbuf went to NULL. This + * means with this copy we have sent everything in the sb. + * In theory we could clear the fast_output flag, but lets + * not since its possible that we could get more added + * and acks that call the extend function which would let + * us send more. + */ + fsb->o_m_len = 0; + } + } + return (top); +nospace: + if (top) + m_freem(top); + return (NULL); + +} + +/* + * This is a copy of m_copym(), taking the TSO segment size/limit + * constraints into account, and advancing the sndptr as it goes. + */ +static struct mbuf * +rack_fo_m_copym(struct tcp_rack *rack, int32_t *plen, + int32_t seglimit, int32_t segsize, struct mbuf **s_mb, int *s_soff) +{ + struct mbuf *m, *n; + int32_t soff; + + soff = rack->r_ctl.fsb.off; + m = rack->r_ctl.fsb.m; + if (rack->r_ctl.fsb.o_m_len != m->m_len) { + /* + * The mbuf had the front of it chopped off by an ack + * we need to adjust the soff/off by that difference. + */ + uint32_t delta; + + delta = rack->r_ctl.fsb.o_m_len - m->m_len; + soff -= delta; + } + KASSERT(soff >= 0, ("%s, negative off %d", __FUNCTION__, soff)); + KASSERT(*plen >= 0, ("%s, negative len %d", __FUNCTION__, *plen)); + KASSERT(soff < m->m_len, ("%s rack:%p len:%u m:%p m->m_len:%u < off?", + __FUNCTION__, + rack, *plen, m, m->m_len)); + /* Save off the right location before we copy and advance */ + *s_soff = soff; + *s_mb = rack->r_ctl.fsb.m; + n = rack_fo_base_copym(m, soff, plen, + &rack->r_ctl.fsb, + seglimit, segsize); + return (n); +} + +static int +rack_fast_rsm_output(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, + uint64_t ts_val, uint32_t cts, uint32_t ms_cts, struct timeval *tv, int len) +{ + /* + * Enter the fast retransmit path. We are given that a sched_pin is + * in place (if accounting is compliled in) and the cycle count taken + * at the entry is in the ts_val. The concept her is that the rsm + * now holds the mbuf offsets and such so we can directly transmit + * without a lot of overhead, the len field is already set for + * us to prohibit us from sending too much (usually its 1MSS). + */ + struct ip *ip = NULL; + struct udphdr *udp = NULL; + struct tcphdr *th = NULL; + struct mbuf *m = NULL; + struct inpcb *inp; + uint8_t *cpto; + struct tcp_log_buffer *lgb; +#ifdef TCP_ACCOUNTING + uint64_t crtsc; + int cnt_thru = 1; +#endif + int doing_tlp = 0; + struct tcpopt to; + u_char opt[TCP_MAXOLEN]; + uint32_t hdrlen, optlen; + int32_t slot, segsiz, max_val, tso = 0, error, flags, ulen = 0; + uint32_t us_cts; + uint32_t if_hw_tsomaxsegcount = 0, startseq; + uint32_t if_hw_tsomaxsegsize; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + + if (rack->r_is_v6) { + ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; + hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else +#endif /* INET6 */ + { + ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; + hdrlen = sizeof(struct tcpiphdr); + } + if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { + goto failed; + } + if (rsm->r_flags & RACK_TLP) + doing_tlp = 1; + startseq = rsm->r_start; + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + inp = rack->rc_inp; + to.to_flags = 0; + flags = tcp_outflags[tp->t_state]; + if (flags & (TH_SYN|TH_RST)) { + goto failed; + } + if (rsm->r_flags & RACK_HAS_FIN) { + /* We can't send a FIN here */ + goto failed; + } + if (flags & TH_FIN) { + /* We never send a FIN */ + flags &= ~TH_FIN; + } + if (tp->t_flags & TF_RCVD_TSTMP) { + to.to_tsval = ms_cts + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags = TOF_TS; + } + optlen = tcp_addoptions(&to, opt); + hdrlen += optlen; + udp = rack->r_ctl.fsb.udp; + if (rack->r_ctl.rc_pace_max_segs) + max_val = rack->r_ctl.rc_pace_max_segs; + else if (rack->rc_user_set_max_segs) + max_val = rack->rc_user_set_max_segs * segsiz; + else + max_val = len; + if ((tp->t_flags & TF_TSO) && + V_tcp_do_tso && + (len > segsiz) && + (tp->t_port == 0)) + tso = 1; +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) + goto failed; + m->m_data += max_linkhdr; + m->m_len = hdrlen; + th = rack->r_ctl.fsb.th; + /* Establish the len to send */ + if (len > max_val) + len = max_val; + if ((tso) && (len + optlen > tp->t_maxseg)) { + uint32_t if_hw_tsomax; + int32_t max_len; + + /* extract TSO information */ + if_hw_tsomax = tp->t_tsomax; + if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; + if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; + /* + * Check if we should limit by maximum payload + * length: + */ + if (if_hw_tsomax != 0) { + /* compute maximum TSO length */ + max_len = (if_hw_tsomax - hdrlen - + max_linkhdr); + if (max_len <= 0) { + goto failed; + } else if (len > max_len) { + len = max_len; + } + } + if (len <= segsiz) { + /* + * In case there are too many small fragments don't + * use TSO: + */ + tso = 0; + } + } else { + tso = 0; + } + if ((tso == 0) && (len > segsiz)) + len = segsiz; + us_cts = tcp_get_usecs(tv); + if ((len == 0) || + (len <= MHLEN - hdrlen - max_linkhdr)) { + goto failed; + } + th->th_seq = htonl(rsm->r_start); + th->th_ack = htonl(tp->rcv_nxt); + if(rsm->r_flags & RACK_HAD_PUSH) + flags |= TH_PUSH; + th->th_flags = flags; + th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); + if (th->th_win == 0) { + tp->t_sndzerowin++; + tp->t_flags |= TF_RXWIN0SENT; + } else + tp->t_flags &= ~TF_RXWIN0SENT; + if (rsm->r_flags & RACK_TLP) { + /* + * TLP should not count in retran count, but + * in its own bin + */ + counter_u64_add(rack_tlp_retran, 1); + counter_u64_add(rack_tlp_retran_bytes, len); + } else { + tp->t_sndrexmitpack++; + KMOD_TCPSTAT_INC(tcps_sndrexmitpack); + KMOD_TCPSTAT_ADD(tcps_sndrexmitbyte, len); + } +#ifdef STATS + stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_RETXPB, + len); +#endif + if (rsm->m == NULL) + goto failed; + if (rsm->orig_m_len != rsm->m->m_len) { + /* Fix up the orig_m_len and possibly the mbuf offset */ + rack_adjust_orig_mlen(rsm); + } + m->m_next = rack_fo_base_copym(rsm->m, rsm->soff, &len, NULL, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize); + if (len <= segsiz) { + /* + * Must have ran out of mbufs for the copy + * shorten it to no longer need tso. Lets + * not put on sendalot since we are low on + * mbufs. + */ + tso = 0; + } + if ((m->m_next == NULL) || (len <= 0)){ + goto failed; + } + if (udp) { + if (rack->r_is_v6) + ulen = hdrlen + len - sizeof(struct ip6_hdr); + else + ulen = hdrlen + len - sizeof(struct ip); + udp->uh_ulen = htons(ulen); + } + m->m_pkthdr.rcvif = (struct ifnet *)0; + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (rack->r_is_v6) { + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, + sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, + 0); + } + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + { + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + + IPPROTO_TCP + len + optlen)); + } + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + } +#endif + if (tso) { + KASSERT(len > tp->t_maxseg - optlen, + ("%s: len <= tso_segsz tp:%p", __func__, tp)); + m->m_pkthdr.csum_flags |= CSUM_TSO; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; + } +#ifdef INET6 + if (rack->r_is_v6) { + ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; + ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + else + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } +#endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { + ip->ip_len = htons(m->m_pkthdr.len); + ip->ip_ttl = rack->r_ctl.fsb.hoplimit; + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + if (tp->t_port == 0 || len < V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + } + } else { + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } + } +#endif + /* Time to copy in our header */ + cpto = mtod(m, uint8_t *); + memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); + th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + } else { + th->th_off = sizeof(struct tcphdr) >> 2; + } + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + if (rack->rack_no_prr) + log.u_bbr.flex1 = 0; + else + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = max_val; + log.u_bbr.flex5 = 0; + /* Save off the early/late values */ + log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; + log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; + log.u_bbr.bw_inuse = rack_get_bw(rack); + log.u_bbr.flex8 = 1; + log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); + log.u_bbr.flex7 = 55; + log.u_bbr.pkts_out = tp->t_maxseg; + log.u_bbr.timeStamp = cts; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; + log.u_bbr.delivered = 0; + lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, + len, &log, false, NULL, NULL, 0, tv); + } else + lgb = NULL; +#ifdef INET6 + if (rack->r_is_v6) { + error = ip6_output(m, NULL, + &inp->inp_route6, + 0, NULL, NULL, inp); + } +#endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { + error = ip_output(m, NULL, + &inp->inp_route, + 0, 0, inp); + } +#endif + m = NULL; + if (lgb) { + lgb->tlb_errno = error; + lgb = NULL; + } + if (error) { + goto failed; + } + rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), + rsm, RACK_SENT_FP, rsm->m, rsm->soff); + if (doing_tlp && (rack->fast_rsm_hack == 0)) { + rack->rc_tlp_in_progress = 1; + rack->r_ctl.rc_tlp_cnt_out++; + } + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + rack->forced_ack = 0; /* If we send something zap the FA flag */ + if (IN_FASTRECOVERY(tp->t_flags) && rsm) + rack->r_ctl.retran_during_recovery += len; + { + int idx; + + idx = (len / segsiz) + 3; + if (idx >= TCP_MSS_ACCT_ATIMER) + counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); + else + counter_u64_add(rack_out_size[idx], 1); + } + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + KMOD_TCPSTAT_INC(tcps_segstimed); + } + counter_u64_add(rack_fto_rsm_send, 1); + if (error && (error == ENOBUFS)) { + slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + if (rack->rc_enobuf < 0x7f) + rack->rc_enobuf++; + if (slot < (10 * HPTS_USEC_IN_MSEC)) + slot = 10 * HPTS_USEC_IN_MSEC; + } else + slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); + if ((slot == 0) || + (rack->rc_always_pace == 0) || + (rack->r_rr_config == 1)) { + /* + * We have no pacing set or we + * are using old-style rack or + * we are overriden to use the old 1ms pacing. + */ + slot = rack->r_ctl.rc_min_to; + } + rack_start_hpts_timer(rack, tp, cts, slot, len, 0); + if (rack->r_must_retran) { + rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); + if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { /* - * This one has been acked use the arrival ack time + * We have retransmitted all we need. */ - tp->gput_ts = my_rsm->r_ack_arrival; - rack->app_limited_needs_set = 0; + rack->r_must_retran = 0; + rack->r_ctl.rc_out_at_rto = 0; } - rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; - tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); - rack_log_pacing_delay_calc(rack, - tp->gput_seq, - tp->gput_ack, - (uint64_t)my_rsm, - tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, - 9, - __LINE__, NULL); - return; } +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((len + segsiz - 1) / segsiz); + } + counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((len + segsiz - 1) / segsiz)); + sched_unpin(); +#endif + return (0); +failed: + if (m) + m_free(m); + return (-1); +} -use_latest: +static void +rack_sndbuf_autoscale(struct tcp_rack *rack) +{ /* - * We don't know how long we may have been - * idle or if this is the first-send. Lets - * setup the flag so we will trim off - * the first ack'd data so we get a true - * measurement. + * Automatic sizing of send socket buffer. Often the send buffer + * size is not optimally adjusted to the actual network conditions + * at hand (delay bandwidth product). Setting the buffer size too + * small limits throughput on links with high bandwidth and high + * delay (eg. trans-continental/oceanic links). Setting the + * buffer size too big consumes too much real kernel memory, + * especially with many connections on busy servers. + * + * The criteria to step up the send buffer one notch are: + * 1. receive window of remote host is larger than send buffer + * (with a fudge factor of 5/4th); + * 2. send buffer is filled to 7/8th with data (so we actually + * have data to make use of it); + * 3. send buffer fill has not hit maximal automatic size; + * 4. our send window (slow start and cogestion controlled) is + * larger than sent but unacknowledged data in send buffer. + * + * Note that the rack version moves things much faster since + * we want to avoid hitting cache lines in the rack_fast_output() + * path so this is called much less often and thus moves + * the SB forward by a percentage. */ - rack->app_limited_needs_set = 1; - tp->gput_ack = startseq + rack_get_measure_window(tp, rack); - /* Find this guy so we can pull the send time */ - fe.r_start = startseq; - my_rsm = RB_FIND(rack_rb_tree_head, &rack->r_ctl.rc_mtree, &fe); - if (my_rsm) { - rack->r_ctl.rc_gp_output_ts = my_rsm->usec_orig_send; - if (my_rsm->r_flags & RACK_ACKED) { + struct socket *so; + struct tcpcb *tp; + uint32_t sendwin, scaleup; + + tp = rack->rc_tp; + so = rack->rc_inp->inp_socket; + sendwin = min(rack->r_ctl.cwnd_to_use, tp->snd_wnd); + if (V_tcp_do_autosndbuf && so->so_snd.sb_flags & SB_AUTOSIZE) { + if ((tp->snd_wnd / 4 * 5) >= so->so_snd.sb_hiwat && + sbused(&so->so_snd) >= + (so->so_snd.sb_hiwat / 8 * 7) && + sbused(&so->so_snd) < V_tcp_autosndbuf_max && + sendwin >= (sbused(&so->so_snd) - + (tp->snd_nxt - tp->snd_una))) { + if (rack_autosndbuf_inc) + scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; + else + scaleup = V_tcp_autosndbuf_inc; + if (scaleup < V_tcp_autosndbuf_inc) + scaleup = V_tcp_autosndbuf_inc; + scaleup += so->so_snd.sb_hiwat; + if (scaleup > V_tcp_autosndbuf_max) + scaleup = V_tcp_autosndbuf_max; + if (!sbreserve_locked(&so->so_snd, scaleup, so, curthread)) + so->so_snd.sb_flags &= ~SB_AUTOSIZE; + } + } +} + +static int +rack_fast_output(struct tcpcb *tp, struct tcp_rack *rack, uint64_t ts_val, + uint32_t cts, uint32_t ms_cts, struct timeval *tv, long tot_len, int *send_err) +{ + /* + * Enter to do fast output. We are given that the sched_pin is + * in place (if accounting is compiled in) and the cycle count taken + * at entry is in place in ts_val. The idea here is that + * we know how many more bytes needs to be sent (presumably either + * during pacing or to fill the cwnd and that was greater than + * the max-burst). We have how much to send and all the info we + * need to just send. + */ + struct ip *ip = NULL; + struct udphdr *udp = NULL; + struct tcphdr *th = NULL; + struct mbuf *m, *s_mb; + struct inpcb *inp; + uint8_t *cpto; + struct tcp_log_buffer *lgb; +#ifdef TCP_ACCOUNTING + uint64_t crtsc; +#endif + struct tcpopt to; + u_char opt[TCP_MAXOLEN]; + uint32_t hdrlen, optlen; + int cnt_thru = 1; + int32_t slot, segsiz, len, max_val, tso = 0, sb_offset, error, flags, ulen = 0; + uint32_t us_cts, s_soff; + uint32_t if_hw_tsomaxsegcount = 0, startseq; + uint32_t if_hw_tsomaxsegsize; + uint16_t add_flag = RACK_SENT_FP; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; + + if (rack->r_is_v6) { + ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; + hdrlen = sizeof(struct ip6_hdr) + sizeof(struct tcphdr); + } else +#endif /* INET6 */ + { + ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; + hdrlen = sizeof(struct tcpiphdr); + } + if (tp->t_port && (V_tcp_udp_tunneling_port == 0)) { + m = NULL; + goto failed; + } + startseq = tp->snd_max; + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + inp = rack->rc_inp; + len = rack->r_ctl.fsb.left_to_send; + to.to_flags = 0; + flags = rack->r_ctl.fsb.tcp_flags; + if (tp->t_flags & TF_RCVD_TSTMP) { + to.to_tsval = ms_cts + tp->ts_offset; + to.to_tsecr = tp->ts_recent; + to.to_flags = TOF_TS; + } + optlen = tcp_addoptions(&to, opt); + hdrlen += optlen; + udp = rack->r_ctl.fsb.udp; + if (rack->r_ctl.rc_pace_max_segs) + max_val = rack->r_ctl.rc_pace_max_segs; + else if (rack->rc_user_set_max_segs) + max_val = rack->rc_user_set_max_segs * segsiz; + else + max_val = len; + if ((tp->t_flags & TF_TSO) && + V_tcp_do_tso && + (len > segsiz) && + (tp->t_port == 0)) + tso = 1; +again: +#ifdef INET6 + if (MHLEN < hdrlen + max_linkhdr) + m = m_getcl(M_NOWAIT, MT_DATA, M_PKTHDR); + else +#endif + m = m_gethdr(M_NOWAIT, MT_DATA); + if (m == NULL) + goto failed; + m->m_data += max_linkhdr; + m->m_len = hdrlen; + th = rack->r_ctl.fsb.th; + /* Establish the len to send */ + if (len > max_val) + len = max_val; + if ((tso) && (len + optlen > tp->t_maxseg)) { + uint32_t if_hw_tsomax; + int32_t max_len; + + /* extract TSO information */ + if_hw_tsomax = tp->t_tsomax; + if_hw_tsomaxsegcount = tp->t_tsomaxsegcount; + if_hw_tsomaxsegsize = tp->t_tsomaxsegsize; + /* + * Check if we should limit by maximum payload + * length: + */ + if (if_hw_tsomax != 0) { + /* compute maximum TSO length */ + max_len = (if_hw_tsomax - hdrlen - + max_linkhdr); + if (max_len <= 0) { + goto failed; + } else if (len > max_len) { + len = max_len; + } + } + if (len <= segsiz) { /* - * Unlikely since its probably what was - * just transmitted (but I am paranoid). + * In case there are too many small fragments don't + * use TSO: */ - tp->gput_ts = my_rsm->r_ack_arrival; - rack->app_limited_needs_set = 0; - } - if (SEQ_LT(my_rsm->r_start, tp->gput_seq)) { - /* This also is unlikely */ - tp->gput_seq = my_rsm->r_start; + tso = 0; } } else { + tso = 0; + } + if ((tso == 0) && (len > segsiz)) + len = segsiz; + us_cts = tcp_get_usecs(tv); + if ((len == 0) || + (len <= MHLEN - hdrlen - max_linkhdr)) { + goto failed; + } + sb_offset = tp->snd_max - tp->snd_una; + th->th_seq = htonl(tp->snd_max); + th->th_ack = htonl(tp->rcv_nxt); + th->th_flags = flags; + th->th_win = htons((u_short)(rack->r_ctl.fsb.recwin >> tp->rcv_scale)); + if (th->th_win == 0) { + tp->t_sndzerowin++; + tp->t_flags |= TF_RXWIN0SENT; + } else + tp->t_flags &= ~TF_RXWIN0SENT; + tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ + KMOD_TCPSTAT_INC(tcps_sndpack); + KMOD_TCPSTAT_ADD(tcps_sndbyte, len); +#ifdef STATS + stats_voi_update_abs_u64(tp->t_stats, VOI_TCP_TXPB, + len); +#endif + if (rack->r_ctl.fsb.m == NULL) + goto failed; + + /* s_mb and s_soff are saved for rack_log_output */ + m->m_next = rack_fo_m_copym(rack, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, &s_mb, &s_soff); + if (len <= segsiz) { /* - * TSNH unless we have some send-map limit, - * and even at that it should not be hitting - * that limit (we should have stopped sending). + * Must have ran out of mbufs for the copy + * shorten it to no longer need tso. Lets + * not put on sendalot since we are low on + * mbufs. + */ + tso = 0; + } + if (rack->r_ctl.fsb.rfo_apply_push && + (len == rack->r_ctl.fsb.left_to_send)) { + th->th_flags |= TH_PUSH; + add_flag |= RACK_HAD_PUSH; + } + if ((m->m_next == NULL) || (len <= 0)){ + goto failed; + } + if (udp) { + if (rack->r_is_v6) + ulen = hdrlen + len - sizeof(struct ip6_hdr); + else + ulen = hdrlen + len - sizeof(struct ip); + udp->uh_ulen = htons(ulen); + } + m->m_pkthdr.rcvif = (struct ifnet *)0; + if (tp->t_state == TCPS_ESTABLISHED && + (tp->t_flags2 & TF2_ECN_PERMIT)) { + /* + * If the peer has ECN, mark data packets with ECN capable + * transmission (ECT). Ignore pure ack packets, + * retransmissions. */ - rack->r_ctl.rc_gp_output_ts = tcp_get_usecs(NULL); + if (len > 0 && SEQ_GEQ(tp->snd_nxt, tp->snd_max)) { +#ifdef INET6 + if (rack->r_is_v6) + ip6->ip6_flow |= htonl(IPTOS_ECN_ECT0 << 20); + else +#endif + ip->ip_tos |= IPTOS_ECN_ECT0; + KMOD_TCPSTAT_INC(tcps_ecn_ect0); + /* + * Reply with proper ECN notifications. + * Only set CWR on new data segments. + */ + if (tp->t_flags2 & TF2_ECN_SND_CWR) { + flags |= TH_CWR; + tp->t_flags2 &= ~TF2_ECN_SND_CWR; + } + } + if (tp->t_flags2 & TF2_ECN_SND_ECE) + flags |= TH_ECE; + } + m->m_pkthdr.len = hdrlen + len; /* in6_cksum() need this */ +#ifdef INET6 + if (rack->r_is_v6) { + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in6_cksum_pseudo(ip6, + sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, + 0); + } + } +#endif +#if defined(INET6) && defined(INET) + else +#endif +#ifdef INET + { + if (tp->t_port) { + m->m_pkthdr.csum_flags = CSUM_UDP; + m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); + udp->uh_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); + th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); + } else { + m->m_pkthdr.csum_flags = CSUM_TCP; + m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + + IPPROTO_TCP + len + optlen)); + } + /* IP version must be set here for ipv4/ipv6 checking later */ + KASSERT(ip->ip_v == IPVERSION, + ("%s: IP version incorrect: %d", __func__, ip->ip_v)); + } +#endif + if (tso) { + KASSERT(len > tp->t_maxseg - optlen, + ("%s: len <= tso_segsz tp:%p", __func__, tp)); + m->m_pkthdr.csum_flags |= CSUM_TSO; + m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; + } +#ifdef INET6 + if (rack->r_is_v6) { + ip6->ip6_hlim = rack->r_ctl.fsb.hoplimit; + ip6->ip6_plen = htons(m->m_pkthdr.len - sizeof(*ip6)); + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + else + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } +#endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { + ip->ip_len = htons(m->m_pkthdr.len); + ip->ip_ttl = rack->r_ctl.fsb.hoplimit; + if (V_path_mtu_discovery && tp->t_maxseg > V_tcp_minmss) { + tp->t_flags2 |= TF2_PLPMTU_PMTUD; + if (tp->t_port == 0 || len < V_tcp_minmss) { + ip->ip_off |= htons(IP_DF); + } + } else { + tp->t_flags2 &= ~TF2_PLPMTU_PMTUD; + } + } +#endif + /* Time to copy in our header */ + cpto = mtod(m, uint8_t *); + memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); + th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + } else { + th->th_off = sizeof(struct tcphdr) >> 2; + } + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; + log.u_bbr.ininput = rack->rc_inp->inp_in_input; + if (rack->rack_no_prr) + log.u_bbr.flex1 = 0; + else + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = max_val; + log.u_bbr.flex5 = 0; + /* Save off the early/late values */ + log.u_bbr.flex6 = rack->r_ctl.rc_agg_early; + log.u_bbr.applimited = rack->r_ctl.rc_agg_delayed; + log.u_bbr.bw_inuse = rack_get_bw(rack); + log.u_bbr.flex8 = 0; + log.u_bbr.pacing_gain = rack_get_output_gain(rack, NULL); + log.u_bbr.flex7 = 44; + log.u_bbr.pkts_out = tp->t_maxseg; + log.u_bbr.timeStamp = cts; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.lt_epoch = rack->r_ctl.cwnd_to_use; + log.u_bbr.delivered = 0; + lgb = tcp_log_event_(tp, th, NULL, NULL, TCP_LOG_OUT, ERRNO_UNK, + len, &log, false, NULL, NULL, 0, tv); + } else + lgb = NULL; +#ifdef INET6 + if (rack->r_is_v6) { + error = ip6_output(m, NULL, + &inp->inp_route6, + 0, NULL, NULL, inp); + } +#endif +#if defined(INET) && defined(INET6) + else +#endif +#ifdef INET + { + error = ip_output(m, NULL, + &inp->inp_route, + 0, 0, inp); + } +#endif + if (lgb) { + lgb->tlb_errno = error; + lgb = NULL; + } + if (error) { + *send_err = error; + m = NULL; + goto failed; + } + rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), + NULL, add_flag, s_mb, s_soff); + m = NULL; + if (tp->snd_una == tp->snd_max) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; + rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); + tp->t_acktime = ticks; } - rack_log_pacing_delay_calc(rack, - tp->gput_seq, - tp->gput_ack, - (uint64_t)my_rsm, - tp->gput_ts, - rack->r_ctl.rc_app_limited_cnt, - 9, __LINE__, NULL); -} - -static inline uint32_t -rack_what_can_we_send(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cwnd_to_use, - uint32_t avail, int32_t sb_offset) -{ - uint32_t len; - uint32_t sendwin; + rack->forced_ack = 0; /* If we send something zap the FA flag */ + tot_len += len; + if ((tp->t_flags & TF_GPUTINPROG) == 0) + rack_start_gp_measurement(tp, rack, tp->snd_max, sb_offset); + tp->snd_max += len; + tp->snd_nxt = tp->snd_max; + { + int idx; - if (tp->snd_wnd > cwnd_to_use) - sendwin = cwnd_to_use; + idx = (len / segsiz) + 3; + if (idx >= TCP_MSS_ACCT_ATIMER) + counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); + else + counter_u64_add(rack_out_size[idx], 1); + } + if (len <= rack->r_ctl.fsb.left_to_send) + rack->r_ctl.fsb.left_to_send -= len; else - sendwin = tp->snd_wnd; - if (ctf_outstanding(tp) >= tp->snd_wnd) { - /* We never want to go over our peers rcv-window */ - len = 0; - } else { - uint32_t flight; - - flight = ctf_flight_size(tp, rack->r_ctl.rc_sacked); - if (flight >= sendwin) { - /* - * We have in flight what we are allowed by cwnd (if - * it was rwnd blocking it would have hit above out - * >= tp->snd_wnd). - */ - return (0); - } - len = sendwin - flight; - if ((len + ctf_outstanding(tp)) > tp->snd_wnd) { - /* We would send too much (beyond the rwnd) */ - len = tp->snd_wnd - ctf_outstanding(tp); - } - if ((len + sb_offset) > avail) { - /* - * We don't have that much in the SB, how much is - * there? - */ - len = avail - sb_offset; - } + rack->r_ctl.fsb.left_to_send = 0; + if (rack->r_ctl.fsb.left_to_send < segsiz) { + rack->r_fast_output = 0; + rack->r_ctl.fsb.left_to_send = 0; + /* At the end of fast_output scale up the sb */ + SOCKBUF_LOCK(&rack->rc_inp->inp_socket->so_snd); + rack_sndbuf_autoscale(rack); + SOCKBUF_UNLOCK(&rack->rc_inp->inp_socket->so_snd); + } + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + KMOD_TCPSTAT_INC(tcps_segstimed); + } + if ((rack->r_ctl.fsb.left_to_send >= segsiz) && + (max_val > len) && + (tso == 0)) { + max_val -= len; + len = segsiz; + th = rack->r_ctl.fsb.th; + cnt_thru++; + goto again; } - return (len); + tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); + counter_u64_add(rack_fto_send, 1); + slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); + rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_DATA] += cnt_thru; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], cnt_thru); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len + segsiz - 1) / segsiz); + } + counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len + segsiz - 1) / segsiz)); + sched_unpin(); +#endif + return (0); +failed: + if (m) + m_free(m); + rack->r_fast_output = 0; + return (-1); } static int @@ -11876,15 +15892,17 @@ { struct socket *so; uint32_t recwin; - uint32_t sb_offset; + uint32_t sb_offset, s_moff = 0; int32_t len, flags, error = 0; - struct mbuf *m; + struct mbuf *m, *s_mb = NULL; struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; int32_t segsiz, minseg; long tot_len_this_send = 0; +#ifdef INET struct ip *ip = NULL; +#endif #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif @@ -11910,17 +15928,22 @@ struct tcpopt to; int32_t slot = 0; int32_t sup_rack = 0; - uint32_t cts, us_cts, delayed, early; - uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; - uint32_t cwnd_to_use; - int32_t do_a_prefetch; + uint32_t cts, ms_cts, delayed, early; + uint16_t add_flag = RACK_SENT_SP; + uint8_t hpts_calling, doing_tlp = 0; + uint32_t cwnd_to_use, pace_max_seg; + int32_t do_a_prefetch = 0; int32_t prefetch_rsm = 0; - int32_t orig_len; + int32_t orig_len = 0; struct timeval tv; int32_t prefetch_so_done = 0; - struct tcp_log_buffer *lgb = NULL; + struct tcp_log_buffer *lgb; struct inpcb *inp; struct sockbuf *sb; + uint64_t ts_val = 0; +#ifdef TCP_ACCOUNTING + uint64_t crtsc; +#endif #ifdef INET6 struct ip6_hdr *ip6 = NULL; int32_t isipv6; @@ -11930,19 +15953,20 @@ /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; - inp = rack->rc_inp; - so = inp->inp_socket; - sb = &so->so_snd; - kern_prefetch(sb, &do_a_prefetch); - do_a_prefetch = 1; - hpts_calling = inp->inp_hpts_calls; - hw_tls = (so->so_snd.sb_flags & SB_TLS_IFNET) != 0; - +#ifdef TCP_ACCOUNTING + sched_pin(); + ts_val = get_cyclecount(); +#endif + hpts_calling = rack->rc_inp->inp_hpts_calls; NET_EPOCH_ASSERT(); - INP_WLOCK_ASSERT(inp); + INP_WLOCK_ASSERT(rack->rc_inp); #ifdef TCP_OFFLOAD - if (tp->t_flags & TF_TOE) + if (tp->t_flags & TF_TOE) { +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif return (tcp_offload_output(tp)); + } #endif /* * For TFO connections in SYN_RECEIVED, only allow the initial @@ -11951,21 +15975,25 @@ if (IS_FASTOPEN(tp->t_flags) && (tp->t_state == TCPS_SYN_RECEIVED) && SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ - (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ + (rack->r_ctl.rc_resend == NULL)) { /* not a retransmit */ +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif return (0); + } #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ isipv6 = rack->r_is_v6; } else { - isipv6 = (inp->inp_vflag & INP_IPV6) != 0; + isipv6 = (rack->rc_inp->inp_vflag & INP_IPV6) != 0; } #endif early = 0; - us_cts = tcp_get_usecs(&tv); - cts = tcp_tv_to_mssectick(&tv); + cts = tcp_get_usecs(&tv); + ms_cts = tcp_tv_to_mssectick(&tv); if (((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) == 0) && - inp->inp_in_hpts) { + rack->rc_inp->inp_in_hpts) { /* * We are on the hpts for some timer but not hptsi output. * Remove from the hpts unconditionally. @@ -11974,37 +16002,64 @@ } /* Are we pacing and late? */ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && - TSTMP_GEQ(us_cts, rack->r_ctl.rc_last_output_to)) { + TSTMP_GEQ(cts, rack->r_ctl.rc_last_output_to)) { /* We are delayed */ - delayed = us_cts - rack->r_ctl.rc_last_output_to; + delayed = cts - rack->r_ctl.rc_last_output_to; } else { delayed = 0; } - /* Do the timers, which may override the pacer */ + /* Do the timers, which may override the pacer */ if (rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) { if (rack_process_timers(tp, rack, cts, hpts_calling)) { counter_u64_add(rack_out_size[TCP_MSS_ACCT_ATIMER], 1); +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif return (0); } } + if (rack->rc_in_persist) { + if (rack->rc_inp->inp_in_hpts == 0) { + /* Timer is not running */ + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); + } +#ifdef TCP_ACCOUNTING + sched_unpin(); +#endif + return (0); + } if ((rack->r_timer_override) || + (rack->rc_ack_can_sendout_data) || (delayed) || (tp->t_state < TCPS_ESTABLISHED)) { - if (tp->t_inpcb->inp_in_hpts) - tcp_hpts_remove(tp->t_inpcb, HPTS_REMOVE_OUTPUT); - } else if (tp->t_inpcb->inp_in_hpts) { + rack->rc_ack_can_sendout_data = 0; + if (rack->rc_inp->inp_in_hpts) + tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); + } else if (rack->rc_inp->inp_in_hpts) { /* * On the hpts you can't pass even if ACKNOW is on, we will * when the hpts fires. */ +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_BLOCKED] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_BLOCKED], (crtsc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_BLOCKED]++; + } + counter_u64_add(tcp_cnt_counters[SND_BLOCKED], 1); + sched_unpin(); +#endif counter_u64_add(rack_out_size[TCP_MSS_ACCT_INPACE], 1); return (0); } - inp->inp_hpts_calls = 0; + rack->rc_inp->inp_hpts_calls = 0; /* Finish out both pacing early and late accounting */ if ((rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT) && - TSTMP_GT(rack->r_ctl.rc_last_output_to, us_cts)) { - early = rack->r_ctl.rc_last_output_to - us_cts; + TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) { + early = rack->r_ctl.rc_last_output_to - cts; } else early = 0; if (delayed) { @@ -12018,6 +16073,26 @@ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; rack->r_wanted_output = 0; rack->r_timer_override = 0; + if ((tp->t_state != rack->r_state) && + TCPS_HAVEESTABLISHED(tp->t_state)) { + rack_set_state(tp, rack); + } + if ((rack->r_fast_output) && + (tp->rcv_numsacks == 0)) { + int ret; + + error = 0; + ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); + if (ret >= 0) + return(ret); + else if (error) { + inp = rack->rc_inp; + so = inp->inp_socket; + sb = &so->so_snd; + goto nomore; + } + } + inp = rack->rc_inp; /* * For TFO connections in SYN_SENT or SYN_RECEIVED, * only allow the initial SYN or SYN|ACK and those sent @@ -12029,6 +16104,8 @@ SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ (tp->t_rxtshift == 0)) { /* not a retransmit */ cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; + so = inp->inp_socket; + sb = &so->so_snd; goto just_return_nolock; } /* @@ -12051,21 +16128,23 @@ } if ((tp->snd_una == tp->snd_max) && rack->r_ctl.rc_went_idle_time && - TSTMP_GT(us_cts, rack->r_ctl.rc_went_idle_time)) { - idle = us_cts - rack->r_ctl.rc_went_idle_time; + TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { + idle = cts - rack->r_ctl.rc_went_idle_time; if (idle > rack_min_probertt_hold) { /* Count as a probe rtt */ if (rack->in_probe_rtt == 0) { - rack->r_ctl.rc_lower_rtt_us_cts = us_cts; + rack->r_ctl.rc_lower_rtt_us_cts = cts; rack->r_ctl.rc_time_probertt_entered = rack->r_ctl.rc_lower_rtt_us_cts; rack->r_ctl.rc_time_probertt_starts = rack->r_ctl.rc_lower_rtt_us_cts; rack->r_ctl.rc_time_of_last_probertt = rack->r_ctl.rc_lower_rtt_us_cts; } else { - rack_exit_probertt(rack, us_cts); + rack_exit_probertt(rack, cts); } } idle = 0; } + if (rack_use_fsb && (rack->r_fsb_inited == 0)) + rack_init_fsb_block(tp, rack); again: /* * If we've recently taken a timeout, snd_max will be greater than @@ -12073,45 +16152,18 @@ * resending already delivered data. Adjust snd_nxt accordingly. */ sendalot = 0; - us_cts = tcp_get_usecs(&tv); - cts = tcp_tv_to_mssectick(&tv); + cts = tcp_get_usecs(&tv); + ms_cts = tcp_tv_to_mssectick(&tv); tso = 0; mtu = 0; segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); minseg = segsiz; + if (rack->r_ctl.rc_pace_max_segs == 0) + pace_max_seg = rack->rc_user_set_max_segs * segsiz; + else + pace_max_seg = rack->r_ctl.rc_pace_max_segs; sb_offset = tp->snd_max - tp->snd_una; cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; -#ifdef NETFLIX_SHARED_CWND - if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && - rack->rack_enable_scwnd) { - /* We are doing cwnd sharing */ - if (rack->rc_gp_filled && - (rack->rack_attempted_scwnd == 0) && - (rack->r_ctl.rc_scw == NULL) && - tp->t_lib) { - /* The pcbid is in, lets make an attempt */ - counter_u64_add(rack_try_scwnd, 1); - rack->rack_attempted_scwnd = 1; - rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, - &rack->r_ctl.rc_scw_index, - segsiz); - } - if (rack->r_ctl.rc_scw && - (rack->rack_scwnd_is_idle == 1) && - (rack->rc_in_persist == 0) && - sbavail(sb)) { - /* we are no longer out of data */ - tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); - rack->rack_scwnd_is_idle = 0; - } - if (rack->r_ctl.rc_scw) { - /* First lets update and get the cwnd */ - rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, - rack->r_ctl.rc_scw_index, - tp->snd_cwnd, tp->snd_wnd, segsiz); - } - } -#endif flags = tcp_outflags[tp->t_state]; while (rack->rc_free_cnt < rack_free_cache) { rsm = rack_alloc(rack); @@ -12119,6 +16171,8 @@ if (inp->inp_hpts_calls) /* Retry in a ms */ slot = (1 * HPTS_USEC_IN_MSEC); + so = inp->inp_socket; + sb = &so->so_snd; goto just_return_nolock; } TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_tnext); @@ -12131,7 +16185,9 @@ len = 0; rsm = NULL; if (flags & TH_RST) { - SOCKBUF_LOCK(sb); + SOCKBUF_LOCK(&inp->inp_socket->so_snd); + so = inp->inp_socket; + sb = &so->so_snd; goto send; } if (rack->r_ctl.rc_resend) { @@ -12149,25 +16205,16 @@ sb_offset = rsm->r_start - tp->snd_una; if (len >= segsiz) len = segsiz; - } else if ((rack->rc_in_persist == 0) && - ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { + } else if ((rsm = tcp_rack_output(tp, rack, cts)) != NULL) { /* We have a retransmit that takes precedence */ rsm->r_flags &= ~RACK_TLP; - if ((!IN_RECOVERY(tp->t_flags)) && - ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { + if ((!IN_FASTRECOVERY(tp->t_flags)) && + ((tp->t_flags & TF_WASFRECOVERY) == 0)) { /* Enter recovery if not induced by a time-out */ rack->r_ctl.rc_rsm_start = rsm->r_start; rack->r_ctl.rc_cwnd_at = tp->snd_cwnd; rack->r_ctl.rc_ssthresh_at = tp->snd_ssthresh; - rack_cong_signal(tp, NULL, CC_NDUPACK); - /* - * When we enter recovery we need to assure we send - * one packet. - */ - if (rack->rack_no_prr == 0) { - rack->r_ctl.rc_prr_sndcnt = segsiz; - rack_log_to_prr(rack, 13, 0); - } + rack_cong_signal(tp, CC_NDUPACK, tp->snd_una); } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { @@ -12181,37 +16228,17 @@ __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - /* Can we send it within the PRR boundary? */ - if (rack->rack_no_prr == 0) { - if ((rack->use_rack_rr == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { - /* It does not fit */ - if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && - (rack->r_ctl.rc_prr_sndcnt < segsiz)) { - /* - * prr is less than a segment, we - * have more acks due in besides - * what we need to resend. Lets not send - * to avoid sending small pieces of - * what we need to retransmit. - */ - len = 0; - goto just_return_nolock; - } - len = rack->r_ctl.rc_prr_sndcnt; - } - } sendalot = 0; if (len >= segsiz) len = segsiz; if (len > 0) { - sub_from_prr = 1; sack_rxmit = 1; KMOD_TCPSTAT_INC(tcps_sack_rexmits); KMOD_TCPSTAT_ADD(tcps_sack_rexmit_bytes, min(len, segsiz)); counter_u64_add(rack_rtm_prr_retran, 1); } - } else if (rack->r_ctl.rc_tlpsend) { + } else if (rack->r_ctl.rc_tlpsend) { /* Tail loss probe */ long cwin; long tlen; @@ -12225,11 +16252,14 @@ */ rsm = rack->r_ctl.rc_tlpsend; rsm->r_flags |= RACK_TLP; + rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; if (tlen > segsiz) tlen = segsiz; + tp->t_sndtlppack++; + tp->t_sndtlpbyte += tlen; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, @@ -12238,6 +16268,46 @@ cwin = min(tp->snd_wnd, tlen); len = cwin; } + if (rack->r_must_retran && + (rsm == NULL)) { + /* + * Non-Sack and we had a RTO or MTU change, we + * need to retransmit until we reach + * the former snd_max (rack->r_ctl.rc_snd_max_at_rto). + */ + if (SEQ_GT(tp->snd_max, tp->snd_una)) { + int sendwin, flight; + + sendwin = min(tp->snd_wnd, tp->snd_cwnd); + flight = ctf_flight_size(tp, rack->r_ctl.rc_out_at_rto); + if (flight >= sendwin) { + so = inp->inp_socket; + sb = &so->so_snd; + goto just_return_nolock; + } + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + KASSERT(rsm != NULL, ("rsm is NULL rack:%p r_must_retran set", rack)); + if (rsm == NULL) { + /* TSNH */ + rack->r_must_retran = 0; + rack->r_ctl.rc_out_at_rto = 0; + rack->r_must_retran = 0; + so = inp->inp_socket; + sb = &so->so_snd; + goto just_return_nolock; + } + sack_rxmit = 1; + len = rsm->r_end - rsm->r_start; + sendalot = 0; + sb_offset = rsm->r_start - tp->snd_una; + if (len >= segsiz) + len = segsiz; + } else { + /* We must be done if there is nothing outstanding */ + rack->r_must_retran = 0; + rack->r_ctl.rc_out_at_rto = 0; + } + } /* * Enforce a connection sendmap count limit if set * as long as we are not retransmiting. @@ -12251,6 +16321,8 @@ rack->alloc_limit_reported = 1; counter_u64_add(rack_alloc_limited_conns, 1); } + so = inp->inp_socket; + sb = &so->so_snd; goto just_return_nolock; } if (rsm && (rsm->r_flags & RACK_HAS_FIN)) { @@ -12267,6 +16339,50 @@ #ifdef INVARIANTS /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; +#endif + if (rsm && rack->r_fsb_inited && rack_use_rsm_rfo && + ((rsm->r_flags & RACK_HAS_FIN) == 0)) { + int ret; + + ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len); + if (ret == 0) + return (0); + } + so = inp->inp_socket; + sb = &so->so_snd; + if (do_a_prefetch == 0) { + kern_prefetch(sb, &do_a_prefetch); + do_a_prefetch = 1; + } +#ifdef NETFLIX_SHARED_CWND + if ((tp->t_flags2 & TF2_TCP_SCWND_ALLOWED) && + rack->rack_enable_scwnd) { + /* We are doing cwnd sharing */ + if (rack->gp_ready && + (rack->rack_attempted_scwnd == 0) && + (rack->r_ctl.rc_scw == NULL) && + tp->t_lib) { + /* The pcbid is in, lets make an attempt */ + counter_u64_add(rack_try_scwnd, 1); + rack->rack_attempted_scwnd = 1; + rack->r_ctl.rc_scw = tcp_shared_cwnd_alloc(tp, + &rack->r_ctl.rc_scw_index, + segsiz); + } + if (rack->r_ctl.rc_scw && + (rack->rack_scwnd_is_idle == 1) && + sbavail(&so->so_snd)) { + /* we are no longer out of data */ + tcp_shared_cwnd_active(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); + rack->rack_scwnd_is_idle = 0; + } + if (rack->r_ctl.rc_scw) { + /* First lets update and get the cwnd */ + rack->r_ctl.cwnd_to_use = cwnd_to_use = tcp_shared_cwnd_update(rack->r_ctl.rc_scw, + rack->r_ctl.rc_scw_index, + tp->snd_cwnd, tp->snd_wnd, segsiz); + } + } #endif /* * Get standard flags, and add SYN or FIN if requested by 'hidden' @@ -12314,15 +16430,20 @@ if (rack->r_ctl.rc_tlp_new_data > (uint32_t) (avail - sb_offset)) { rack->r_ctl.rc_tlp_new_data = (uint32_t) (avail - sb_offset); } - if (rack->r_ctl.rc_tlp_new_data > tp->snd_wnd) - len = tp->snd_wnd; - else + if ((rack->r_ctl.rc_tlp_new_data + sb_offset) > tp->snd_wnd) { + if (tp->snd_wnd > sb_offset) + len = tp->snd_wnd - sb_offset; + else + len = 0; + } else { len = rack->r_ctl.rc_tlp_new_data; + } rack->r_ctl.rc_tlp_new_data = 0; - new_data_tlp = doing_tlp = 1; - } else + doing_tlp = 1; + } else { len = rack_what_can_we_send(tp, rack, cwnd_to_use, avail, sb_offset); - if (IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { + } + if ((rack->r_ctl.crte == NULL) && IN_FASTRECOVERY(tp->t_flags) && (len > segsiz)) { /* * For prr=off, we need to send only 1 MSS * at a time. We do this because another sack could @@ -12334,12 +16455,14 @@ } } else { uint32_t outstanding; - /* - * We are inside of a SACK recovery episode and are - * sending new data, having retransmitted all the - * data possible so far in the scoreboard. + * We are inside of a Fast recovery episode, this + * is caused by a SACK or 3 dup acks. At this point + * we have sent all the retransmissions and we rely + * on PRR to dictate what we will send in the form of + * new data. */ + outstanding = tp->snd_max - tp->snd_una; if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { if (tp->snd_wnd > outstanding) { @@ -12352,15 +16475,18 @@ else len = 0; } - } else + } else { len = 0; - } else if (avail > sb_offset) + } + } else if (avail > sb_offset) { len = avail - sb_offset; - else + } else { len = 0; + } if (len > 0) { - if (len > rack->r_ctl.rc_prr_sndcnt) + if (len > rack->r_ctl.rc_prr_sndcnt) { len = rack->r_ctl.rc_prr_sndcnt; + } if (len > 0) { sub_from_prr = 1; counter_u64_add(rack_rtm_prr_newdata, 1); @@ -12374,8 +16500,9 @@ * likely the PRR algorithm is not going to * let us send a lot as well :-) */ - if (rack->r_ctl.rc_prr_sendalot == 0) + if (rack->r_ctl.rc_prr_sendalot == 0) { len = segsiz; + } } else if (len < segsiz) { /* * Do we send any? The idea here is if the @@ -12457,6 +16584,11 @@ tp->snd_nxt = tp->iss; len = 0; } + if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { + /* We only send 1 MSS if we have a DSACK block */ + add_flag |= RACK_SENT_W_DSACK; + len = segsiz; + } orig_len = len; if (len <= 0) { /* @@ -12479,18 +16611,17 @@ (TCPS_HAVEESTABLISHED(tp->t_state)) && (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { - tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } } else if ((rsm == NULL) && - ((doing_tlp == 0) || (new_data_tlp == 1)) && - (len < rack->r_ctl.rc_pace_max_segs)) { + (doing_tlp == 0) && + (len < pace_max_seg)) { /* * We are not sending a maximum sized segment for * some reason. Should we not send anything (think * sws or persists)? */ - if ((tp->snd_wnd < min(max(segsiz, (rack->r_ctl.rc_high_rwnd/2)), minseg)) && + if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), minseg)) && (TCPS_HAVEESTABLISHED(tp->t_state)) && (len < minseg) && (len < (int)(sbavail(sb) - sb_offset))) { @@ -12509,9 +16640,8 @@ * go into persists. */ rack_enter_persist(tp, rack, cts); - tp->snd_nxt = tp->snd_una; } - } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && + } else if ((cwnd_to_use >= max(minseg, (segsiz * 4))) && (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * segsiz)) && (len < (int)(sbavail(sb) - sb_offset)) && (len < minseg)) { @@ -12540,11 +16670,41 @@ * its not a full pacing segment. */ len = 0; + } else if ((rack->r_ctl.crte != NULL) && + (tp->snd_wnd >= (pace_max_seg * max(1, rack_hw_rwnd_factor))) && + (cwnd_to_use >= (pace_max_seg + (4 * segsiz))) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) >= (2 * segsiz)) && + (len < (int)(sbavail(sb) - sb_offset))) { + /* + * Here we are doing hardware pacing, this is not a TLP, + * we are not sending a pace max segment size, there is rwnd + * room to send at least N pace_max_seg, the cwnd is greater + * than or equal to a full pacing segments plus 4 mss and we have 2 or + * more segments in flight and its not the tail of the socket buffer. + * + * We don't want to send instead we need to get more ack's in to + * allow us to send a full pacing segment. Normally, if we are pacing + * about the right speed, we should have finished our pacing + * send as most of the acks have come back if we are at the + * right rate. This is a bit fuzzy since return path delay + * can delay the acks, which is why we want to make sure we + * have cwnd space to have a bit more than a max pace segments in flight. + * + * If we have not gotten our acks back we are pacing at too high a + * rate delaying will not hurt and will bring our GP estimate down by + * injecting the delay. If we don't do this we will send + * 2 MSS out in response to the acks being clocked in which + * defeats the point of hw-pacing (i.e. to help us get + * larger TSO's out). + */ + len = 0; + } + } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); - tcp_sndbuf_autoscale(tp, so, min(tp->snd_wnd, cwnd_to_use)); + rack_sndbuf_autoscale(rack); /* * Decide if we can use TCP Segmentation Offloading (if supported by * hardware). @@ -12561,17 +16721,7 @@ * the right thing below to provide length of just ip options and thus * checking for ipoptlen is enough to decide if ip options are present. */ - -#ifdef INET6 - if (isipv6) - ipoptlen = ip6_optlen(tp->t_inpcb); - else -#endif - if (tp->t_inpcb->inp_options) - ipoptlen = tp->t_inpcb->inp_options->m_len - - offsetof(struct ipoption, ipopt_list); - else - ipoptlen = 0; + ipoptlen = 0; #if defined(IPSEC) || defined(IPSEC_SUPPORT) /* * Pre-calculate here as we save another lookup into the darknesses @@ -12783,9 +16933,51 @@ if (tot_len_this_send > 0) { /* Make sure snd_nxt is up to max */ + rack->r_ctl.fsb.recwin = recwin; + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); + if ((error == 0) && + rack_use_rfo && + ((flags & (TH_SYN|TH_FIN)) == 0) && + (ipoptlen == 0) && + (tp->snd_nxt == tp->snd_max) && + (tp->rcv_numsacks == 0) && + rack->r_fsb_inited && + TCPS_HAVEESTABLISHED(tp->t_state) && + (rack->r_must_retran == 0) && + ((tp->t_flags & TF_NEEDFIN) == 0) && + (len > 0) && (orig_len > 0) && + (orig_len > len) && + ((orig_len - len) >= segsiz) && + ((optlen == 0) || + ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { + /* We can send at least one more MSS using our fsb */ + + rack->r_fast_output = 1; + rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); + rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; + rack->r_ctl.fsb.tcp_flags = flags; + rack->r_ctl.fsb.left_to_send = orig_len - len; + KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), + ("rack:%p left_to_send:%u sbavail:%u out:%u", + rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), + (tp->snd_max - tp->snd_una))); + if (rack->r_ctl.fsb.left_to_send < segsiz) + rack->r_fast_output = 0; + else { + if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) + rack->r_ctl.fsb.rfo_apply_push = 1; + else + rack->r_ctl.fsb.rfo_apply_push = 0; + } + } else + rack->r_fast_output = 0; + + + rack_log_fsb(rack, tp, so, flags, + ipoptlen, orig_len, len, 0, + 1, optlen, __LINE__, 1); if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); } else { int end_window = 0; uint32_t seq = tp->gput_ack; @@ -12808,9 +17000,13 @@ minseg)) >= tp->snd_wnd) { /* We are limited by the rwnd */ app_limited = CTF_JR_RWND_LIMITED; + if (IN_FASTRECOVERY(tp->t_flags)) + rack->r_ctl.rc_prr_sndcnt = 0; } else if (ctf_outstanding(tp) >= sbavail(sb)) { /* We are limited by whats available -- app limited */ app_limited = CTF_JR_APP_LIMITED; + if (IN_FASTRECOVERY(tp->t_flags)) + rack->r_ctl.rc_prr_sndcnt = 0; } else if ((idle == 0) && ((tp->t_flags & TF_NODELAY) == 0) && ((uint32_t)len + (uint32_t)sb_offset >= sbavail(sb)) && @@ -12833,10 +17029,7 @@ } else if ((ctf_outstanding(tp) + minseg) > cwnd_to_use) { /* Its the cwnd */ app_limited = CTF_JR_CWND_LIMITED; - } else if (rack->rc_in_persist == 1) { - /* We are in persists */ - app_limited = CTF_JR_PERSISTS; - } else if (IN_RECOVERY(tp->t_flags) && + } else if (IN_FASTRECOVERY(tp->t_flags) && (rack->rack_no_prr == 0) && (rack->r_ctl.rc_prr_sndcnt < segsiz)) { app_limited = CTF_JR_PRR; @@ -12864,15 +17057,6 @@ * this case. */ end_window = 1; - } else if (app_limited == CTF_JR_PERSISTS) { - /* - * We never end the measurement window - * in persists, though in theory we - * should be only entering after everything - * is acknowledged (so we will probably - * never come here). - */ - end_window = 0; } else if (rack_rwnd_block_ends_measure && (app_limited == CTF_JR_RWND_LIMITED)) { /* @@ -12937,8 +17121,7 @@ counter_u64_add(rack_unpaced_segments, 1); } /* Check if we need to go into persists or not */ - if ((rack->rc_in_persist == 0) && - (tp->snd_max == tp->snd_una) && + if ((tp->snd_max == tp->snd_una) && TCPS_HAVEESTABLISHED(tp->t_state) && sbavail(sb) && (sbavail(sb) > tp->snd_wnd) && @@ -12955,10 +17138,42 @@ tcp_shared_cwnd_idle(rack->r_ctl.rc_scw, rack->r_ctl.rc_scw_index); rack->rack_scwnd_is_idle = 1; } +#endif +#ifdef TCP_ACCOUNTING + if (tot_len_this_send > 0) { + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_DATA]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_DATA] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_DATA], (crtsc - ts_val)); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) / segsiz); + } + counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) / segsiz)); + } else { + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_LIMITED]++; + } + counter_u64_add(tcp_cnt_counters[SND_LIMITED], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_LIMITED] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_LIMITED], (crtsc - ts_val)); + } + sched_unpin(); #endif return (0); send: + if (rsm || sack_rxmit) + counter_u64_add(rack_nfto_resend, 1); + else + counter_u64_add(rack_non_fto_send, 1); if ((flags & TH_FIN) && sbavail(sb)) { /* @@ -13053,7 +17268,7 @@ /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { - to.to_tsval = cts + tp->ts_offset; + to.to_tsval = ms_cts + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; } @@ -13062,13 +17277,15 @@ (so->so_rcv.sb_flags & SB_AUTOSIZE)) tp->rfbuf_ts = tcp_ts_getticks(); /* Selective ACK's. */ - if (flags & TH_SYN) - to.to_flags |= TOF_SACKPERM; - else if (TCPS_HAVEESTABLISHED(tp->t_state) && - tp->rcv_numsacks > 0) { - to.to_flags |= TOF_SACK; - to.to_nsacks = tp->rcv_numsacks; - to.to_sacks = (u_char *)tp->sackblks; + if (tp->t_flags & TF_SACK_PERMIT) { + if (flags & TH_SYN) + to.to_flags |= TOF_SACKPERM; + else if (TCPS_HAVEESTABLISHED(tp->t_state) && + tp->rcv_numsacks > 0) { + to.to_flags |= TOF_SACK; + to.to_nsacks = tp->rcv_numsacks; + to.to_sacks = (u_char *)tp->sackblks; + } } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) /* TCP-MD5 (RFC2385). */ @@ -13090,6 +17307,18 @@ if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ SOCKBUF_UNLOCK(&so->so_snd); +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_FAIL]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); + sched_unpin(); +#endif return (EHOSTUNREACH); } hdrlen += sizeof(struct udphdr); @@ -13154,7 +17383,7 @@ len -= moff; } } - /* + /* * In case there are too many small fragments don't * use TSO: */ @@ -13224,6 +17453,7 @@ * and initialize the header from the template for sends on this * connection. */ + hw_tls = (sb->sb_flags & SB_TLS_IFNET) != 0; if (len) { uint32_t max_val; uint32_t moff; @@ -13262,6 +17492,8 @@ * sb_offset in the socket buffer chain. */ mb = sbsndptr_noadv(sb, sb_offset, &moff); + s_mb = mb; + s_moff = moff; if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); @@ -13332,8 +17564,10 @@ */ if (sb_offset + len == sbused(sb) && sbused(sb) && - !(flags & TH_SYN)) + !(flags & TH_SYN)) { flags |= TH_PUSH; + add_flag |= RACK_HAD_PUSH; + } SOCKBUF_UNLOCK(sb); } else { @@ -13365,38 +17599,54 @@ #ifdef MAC mac_inpcb_create_mbuf(inp, m); #endif + if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { #ifdef INET6 - if (isipv6) { - ip6 = mtod(m, struct ip6_hdr *); - if (tp->t_port) { - udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); - udp->uh_sport = htons(V_tcp_udp_tunneling_port); - udp->uh_dport = tp->t_port; - ulen = hdrlen + len - sizeof(struct ip6_hdr); + if (isipv6) + ip6 = (struct ip6_hdr *)rack->r_ctl.fsb.tcp_ip_hdr; + else +#endif /* INET6 */ + ip = (struct ip *)rack->r_ctl.fsb.tcp_ip_hdr; + th = rack->r_ctl.fsb.th; + udp = rack->r_ctl.fsb.udp; + if (udp) { + if (isipv6) + ulen = hdrlen + len - sizeof(struct ip6_hdr); + else + ulen = hdrlen + len - sizeof(struct ip); udp->uh_ulen = htons(ulen); - th = (struct tcphdr *)(udp + 1); - } else { - th = (struct tcphdr *)(ip6 + 1); } - tcpip_fillheaders(inp, tp->t_port, ip6, th); - } else + } else { +#ifdef INET6 + if (isipv6) { + ip6 = mtod(m, struct ip6_hdr *); + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip6_hdr); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else + th = (struct tcphdr *)(ip6 + 1); + tcpip_fillheaders(inp, tp->t_port, ip6, th); + } else #endif /* INET6 */ - { - ip = mtod(m, struct ip *); + { + ip = mtod(m, struct ip *); #ifdef TCPDEBUG - ipov = (struct ipovly *)ip; + ipov = (struct ipovly *)ip; #endif - if (tp->t_port) { - udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); - udp->uh_sport = htons(V_tcp_udp_tunneling_port); - udp->uh_dport = tp->t_port; - ulen = hdrlen + len - sizeof(struct ip); - udp->uh_ulen = htons(ulen); - th = (struct tcphdr *)(udp + 1); - } else { - th = (struct tcphdr *)(ip + 1); + if (tp->t_port) { + udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); + udp->uh_sport = htons(V_tcp_udp_tunneling_port); + udp->uh_dport = tp->t_port; + ulen = hdrlen + len - sizeof(struct ip); + udp->uh_ulen = htons(ulen); + th = (struct tcphdr *)(udp + 1); + } else + th = (struct tcphdr *)(ip + 1); + tcpip_fillheaders(inp, tp->t_port, ip, th); } - tcpip_fillheaders(inp, tp->t_port, ip, th); } /* * Fill in fields, remembering maximum advertised window for use in @@ -13464,19 +17714,9 @@ * (retransmit and persist are mutually exclusive...) */ if (sack_rxmit == 0) { - if (len || (flags & (TH_SYN | TH_FIN)) || - rack->rc_in_persist) { + if (len || (flags & (TH_SYN | TH_FIN))) { th->th_seq = htonl(tp->snd_nxt); rack_seq = tp->snd_nxt; - } else if (flags & TH_RST) { - /* - * For a Reset send the last cum ack in sequence - * (this like any other choice may still generate a - * challenge ack, if a ack-update packet is in - * flight). - */ - th->th_seq = htonl(tp->snd_una); - rack_seq = tp->snd_una; } else { th->th_seq = htonl(tp->snd_max); rack_seq = tp->snd_max; @@ -13486,10 +17726,6 @@ rack_seq = rsm->r_start; } th->th_ack = htonl(tp->rcv_nxt); - if (optlen) { - bcopy(opt, th + 1, optlen); - th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; - } th->th_flags = flags; /* * Calculate receive window. Don't shrink window, but avoid silly @@ -13500,8 +17736,9 @@ recwin = 0; } else { if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)segsiz) + recwin < (long)segsiz) { recwin = 0; + } if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) recwin = (long)(tp->rcv_adv - tp->rcv_nxt); @@ -13533,8 +17770,35 @@ tp->t_flags |= TF_RXWIN0SENT; } else tp->t_flags &= ~TF_RXWIN0SENT; - tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ + tp->snd_up = tp->snd_una; /* drag it along, its deprecated */ + /* Now are we using fsb?, if so copy the template data to the mbuf */ + if ((ipoptlen == 0) && (rack->r_ctl.fsb.tcp_ip_hdr) && rack->r_fsb_inited) { + uint8_t *cpto; + cpto = mtod(m, uint8_t *); + memcpy(cpto, rack->r_ctl.fsb.tcp_ip_hdr, rack->r_ctl.fsb.tcp_ip_hdr_len); + /* + * We have just copied in: + * IP/IP6 + * + * tcphdr (no options) + * + * We need to grab the correct pointers into the mbuf + * for both the tcp header, and possibly the udp header (if tunneling). + * We do this by using the offset in the copy buffer and adding it + * to the mbuf base pointer (cpto). + */ +#ifdef INET6 + if (isipv6) + ip6 = mtod(m, struct ip6_hdr *); + else +#endif /* INET6 */ + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.th - rack->r_ctl.fsb.tcp_ip_hdr)); + /* If we have a udp header lets set it into the mbuf as well */ + if (udp) + udp = (struct udphdr *)(cpto + ((uint8_t *)rack->r_ctl.fsb.udp - rack->r_ctl.fsb.tcp_ip_hdr)); + } #if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) if (to.to_flags & TOF_SIGNATURE) { /* @@ -13553,7 +17817,10 @@ } } #endif - + if (optlen) { + bcopy(opt, th + 1, optlen); + th->th_off = (sizeof(struct tcphdr) + optlen) >> 2; + } /* * Put TCP length in extended header, and then checksum extended * header and data. @@ -13623,33 +17890,9 @@ /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif -#ifdef TCPDEBUG - /* - * Trace. - */ - if (so->so_options & SO_DEBUG) { - u_short save = 0; - -#ifdef INET6 - if (!isipv6) -#endif - { - save = ipov->ih_len; - ipov->ih_len = htons(m->m_pkthdr.len /* - hdrlen + - * (th->th_off << 2) */ ); - } - tcp_trace(TA_OUTPUT, tp->t_state, tp, mtod(m, void *), th, 0); -#ifdef INET6 - if (!isipv6) -#endif - ipov->ih_len = save; - } -#endif /* TCPDEBUG */ - /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; - struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -13679,8 +17922,10 @@ } log.u_bbr.pacing_gain = rack_get_output_gain(rack, rsm); log.u_bbr.flex7 = mark; + log.u_bbr.flex7 <<= 8; + log.u_bbr.flex7 |= pass; log.u_bbr.pkts_out = tp->t_maxseg; - log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.timeStamp = cts; log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); log.u_bbr.lt_epoch = cwnd_to_use; log.u_bbr.delivered = sendalot; @@ -13706,7 +17951,7 @@ * desired default hop limit might be changed via Neighbor * Discovery. */ - ip6->ip6_hlim = in6_selecthlim(inp, NULL); + rack->r_ctl.fsb.hoplimit = ip6->ip6_hlim = in6_selecthlim(inp, NULL); /* * Set the packet size here for the benefit of DTrace @@ -13725,7 +17970,12 @@ TCP_PROBE5(send, NULL, tp, ip6, tp, th); /* TODO: IPv6 IP6TOS_ECT bit on */ - error = ip6_output(m, inp->in6p_outputopts, + error = ip6_output(m, +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + inp->in6p_outputopts, +#else + NULL, +#endif &inp->inp_route6, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), NULL, NULL, inp); @@ -13744,6 +17994,7 @@ if (inp->inp_vflag & INP_IPV6PROTO) ip->ip_ttl = in6_selecthlim(inp, NULL); #endif /* INET6 */ + rack->r_ctl.fsb.hoplimit = ip->ip_ttl; /* * If we do path MTU discovery, then we set DF on every * packet. This might not be the best thing to do according @@ -13768,7 +18019,13 @@ TCP_PROBE5(send, NULL, tp, ip, tp, th); - error = ip_output(m, inp->inp_options, &inp->inp_route, + error = ip_output(m, +#if defined(IPSEC) || defined(IPSEC_SUPPORT) + inp->inp_options, +#else + NULL, +#endif + &inp->inp_route, ((rsm || sack_rxmit) ? IP_NO_SND_TAG_RL : 0), 0, inp); if (error == EMSGSIZE && inp->inp_route.ro_nh != NULL) @@ -13818,23 +18075,31 @@ counter_u64_add(rack_out_size[idx], 1); } } - if (rack->rack_no_prr == 0) { - if (sub_from_prr && (error == 0)) { - if (rack->r_ctl.rc_prr_sndcnt >= len) - rack->r_ctl.rc_prr_sndcnt -= len; - else - rack->r_ctl.rc_prr_sndcnt = 0; - } - } + if ((rack->rack_no_prr == 0) && + sub_from_prr && + (error == 0)) { + if (rack->r_ctl.rc_prr_sndcnt >= len) + rack->r_ctl.rc_prr_sndcnt -= len; + else + rack->r_ctl.rc_prr_sndcnt = 0; + } sub_from_prr = 0; - rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, - pass, rsm, us_cts); + if (doing_tlp && (rsm == NULL)) { + /* New send doing a TLP */ + add_flag |= RACK_TLP; + tp->t_sndtlppack++; + tp->t_sndtlpbyte += len; + } + rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, + rack_to_usec_ts(&tv), + rsm, add_flag, s_mb, s_moff); + + if ((error == 0) && (len > 0) && (tp->snd_una == tp->snd_max)) rack->r_ctl.rc_tlp_rxt_last_time = cts; - /* Now are we in persists? */ - if (rack->rc_in_persist == 0) { + { tcp_seq startseq = tp->snd_nxt; /* Track our lost count */ @@ -13903,33 +18168,26 @@ ((tp->t_flags & TF_GPUTINPROG) == 0)) rack_start_gp_measurement(tp, rack, startseq, sb_offset); } - } else { /* - * Persist case, update snd_max but since we are in persist - * mode (no window) we do not update snd_nxt. + * If we are doing FO we need to update the mbuf position and subtract + * this happens when the peer sends us duplicate information and + * we thus want to send a DSACK. + * + * XXXRRS: This brings to mind a ?, when we send a DSACK block is TSO + * turned off? If not then we are going to echo multiple DSACK blocks + * out (with the TSO), which we should not be doing. */ - int32_t xlen = len; - - if (error) - goto nomore; - - if (flags & TH_SYN) - ++xlen; - if (flags & TH_FIN) { - ++xlen; - tp->t_flags |= TF_SENTFIN; - } - /* In the ENOBUFS case we do *not* update snd_max */ - if (SEQ_GT(tp->snd_nxt + xlen, tp->snd_max)) { - if (tp->snd_una == tp->snd_max) { - /* - * Update the time we just added data since - * none was outstanding. - */ - rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); - tp->t_acktime = ticks; + if (rack->r_fast_output && len) { + if (rack->r_ctl.fsb.left_to_send > len) + rack->r_ctl.fsb.left_to_send -= len; + else + rack->r_ctl.fsb.left_to_send = 0; + if (rack->r_ctl.fsb.left_to_send < segsiz) + rack->r_fast_output = 0; + if (rack->r_fast_output) { + rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); + rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; } - tp->snd_max = tp->snd_nxt + len; } } nomore: @@ -13952,24 +18210,34 @@ switch (error) { case EPERM: tp->t_softerror = error; +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_FAIL]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); + sched_unpin(); +#endif return (error); case ENOBUFS: - if (slot == 0) { - /* - * Pace us right away to retry in a some - * time - */ - slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); - if (rack->rc_enobuf < 126) - rack->rc_enobuf++; - if (slot > ((rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC)) { - slot = (rack->rc_rack_rtt / 2) * HPTS_USEC_IN_MSEC; - } - if (slot < (10 * HPTS_USEC_IN_MSEC)) - slot = 10 * HPTS_USEC_IN_MSEC; + /* + * Pace us right away to retry in a some + * time + */ + slot = ((1 + rack->rc_enobuf) * HPTS_USEC_IN_MSEC); + if (rack->rc_enobuf < 0x7f) + rack->rc_enobuf++; + if (slot < (10 * HPTS_USEC_IN_MSEC)) + slot = 10 * HPTS_USEC_IN_MSEC; + if (rack->r_ctl.crte != NULL) { + counter_u64_add(rack_saw_enobuf_hw, 1); + tcp_rl_log_enobuf(rack->r_ctl.crte); } counter_u64_add(rack_saw_enobuf, 1); - error = 0; goto enobufs; case EMSGSIZE: /* @@ -13988,6 +18256,18 @@ } slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_FAIL]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); + sched_unpin(); +#endif return (error); case ENETUNREACH: counter_u64_add(rack_saw_enetunreach, 1); @@ -14001,10 +18281,24 @@ default: slot = 10 * HPTS_USEC_IN_MSEC; rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount(); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_FAIL]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_FAIL], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_FAIL] += (crtsc - ts_val); + } + counter_u64_add(tcp_proc_time[SND_OUT_FAIL], (crtsc - ts_val)); + sched_unpin(); +#endif return (error); } } else { rack->rc_enobuf = 0; + if (IN_FASTRECOVERY(tp->t_flags) && rsm) + rack->r_ctl.retran_during_recovery += len; } KMOD_TCPSTAT_INC(tcps_sndtotal); @@ -14015,12 +18309,10 @@ */ if (recwin > 0 && SEQ_GT(tp->rcv_nxt + recwin, tp->rcv_adv)) tp->rcv_adv = tp->rcv_nxt + recwin; + tp->last_ack_sent = tp->rcv_nxt; tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); enobufs: - /* Assure when we leave that snd_nxt will point to top */ - if (SEQ_GT(tp->snd_max, tp->snd_nxt)) - tp->snd_nxt = tp->snd_max; if (sendalot) { /* Do we need to turn off sendalot? */ if (rack->r_ctl.rc_pace_max_segs && @@ -14053,6 +18345,7 @@ slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); } if (rsm && + (rsm->r_flags & RACK_HAS_SYN) == 0 && rack->use_rack_rr) { /* Its a retransmit and we use the rack cheat? */ if ((slot == 0) || @@ -14063,21 +18356,172 @@ * are using old-style rack or * we are overriden to use the old 1ms pacing. */ - slot = rack->r_ctl.rc_min_to * HPTS_USEC_IN_MSEC; + slot = rack->r_ctl.rc_min_to; } } + /* We have sent clear the flag */ + rack->r_ent_rec_ns = 0; + if (rack->r_must_retran) { + if (rsm) { + rack->r_ctl.rc_out_at_rto -= (rsm->r_end - rsm->r_start); + if (SEQ_GEQ(rsm->r_end, rack->r_ctl.rc_snd_max_at_rto)) { + /* + * We have retransmitted all. + */ + rack->r_must_retran = 0; + rack->r_ctl.rc_out_at_rto = 0; + } + } else if (SEQ_GEQ(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { + /* + * Sending new data will also kill + * the loop. + */ + rack->r_must_retran = 0; + rack->r_ctl.rc_out_at_rto = 0; + } + } + rack->r_ctl.fsb.recwin = recwin; + if ((tp->t_flags & (TF_WASCRECOVERY|TF_WASFRECOVERY)) && + SEQ_GT(tp->snd_max, rack->r_ctl.rc_snd_max_at_rto)) { + /* + * We hit an RTO and now have past snd_max at the RTO + * clear all the WAS flags. + */ + tp->t_flags &= ~(TF_WASCRECOVERY|TF_WASFRECOVERY); + } if (slot) { /* set the rack tcb into the slot N */ counter_u64_add(rack_paced_segments, 1); + if ((error == 0) && + rack_use_rfo && + ((flags & (TH_SYN|TH_FIN)) == 0) && + (rsm == NULL) && + (tp->snd_nxt == tp->snd_max) && + (ipoptlen == 0) && + (tp->rcv_numsacks == 0) && + rack->r_fsb_inited && + TCPS_HAVEESTABLISHED(tp->t_state) && + (rack->r_must_retran == 0) && + ((tp->t_flags & TF_NEEDFIN) == 0) && + (len > 0) && (orig_len > 0) && + (orig_len > len) && + ((orig_len - len) >= segsiz) && + ((optlen == 0) || + ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { + /* We can send at least one more MSS using our fsb */ + + rack->r_fast_output = 1; + rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); + rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; + rack->r_ctl.fsb.tcp_flags = flags; + rack->r_ctl.fsb.left_to_send = orig_len - len; + KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), + ("rack:%p left_to_send:%u sbavail:%u out:%u", + rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), + (tp->snd_max - tp->snd_una))); + if (rack->r_ctl.fsb.left_to_send < segsiz) + rack->r_fast_output = 0; + else { + if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) + rack->r_ctl.fsb.rfo_apply_push = 1; + else + rack->r_ctl.fsb.rfo_apply_push = 0; + } + } else + rack->r_fast_output = 0; + rack_log_fsb(rack, tp, so, flags, + ipoptlen, orig_len, len, error, + (rsm == NULL), optlen, __LINE__, 2); } else if (sendalot) { + int ret; + if (len) counter_u64_add(rack_unpaced_segments, 1); sack_rxmit = 0; + if ((error == 0) && + rack_use_rfo && + ((flags & (TH_SYN|TH_FIN)) == 0) && + (rsm == NULL) && + (ipoptlen == 0) && + (tp->rcv_numsacks == 0) && + (tp->snd_nxt == tp->snd_max) && + (rack->r_must_retran == 0) && + rack->r_fsb_inited && + TCPS_HAVEESTABLISHED(tp->t_state) && + ((tp->t_flags & TF_NEEDFIN) == 0) && + (len > 0) && (orig_len > 0) && + (orig_len > len) && + ((orig_len - len) >= segsiz) && + ((optlen == 0) || + ((optlen == TCPOLEN_TSTAMP_APPA) && (to.to_flags & TOF_TS)))) { + /* we can use fast_output for more */ + + rack->r_fast_output = 1; + rack->r_ctl.fsb.m = sbsndmbuf(sb, (tp->snd_max - tp->snd_una), &rack->r_ctl.fsb.off); + rack->r_ctl.fsb.o_m_len = rack->r_ctl.fsb.m->m_len; + rack->r_ctl.fsb.tcp_flags = flags; + rack->r_ctl.fsb.left_to_send = orig_len - len; + KASSERT((rack->r_ctl.fsb.left_to_send <= (sbavail(sb) - (tp->snd_max - tp->snd_una))), + ("rack:%p left_to_send:%u sbavail:%u out:%u", + rack, rack->r_ctl.fsb.left_to_send, sbavail(sb), + (tp->snd_max - tp->snd_una))); + if (rack->r_ctl.fsb.left_to_send < segsiz) { + rack->r_fast_output = 0; + } + if (rack->r_fast_output) { + if (rack->r_ctl.fsb.left_to_send == (sbavail(sb) - (tp->snd_max - tp->snd_una))) + rack->r_ctl.fsb.rfo_apply_push = 1; + else + rack->r_ctl.fsb.rfo_apply_push = 0; + rack_log_fsb(rack, tp, so, flags, + ipoptlen, orig_len, len, error, + (rsm == NULL), optlen, __LINE__, 3); + error = 0; + ret = rack_fast_output(tp, rack, ts_val, cts, ms_cts, &tv, tot_len_this_send, &error); + if (ret >= 0) + return (ret); + else if (error) + goto nomore; + + } + } goto again; } else if (len) { counter_u64_add(rack_unpaced_segments, 1); } + /* Assure when we leave that snd_nxt will point to top */ + if (SEQ_GT(tp->snd_max, tp->snd_nxt)) + tp->snd_nxt = tp->snd_max; rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); +#ifdef TCP_ACCOUNTING + crtsc = get_cyclecount() - ts_val; + if (tot_len_this_send) { + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_DATA]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_DATA], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_DATA] += crtsc; + } + counter_u64_add(tcp_proc_time[SND_OUT_DATA], crtsc); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[CNT_OF_MSS_OUT] += ((tot_len_this_send + segsiz - 1) /segsiz); + } + counter_u64_add(tcp_cnt_counters[CNT_OF_MSS_OUT], ((tot_len_this_send + segsiz - 1) /segsiz)); + } else { + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_cnt_counters[SND_OUT_ACK]++; + } + counter_u64_add(tcp_cnt_counters[SND_OUT_ACK], 1); + if (tp->t_flags2 & TF2_TCP_ACCOUNTING) { + tp->tcp_proc_time[SND_OUT_ACK] += crtsc; + } + counter_u64_add(tcp_proc_time[SND_OUT_ACK], crtsc); + } + sched_unpin(); +#endif + if (error == ENOBUFS) + error = 0; return (error); } @@ -14087,133 +18531,380 @@ uint32_t orig_val; orig_val = rack->r_ctl.rc_pace_max_segs; - rack_set_pace_segments(rack->rc_tp, rack, __LINE__); + rack_set_pace_segments(rack->rc_tp, rack, __LINE__, NULL); if (orig_val != rack->r_ctl.rc_pace_max_segs) rack_log_pacing_delay_calc(rack, 0, 0, orig_val, 0, 0, 15, __LINE__, NULL); } -/* - * rack_ctloutput() must drop the inpcb lock before performing copyin on - * socket option arguments. When it re-acquires the lock after the copy, it - * has to revalidate that the connection is still valid for the socket - * option. - */ +static void +rack_mtu_change(struct tcpcb *tp) +{ + /* + * The MSS may have changed + */ + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + if (rack->r_ctl.rc_pace_min_segs != ctf_fixed_maxseg(tp)) { + /* + * The MTU has changed we need to resend everything + * since all we have sent is lost. We first fix + * up the mtu though. + */ + rack_set_pace_segments(tp, rack, __LINE__, NULL); + /* We treat this like a full retransmit timeout without the cwnd adjustment */ + rack_remxt_tmr(tp); + rack->r_fast_output = 0; + rack->r_ctl.rc_out_at_rto = ctf_flight_size(tp, + rack->r_ctl.rc_sacked); + rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; + rack->r_must_retran = 1; + + } + sack_filter_clear(&rack->r_ctl.rack_sf, tp->snd_una); + /* We don't use snd_nxt to retransmit */ + tp->snd_nxt = tp->snd_max; +} + static int -rack_set_sockopt(struct socket *so, struct sockopt *sopt, - struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) +rack_set_profile(struct tcp_rack *rack, int prof) +{ + int err = EINVAL; + if (prof == 1) { + /* pace_always=1 */ + if (rack->rc_always_pace == 0) { + if (tcp_can_enable_pacing() == 0) + return (EBUSY); + } + rack->rc_always_pace = 1; + if (rack->use_fixed_rate || rack->gp_ready) + rack_set_cc_pacing(rack); + rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + rack->rack_attempt_hdwr_pace = 0; + /* cmpack=1 */ + if (rack_use_cmp_acks) + rack->r_use_cmp_ack = 1; + if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && + rack->r_use_cmp_ack) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + /* scwnd=1 */ + rack->rack_enable_scwnd = 1; + /* dynamic=100 */ + rack->rc_gp_dyn_mul = 1; + /* gp_inc_ca */ + rack->r_ctl.rack_per_of_gp_ca = 100; + /* rrr_conf=3 */ + rack->r_rr_config = 3; + /* npush=2 */ + rack->r_ctl.rc_no_push_at_mrtt = 2; + /* fillcw=1 */ + rack->rc_pace_to_cwnd = 1; + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + /* noprr=1 */ + rack->rack_no_prr = 1; + /* lscwnd=1 */ + rack->r_limit_scw = 1; + /* gp_inc_rec */ + rack->r_ctl.rack_per_of_gp_rec = 90; + err = 0; + + } else if (prof == 3) { + /* Same as profile one execept fill_cw becomes 2 (less aggressive set) */ + /* pace_always=1 */ + if (rack->rc_always_pace == 0) { + if (tcp_can_enable_pacing() == 0) + return (EBUSY); + } + rack->rc_always_pace = 1; + if (rack->use_fixed_rate || rack->gp_ready) + rack_set_cc_pacing(rack); + rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + rack->rack_attempt_hdwr_pace = 0; + /* cmpack=1 */ + if (rack_use_cmp_acks) + rack->r_use_cmp_ack = 1; + if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state) && + rack->r_use_cmp_ack) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + /* scwnd=1 */ + rack->rack_enable_scwnd = 1; + /* dynamic=100 */ + rack->rc_gp_dyn_mul = 1; + /* gp_inc_ca */ + rack->r_ctl.rack_per_of_gp_ca = 100; + /* rrr_conf=3 */ + rack->r_rr_config = 3; + /* npush=2 */ + rack->r_ctl.rc_no_push_at_mrtt = 2; + /* fillcw=2 */ + rack->rc_pace_to_cwnd = 1; + rack->r_fill_less_agg = 1; + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + /* noprr=1 */ + rack->rack_no_prr = 1; + /* lscwnd=1 */ + rack->r_limit_scw = 1; + /* gp_inc_rec */ + rack->r_ctl.rack_per_of_gp_rec = 90; + err = 0; + + + } else if (prof == 2) { + /* cmpack=1 */ + if (rack->rc_always_pace == 0) { + if (tcp_can_enable_pacing() == 0) + return (EBUSY); + } + rack->rc_always_pace = 1; + if (rack->use_fixed_rate || rack->gp_ready) + rack_set_cc_pacing(rack); + rack->r_use_cmp_ack = 1; + if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + /* pace_always=1 */ + rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + /* scwnd=1 */ + rack->rack_enable_scwnd = 1; + /* dynamic=100 */ + rack->rc_gp_dyn_mul = 1; + rack->r_ctl.rack_per_of_gp_ca = 100; + /* rrr_conf=3 */ + rack->r_rr_config = 3; + /* npush=2 */ + rack->r_ctl.rc_no_push_at_mrtt = 2; + /* fillcw=1 */ + rack->rc_pace_to_cwnd = 1; + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + /* noprr=1 */ + rack->rack_no_prr = 1; + /* lscwnd=0 */ + rack->r_limit_scw = 0; + err = 0; + } else if (prof == 0) { + /* This changes things back to the default settings */ + err = 0; + if (rack->rc_always_pace) { + tcp_decrement_paced_conn(); + rack_undo_cc_pacing(rack); + rack->rc_always_pace = 0; + } + if (rack_pace_every_seg && tcp_can_enable_pacing()) { + rack->rc_always_pace = 1; + if (rack->use_fixed_rate || rack->gp_ready) + rack_set_cc_pacing(rack); + } else + rack->rc_always_pace = 0; + if (rack_use_cmp_acks) + rack->r_use_cmp_ack = 1; + else + rack->r_use_cmp_ack = 0; + if (rack_disable_prr) + rack->rack_no_prr = 1; + else + rack->rack_no_prr = 0; + if (rack_gp_no_rec_chg) + rack->rc_gp_no_rec_chg = 1; + else + rack->rc_gp_no_rec_chg = 0; + if (rack_enable_mqueue_for_nonpaced || rack->r_use_cmp_ack) { + rack->r_mbuf_queue = 1; + if (TCPS_HAVEESTABLISHED(rack->rc_tp->t_state)) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; + rack->rc_inp->inp_flags2 |= INP_SUPPORTS_MBUFQ; + } else { + rack->r_mbuf_queue = 0; + rack->rc_inp->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + } + if (rack_enable_shared_cwnd) + rack->rack_enable_scwnd = 1; + else + rack->rack_enable_scwnd = 0; + if (rack_do_dyn_mul) { + /* When dynamic adjustment is on CA needs to start at 100% */ + rack->rc_gp_dyn_mul = 1; + if (rack_do_dyn_mul >= 100) + rack->r_ctl.rack_per_of_gp_ca = rack_do_dyn_mul; + } else { + rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; + rack->rc_gp_dyn_mul = 0; + } + rack->r_rr_config = 0; + rack->r_ctl.rc_no_push_at_mrtt = 0; + rack->rc_pace_to_cwnd = 0; + rack->rc_pace_fill_if_rttin_range = 0; + rack->rtt_limit_mul = 0; + + if (rack_enable_hw_pacing) + rack->rack_hdw_pace_ena = 1; + else + rack->rack_hdw_pace_ena = 0; + if (rack_disable_prr) + rack->rack_no_prr = 1; + else + rack->rack_no_prr = 0; + if (rack_limits_scwnd) + rack->r_limit_scw = 1; + else + rack->r_limit_scw = 0; + err = 0; + } + return (err); +} + +static int +rack_add_deferred_option(struct tcp_rack *rack, int sopt_name, uint64_t loptval) +{ + struct deferred_opt_list *dol; + + dol = malloc(sizeof(struct deferred_opt_list), + M_TCPFSB, M_NOWAIT|M_ZERO); + if (dol == NULL) { + /* + * No space yikes -- fail out.. + */ + return (0); + } + dol->optname = sopt_name; + dol->optval = loptval; + TAILQ_INSERT_TAIL(&rack->r_ctl.opt_list, dol, next); + return (1); +} + +static int +rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, + uint32_t optval, uint64_t loptval) { struct epoch_tracker et; + struct sockopt sopt; + struct cc_newreno_opts opt; uint64_t val; - int32_t error = 0, optval; + int error = 0; uint16_t ca, ss; - - switch (sopt->sopt_name) { - case TCP_RACK_PROP_RATE: /* URL:prop_rate */ - case TCP_RACK_PROP : /* URL:prop */ - case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ - case TCP_RACK_EARLY_RECOV: /* URL:early_recov */ - case TCP_RACK_PACE_REDUCE: /* Not used */ - /* Pacing related ones */ - case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ - case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ - case TCP_BBR_IWINTSO: /* URL:tso_iwin */ - case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ - case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ - case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ - case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ - case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ - case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ - case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ - case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ - case TCP_RACK_RR_CONF: /* URL:rrr_conf */ - case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ - /* End pacing related */ - case TCP_DELACK: - case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ - case TCP_RACK_MIN_TO: /* URL:min_to */ - case TCP_RACK_EARLY_SEG: /* URL:early_seg */ - case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ - case TCP_RACK_REORD_FADE: /* URL:reord_fade */ - case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ - case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ - case TCP_RACK_TLP_USE: /* URL:tlp_use */ - case TCP_RACK_TLP_INC_VAR: /* URL:tlp_inc_var */ - case TCP_RACK_IDLE_REDUCE_HIGH: /* URL:idle_reduce_high */ - case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ - case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ - case TCP_RACK_DO_DETECTION: /* URL:detect */ - case TCP_NO_PRR: /* URL:noprr */ - case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ - case TCP_DATA_AFTER_CLOSE: - case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ - case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ - case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ - case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ - case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ - case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ - case TCP_RACK_PROFILE: /* URL:profile */ + + switch (sopt_name) { + + case TCP_RACK_PACING_BETA: + RACK_OPTS_INC(tcp_rack_beta); + if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { + /* This only works for newreno. */ + error = EINVAL; + break; + } + if (rack->rc_pacing_cc_set) { + /* + * Set them into the real CC module + * whats in the rack pcb is the old values + * to be used on restoral/ + */ + sopt.sopt_dir = SOPT_SET; + opt.name = CC_NEWRENO_BETA; + opt.val = optval; + if (CC_ALGO(tp)->ctl_output != NULL) + error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); + else { + error = ENOENT; + break; + } + } else { + /* + * Not pacing yet so set it into our local + * rack pcb storage. + */ + rack->r_ctl.rc_saved_beta.beta = optval; + } break; - default: - return (tcp_default_ctloutput(so, sopt, inp, tp)); + case TCP_RACK_PACING_BETA_ECN: + RACK_OPTS_INC(tcp_rack_beta_ecn); + if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) { + /* This only works for newreno. */ + error = EINVAL; + break; + } + if (rack->rc_pacing_cc_set) { + /* + * Set them into the real CC module + * whats in the rack pcb is the old values + * to be used on restoral/ + */ + sopt.sopt_dir = SOPT_SET; + opt.name = CC_NEWRENO_BETA_ECN; + opt.val = optval; + if (CC_ALGO(tp)->ctl_output != NULL) + error = CC_ALGO(tp)->ctl_output(tp->ccv, &sopt, &opt); + else + error = ENOENT; + } else { + /* + * Not pacing yet so set it into our local + * rack pcb storage. + */ + rack->r_ctl.rc_saved_beta.beta_ecn = optval; + rack->r_ctl.rc_saved_beta.newreno_flags = CC_NEWRENO_BETA_ECN; + } + break; + case TCP_DEFER_OPTIONS: + RACK_OPTS_INC(tcp_defer_opt); + if (optval) { + if (rack->gp_ready) { + /* Too late */ + error = EINVAL; + break; + } + rack->defer_options = 1; + } else + rack->defer_options = 0; + break; + case TCP_RACK_MEASURE_CNT: + RACK_OPTS_INC(tcp_rack_measure_cnt); + if (optval && (optval <= 0xff)) { + rack->r_ctl.req_measurements = optval; + } else + error = EINVAL; + break; + case TCP_REC_ABC_VAL: + RACK_OPTS_INC(tcp_rec_abc_val); + if (optval > 0) + rack->r_use_labc_for_rec = 1; + else + rack->r_use_labc_for_rec = 0; + break; + case TCP_RACK_ABC_VAL: + RACK_OPTS_INC(tcp_rack_abc_val); + if ((optval > 0) && (optval < 255)) + rack->rc_labc = optval; + else + error = EINVAL; + break; + case TCP_HDWR_UP_ONLY: + RACK_OPTS_INC(tcp_pacing_up_only); + if (optval) + rack->r_up_only = 1; + else + rack->r_up_only = 0; + break; + case TCP_PACING_RATE_CAP: + RACK_OPTS_INC(tcp_pacing_rate_cap); + rack->r_ctl.bw_rate_cap = loptval; break; - } - INP_WUNLOCK(inp); - error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); - if (error) - return (error); - INP_WLOCK(inp); - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { - INP_WUNLOCK(inp); - return (ECONNRESET); - } - tp = intotcpcb(inp); - rack = (struct tcp_rack *)tp->t_fb_ptr; - switch (sopt->sopt_name) { case TCP_RACK_PROFILE: RACK_OPTS_INC(tcp_profile); - if (optval == 1) { - /* pace_always=1 */ - rack->rc_always_pace = 1; - tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; - /* scwnd=1 */ - rack->rack_enable_scwnd = 1; - /* dynamic=100 */ - rack->rc_gp_dyn_mul = 1; - rack->r_ctl.rack_per_of_gp_ca = 100; - /* rrr_conf=3 */ - rack->r_rr_config = 3; - /* npush=2 */ - rack->r_ctl.rc_no_push_at_mrtt = 2; - /* fillcw=1 */ - rack->rc_pace_to_cwnd = 1; - rack->rc_pace_fill_if_rttin_range = 0; - rack->rtt_limit_mul = 0; - /* noprr=1 */ - rack->rack_no_prr = 1; - /* lscwnd=1 */ - rack->r_limit_scw = 1; - } else if (optval == 2) { - /* pace_always=1 */ - rack->rc_always_pace = 1; + error = rack_set_profile(rack, optval); + break; + case TCP_USE_CMP_ACKS: + RACK_OPTS_INC(tcp_use_cmp_acks); + if ((optval == 0) && (rack->rc_inp->inp_flags2 & INP_MBUF_ACKCMP)) { + /* You can't turn it off once its on! */ + error = EINVAL; + } else if ((optval == 1) && (rack->r_use_cmp_ack == 0)) { + rack->r_use_cmp_ack = 1; + rack->r_mbuf_queue = 1; tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; - /* scwnd=1 */ - rack->rack_enable_scwnd = 1; - /* dynamic=100 */ - rack->rc_gp_dyn_mul = 1; - rack->r_ctl.rack_per_of_gp_ca = 100; - /* rrr_conf=3 */ - rack->r_rr_config = 3; - /* npush=2 */ - rack->r_ctl.rc_no_push_at_mrtt = 2; - /* fillcw=1 */ - rack->rc_pace_to_cwnd = 1; - rack->rc_pace_fill_if_rttin_range = 0; - rack->rtt_limit_mul = 0; - /* noprr=1 */ - rack->rack_no_prr = 1; - /* lscwnd=0 */ - rack->r_limit_scw = 0; } + if (rack->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) + rack->rc_inp->inp_flags2 |= INP_MBUF_ACKCMP; break; case TCP_SHARED_CWND_TIME_LIMIT: RACK_OPTS_INC(tcp_lscwnd); @@ -14226,8 +18917,11 @@ RACK_OPTS_INC(tcp_fillcw); if (optval == 0) rack->rc_pace_to_cwnd = 0; - else + else { rack->rc_pace_to_cwnd = 1; + if (optval > 1) + rack->r_fill_less_agg = 1; + } if ((optval >= rack_gp_rtt_maxmul) && rack_gp_rtt_maxmul && (optval < 0xf)) { @@ -14257,11 +18951,11 @@ case TCP_RACK_MBUF_QUEUE: /* Now do we use the LRO mbuf-queue feature */ RACK_OPTS_INC(tcp_rack_mbufq); - if (optval) + if (optval || rack->r_use_cmp_ack) rack->r_mbuf_queue = 1; else rack->r_mbuf_queue = 0; - if (rack->r_mbuf_queue || rack->rc_always_pace) + if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; @@ -14277,8 +18971,12 @@ RACK_OPTS_INC(tcp_rack_noprr); if (optval == 0) rack->rack_no_prr = 0; - else + else if (optval == 1) rack->rack_no_prr = 1; + else if (optval == 2) + rack->no_prr_addback = 1; + else + error = EINVAL; break; case TCP_TIMELY_DYN_ADJ: RACK_OPTS_INC(tcp_timely_dyn); @@ -14302,14 +19000,6 @@ else rack->do_detection = 1; break; - case TCP_RACK_PROP_RATE: - if ((optval <= 0) || (optval >= 100)) { - error = EINVAL; - break; - } - RACK_OPTS_INC(tcp_rack_prop_rate); - rack->r_ctl.rc_prop_rate = optval; - break; case TCP_RACK_TLP_USE: if ((optval < TLP_USE_ID) || (optval > TLP_USE_TWO_TWO)) { error = EINVAL; @@ -14318,34 +19008,39 @@ RACK_OPTS_INC(tcp_tlp_use); rack->rack_tlp_threshold_use = optval; break; - case TCP_RACK_PROP: - /* RACK proportional rate reduction (bool) */ - RACK_OPTS_INC(tcp_rack_prop); - rack->r_ctl.rc_prop_reduce = optval; - break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ RACK_OPTS_INC(tcp_rack_tlp_reduce); rack->r_ctl.rc_tlp_cwnd_reduce = optval; break; - case TCP_RACK_EARLY_RECOV: - /* Should recovery happen early (bool) */ - RACK_OPTS_INC(tcp_rack_early_recov); - rack->r_ctl.rc_early_recovery = optval; - break; - - /* Pacing related ones */ + /* Pacing related ones */ case TCP_RACK_PACE_ALWAYS: /* * zero is old rack method, 1 is new * method using a pacing rate. */ RACK_OPTS_INC(tcp_rack_pace_always); - if (optval > 0) - rack->rc_always_pace = 1; - else - rack->rc_always_pace = 0; - if (rack->r_mbuf_queue || rack->rc_always_pace) + if (optval > 0) { + if (rack->rc_always_pace) { + error = EALREADY; + break; + } else if (tcp_can_enable_pacing()) { + rack->rc_always_pace = 1; + if (rack->use_fixed_rate || rack->gp_ready) + rack_set_cc_pacing(rack); + } + else { + error = ENOSPC; + break; + } + } else { + if (rack->rc_always_pace) { + tcp_decrement_paced_conn(); + rack->rc_always_pace = 0; + rack_undo_cc_pacing(rack); + } + } + if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; else tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; @@ -14426,7 +19121,7 @@ /* Max segments size in a pace in bytes */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_user_set_max_segs = optval; - rack_set_pace_segments(tp, rack, __LINE__); + rack_set_pace_segments(tp, rack, __LINE__, NULL); break; case TCP_RACK_PACE_RATE_REC: /* Set the fixed pacing rate in Bytes per second ca */ @@ -14437,6 +19132,8 @@ if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) rack->r_ctl.rc_fixed_pacing_rate_ss = optval; rack->use_fixed_rate = 1; + if (rack->rc_always_pace) + rack_set_cc_pacing(rack); rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, rack->r_ctl.rc_fixed_pacing_rate_ca, @@ -14453,6 +19150,8 @@ if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) rack->r_ctl.rc_fixed_pacing_rate_rec = optval; rack->use_fixed_rate = 1; + if (rack->rc_always_pace) + rack_set_cc_pacing(rack); rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, rack->r_ctl.rc_fixed_pacing_rate_ca, @@ -14469,6 +19168,8 @@ if (rack->r_ctl.rc_fixed_pacing_rate_rec == 0) rack->r_ctl.rc_fixed_pacing_rate_rec = optval; rack->use_fixed_rate = 1; + if (rack->rc_always_pace) + rack_set_cc_pacing(rack); rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_fixed_pacing_rate_ss, rack->r_ctl.rc_fixed_pacing_rate_ca, @@ -14527,6 +19228,17 @@ else rack->r_rr_config = 0; break; + case TCP_HDWR_RATE_CAP: + RACK_OPTS_INC(tcp_hdwr_rate_cap); + if (optval) { + if (rack->r_rack_hw_rate_caps == 0) + rack->r_rack_hw_rate_caps = 1; + else + error = EALREADY; + } else { + rack->r_rack_hw_rate_caps = 0; + } + break; case TCP_BBR_HDWR_PACE: RACK_OPTS_INC(tcp_hdwr_pacing); if (optval){ @@ -14538,14 +19250,16 @@ } else { rack->rack_hdw_pace_ena = 0; #ifdef RATELIMIT - if (rack->rack_hdrw_pacing) { + if (rack->r_ctl.crte != NULL) { rack->rack_hdrw_pacing = 0; - in_pcbdetach_txrtlmt(rack->rc_inp); + rack->rack_attempt_hdwr_pace = 0; + tcp_rel_pacing_rate(rack->r_ctl.crte, tp); + rack->r_ctl.crte = NULL; } #endif } break; - /* End Pacing related ones */ + /* End Pacing related ones */ case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ RACK_OPTS_INC(tcp_rack_prr_sendalot); @@ -14589,19 +19303,20 @@ else rack->use_rack_rr = 0; break; + case TCP_FAST_RSM_HACK: + RACK_OPTS_INC(tcp_rack_fastrsm_hack); + if (optval) + rack->fast_rsm_hack = 1; + else + rack->fast_rsm_hack = 0; + break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); rack->r_ctl.rc_pkt_delay = optval; break; - case TCP_RACK_TLP_INC_VAR: - /* Does TLP include rtt variance in t-o */ - error = EINVAL; - break; - case TCP_RACK_IDLE_REDUCE_HIGH: - error = EINVAL; - break; case TCP_DELACK: + RACK_OPTS_INC(tcp_rack_delayed_ack); if (optval == 0) tp->t_delayed_ack = 0; else @@ -14616,6 +19331,7 @@ break; case TCP_BBR_RACK_RTT_USE: + RACK_OPTS_INC(tcp_rack_rtt_use); if ((optval != USE_RTT_HIGH) && (optval != USE_RTT_LOW) && (optval != USE_RTT_AVG)) @@ -14624,32 +19340,208 @@ rack->r_ctl.rc_rate_sample_method = optval; break; case TCP_DATA_AFTER_CLOSE: + RACK_OPTS_INC(tcp_data_after_close); if (optval) rack->rc_allow_data_af_clo = 1; else rack->rc_allow_data_af_clo = 0; break; - case TCP_RACK_PACE_REDUCE: - /* sysctl only now */ - error = EINVAL; - break; default: - return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } #ifdef NETFLIX_STATS - tcp_log_socket_option(tp, sopt->sopt_name, optval, error); + tcp_log_socket_option(tp, sopt_name, optval, error); #endif + return (error); +} + + +static void +rack_apply_deferred_options(struct tcp_rack *rack) +{ + struct deferred_opt_list *dol, *sdol; + uint32_t s_optval; + + TAILQ_FOREACH_SAFE(dol, &rack->r_ctl.opt_list, next, sdol) { + TAILQ_REMOVE(&rack->r_ctl.opt_list, dol, next); + /* Disadvantage of deferal is you loose the error return */ + s_optval = (uint32_t)dol->optval; + (void)rack_process_option(rack->rc_tp, rack, dol->optname, s_optval, dol->optval); + free(dol, M_TCPDO); + } +} + +/* + * rack_ctloutput() must drop the inpcb lock before performing copyin on + * socket option arguments. When it re-acquires the lock after the copy, it + * has to revalidate that the connection is still valid for the socket + * option. + */ +static int +rack_set_sockopt(struct socket *so, struct sockopt *sopt, + struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) +{ + uint64_t loptval; + int32_t error = 0, optval; + + switch (sopt->sopt_name) { + case TCP_RACK_TLP_REDUCE: /* URL:tlp_reduce */ + /* Pacing related ones */ + case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ + case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ + case TCP_BBR_IWINTSO: /* URL:tso_iwin */ + case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ + case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ + case TCP_RACK_PACE_RATE_CA: /* URL:pr_ca */ + case TCP_RACK_PACE_RATE_SS: /* URL:pr_ss*/ + case TCP_RACK_PACE_RATE_REC: /* URL:pr_rec */ + case TCP_RACK_GP_INCREASE_CA: /* URL:gp_inc_ca */ + case TCP_RACK_GP_INCREASE_SS: /* URL:gp_inc_ss */ + case TCP_RACK_GP_INCREASE_REC: /* URL:gp_inc_rec */ + case TCP_RACK_RR_CONF: /* URL:rrr_conf */ + case TCP_BBR_HDWR_PACE: /* URL:hdwrpace */ + case TCP_HDWR_RATE_CAP: /* URL: hdwrcap boolean */ + case TCP_PACING_RATE_CAP: /* URL:cap-- used by side-channel */ + case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ + /* End pacing related */ + case TCP_FAST_RSM_HACK: /* URL:frsm_hack */ + case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ + case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ + case TCP_RACK_MIN_TO: /* URL:min_to */ + case TCP_RACK_EARLY_SEG: /* URL:early_seg */ + case TCP_RACK_REORD_THRESH: /* URL:reord_thresh */ + case TCP_RACK_REORD_FADE: /* URL:reord_fade */ + case TCP_RACK_TLP_THRESH: /* URL:tlp_thresh */ + case TCP_RACK_PKT_DELAY: /* URL:pkt_delay */ + case TCP_RACK_TLP_USE: /* URL:tlp_use */ + case TCP_BBR_RACK_RTT_USE: /* URL:rttuse */ + case TCP_BBR_USE_RACK_RR: /* URL:rackrr */ + case TCP_RACK_DO_DETECTION: /* URL:detect */ + case TCP_NO_PRR: /* URL:noprr */ + case TCP_TIMELY_DYN_ADJ: /* URL:dynamic */ + case TCP_DATA_AFTER_CLOSE: /* no URL */ + case TCP_RACK_NONRXT_CFG_RATE: /* URL:nonrxtcr */ + case TCP_SHARED_CWND_ENABLE: /* URL:scwnd */ + case TCP_RACK_MBUF_QUEUE: /* URL:mqueue */ + case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ + case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ + case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ + case TCP_RACK_PROFILE: /* URL:profile */ + case TCP_USE_CMP_ACKS: /* URL:cmpack */ + case TCP_RACK_ABC_VAL: /* URL:labc */ + case TCP_REC_ABC_VAL: /* URL:reclabc */ + case TCP_RACK_MEASURE_CNT: /* URL:measurecnt */ + case TCP_DEFER_OPTIONS: /* URL:defer */ + case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ + case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ + break; + default: + /* Filter off all unknown options to the base stack */ + return (tcp_default_ctloutput(so, sopt, inp, tp)); + break; + } + INP_WUNLOCK(inp); + if (sopt->sopt_name == TCP_PACING_RATE_CAP) { + error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); + /* + * We truncate it down to 32 bits for the socket-option trace this + * means rates > 34Gbps won't show right, but thats probably ok. + */ + optval = (uint32_t)loptval; + } else { + error = sooptcopyin(sopt, &optval, sizeof(optval), sizeof(optval)); + /* Save it in 64 bit form too */ + loptval = optval; + } + if (error) + return (error); + INP_WLOCK(inp); + if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { + INP_WUNLOCK(inp); + return (ECONNRESET); + } + if (rack->defer_options && (rack->gp_ready == 0) && + (sopt->sopt_name != TCP_DEFER_OPTIONS) && + (sopt->sopt_name != TCP_RACK_PACING_BETA) && + (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && + (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { + /* Options are beind deferred */ + if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { + INP_WUNLOCK(inp); + return (0); + } else { + /* No memory to defer, fail */ + INP_WUNLOCK(inp); + return (ENOMEM); + } + } + error = rack_process_option(tp, rack, sopt->sopt_name, optval, loptval); INP_WUNLOCK(inp); return (error); } +static void +rack_fill_info(struct tcpcb *tp, struct tcp_info *ti) +{ + + INP_WLOCK_ASSERT(tp->t_inpcb); + bzero(ti, sizeof(*ti)); + + ti->tcpi_state = tp->t_state; + if ((tp->t_flags & TF_REQ_TSTMP) && (tp->t_flags & TF_RCVD_TSTMP)) + ti->tcpi_options |= TCPI_OPT_TIMESTAMPS; + if (tp->t_flags & TF_SACK_PERMIT) + ti->tcpi_options |= TCPI_OPT_SACK; + if ((tp->t_flags & TF_REQ_SCALE) && (tp->t_flags & TF_RCVD_SCALE)) { + ti->tcpi_options |= TCPI_OPT_WSCALE; + ti->tcpi_snd_wscale = tp->snd_scale; + ti->tcpi_rcv_wscale = tp->rcv_scale; + } + if (tp->t_flags2 & TF2_ECN_PERMIT) + ti->tcpi_options |= TCPI_OPT_ECN; + if (tp->t_flags & TF_FASTOPEN) + ti->tcpi_options |= TCPI_OPT_TFO; + /* still kept in ticks is t_rcvtime */ + ti->tcpi_last_data_recv = ((uint32_t)ticks - tp->t_rcvtime) * tick; + /* Since we hold everything in precise useconds this is easy */ + ti->tcpi_rtt = tp->t_srtt; + ti->tcpi_rttvar = tp->t_rttvar; + ti->tcpi_rto = tp->t_rxtcur; + ti->tcpi_snd_ssthresh = tp->snd_ssthresh; + ti->tcpi_snd_cwnd = tp->snd_cwnd; + /* + * FreeBSD-specific extension fields for tcp_info. + */ + ti->tcpi_rcv_space = tp->rcv_wnd; + ti->tcpi_rcv_nxt = tp->rcv_nxt; + ti->tcpi_snd_wnd = tp->snd_wnd; + ti->tcpi_snd_bwnd = 0; /* Unused, kept for compat. */ + ti->tcpi_snd_nxt = tp->snd_nxt; + ti->tcpi_snd_mss = tp->t_maxseg; + ti->tcpi_rcv_mss = tp->t_maxseg; + ti->tcpi_snd_rexmitpack = tp->t_sndrexmitpack; + ti->tcpi_rcv_ooopack = tp->t_rcvoopack; + ti->tcpi_snd_zerowin = tp->t_sndzerowin; +#ifdef NETFLIX_STATS + ti->tcpi_total_tlp = tp->t_sndtlppack; + ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; + memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); +#endif +#ifdef TCP_OFFLOAD + if (tp->t_flags & TF_TOE) { + ti->tcpi_options |= TCPI_OPT_TOE; + tcp_offload_tcp_info(tp, ti); + } +#endif +} + static int rack_get_sockopt(struct socket *so, struct sockopt *sopt, struct inpcb *inp, struct tcpcb *tp, struct tcp_rack *rack) { int32_t error, optval; - uint64_t val; + uint64_t val, loptval; + struct tcp_info ti; /* * Because all our options are either boolean or an int, we can just * pull everything into optval and then unlock and copy. If we ever @@ -14658,12 +19550,93 @@ */ error = 0; switch (sopt->sopt_name) { + case TCP_INFO: + /* First get the info filled */ + rack_fill_info(tp, &ti); + /* Fix up the rtt related fields if needed */ + INP_WUNLOCK(inp); + error = sooptcopyout(sopt, &ti, sizeof ti); + return (error); + /* + * Beta is the congestion control value for NewReno that influences how + * much of a backoff happens when loss is detected. It is normally set + * to 50 for 50% i.e. the cwnd is reduced to 50% of its previous value + * when you exit recovery. + */ + case TCP_RACK_PACING_BETA: + if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) + error = EINVAL; + else if (rack->rc_pacing_cc_set == 0) + optval = rack->r_ctl.rc_saved_beta.beta; + else { + /* + * Reach out into the CC data and report back what + * I have previously set. Yeah it looks hackish but + * we don't want to report the saved values. + */ + if (tp->ccv->cc_data) + optval = ((struct newreno *)tp->ccv->cc_data)->beta; + else + error = EINVAL; + } + break; + /* + * Beta_ecn is the congestion control value for NewReno that influences how + * much of a backoff happens when a ECN mark is detected. It is normally set + * to 80 for 80% i.e. the cwnd is reduced by 20% of its previous value when + * you exit recovery. Note that classic ECN has a beta of 50, it is only + * ABE Ecn that uses this "less" value, but we do too with pacing :) + */ + + case TCP_RACK_PACING_BETA_ECN: + if (strcmp(tp->cc_algo->name, CCALGONAME_NEWRENO) != 0) + error = EINVAL; + else if (rack->rc_pacing_cc_set == 0) + optval = rack->r_ctl.rc_saved_beta.beta_ecn; + else { + /* + * Reach out into the CC data and report back what + * I have previously set. Yeah it looks hackish but + * we don't want to report the saved values. + */ + if (tp->ccv->cc_data) + optval = ((struct newreno *)tp->ccv->cc_data)->beta_ecn; + else + error = EINVAL; + } + break; + case TCP_FAST_RSM_HACK: + optval = rack->fast_rsm_hack; + break; + case TCP_DEFER_OPTIONS: + optval = rack->defer_options; + break; + case TCP_RACK_MEASURE_CNT: + optval = rack->r_ctl.req_measurements; + break; + case TCP_REC_ABC_VAL: + optval = rack->r_use_labc_for_rec; + break; + case TCP_RACK_ABC_VAL: + optval = rack->rc_labc; + break; + case TCP_HDWR_UP_ONLY: + optval= rack->r_up_only; + break; + case TCP_PACING_RATE_CAP: + loptval = rack->r_ctl.bw_rate_cap; + break; case TCP_RACK_PROFILE: /* You cannot retrieve a profile, its write only */ error = EINVAL; break; + case TCP_USE_CMP_ACKS: + optval = rack->r_use_cmp_ack; + break; case TCP_RACK_PACE_TO_FILL: optval = rack->rc_pace_to_cwnd; + if (optval && rack->r_fill_less_agg) + optval++; break; case TCP_RACK_NO_PUSH_AT_MAX: optval = rack->r_ctl.rc_no_push_at_mrtt; @@ -14675,7 +19648,12 @@ optval = rack->rack_rec_nonrxt_use_cr; break; case TCP_NO_PRR: - optval = rack->rack_no_prr; + if (rack->rack_no_prr == 1) + optval = 1; + else if (rack->no_prr_addback == 1) + optval = 2; + else + optval = 0; break; case TCP_RACK_DO_DETECTION: optval = rack->do_detection; @@ -14690,25 +19668,10 @@ case TCP_BBR_IWINTSO: optval = rack->rc_init_win; break; - case TCP_RACK_PROP_RATE: - optval = rack->r_ctl.rc_prop_rate; - break; - case TCP_RACK_PROP: - /* RACK proportional rate reduction (bool) */ - optval = rack->r_ctl.rc_prop_reduce; - break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ optval = rack->r_ctl.rc_tlp_cwnd_reduce; break; - case TCP_RACK_EARLY_RECOV: - /* Should recovery happen early (bool) */ - optval = rack->r_ctl.rc_early_recovery; - break; - case TCP_RACK_PACE_REDUCE: - /* RACK Hptsi reduction factor (divisor) */ - error = EINVAL; - break; case TCP_BBR_RACK_INIT_RATE: val = rack->r_ctl.init_rate; /* convert to kbits per sec */ @@ -14754,6 +19717,9 @@ case TCP_RACK_RR_CONF: optval = rack->r_rr_config; break; + case TCP_HDWR_RATE_CAP: + optval = rack->r_rack_hw_rate_caps; + break; case TCP_BBR_HDWR_PACE: optval = rack->rack_hdw_pace_ena; break; @@ -14768,13 +19734,6 @@ case TCP_RACK_TLP_USE: optval = rack->rack_tlp_threshold_use; break; - case TCP_RACK_TLP_INC_VAR: - /* Does TLP include rtt variance in t-o */ - error = EINVAL; - break; - case TCP_RACK_IDLE_REDUCE_HIGH: - error = EINVAL; - break; case TCP_RACK_PACE_RATE_CA: optval = rack->r_ctl.rc_fixed_pacing_rate_ca; break; @@ -14808,7 +19767,10 @@ } INP_WUNLOCK(inp); if (error == 0) { - error = sooptcopyout(sopt, &optval, sizeof optval); + if (TCP_PACING_RATE_CAP) + error = sooptcopyout(sopt, &loptval, sizeof loptval); + else + error = sooptcopyout(sopt, &optval, sizeof optval); } return (error); } @@ -14857,7 +19819,9 @@ .tfb_tcp_timer_stop = rack_timer_stop, .tfb_tcp_rexmit_tmr = rack_remxt_tmr, .tfb_tcp_handoff_ok = rack_handoff_ok, + .tfb_tcp_mtu_chg = rack_mtu_change, .tfb_pru_options = rack_pru_options, + }; static const char *rack_stack_names[] = { Index: sys/netinet/tcp_stacks/rack_bbr_common.h =================================================================== --- sys/netinet/tcp_stacks/rack_bbr_common.h +++ sys/netinet/tcp_stacks/rack_bbr_common.h @@ -98,12 +98,20 @@ uint32_t ctf_outstanding(struct tcpcb *tp); uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked); int -ctf_drop_checks(struct tcpopt *to, struct mbuf *m, - struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, - int32_t * drop_hdrlen, int32_t * ret_val); +_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t *tlenp, + int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val, + uint32_t *ts, uint32_t *cnt); +void ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt); +#define ctf_drop_checks(a, b, c, d, e, f, g, h) _ctf_drop_checks(a, b, c, d, e, f, g, h, NULL, NULL) + void -ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); +__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t thflags, int32_t tlen, + int32_t *ret_val, uint32_t *ts, uint32_t *cnt); + +#define ctf_do_dropafterack(a, b, c, d, e, f) __ctf_do_dropafterack(a, b, c, d, e, f, NULL, NULL) + void ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen); @@ -122,6 +130,9 @@ ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); +int +ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags); + void ctf_calc_rwin(struct socket *so, struct tcpcb *tp); Index: sys/netinet/tcp_stacks/rack_bbr_common.c =================================================================== --- sys/netinet/tcp_stacks/rack_bbr_common.c +++ sys/netinet/tcp_stacks/rack_bbr_common.c @@ -96,6 +96,7 @@ #include #include #include +#include #include #include #ifdef TCPDEBUG @@ -161,6 +162,130 @@ } #endif +static int +ctf_get_enet_type(struct ifnet *ifp, struct mbuf *m) +{ + struct ether_header *eh; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip = NULL; /* Keep compiler happy. */ +#endif + int32_t tlen; + uint16_t drop_hdrlen; + uint16_t etype; + uint8_t iptos; + + /* Is it the easy way? */ + if (m->m_flags & M_LRO_EHDRSTRP) + return (m->m_pkthdr.lro_etype); + /* + * Ok this is the old style call, the ethernet header is here. + * This also means no checksum or BPF were done. This + * can happen if the race to setup the inp fails and + * LRO sees no INP at packet input, but by the time + * we queue the packets an INP gets there. Its rare + * but it can occur so we will handle it. Note that + * this means duplicated work but with the rarity of it + * its not worth worrying about. + */ + /* Let the BPF see the packet */ + if (bpf_peers_present(ifp->if_bpf)) + ETHER_BPF_MTAP(ifp, m); + /* Now the csum */ + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + m_adj(m, sizeof(*eh)); + switch (etype) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { + m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); + if (m == NULL) { + KMOD_TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + return (-1); + } + } + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + drop_hdrlen = sizeof(*ip6); + tlen = ntohs(ip6->ip6_plen); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_pseudo(ip6, tlen, + IPPROTO_TCP, + m->m_pkthdr.csum_data); + th->th_sum ^= 0xffff; + } else + th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); + if (th->th_sum) { + KMOD_TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + return (-1); + } + return (etype); + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + if (m->m_len < sizeof (struct tcpiphdr)) { + m = m_pullup(m, sizeof (struct tcpiphdr)); + if (m == NULL) { + KMOD_TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + return (-1); + } + } + ip = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip + 1); + drop_hdrlen = sizeof(*ip); + iptos = ip->ip_tos; + tlen = ntohs(ip->ip_len) - sizeof(struct ip); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + int len; + struct ipovly *ipov = (struct ipovly *)ip; + /* + * Checksum extended TCP header and data. + */ + len = drop_hdrlen + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = htons(tlen); + th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(len); + /* Reset TOS bits */ + ip->ip_tos = iptos; + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + } + if (th->th_sum) { + KMOD_TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + return (-1); + } + break; + } +#endif + }; + return (etype); +} + /* * The function ctf_process_inbound_raw() is used by * transport developers to do the steps needed to @@ -170,6 +295,7 @@ * - INP_SUPPORTS_MBUFQ * - INP_MBUF_QUEUE_READY * - INP_DONT_SACK_QUEUE + * - INP_MBUF_ACKCMP * * These flags help control how LRO will deliver * packets to the transport. You first set in inp_flags2 @@ -186,6 +312,18 @@ * In some transport designs this is important since knowing * the actual time we got the packet is useful information. * + * A new special type of mbuf may also be supported by the transport + * if it has set the INP_MBUF_ACKCMP flag. If its set, LRO will + * possibly create a M_ACKCMP type mbuf. This is a mbuf with + * an array of "acks". One thing also to note is that when this + * occurs a subsequent LRO may find at the back of the untouched + * mbuf queue chain a M_ACKCMP and append on to it. This means + * that until the transport pulls in the mbuf chain queued + * for it more ack's may get on the mbufs that were already + * delivered. There currently is a limit of 6 acks condensed + * into 1 mbuf which means often when this is occuring, we + * don't get that effect but it does happen. + * * Now there are some interesting Caveats that the transport * designer needs to take into account when using this feature. * @@ -247,7 +385,6 @@ * shipped in, the tcb has been destroyed (or about to be destroyed). */ struct mbuf *m_save; - struct ether_header *eh; struct tcphdr *th; #ifdef INET6 struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ @@ -257,20 +394,18 @@ #endif struct ifnet *ifp; struct timeval tv; + struct inpcb *inp; int32_t retval, nxt_pkt, tlen, off; - uint16_t etype; + int etype = 0; uint16_t drop_hdrlen; - uint8_t iptos, no_vn=0, bpf_req=0; + uint8_t iptos, no_vn=0; NET_EPOCH_ASSERT(); - - if (m && m->m_pkthdr.rcvif) - ifp = m->m_pkthdr.rcvif; + if (m) + ifp = m_rcvif(m); else ifp = NULL; - if (ifp) { - bpf_req = bpf_peers_present(ifp->if_bpf); - } else { + if (ifp == NULL) { /* * We probably should not work around * but kassert, since lro alwasy sets rcvif. @@ -280,147 +415,86 @@ } CURVNET_SET(ifp->if_vnet); skip_vnet: + tcp_get_usecs(&tv); while (m) { m_save = m->m_nextpkt; m->m_nextpkt = NULL; - /* Now lets get the ether header */ - eh = mtod(m, struct ether_header *); - etype = ntohs(eh->ether_type); - /* Let the BPF see the packet */ - if (bpf_req && ifp) - ETHER_BPF_MTAP(ifp, m); - m_adj(m, sizeof(*eh)); - /* Trim off the ethernet header */ - switch (etype) { -#ifdef INET6 - case ETHERTYPE_IPV6: - { - if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { - m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); - if (m == NULL) { - KMOD_TCPSTAT_INC(tcps_rcvshort); - m_freem(m); - goto skipped_pkt; - } - } - ip6 = (struct ip6_hdr *)(eh + 1); - th = (struct tcphdr *)(ip6 + 1); - tlen = ntohs(ip6->ip6_plen); - drop_hdrlen = sizeof(*ip6); - if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { - if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) - th->th_sum = m->m_pkthdr.csum_data; - else - th->th_sum = in6_cksum_pseudo(ip6, tlen, - IPPROTO_TCP, m->m_pkthdr.csum_data); - th->th_sum ^= 0xffff; - } else - th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); - if (th->th_sum) { - KMOD_TCPSTAT_INC(tcps_rcvbadsum); - m_freem(m); - goto skipped_pkt; - } - /* - * Be proactive about unspecified IPv6 address in source. - * As we use all-zero to indicate unbounded/unconnected pcb, - * unspecified IPv6 address can be used to confuse us. - * - * Note that packets with unspecified IPv6 destination is - * already dropped in ip6_input. - */ - if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { - /* XXX stat */ - m_freem(m); + if ((m->m_flags & M_ACKCMP) == 0) { + /* Now lets get the ether header */ + etype = ctf_get_enet_type(ifp, m); + if (etype == -1) { + /* Skip this packet it was freed by checksum */ goto skipped_pkt; } - iptos = IPV6_TRAFFIC_CLASS(ip6); - break; - } + KASSERT(((etype == ETHERTYPE_IPV6) || (etype == ETHERTYPE_IP)), + ("tp:%p m:%p etype:0x%x -- not IP or IPv6", tp, m, etype)); + /* Trim off the ethernet header */ + switch (etype) { +#ifdef INET6 + case ETHERTYPE_IPV6: + ip6 = mtod(m, struct ip6_hdr *); + th = (struct tcphdr *)(ip6 + 1); + tlen = ntohs(ip6->ip6_plen); + drop_hdrlen = sizeof(*ip6); + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; #endif #ifdef INET - case ETHERTYPE_IP: - { - if (m->m_len < sizeof (struct tcpiphdr)) { - if ((m = m_pullup(m, sizeof (struct tcpiphdr))) - == NULL) { - KMOD_TCPSTAT_INC(tcps_rcvshort); - m_freem(m); - goto skipped_pkt; - } - } - ip = (struct ip *)(eh + 1); - th = (struct tcphdr *)(ip + 1); - drop_hdrlen = sizeof(*ip); - iptos = ip->ip_tos; - tlen = ntohs(ip->ip_len) - sizeof(struct ip); - if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { - if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) - th->th_sum = m->m_pkthdr.csum_data; - else - th->th_sum = in_pseudo(ip->ip_src.s_addr, - ip->ip_dst.s_addr, - htonl(m->m_pkthdr.csum_data + tlen + - IPPROTO_TCP)); - th->th_sum ^= 0xffff; - } else { - int len; - struct ipovly *ipov = (struct ipovly *)ip; - /* - * Checksum extended TCP header and data. - */ - len = drop_hdrlen + tlen; - bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); - ipov->ih_len = htons(tlen); - th->th_sum = in_cksum(m, len); - /* Reset length for SDT probes. */ - ip->ip_len = htons(len); - /* Reset TOS bits */ - ip->ip_tos = iptos; - /* Re-initialization for later version check */ - ip->ip_v = IPVERSION; - ip->ip_hl = sizeof(*ip) >> 2; - } - if (th->th_sum) { - KMOD_TCPSTAT_INC(tcps_rcvbadsum); - m_freem(m); - goto skipped_pkt; - } - break; - } + case ETHERTYPE_IP: + ip = mtod(m, struct ip *); + th = (struct tcphdr *)(ip + 1); + drop_hdrlen = sizeof(*ip); + iptos = ip->ip_tos; + tlen = ntohs(ip->ip_len) - sizeof(struct ip); + break; #endif - } - /* - * Convert TCP protocol specific fields to host format. - */ - tcp_fields_to_host(th); - - off = th->th_off << 2; - if (off < sizeof (struct tcphdr) || off > tlen) { - KMOD_TCPSTAT_INC(tcps_rcvbadoff); + } /* end switch */ + /* + * Convert TCP protocol specific fields to host format. + */ + tcp_fields_to_host(th); + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + printf("off:%d < hdrlen:%zu || > tlen:%u -- dump\n", + off, + sizeof(struct tcphdr), + tlen); + KMOD_TCPSTAT_INC(tcps_rcvbadoff); m_freem(m); goto skipped_pkt; - } - tlen -= off; - drop_hdrlen += off; - /* - * Now lets setup the timeval to be when we should - * have been called (if we can). - */ - m->m_pkthdr.lro_nsegs = 1; - if (m->m_flags & M_TSTMP_LRO) { - tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; - tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } + tlen -= off; + drop_hdrlen += off; + /* + * Now lets setup the timeval to be when we should + * have been called (if we can). + */ + m->m_pkthdr.lro_nsegs = 1; + /* Now what about next packet? */ } else { - /* Should not be should we kassert instead? */ - tcp_get_usecs(&tv); + /* + * This mbuf is an array of acks that have + * been compressed. We assert the inp has + * the flag set to enable this! + */ + KASSERT((tp->t_inpcb->inp_flags2 & INP_MBUF_ACKCMP), + ("tp:%p inp:%p no INP_MBUF_ACKCMP flags?", tp, tp->t_inpcb)); + tlen = 0; + drop_hdrlen = 0; + th = NULL; + iptos = 0; } - /* Now what about next packet? */ + tcp_get_usecs(&tv); if (m_save || has_pkt) nxt_pkt = 1; else nxt_pkt = 0; - KMOD_TCPSTAT_INC(tcps_rcvtotal); + if ((m->m_flags & M_ACKCMP) == 0) + KMOD_TCPSTAT_INC(tcps_rcvtotal); + else + KMOD_TCPSTAT_ADD(tcps_rcvtotal, (m->m_len / sizeof(struct tcp_ackent))); + inp = tp->t_inpcb; + INP_WLOCK_ASSERT(inp); retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, iptos, nxt_pkt, &tv); if (retval) { @@ -434,6 +508,7 @@ } if (no_vn == 0) CURVNET_RESTORE(); + INP_UNLOCK_ASSERT(inp); return(retval); } skipped_pkt: @@ -482,11 +557,6 @@ if (rc_sacked <= ctf_outstanding(tp)) return(ctf_outstanding(tp) - rc_sacked); else { - /* TSNH */ -#ifdef INVARIANTS - panic("tp:%p rc_sacked:%d > out:%d", - tp, rc_sacked, ctf_outstanding(tp)); -#endif return (0); } } @@ -502,6 +572,36 @@ tcp_dropwithreset(m, th, NULL, tlen, rstreason); } +void +ctf_ack_war_checks(struct tcpcb *tp, uint32_t *ts, uint32_t *cnt) +{ + if ((ts != NULL) && (cnt != NULL) && + (tcp_ack_war_time_window > 0) && + (tcp_ack_war_cnt > 0)) { + /* We are possibly doing ack war prevention */ + uint32_t cts; + + /* + * We use a msec tick here which gives us + * roughly 49 days. We don't need the + * precision of a microsecond timestamp which + * would only give us hours. + */ + cts = tcp_ts_getticks(); + if (TSTMP_LT((*ts), cts)) { + /* Timestamp is in the past */ + *cnt = 0; + *ts = (cts + tcp_ack_war_time_window); + } + if (*cnt < tcp_ack_war_cnt) { + *cnt = (*cnt + 1); + tp->t_flags |= TF_ACKNOW; + } else + tp->t_flags &= ~TF_ACKNOW; + } else + tp->t_flags |= TF_ACKNOW; +} + /* * ctf_drop_checks returns 1 for you should not proceed. It places * in ret_val what should be returned 1/0 by the caller. The 1 indicates @@ -509,7 +609,10 @@ * TCB is still valid and locked. */ int -ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +_ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t *tlenp, + int32_t *thf, int32_t *drop_hdrlen, int32_t *ret_val, + uint32_t *ts, uint32_t *cnt) { int32_t todrop; int32_t thflags; @@ -543,7 +646,7 @@ * Send an ACK to resynchronize and drop any data. * But keep on processing for RST or ACK. */ - tp->t_flags |= TF_ACKNOW; + ctf_ack_war_checks(tp, ts, cnt); todrop = tlen; KMOD_TCPSTAT_INC(tcps_rcvduppack); KMOD_TCPSTAT_ADD(tcps_rcvdupbyte, todrop); @@ -555,13 +658,14 @@ * DSACK - add SACK block for dropped range */ if ((todrop > 0) && (tp->t_flags & TF_SACK_PERMIT)) { - tcp_update_sack_list(tp, th->th_seq, - th->th_seq + todrop); /* * ACK now, as the next in-sequence segment * will clear the DSACK block again */ - tp->t_flags |= TF_ACKNOW; + ctf_ack_war_checks(tp, ts, cnt); + if (tp->t_flags & TF_ACKNOW) + tcp_update_sack_list(tp, th->th_seq, + th->th_seq + todrop); } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; @@ -590,10 +694,10 @@ * ack. */ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { - tp->t_flags |= TF_ACKNOW; + ctf_ack_war_checks(tp, ts, cnt); KMOD_TCPSTAT_INC(tcps_rcvwinprobe); } else { - ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + __ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val, ts, cnt); return (1); } } else @@ -614,7 +718,7 @@ * and valid. */ void -ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) +__ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t *ret_val, uint32_t *ts, uint32_t *cnt) { /* * Generate an ACK dropping incoming segment if it occupies sequence @@ -638,7 +742,7 @@ return; } else *ret_val = 0; - tp->t_flags |= TF_ACKNOW; + ctf_ack_war_checks(tp, ts, cnt); if (m) m_freem(m); } @@ -671,7 +775,7 @@ */ int dropped = 0; - if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && + if ((SEQ_GEQ(th->th_seq, tp->last_ack_sent) && SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { KASSERT(tp->t_state != TCPS_SYN_SENT, @@ -680,8 +784,7 @@ if (V_tcp_insecure_rst || (tp->last_ack_sent == th->th_seq) || - (tp->rcv_nxt == th->th_seq) || - ((tp->last_ack_sent - 1) == th->th_seq)) { + (tp->rcv_nxt == th->th_seq)) { KMOD_TCPSTAT_INC(tcps_drops); /* Drop the connection. */ switch (tp->t_state) { @@ -748,7 +851,7 @@ } /* - * bbr_ts_check returns 1 for you should not proceed, the state + * ctf_ts_check returns 1 for you should not proceed, the state * machine should return. It places in ret_val what should * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates * that the TCB is unlocked and probably dropped. The 0 indicates the @@ -786,6 +889,32 @@ return (0); } +int +ctf_ts_check_ac(struct tcpcb *tp, int32_t thflags) +{ + + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates ts_recent, + * the age will be reset later and ts_recent will get a + * valid value. If it does not, setting ts_recent to zero + * will at least satisfy the requirement that zero be placed + * in the timestamp echo reply when ts_recent isn't valid. + * The age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be dropped + * when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + KMOD_TCPSTAT_INC(tcps_rcvduppack); + KMOD_TCPSTAT_INC(tcps_pawsdrop); + return (1); + } + return (0); +} + + + void ctf_calc_rwin(struct socket *so, struct tcpcb *tp) { @@ -817,45 +946,7 @@ uint32_t ctf_fixed_maxseg(struct tcpcb *tp) { - int optlen; - - if (tp->t_flags & TF_NOOPT) - return (tp->t_maxseg); - - /* - * Here we have a simplified code from tcp_addoptions(), - * without a proper loop, and having most of paddings hardcoded. - * We only consider fixed options that we would send every - * time I.e. SACK is not considered. - * - */ -#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) - if (TCPS_HAVEESTABLISHED(tp->t_state)) { - if (tp->t_flags & TF_RCVD_TSTMP) - optlen = TCPOLEN_TSTAMP_APPA; - else - optlen = 0; -#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) - if (tp->t_flags & TF_SIGNATURE) - optlen += PAD(TCPOLEN_SIGNATURE); -#endif - } else { - if (tp->t_flags & TF_REQ_TSTMP) - optlen = TCPOLEN_TSTAMP_APPA; - else - optlen = PAD(TCPOLEN_MAXSEG); - if (tp->t_flags & TF_REQ_SCALE) - optlen += PAD(TCPOLEN_WINDOW); -#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) - if (tp->t_flags & TF_SIGNATURE) - optlen += PAD(TCPOLEN_SIGNATURE); -#endif - if (tp->t_flags & TF_SACK_PERMIT) - optlen += PAD(TCPOLEN_SACK_PERMITTED); - } -#undef PAD - optlen = min(optlen, TCP_MAXOLEN); - return (tp->t_maxseg - optlen); + return (tcp_fixed_maxseg(tp)); } void Index: sys/netinet/tcp_stacks/tcp_bbr.h =================================================================== --- sys/netinet/tcp_stacks/tcp_bbr.h +++ sys/netinet/tcp_stacks/tcp_bbr.h @@ -71,7 +71,7 @@ uint32_t r_del_time; /* The time of the last delivery update */ uint8_t r_rtr_cnt:4, /* Retran count, index this -1 to get time * sent */ - unused_bit:1, + r_rtt_not_allowed:1, /* No rtt measurement allowed */ r_is_drain:1, /* In a draining cycle */ r_app_limited:1,/* We went app limited */ r_ts_valid:1; /* Timestamp field is valid (r_del_ack_ts) */ @@ -588,9 +588,9 @@ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ uint32_t rc_init_rwnd; /* Initial rwnd when we transitioned */ - /*- --- + /*- --- * used only initial and close - */ + */ uint32_t rc_high_rwnd; /* Highest rwnd seen */ uint32_t rc_lowest_rtt; /* Smallest RTT we have seen */ Index: sys/netinet/tcp_stacks/tcp_rack.h =================================================================== --- sys/netinet/tcp_stacks/tcp_rack.h +++ sys/netinet/tcp_stacks/tcp_rack.h @@ -29,7 +29,7 @@ #define _NETINET_TCP_RACK_H_ #define RACK_ACKED 0x0001/* The remote endpoint acked this */ -#define RACK_TO_MIXED 0x0002/* A timeout occurred that mixed the send order - not used */ +#define RACK_TO_REXT 0x0002/* A timeout occured on this sendmap entry */ #define RACK_DEFERRED 0x0004/* We can't use this for RTT calc - not used */ #define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ #define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ @@ -39,37 +39,94 @@ #define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */ #define RACK_APP_LIMITED 0x0200/* We went app limited after this send */ #define RACK_WAS_ACKED 0x0400/* a RTO undid the ack, but it already had a rtt calc done */ -#define RACK_HAS_SIN 0x0800/* SIN is on this guy */ +#define RACK_HAS_SYN 0x0800/* SYN is on this guy */ +#define RACK_SENT_W_DSACK 0x1000/* Sent with a dsack */ +#define RACK_SENT_SP 0x2000/* sent in slow path */ +#define RACK_SENT_FP 0x4000/* sent in fast path */ +#define RACK_HAD_PUSH 0x8000/* Push was sent on original send */ #define RACK_NUM_OF_RETRANS 3 -#define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ +#define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */ -#define RACK_REQ_AVG 4 /* Must be less than 256 */ +#define RACK_REQ_AVG 3 /* Must be less than 256 */ struct rack_sendmap { + TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ uint32_t r_start; /* Sequence number of the segment */ uint32_t r_end; /* End seq, this is 1 beyond actually */ - TAILQ_ENTRY(rack_sendmap) r_tnext; /* Time of transmit based next */ - RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time * sent */ uint16_t r_flags; /* Flags as defined above */ - uint32_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; - uint32_t usec_orig_send; /* time of orginal send in useconds */ + struct mbuf *m; + uint32_t soff; + uint32_t orig_m_len; uint32_t r_nseq_appl; /* If this one is app limited, this is the nxt seq limited */ - uint32_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */ uint8_t r_dupack; /* Dup ack count */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ uint8_t r_limit_type; /* is this entry counted against a limit? */ uint8_t r_just_ret : 1, /* After sending, the next pkt was just returned, i.e. limited */ r_one_out_nr : 1, /* Special case 1 outstanding and not in recovery */ - r_avail : 6; - uint8_t r_resv[36]; + r_no_rtt_allowed : 1, /* No rtt measurement allowed */ + r_avail : 5; + uint64_t r_tim_lastsent[RACK_NUM_OF_RETRANS]; + uint64_t r_ack_arrival; /* This is the time of ack-arrival (if SACK'd) */ + RB_ENTRY(rack_sendmap) r_next; /* RB Tree next */ }; +struct deferred_opt_list { + TAILQ_ENTRY(deferred_opt_list) next; + int optname; + uint64_t optval; +}; + +/* + * Timestamps in the rack sendmap are now moving to be + * uint64_t's. This means that if you want a uint32_t + * usec timestamp (the old usecond timestamp) you simply have + * to cast it to uint32_t. The reason we do this is not for + * wrap, but we need to get back, at times, to the millisecond + * timestamp that is used in the TSTMP option. To do this we + * can use the rack_ts_to_msec() inline below which can take + * the 64bit ts and make into the correct timestamp millisecond + * wise. Thats not possible with the 32bit usecond timestamp since + * the seconds wrap too quickly to cover all bases. + * + * There are quite a few places in rack where I simply cast + * back to uint32_t and then end up using the TSTMP_XX() + * macros. This is ok, but we could do simple compares if + * we ever decided to move all of those variables to 64 bits + * as well. + */ + +inline uint64_t +rack_to_usec_ts(struct timeval *tv) +{ + return ((tv->tv_sec * HPTS_USEC_IN_SEC) + tv->tv_usec); +} + +inline uint32_t +rack_ts_to_msec(uint64_t ts) +{ + return((uint32_t)(ts / HPTS_MSEC_IN_SEC)); +} + + RB_HEAD(rack_rb_tree_head, rack_sendmap); TAILQ_HEAD(rack_head, rack_sendmap); +TAILQ_HEAD(def_opt_head, deferred_opt_list); + +/* Map change logging */ +#define MAP_MERGE 0x01 +#define MAP_SPLIT 0x02 +#define MAP_NEW 0x03 +#define MAP_SACK_M1 0x04 +#define MAP_SACK_M2 0x05 +#define MAP_SACK_M3 0x06 +#define MAP_SACK_M4 0x07 +#define MAP_SACK_M5 0x08 +#define MAP_FREE 0x09 +#define MAP_TRIM_HEAD 0x0a #define RACK_LIMIT_TYPE_SPLIT 1 @@ -128,10 +185,7 @@ #define RACK_TO_FRM_DELACK 6 struct rack_opts_stats { - uint64_t tcp_rack_prop_rate; - uint64_t tcp_rack_prop; uint64_t tcp_rack_tlp_reduce; - uint64_t tcp_rack_early_recov; uint64_t tcp_rack_pace_always; uint64_t tcp_rack_pace_reduce; uint64_t tcp_rack_max_seg; @@ -177,6 +231,20 @@ uint64_t tcp_npush; uint64_t tcp_lscwnd; uint64_t tcp_profile; + uint64_t tcp_hdwr_rate_cap; + uint64_t tcp_pacing_rate_cap; + uint64_t tcp_pacing_up_only; + uint64_t tcp_use_cmp_acks; + uint64_t tcp_rack_abc_val; + uint64_t tcp_rec_abc_val; + uint64_t tcp_rack_measure_cnt; + uint64_t tcp_rack_delayed_ack; + uint64_t tcp_rack_rtt_use; + uint64_t tcp_data_after_close; + uint64_t tcp_defer_opt; + uint64_t tcp_rack_fastrsm_hack; + uint64_t tcp_rack_beta; + uint64_t tcp_rack_beta_ecn; }; /* RTT shrink reasons */ @@ -247,6 +315,23 @@ */ #define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ +#define RACK_NUM_FSB_DEBUG 16 +struct rack_fast_send_blk { + uint32_t left_to_send; + uint16_t tcp_ip_hdr_len; + uint8_t tcp_flags; + uint8_t hoplimit; + uint8_t *tcp_ip_hdr; + uint32_t recwin; + uint32_t off; + struct tcphdr *th; + struct udphdr *udp; + struct mbuf *m; + uint32_t o_m_len; + uint32_t rfo_apply_push : 1, + unused : 31; +}; + struct rack_control { /* Second cache line 0x40 from tcp_rack */ struct rack_rb_tree_head rc_mtree; /* Tree of all segments Lock(a) */ @@ -255,6 +340,7 @@ * tlp_sending Lock(a) */ struct rack_sendmap *rc_resend; /* something we have been asked to * resend */ + struct rack_fast_send_blk fsb; /* The fast-send block */ uint32_t input_pkt; uint32_t saved_input_pkt; uint32_t rc_hpts_flags; @@ -268,6 +354,9 @@ /* Third Cache line 0x80 */ struct rack_head rc_free; /* Allocation array */ + uint64_t last_hw_bw_req; + uint64_t crte_prev_rate; + uint64_t bw_rate_cap; uint32_t rc_time_last_sent; /* Time we last sent some data and * logged it Lock(a). */ uint32_t rc_reorder_ts; /* Last time we saw reordering Lock(a) */ @@ -342,8 +431,8 @@ uint32_t rc_agg_delayed; uint32_t rc_tlp_rxt_last_time; uint32_t rc_saved_cwnd; - uint32_t rc_gp_output_ts; - uint32_t rc_gp_cumack_ts; + uint64_t rc_gp_output_ts; /* chg*/ + uint64_t rc_gp_cumack_ts; /* chg*/ struct timeval act_rcv_time; struct timeval rc_last_time_decay; /* SAD time decay happened here */ uint64_t gp_bw; @@ -354,6 +443,7 @@ uint64_t last_gp_comp_bw; uint64_t last_max_bw; /* Our calculated max b/w last */ struct time_filter_small rc_gp_min_rtt; + struct def_opt_head opt_list; int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */ uint32_t rc_gp_srtt; /* Current GP srtt */ uint32_t rc_prev_gp_srtt; /* Previous RTT */ @@ -370,21 +460,40 @@ uint32_t rc_time_of_last_probertt; uint32_t rc_target_probertt_flight; uint32_t rc_probertt_sndmax_atexit; /* Highest sent to in probe-rtt */ + uint32_t rc_cwnd_at_erec; + uint32_t rc_ssthresh_at_erec; + uint32_t dsack_byte_cnt; + uint32_t retran_during_recovery; uint32_t rc_gp_lowrtt; /* Lowest rtt seen during GPUT measurement */ uint32_t rc_gp_high_rwnd; /* Highest rwnd seen during GPUT measurement */ + uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occured what was snd-max */ + uint32_t rc_out_at_rto; int32_t rc_scw_index; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ + uint32_t rc_last_timeout_snduna; + uint32_t challenge_ack_ts; + uint32_t challenge_ack_cnt; + uint32_t rc_min_to; /* Socket option value Lock(a) */ + uint32_t rc_pkt_delay; /* Socket option value Lock(a) */ + struct newreno rc_saved_beta; /* + * For newreno cc: + * rc_saved_cc are the values we have had + * set by the user, if pacing is not happening + * (i.e. its early and we have not turned on yet + * or it was turned off). The minute pacing + * is turned on we pull out the values currently + * being used by newreno and replace them with + * these values, then save off the old values here, + * we also set the flag (if ecn_beta is set) to make + * new_reno do less of a backoff for ecn (think abe). + */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ - uint16_t rc_pkt_delay; /* Socket option value Lock(a) */ uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */ - uint8_t num_avg; /* average count before we go to normal decay */ - uint8_t rc_prop_rate; /* Socket option value Lock(a) */ - uint8_t rc_prop_reduce; /* Socket option value Lock(a) */ + uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */ + uint8_t req_measurements; /* How many measurements are required? */ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ - uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ - uint8_t rc_min_to; /* Socket option value Lock(a) */ uint8_t rc_rate_sample_method; uint8_t rc_gp_hist_idx; }; @@ -402,21 +511,57 @@ int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ - uint32_t rc_free_cnt; /* Number of free entries on the rc_free list + uint8_t rc_free_cnt; /* Number of free entries on the rc_free list * Lock(a) */ + uint8_t client_bufferlvl; /* 0 - 5 normaly, less than or at 2 means its real low */ + uint8_t no_prr_addback : 1, + gp_ready : 1, + defer_options: 1, + fast_rsm_hack: 1, + rc_ack_can_sendout_data: 1, /* + * If set it will override pacing restrictions on not sending + * data when the pacing timer is running. I.e. you set this + * and an ACK will send data. Default is off and its only used + * without pacing when we are doing 5G speed up for there + * ack filtering. + */ + rc_pacing_cc_set: 1, /* + * If we are pacing (pace_always=1) and we have reached the + * point where we start pacing (fixed or gp has reached its + * magic gp_ready state) this flag indicates we have set in + * values to effect CC's backoff's. If pacing is turned off + * then we must restore the values saved in rc_saved_beta, + * if its going to gp_ready we need to copy the values into + * the CC module and set our flags. + * + * Note this only happens if the cc name is newreno (CCALGONAME_NEWRENO). + */ + + avail :2; + uint8_t avail_bytes; uint32_t rc_rack_rtt; /* RACK-RTT Lock(a) */ uint16_t r_mbuf_queue : 1, /* Do we do mbuf queue for non-paced */ rtt_limit_mul : 4, /* muliply this by low rtt */ r_limit_scw : 1, - r_avail_bits : 10; /* Available */ - - uint16_t rc_user_set_max_segs; /* Socket option value Lock(a) */ + r_must_retran : 1, /* For non-sack customers we hit an RTO and new data should be resends */ + r_use_cmp_ack: 1, /* Do we use compressed acks */ + r_ent_rec_ns: 1, /* We entered recovery and have not sent */ + r_might_revert: 1, /* Flag to find out if we might need to revert */ + r_fast_output: 1, /* Fast output is in progress we can skip the bulk of rack_output */ + r_fsb_inited: 1, + r_rack_hw_rate_caps: 1, + r_up_only: 1, + r_via_fill_cw : 1, + r_fill_less_agg : 1; + + uint8_t rc_user_set_max_segs; /* Socket option value Lock(a) */ + uint8_t rc_labc; /* Appropriate Byte Counting Value */ uint16_t forced_ack : 1, rc_gp_incr : 1, rc_gp_bwred : 1, rc_gp_timely_inc_cnt : 3, rc_gp_timely_dec_cnt : 3, - rc_not_backing_off: 1, + r_use_labc_for_rec: 1, rc_highly_buffered: 1, /* The path is highly buffered */ rc_dragged_bottom: 1, rc_dack_mode : 1, /* Mac O/S emulation of d-ack */ @@ -435,7 +580,7 @@ rc_always_pace : 1, /* Socket option value Lock(a) */ rc_pace_to_cwnd : 1, rc_pace_fill_if_rttin_range : 1, - xxx_avail_bits : 1; + rc_srtt_measure_made : 1; uint8_t app_limited_needs_set : 1, use_fixed_rate : 1, rc_has_collapsed : 1, Index: sys/netinet/tcp_subr.c =================================================================== --- sys/netinet/tcp_subr.c +++ sys/netinet/tcp_subr.c @@ -193,6 +193,16 @@ &tcp_sad_low_pps, 100, "What is the input pps that below which we do not decay?"); #endif +uint32_t tcp_ack_war_time_window = 1000; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_timewindow, + CTLFLAG_RW, + &tcp_ack_war_time_window, 1000, + "If the tcp_stack does ack-war prevention how many milliseconds are in its time window?"); +uint32_t tcp_ack_war_cnt = 5; +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, ack_war_cnt, + CTLFLAG_RW, + &tcp_ack_war_cnt, 5, + "If the tcp_stack does ack-war prevention how many acks can be sent in its time window?"); struct rwlock tcp_function_lock; @@ -268,6 +278,18 @@ &VNET_NAME(tcp_ts_offset_per_conn), 0, "Initialize TCP timestamps per connection instead of per host pair"); +/* How many connections are pacing */ +static volatile uint32_t number_of_tcp_connections_pacing = 0; +static uint32_t shadow_num_connections = 0; + +static int tcp_pacing_limit = 10000; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, + &tcp_pacing_limit, 1000, + "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); + +SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, + &shadow_num_connections, 0, "Number of TCP connections being paced"); + static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); @@ -3511,6 +3533,54 @@ return (tp->t_maxseg - optlen); } + +u_int +tcp_fixed_maxseg(const struct tcpcb *tp) +{ + int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We only consider fixed options that we would send every + * time I.e. SACK is not considered. This is important + * for cc modules to figure out what the modulo of the + * cwnd should be. + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + + + static int sysctl_drop(SYSCTL_HANDLER_ARGS) { @@ -3972,3 +4042,38 @@ } } } + +int +tcp_can_enable_pacing(void) +{ + + if ((tcp_pacing_limit == -1) || + (tcp_pacing_limit > number_of_tcp_connections_pacing)) { + atomic_fetchadd_int(&number_of_tcp_connections_pacing, 1); + shadow_num_connections = number_of_tcp_connections_pacing; + return (1); + } else { + return (0); + } +} + +static uint8_t tcp_pacing_warning = 0; + +void +tcp_decrement_paced_conn(void) +{ + uint32_t ret; + + ret = atomic_fetchadd_int(&number_of_tcp_connections_pacing, -1); + shadow_num_connections = number_of_tcp_connections_pacing; + KASSERT(ret != 0, ("tcp_paced_connection_exits -1 would cause wrap?")); + if (ret == 0) { + if (tcp_pacing_limit != -1) { + printf("Warning all pacing is now disabled, count decrements invalidly!\n"); + tcp_pacing_limit = 0; + } else if (tcp_pacing_warning == 0) { + printf("Warning pacing count is invalid, invalid decrement\n"); + tcp_pacing_warning = 1; + } + } +} Index: sys/netinet/tcp_var.h =================================================================== --- sys/netinet/tcp_var.h +++ sys/netinet/tcp_var.h @@ -258,6 +258,10 @@ tcp_seq gput_seq; /* Outbound measurement seq */ tcp_seq gput_ack; /* Inbound measurement ack */ int32_t t_stats_gput_prev; /* XXXLAS: Prev gput measurement */ + uint32_t t_maxpeakrate; /* max peak rate set by user, in bytes/s */ + uint32_t t_sndtlppack; /* tail loss probe packets sent */ + uint64_t t_sndtlpbyte; /* total tail loss probe bytes sent */ + uint8_t t_tfo_client_cookie_len; /* TCP Fast Open client cookie length */ uint32_t t_end_info_status; /* Status flag of end info */ unsigned int *t_tfo_pending; /* TCP Fast Open server pending counter */ @@ -974,6 +978,7 @@ void cc_conn_init(struct tcpcb *tp); void cc_post_recovery(struct tcpcb *tp, struct tcphdr *th); void cc_ecnpkt_handler(struct tcpcb *tp, struct tcphdr *th, uint8_t iptos); +void cc_ecnpkt_handler_flags(struct tcpcb *tp, uint16_t flags, uint8_t iptos); void cc_cong_signal(struct tcpcb *tp, struct tcphdr *th, uint32_t type); #ifdef TCP_HHOOK void hhook_run_tcp_est_in(struct tcpcb *tp, @@ -1022,10 +1027,13 @@ extern int32_t tcp_map_minimum; extern int32_t tcp_attack_on_turns_on_logging; #endif +extern uint32_t tcp_ack_war_time_window; +extern uint32_t tcp_ack_war_cnt; uint32_t tcp_maxmtu(struct in_conninfo *, struct tcp_ifcap *); uint32_t tcp_maxmtu6(struct in_conninfo *, struct tcp_ifcap *); u_int tcp_maxseg(const struct tcpcb *); +u_int tcp_fixed_maxseg(const struct tcpcb *); void tcp_mss_update(struct tcpcb *, int, int, struct hc_metrics_lite *, struct tcp_ifcap *); void tcp_mss(struct tcpcb *, int); @@ -1075,6 +1083,7 @@ tcp_seq tcp_new_isn(struct in_conninfo *); int tcp_sack_doack(struct tcpcb *, struct tcpopt *, tcp_seq); +int tcp_dsack_block_exists(struct tcpcb *); void tcp_update_dsack_list(struct tcpcb *, tcp_seq, tcp_seq); void tcp_update_sack_list(struct tcpcb *tp, tcp_seq rcv_laststart, tcp_seq rcv_lastend); void tcp_clean_dsack_blocks(struct tcpcb *tp); @@ -1090,6 +1099,9 @@ void tcp_sndbuf_autoscale(struct tcpcb *, struct socket *, uint32_t); int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); +int tcp_can_enable_pacing(void); +void tcp_decrement_paced_conn(void); + struct mbuf * tcp_m_copym(struct mbuf *m, int32_t off0, int32_t *plen, int32_t seglimit, int32_t segsize, struct sockbuf *sb, bool hw_tls);