diff --git a/sys/modules/tcp/rack/Makefile b/sys/modules/tcp/rack/Makefile --- a/sys/modules/tcp/rack/Makefile +++ b/sys/modules/tcp/rack/Makefile @@ -5,7 +5,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c +SRCS= rack.c sack_filter.c rack_bbr_common.c tailq_hash.c rack_pcm.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_kern_tls.h diff --git a/sys/netinet/tcp.h b/sys/netinet/tcp.h --- a/sys/netinet/tcp.h +++ b/sys/netinet/tcp.h @@ -334,9 +334,22 @@ #define TCP_RACK_PACING_DIVISOR 1146 /* Pacing divisor given to rate-limit code for burst sizing */ #define TCP_RACK_PACE_MIN_SEG 1147 /* Pacing min seg size rack will use */ #define TCP_RACK_DGP_IN_REC 1148 /* Do we use full DGP in recovery? */ -#define TCP_RXT_CLAMP 1149 /* Do we apply a threshold to rack so if excess rxt clamp cwnd? */ +#define TCP_POLICER_DETECT 1149 /* Do we apply a thresholds to rack to detect and compensate for policers? */ +#define TCP_RXT_CLAMP TCP_POLICER_DETECT #define TCP_HYBRID_PACING 1150 /* Hybrid pacing enablement */ #define TCP_PACING_DND 1151 /* When pacing with rr_config=3 can sacks disturb us */ +#define TCP_SS_EEXIT 1152 /* Do we do early exit from slowtart if no b/w growth */ +#define TCP_DGP_UPPER_BOUNDS 1153 /* SS and CA upper bound in percentage */ +#define TCP_NO_TIMELY 1154 /* Disable/enable Timely */ +#define TCP_HONOR_HPTS_MIN 1155 /* Do we honor hpts min to */ +#define TCP_REC_IS_DYN 1156 /* Do we allow timely to change recovery multiplier? */ +#define TCP_SIDECHAN_DIS 1157 /* Disable/enable the side-channel */ +#define TCP_FILLCW_RATE_CAP 1158 /* Set a cap for DGP's fillcw */ +#define TCP_POLICER_MSS 1159 /* Policer MSS requirement */ +#define TCP_STACK_SPEC_INFO 1160 /* Get stack specific information (if present) */ +#define RACK_CSPR_IS_FCC 1161 +#define TCP_GP_USE_LTBW 1162 /* how we use lt_bw 0=not, 1=min, 2=max */ + /* Start of reserved space for third-party user-settable options. */ #define TCP_VENDOR SO_VENDOR @@ -447,6 +460,7 @@ u_int32_t tcpi_rcv_adv; /* Peer advertised window */ u_int32_t tcpi_dupacks; /* Consecutive dup ACKs recvd */ + u_int32_t tcpi_rttmin; /* Min observed RTT */ /* Padding to grow without breaking ABI. */ u_int32_t __tcpi_pad[14]; /* Padding. */ }; @@ -463,6 +477,20 @@ #define TCP_FUNCTION_NAME_LEN_MAX 32 +struct stack_specific_info { + char stack_name[TCP_FUNCTION_NAME_LEN_MAX]; + uint64_t policer_last_bw; /* Only valid if detection enabled and policer detected */ + uint64_t bytes_transmitted; + uint64_t bytes_retransmitted; + uint32_t policer_detection_enabled: 1, + policer_detected : 1, /* transport thinks a policer is on path */ + highly_buffered : 1, /* transport considers the path highly buffered */ + spare : 29; + uint32_t policer_bucket_size; /* Only valid if detection enabled and policer detected */ + uint32_t current_round; + uint32_t _rack_i_pad[18]; +}; + struct tcp_function_set { char function_set_name[TCP_FUNCTION_NAME_LEN_MAX]; uint32_t pcbcnt; @@ -488,6 +516,7 @@ uint64_t start; uint64_t end; uint32_t flags; + uint32_t playout_ms; }; union tcp_log_userdata { @@ -518,9 +547,12 @@ #define TCP_HYBRID_PACING_H_MS 0x0008 /* A client hint for maxseg is present */ #define TCP_HYBRID_PACING_ENABLE 0x0010 /* We are enabling hybrid pacing else disable */ #define TCP_HYBRID_PACING_S_MSS 0x0020 /* Clent wants us to set the mss overriding gp est in CU */ -#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tellsus we set the mss on this entry */ +#define TCP_HAS_PLAYOUT_MS 0x0040 /* The client included the chunk playout milliseconds: deprecate */ +/* the below are internal only flags */ +#define TCP_HYBRID_PACING_USER_MASK 0x0FFF /* Non-internal flags mask */ +#define TCP_HYBRID_PACING_SETMSS 0x1000 /* Internal flag that tells us we set the mss on this entry */ #define TCP_HYBRID_PACING_WASSET 0x2000 /* We init to this to know if a hybrid command was issued */ - +#define TCP_HYBRID_PACING_SENDTIME 0x4000 /* Duplicate tm to last, use sendtime for catch up mode */ struct tcp_hybrid_req { struct tcp_snd_req req; diff --git a/sys/netinet/tcp_log_buf.h b/sys/netinet/tcp_log_buf.h --- a/sys/netinet/tcp_log_buf.h +++ b/sys/netinet/tcp_log_buf.h @@ -267,7 +267,9 @@ TCP_RACK_TP_TRIGGERED, /* A rack tracepoint is triggered 68 */ TCP_HYBRID_PACING_LOG, /* Hybrid pacing log 69 */ TCP_LOG_PRU, /* TCP protocol user request 70 */ - TCP_LOG_END /* End (keep at end) 71 */ + TCP_POLICER_DET, /* TCP Policer detectionn 71 */ + TCP_PCM_MEASURE, /* TCP Path Capacity Measurement 72 */ + TCP_LOG_END /* End (keep at end) 72 */ }; enum tcp_log_states { @@ -371,10 +373,11 @@ #define TCP_TP_COLLAPSED_RXT 0x00000004 /* When we actually retransmit a collapsed window rsm */ #define TCP_TP_REQ_LOG_FAIL 0x00000005 /* We tried to allocate a Request log but had no space */ #define TCP_TP_RESET_RCV 0x00000006 /* Triggers when we receive a RST */ -#define TCP_TP_EXCESS_RXT 0x00000007 /* When we get excess RXT's clamping the cwnd */ +#define TCP_TP_POLICER_DET 0x00000007 /* When we detect a policer */ +#define TCP_TP_EXCESS_RXT TCP_TP_POLICER_DET /* alias */ #define TCP_TP_SAD_TRIGGERED 0x00000008 /* Sack Attack Detection triggers */ - #define TCP_TP_SAD_SUSPECT 0x0000000a /* A sack has supicious information in it */ +#define TCP_TP_PACED_BOTTOM 0x0000000b /* We have paced at the bottom */ #ifdef _KERNEL diff --git a/sys/netinet/tcp_stacks/bbr.c b/sys/netinet/tcp_stacks/bbr.c --- a/sys/netinet/tcp_stacks/bbr.c +++ b/sys/netinet/tcp_stacks/bbr.c @@ -11529,7 +11529,9 @@ bbr_set_pktepoch(bbr, cts, __LINE__); bbr_check_bbr_for_state(bbr, cts, __LINE__, (bbr->r_ctl.rc_lost - lost)); if (nxt_pkt == 0) { - if (bbr->r_wanted_output != 0) { + if ((bbr->r_wanted_output != 0) || + (tp->t_flags & TF_ACKNOW)) { + bbr->rc_output_starts_timer = 0; did_out = 1; if (tcp_output(tp) < 0) diff --git a/sys/netinet/tcp_stacks/rack.c b/sys/netinet/tcp_stacks/rack.c --- a/sys/netinet/tcp_stacks/rack.c +++ b/sys/netinet/tcp_stacks/rack.c @@ -142,9 +142,12 @@ #define V_newreno_beta VNET(newreno_beta) #define V_newreno_beta_ecn VNET(newreno_beta_ecn) +#define M_TCPFSB __CONCAT(M_TCPFSB, STACKNAME) +#define M_TCPDO __CONCAT(M_TCPDO, STACKNAME) -MALLOC_DEFINE(M_TCPFSB, "tcp_fsb", "TCP fast send block"); -MALLOC_DEFINE(M_TCPDO, "tcp_do", "TCP deferred options"); +MALLOC_DEFINE(M_TCPFSB, "tcp_fsb_" __XSTRING(STACKNAME), "TCP fast send block"); +MALLOC_DEFINE(M_TCPDO, "tcp_do_" __XSTRING(STACKNAME), "TCP deferred options"); +MALLOC_DEFINE(M_TCPPCM, "tcp_pcm_" __XSTRING(STACKNAME), "TCP PCM measurement information"); struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; @@ -190,12 +193,24 @@ static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000000; /* 0 - never fade, def 60,000,000 * - 60 seconds */ -static uint32_t rack_clamp_ss_upper = 110; -static uint32_t rack_clamp_ca_upper = 105; -static uint32_t rack_rxt_min_rnds = 10; /* Min rounds if drastic rxt clamp is in place */ -static uint32_t rack_unclamp_round_thresh = 100; /* number of perfect rounds before we unclamp */ -static uint32_t rack_unclamp_rxt_thresh = 5; /* .5% and under */ -static uint64_t rack_rxt_clamp_thresh = 0; /* Do we do the rxt clamp thing */ +static uint16_t rack_policer_rxt_thresh= 0; /* 499 = 49.9%, 0 is off */ +static uint8_t rack_policer_avg_thresh = 0; /* 3.2 */ +static uint8_t rack_policer_med_thresh = 0; /* 1 - 16 */ +static uint16_t rack_policer_bucket_reserve = 20; /* How much % is reserved in the bucket */ +static uint64_t rack_pol_min_bw = 125000; /* 1mbps in Bytes per sec */ +static uint32_t rack_policer_data_thresh = 64000; /* 64,000 bytes must be sent before we engage */ +static uint32_t rack_policing_do_bw_comp = 1; +static uint32_t rack_pcm_every_n_rounds = 100; +static uint32_t rack_pcm_blast = 0; +static uint32_t rack_pcm_is_enabled = 1; +static uint8_t rack_req_del_mss = 18; /* How many segments need to be sent in a recovery episode to do policer_detection */ +static uint8_t rack_ssthresh_rest_rto_rec = 0; /* Do we restore ssthresh when we have rec -> rto -> rec */ + +static uint32_t rack_gp_gain_req = 1200; /* Amount percent wise required to gain to record a round has "gaining" */ +static uint32_t rack_rnd_cnt_req = 0x10005; /* Default number of rounds if we are below rack_gp_gain_req where we exit ss */ + + +static int32_t rack_rxt_scoreboard_clear_thresh = 2; static int32_t rack_dnd_default = 0; /* For rr_conf = 3, what is the default for dnd */ static int32_t rack_rxt_controls = 0; static int32_t rack_fill_cw_state = 0; @@ -217,9 +232,8 @@ static int32_t rack_apply_rtt_with_reduced_conf = 0; static int32_t rack_hibeta_setting = 0; static int32_t rack_default_pacing_divisor = 250; -static int32_t rack_uses_full_dgp_in_rec = 1; static uint16_t rack_pacing_min_seg = 0; - +static int32_t rack_timely_off = 0; static uint32_t sad_seg_size_per = 800; /* 80.0 % */ static int32_t rack_pkt_delay = 1000; @@ -235,7 +249,7 @@ static int32_t rack_max_abc_post_recovery = 2; static int32_t rack_client_low_buf = 0; static int32_t rack_dsack_std_based = 0x3; /* bit field bit 1 sets rc_rack_tmr_std_based and bit 2 sets rc_rack_use_dsack */ -static int32_t rack_bw_multipler = 2; /* Limit on fill cw's jump up to be this x gp_est */ +static int32_t rack_bw_multipler = 0; /* Limit on fill cw's jump up to be this x gp_est */ #ifdef TCP_ACCOUNTING static int32_t rack_tcp_accounting = 0; #endif @@ -247,8 +261,9 @@ static int32_t rack_non_rxt_use_cr = 0; /* does a non-rxt in recovery use the configured rate (ss/ca)? */ static int32_t rack_persist_min = 250000; /* 250usec */ static int32_t rack_persist_max = 2000000; /* 2 Second in usec's */ +static int32_t rack_honors_hpts_min_to = 1; /* Do we honor the hpts minimum time out for pacing timers */ +static uint32_t rack_max_reduce = 10; /* Percent we can reduce slot by */ static int32_t rack_sack_not_required = 1; /* set to one to allow non-sack to use rack */ -static int32_t rack_default_init_window = 0; /* Use system default */ static int32_t rack_limit_time_with_srtt = 0; static int32_t rack_autosndbuf_inc = 20; /* In percentage form */ static int32_t rack_enobuf_hw_boost_mult = 0; /* How many times the hw rate we boost slot using time_between */ @@ -282,7 +297,6 @@ static int32_t rack_def_profile = 0; static int32_t rack_lower_cwnd_at_tlp = 0; -static int32_t rack_limited_retran = 0; static int32_t rack_always_send_oldest = 0; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; @@ -356,6 +370,7 @@ static int32_t rack_down_raise_thresh = 100; static int32_t rack_req_segs = 1; static uint64_t rack_bw_rate_cap = 0; +static uint64_t rack_fillcw_bw_cap = 3750000; /* Cap fillcw at 30Mbps */ /* Rack specific counters */ @@ -377,6 +392,7 @@ counter_u64_t rack_tlp_retran_bytes; counter_u64_t rack_to_tot; counter_u64_t rack_hot_alloc; +counter_u64_t tcp_policer_detected; counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; @@ -440,7 +456,7 @@ static int rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, - uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); + uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val, int32_t orig_tlen); static int rack_process_data(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, @@ -454,6 +470,8 @@ static struct rack_sendmap * rack_check_recovery_mode(struct tcpcb *tp, uint32_t tsused); +static uint32_t +rack_grab_rtt(struct tcpcb *tp, struct tcp_rack *rack); static void rack_cong_signal(struct tcpcb *tp, uint32_t type, uint32_t ack, int ); @@ -504,13 +522,14 @@ static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t ts, - struct rack_sendmap *hintrsm, uint16_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); + struct rack_sendmap *hintrsm, uint32_t add_flags, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz); static uint64_t rack_get_gp_est(struct tcp_rack *rack); + static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm); + struct rack_sendmap *rsm, uint32_t cts); static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, struct rack_sendmap *rsm); static int32_t rack_output(struct tcpcb *tp); @@ -526,10 +545,10 @@ static void rack_timer_cancel(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int line); static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint16_t add_flag, int segsiz); + struct rack_sendmap *rsm, uint64_t ts, int32_t * lenp, uint32_t add_flag, int segsiz); static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz); + struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz); static int rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type, tcp_seq th_ack); @@ -538,6 +557,10 @@ rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos); + +static void +rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz); + static int rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -720,6 +743,22 @@ rack_swap_beta_values(rack, 4); } +static void +rack_remove_pacing(struct tcp_rack *rack) +{ + if (rack->rc_pacing_cc_set) + rack_undo_cc_pacing(rack); + if (rack->r_ctl.pacing_method & RACK_REG_PACING) + tcp_decrement_paced_conn(); + if (rack->r_ctl.pacing_method & RACK_DGP_PACING) + tcp_dec_dgp_pacing_cnt(); + rack->rc_always_pace = 0; + rack->r_ctl.pacing_method = RACK_PACING_NONE; + rack->dgp_on = 0; + rack->rc_hybrid_mode = 0; + rack->use_fixed_rate = 0; +} + static void rack_log_gpset(struct tcp_rack *rack, uint32_t seq_end, uint32_t ack_end_t, uint32_t send_end_t, int line, uint8_t mode, struct rack_sendmap *rsm) @@ -742,6 +781,8 @@ log.u_bbr.pkts_out = line; log.u_bbr.cwnd_gain = rack->app_limited_needs_set; log.u_bbr.pkt_epoch = rack->r_ctl.rc_app_limited_cnt; + log.u_bbr.epoch = rack->r_ctl.current_round; + log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; if (rsm != NULL) { log.u_bbr.applimited = rsm->r_start; log.u_bbr.delivered = rsm->r_end; @@ -857,6 +898,7 @@ struct sysctl_oid *rack_measure; struct sysctl_oid *rack_probertt; struct sysctl_oid *rack_hw_pacing; + struct sysctl_oid *rack_policing; rack_attack = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -994,11 +1036,36 @@ "pacing", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Pacing related Controls"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "pcm_enabled", CTLFLAG_RW, + &rack_pcm_is_enabled, 1, + "Do we by default do PCM measurements?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "pcm_rnds", CTLFLAG_RW, + &rack_pcm_every_n_rounds, 100, + "How many rounds before we need to do a PCM measurement"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "pcm_blast", CTLFLAG_RW, + &rack_pcm_blast, 0, + "Blast out the full cwnd/rwnd when doing a PCM measurement"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "rnd_gp_gain", CTLFLAG_RW, + &rack_gp_gain_req, 1200, + "How much do we have to increase the GP to record the round 1200 = 120.0"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "dgp_out_of_ss_at", CTLFLAG_RW, + &rack_rnd_cnt_req, 0x10005, + "How many rounds less than rnd_gp_gain will drop us out of SS"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), - OID_AUTO, "fulldgpinrec", CTLFLAG_RW, - &rack_uses_full_dgp_in_rec, 1, - "Do we use all DGP features in recovery (fillcw, timely et.al.)?"); + OID_AUTO, "no_timely", CTLFLAG_RW, + &rack_timely_off, 0, + "Do we not use timely in DGP?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "fullbufdisc", CTLFLAG_RW, @@ -1017,13 +1084,13 @@ SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "divisor", CTLFLAG_RW, - &rack_default_pacing_divisor, 4, + &rack_default_pacing_divisor, 250, "What is the default divisor given to the rl code?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "fillcw_max_mult", CTLFLAG_RW, - &rack_bw_multipler, 2, - "What is the multiplier of the current gp_est that fillcw can increase the b/w too?"); + &rack_bw_multipler, 0, + "What is the limit multiplier of the current gp_est that fillcw can increase the b/w too, 200 == 200% (0 = off)?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "max_pace_over", CTLFLAG_RW, @@ -1039,11 +1106,6 @@ OID_AUTO, "limit_wsrtt", CTLFLAG_RW, &rack_limit_time_with_srtt, 0, "Do we limit pacing time based on srtt"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_pacing), - OID_AUTO, "init_win", CTLFLAG_RW, - &rack_default_init_window, 0, - "Do we have a rack initial window 0 = system default"); SYSCTL_ADD_U16(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_pacing), OID_AUTO, "gp_per_ss", CTLFLAG_RW, @@ -1079,6 +1141,11 @@ OID_AUTO, "rate_cap", CTLFLAG_RW, &rack_bw_rate_cap, 0, "If set we apply this value to the absolute rate cap used by pacing"); + SYSCTL_ADD_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_pacing), + OID_AUTO, "fillcw_cap", CTLFLAG_RW, + &rack_fillcw_bw_cap, 3750000, + "Do we have an absolute cap on the amount of b/w fillcw can specify (0 = no)?"); SYSCTL_ADD_U8(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "req_measure_cnt", CTLFLAG_RW, @@ -1317,11 +1384,6 @@ OID_AUTO, "send_oldest", CTLFLAG_RW, &rack_always_send_oldest, 0, "Should we always send the oldest TLP and RACK-TLP"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_tlp), - OID_AUTO, "rack_tlimit", CTLFLAG_RW, - &rack_limited_retran, 0, - "How many times can a rack timeout drive out sends"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_tlp), OID_AUTO, "tlp_cwnd_flag", CTLFLAG_RW, @@ -1355,6 +1417,26 @@ "timers", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Timer related controls"); + SYSCTL_ADD_U8(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "reset_ssth_rec_rto", CTLFLAG_RW, + &rack_ssthresh_rest_rto_rec, 0, + "When doing recovery -> rto -> recovery do we reset SSthresh?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "scoreboard_thresh", CTLFLAG_RW, + &rack_rxt_scoreboard_clear_thresh, 2, + "How many RTO's are allowed before we clear the scoreboard"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "honor_hpts_min", CTLFLAG_RW, + &rack_honors_hpts_min_to, 1, + "Do rack pacing timers honor hpts min timeout"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_timers), + OID_AUTO, "hpts_max_reduce", CTLFLAG_RW, + &rack_max_reduce, 10, + "Max percentage we will reduce slot by for pacing when we are behind"); SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_timers), OID_AUTO, "persmin", CTLFLAG_RW, @@ -1434,11 +1516,6 @@ "features", CTLFLAG_RW | CTLFLAG_MPSAFE, 0, "Feature controls"); - SYSCTL_ADD_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_features), - OID_AUTO, "rxt_clamp_thresh", CTLFLAG_RW, - &rack_rxt_clamp_thresh, 0, - "Bit encoded clamping setup bits CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_features), OID_AUTO, "hybrid_set_maxseg", CTLFLAG_RW, @@ -1474,6 +1551,53 @@ OID_AUTO, "hystartplusplus", CTLFLAG_RW, &rack_do_hystart, 0, "Should RACK enable HyStart++ on connections?"); + /* Policer detection */ + rack_policing = SYSCTL_ADD_NODE(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, + "policing", + CTLFLAG_RW | CTLFLAG_MPSAFE, 0, + "policer detection"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "rxt_thresh", CTLFLAG_RW, + &rack_policer_rxt_thresh, 0, + "Percentage of retransmits we need to be a possible policer (499 = 49.9 percent)"); + SYSCTL_ADD_U8(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "avg_thresh", CTLFLAG_RW, + &rack_policer_avg_thresh, 0, + "What threshold of average retransmits needed to recover a lost packet (1 - 169 aka 21 = 2.1)?"); + SYSCTL_ADD_U8(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "med_thresh", CTLFLAG_RW, + &rack_policer_med_thresh, 0, + "What threshold of Median retransmits needed to recover a lost packet (1 - 16)?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "data_thresh", CTLFLAG_RW, + &rack_policer_data_thresh, 64000, + "How many bytes must have gotten through before we can start doing policer detection?"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "bwcomp", CTLFLAG_RW, + &rack_policing_do_bw_comp, 1, + "Do we raise up low b/w so that at least pace_max_seg can be sent in the srtt?"); + SYSCTL_ADD_U8(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "recmss", CTLFLAG_RW, + &rack_req_del_mss, 18, + "How many MSS must be delivered during recovery to engage policer detection?"); + SYSCTL_ADD_U16(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "res_div", CTLFLAG_RW, + &rack_policer_bucket_reserve, 20, + "What percentage is reserved in the policer bucket?"); + SYSCTL_ADD_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_policing), + OID_AUTO, "min_comp_bw", CTLFLAG_RW, + &rack_pol_min_bw, 125000, + "Do we have a min b/w for b/w compensation (0 = no)?"); /* Misc rack controls */ rack_misc = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -1578,31 +1702,8 @@ OID_AUTO, "autoscale", CTLFLAG_RW, &rack_autosndbuf_inc, 20, "What percentage should rack scale up its snd buffer by?"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "rnds_for_rxt_clamp", CTLFLAG_RW, - &rack_rxt_min_rnds, 10, - "Number of rounds needed between RTT clamps due to high loss rates"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "rnds_for_unclamp", CTLFLAG_RW, - &rack_unclamp_round_thresh, 100, - "Number of rounds needed with no loss to unclamp"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "rxt_threshs_for_unclamp", CTLFLAG_RW, - &rack_unclamp_rxt_thresh, 5, - "Percentage of retransmits we need to be under to unclamp (5 = .5 percent)\n"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "clamp_ss_upper", CTLFLAG_RW, - &rack_clamp_ss_upper, 110, - "Clamp percentage ceiling in SS?"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_misc), - OID_AUTO, "clamp_ca_upper", CTLFLAG_RW, - &rack_clamp_ca_upper, 110, - "Clamp percentage ceiling in CA?"); + + /* Sack Attacker detection stuff */ SYSCTL_ADD_U32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_attack), @@ -1779,6 +1880,13 @@ OID_AUTO, "alloc_hot", CTLFLAG_RD, &rack_hot_alloc, "Total allocations from the top of our list"); + tcp_policer_detected = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "policer_detected", CTLFLAG_RD, + &tcp_policer_detected, + "Total policer_detections"); + rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_counters), @@ -1957,17 +2065,8 @@ static uint32_t rc_init_window(struct tcp_rack *rack) { - uint32_t win; + return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); - if (rack->rc_init_win == 0) { - /* - * Nothing set by the user, use the system stack - * default. - */ - return (tcp_compute_initwnd(tcp_maxseg(rack->rc_tp))); - } - win = ctf_fixed_maxseg(rack->rc_tp) * rack->rc_init_win; - return (win); } static uint64_t @@ -2071,6 +2170,7 @@ off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); log.u_bbr.bbr_substate = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); #endif + log.u_bbr.inhpts = 1; log.u_bbr.flex4 = (uint32_t)(rack->rc_tp->t_sndbytes - cur->sent_at_fs); log.u_bbr.flex5 = (uint32_t)(rack->rc_tp->t_snd_rxt_bytes - cur->rxt_at_fs); log.u_bbr.flex7 = (uint16_t)cur->hybrid_flags; @@ -2116,9 +2216,24 @@ memset(&log, 0, sizeof(log)); log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; log.u_bbr.delRate = cur->sent_at_fs; - log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes; + + if ((cur->flags & TCP_TRK_TRACK_FLG_LSND) == 0) { + /* + * We did not get a new Rules Applied to set so + * no overlapping send occured, this means the + * current byte counts are correct. + */ + log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; + log.u_bbr.rttProp = rack->rc_tp->t_snd_rxt_bytes; + } else { + /* + * Overlapping send case, we switched to a new + * send and did a rules applied. + */ + log.u_bbr.cur_del_rate = cur->sent_at_ls; + log.u_bbr.rttProp = cur->rxt_at_ls; + } log.u_bbr.bw_inuse = cur->rxt_at_fs; log.u_bbr.cwnd_gain = line; off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); @@ -2138,6 +2253,7 @@ log.u_bbr.lt_epoch = (uint32_t)((cur->timestamp >> 32) & 0x00000000ffffffff); /* now set all the flags in */ log.u_bbr.pkts_out = cur->hybrid_flags; + log.u_bbr.lost = cur->playout_ms; log.u_bbr.flex6 = cur->flags; /* * Last send time = note we do not distinguish cases @@ -2146,6 +2262,20 @@ */ log.u_bbr.pkt_epoch = (uint32_t)(rack->r_ctl.last_tmit_time_acked & 0x00000000ffffffff); log.u_bbr.flex5 = (uint32_t)((rack->r_ctl.last_tmit_time_acked >> 32) & 0x00000000ffffffff); + /* + * Compose bbr_state to be a bit wise 0000ADHF + * where A is the always_pace flag + * where D is the dgp_on flag + * where H is the hybrid_mode on flag + * where F is the use_fixed_rate flag. + */ + log.u_bbr.bbr_state = rack->rc_always_pace; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->dgp_on; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->rc_hybrid_mode; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->use_fixed_rate; log.u_bbr.flex8 = HYBRID_LOG_SENT_LOST; tcp_log_event(rack->rc_tp, NULL, @@ -2299,6 +2429,7 @@ #ifdef TCP_REQUEST_TRK if (rack->rc_hybrid_mode && rack->rc_catch_up && + (rack->r_ctl.rc_last_sft != NULL) && (rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_S_MSS) && (rack_hybrid_allow_set_maxseg == 1) && ((rack->r_ctl.rc_last_sft->hybrid_flags & TCP_HYBRID_PACING_SETMSS) == 0)) { @@ -2338,7 +2469,10 @@ */ uint64_t srtt; - lt_bw = rack_get_lt_bw(rack); + if (rack->dis_lt_bw == 1) + lt_bw = 0; + else + lt_bw = rack_get_lt_bw(rack); if (lt_bw) { /* * No goodput bw but a long-term b/w does exist @@ -2374,19 +2508,22 @@ /* Still doing initial average must calculate */ bw = rack->r_ctl.gp_bw / max(rack->r_ctl.num_measurements, 1); } + if (rack->dis_lt_bw) { + /* We are not using lt-bw */ + ret_bw = bw; + goto compensate; + } lt_bw = rack_get_lt_bw(rack); if (lt_bw == 0) { /* If we don't have one then equate it to the gp_bw */ lt_bw = rack->r_ctl.gp_bw; } - if ((rack->r_cwnd_was_clamped == 1) && (rack->r_clamped_gets_lower > 0)){ - /* if clamped take the lowest */ + if (rack->use_lesser_lt_bw) { if (lt_bw < bw) ret_bw = lt_bw; else ret_bw = bw; } else { - /* If not set for clamped to get lowest, take the highest */ if (lt_bw > bw) ret_bw = lt_bw; else @@ -2487,6 +2624,8 @@ log.u_bbr.flex7 = rack->r_ctl.dsack_persist; log.u_bbr.flex8 = mod; log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.epoch = rack->r_ctl.current_round; + log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2535,6 +2674,8 @@ else log.u_bbr.cur_del_rate = 0; log.u_bbr.rttProp = rack->r_ctl.last_hw_bw_req; + log.u_bbr.epoch = rack->r_ctl.current_round; + log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2552,28 +2693,9 @@ uint64_t bw_est, high_rate; uint64_t gain; - if ((rack->r_pacing_discount == 0) || - (rack_full_buffer_discount == 0)) { - /* - * No buffer level based discount from client buffer - * level is enabled or the feature is disabled. - */ - gain = (uint64_t)rack_get_output_gain(rack, rsm); - bw_est = bw * gain; - bw_est /= (uint64_t)100; - } else { - /* - * We have a discount in place apply it with - * just a 100% gain (we get no boost if the buffer - * is full). - */ - uint64_t discount; - - discount = bw * (uint64_t)(rack_full_buffer_discount * rack->r_ctl.pacing_discount_amm); - discount /= 100; - /* What %% of the b/w do we discount */ - bw_est = bw - discount; - } + gain = (uint64_t)rack_get_output_gain(rack, rsm); + bw_est = bw * gain; + bw_est /= (uint64_t)100; /* Never fall below the minimum (def 64kbps) */ if (bw_est < RACK_MIN_BW) bw_est = RACK_MIN_BW; @@ -2659,6 +2781,8 @@ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; + log.u_bbr.epoch = rack->r_ctl.current_round; + log.u_bbr.lt_epoch = rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2698,6 +2822,10 @@ log.u_bbr.lt_epoch = rack->rc_tp->t_rxtshift; log.u_bbr.lost = rack_rto_min; log.u_bbr.epoch = rack->r_ctl.roundends; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; + log.u_bbr.applimited = rack->rc_tp->t_flags2; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2731,6 +2859,9 @@ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2780,6 +2911,9 @@ log.u_bbr.lost = 0; else log.u_bbr.lost = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2927,6 +3061,9 @@ log.u_bbr.flex4 = where; log.u_bbr.flex7 = 2; log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2939,7 +3076,7 @@ static void rack_log_rtt_sendmap(struct tcp_rack *rack, uint32_t idx, uint64_t tsv, uint32_t tsecho) { - if (tcp_bblogging_on(rack->rc_tp)) { + if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; struct timeval tv; @@ -2951,6 +3088,9 @@ log.u_bbr.flex7 = 3; log.u_bbr.rttProp = tsv; log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -2979,6 +3119,9 @@ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -3051,6 +3194,13 @@ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; + log.u_bbr.epoch = rack->rc_inp->inp_socket->so_snd.sb_hiwat; + log.u_bbr.lt_epoch = rack->rc_inp->inp_socket->so_rcv.sb_hiwat; + log.u_bbr.lost = rack->rc_tp->t_srtt; + log.u_bbr.pkt_epoch = rack->rc_tp->rfbuf_cnt; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -3112,6 +3262,9 @@ log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; log.u_bbr.cwnd_gain = rack->rc_has_collapsed; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -3146,6 +3299,9 @@ log.u_bbr.pkts_out = rack->r_ctl.rc_out_at_rto; log.u_bbr.delivered = rack->r_ctl.rc_snd_max_at_rto; log.u_bbr.pacing_gain = rack->r_must_retran; + log.u_bbr.bw_inuse = rack->r_ctl.current_round; + log.u_bbr.bw_inuse <<= 32; + log.u_bbr.bw_inuse |= rack->r_ctl.rc_considered_lost; TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -3314,6 +3470,7 @@ counter_u64_free(rack_saw_enobuf_hw); counter_u64_free(rack_saw_enetunreach); counter_u64_free(rack_hot_alloc); + counter_u64_free(tcp_policer_detected); counter_u64_free(rack_to_alloc); counter_u64_free(rack_to_alloc_hard); counter_u64_free(rack_to_alloc_emerg); @@ -3475,6 +3632,8 @@ rack->r_ctl.rc_num_split_allocs--; } if (rsm == rack->r_ctl.rc_first_appl) { + rack->r_ctl.cleared_app_ack_seq = rsm->r_start + (rsm->r_end - rsm->r_start); + rack->r_ctl.cleared_app_ack = 1; if (rack->r_ctl.rc_app_limited_cnt == 0) rack->r_ctl.rc_first_appl = NULL; else @@ -3490,7 +3649,7 @@ rack->r_ctl.rc_sacklast = NULL; memset(rsm, 0, sizeof(struct rack_sendmap)); /* Make sure we are not going to overrun our count limit of 0xff */ - if ((rack->rc_free_cnt + 1) > 0xff) { + if ((rack->rc_free_cnt + 1) > RACK_FREE_CNT_MAX) { rack_free_trim(rack); } TAILQ_INSERT_HEAD(&rack->r_ctl.rc_free, rsm, r_tnext); @@ -3806,6 +3965,8 @@ logged = 0; + if (rack->rc_skip_timely) + return; if (override) { /* * override is passed when we are @@ -3976,6 +4137,8 @@ uint64_t logvar, logvar2, logvar3; uint32_t logged, new_per, ss_red, ca_red, rec_red, alt, val; + if (rack->rc_skip_timely) + return; if (rack->rc_gp_incr) { /* Turn off increment counting */ rack->rc_gp_incr = 0; @@ -4177,6 +4340,7 @@ */ uint32_t segsiz; + rack->r_ctl.rc_lower_rtt_us_cts = us_cts; if (rack->rc_gp_dyn_mul == 0) return; @@ -4203,7 +4367,6 @@ rack->r_ctl.rc_pace_min_segs); rack->in_probe_rtt = 1; rack->measure_saw_probe_rtt = 1; - rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_probertt_starts = 0; rack->r_ctl.rc_entry_gp_rtt = rack->r_ctl.rc_gp_srtt; if (rack_probertt_use_min_rtt_entry) @@ -4387,6 +4550,7 @@ rack_check_probe_rtt(struct tcp_rack *rack, uint32_t us_cts) { /* Check in on probe-rtt */ + if (rack->rc_gp_filled == 0) { /* We do not do p-rtt unless we have gp measurements */ return; @@ -4431,7 +4595,10 @@ if (calc) { /* Maybe */ calc *= rack_per_of_gp_probertt_reduce; - rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; + if (calc > rack_per_of_gp_probertt) + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; + else + rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt - calc; /* Limit it too */ if (rack->r_ctl.rack_per_of_gp_probertt < rack_per_of_gp_lowthresh) rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_lowthresh; @@ -4472,7 +4639,9 @@ rack_exit_probertt(rack, us_cts); } - } else if ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt) { + } else if ((rack->rc_skip_timely == 0) && + (TSTMP_GT(us_cts, rack->r_ctl.rc_lower_rtt_us_cts)) && + ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= rack_time_between_probertt)) { /* Go into probertt, its been too long since we went lower */ rack_enter_probertt(rack, us_cts); } @@ -4831,6 +5000,32 @@ } } +static void +rack_log_gp_calc(struct tcp_rack *rack, uint32_t add_part, uint32_t sub_part, uint32_t srtt, uint64_t meas_bw, uint64_t utim, uint8_t meth, uint32_t line) +{ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = add_part; + log.u_bbr.flex2 = sub_part; + log.u_bbr.flex3 = rack_wma_divisor; + log.u_bbr.flex4 = srtt; + log.u_bbr.flex7 = (uint16_t)line; + log.u_bbr.flex8 = meth; + log.u_bbr.delRate = rack->r_ctl.gp_bw; + log.u_bbr.cur_del_rate = meas_bw; + log.u_bbr.rttProp = utim; + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_THRESH_CALC, 0, + 0, &log, false, &rack->r_ctl.act_rcv_time); + } +} + static void rack_do_goodput_measurement(struct tcpcb *tp, struct tcp_rack *rack, tcp_seq th_ack, int line, uint8_t quality) @@ -5046,6 +5241,8 @@ * other hand if we get a measurement over 1ms with a * 10ms rtt we only want to take a much smaller portion. */ + uint8_t meth; + if (rack->r_ctl.num_measurements < 0xff) { rack->r_ctl.num_measurements++; } @@ -5086,6 +5283,7 @@ */ addpart = bytes_ps * utim; addpart /= (srtt * 8); + meth = 1; } else { /* * Don't allow a single measurement @@ -5098,7 +5296,9 @@ */ subpart = rack->r_ctl.gp_bw / 2; addpart = bytes_ps / 2; + meth = 2; } + rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); resid_bw = rack->r_ctl.gp_bw - subpart; rack->r_ctl.gp_bw = resid_bw + addpart; did_add = 1; @@ -5116,6 +5316,7 @@ subpart /= (srtt * rack_wma_divisor); addpart = bytes_ps * utim; addpart /= (srtt * rack_wma_divisor); + meth = 3; } else { /* * The scaled measurement was long @@ -5124,6 +5325,7 @@ */ subpart = rack->r_ctl.gp_bw / rack_wma_divisor; addpart = bytes_ps / rack_wma_divisor; + meth = 4; } if ((rack->measure_saw_probe_rtt == 0) || (bytes_ps > rack->r_ctl.gp_bw)) { @@ -5133,12 +5335,83 @@ * add in. */ did_add = 1; + rack_log_gp_calc(rack, addpart, subpart, srtt, bytes_ps, utim, meth, __LINE__); resid_bw = rack->r_ctl.gp_bw - subpart; rack->r_ctl.gp_bw = resid_bw + addpart; } } rack_set_pace_segments(tp, rack, __LINE__, NULL); } + /* + * We only watch the growth of the GP during the initial startup + * or first-slowstart that ensues. If we ever needed to watch + * growth of gp outside of that period all we need to do is + * remove the first clause of this if (rc_initial_ss_comp). + */ + if ((rack->rc_initial_ss_comp == 0) && + (rack->r_ctl.num_measurements >= RACK_REQ_AVG)) { + uint64_t gp_est; + + gp_est = bytes_ps; + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = rack->r_ctl.current_round; + log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; + log.u_bbr.delRate = gp_est; + log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; + log.u_bbr.flex8 = 41; + (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, __func__, __LINE__,&tv); + } + if ((rack->r_ctl.num_measurements == RACK_REQ_AVG) || + (rack->r_ctl.last_gpest == 0)) { + /* + * The round we get our measurement averaging going + * is the base round so it always is the source point + * for when we had our first increment. From there on + * we only record the round that had a rise. + */ + rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; + rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; + } else if (gp_est >= rack->r_ctl.last_gpest) { + /* + * Test to see if its gone up enough + * to set the round count up to now. Note + * that on the seeding of the 4th measurement we + */ + gp_est *= 1000; + gp_est /= rack->r_ctl.last_gpest; + if ((uint32_t)gp_est > rack->r_ctl.gp_gain_req) { + /* + * We went up enough to record the round. + */ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = rack->r_ctl.current_round; + log.u_bbr.flex2 = (uint32_t)gp_est; + log.u_bbr.flex3 = rack->r_ctl.gp_gain_req; + log.u_bbr.delRate = gp_est; + log.u_bbr.cur_del_rate = rack->r_ctl.last_gpest; + log.u_bbr.flex8 = 42; + (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, __func__, __LINE__,&tv); + } + rack->r_ctl.last_rnd_of_gp_rise = rack->r_ctl.current_round; + if (rack->r_ctl.use_gp_not_last == 1) + rack->r_ctl.last_gpest = rack->r_ctl.gp_bw; + else + rack->r_ctl.last_gpest = bytes_ps; + } + } + } if ((rack->gp_ready == 0) && (rack->r_ctl.num_measurements >= rack->r_ctl.req_measurements)) { /* We have enough measurements now */ @@ -5152,10 +5425,15 @@ rack_log_pacing_delay_calc(rack, subpart, addpart, bytes_ps, stim, rack_get_bw(rack), 22, did_add, NULL, quality); /* We do not update any multipliers if we are in or have seen a probe-rtt */ - if ((rack->measure_saw_probe_rtt == 0) && rack->rc_gp_rtt_set) - rack_update_multiplier(rack, timely_says, bytes_ps, - rack->r_ctl.rc_gp_srtt, - rack->r_ctl.rc_rtt_diff); + + if ((rack->measure_saw_probe_rtt == 0) && + rack->rc_gp_rtt_set) { + if (rack->rc_skip_timely == 0) { + rack_update_multiplier(rack, timely_says, bytes_ps, + rack->r_ctl.rc_gp_srtt, + rack->r_ctl.rc_rtt_diff); + } + } rack_log_pacing_delay_calc(rack, bytes, tim, bytes_ps, stim, rack_get_bw(rack), 3, line, NULL, quality); rack_log_pacing_delay_calc(rack, @@ -5179,7 +5457,6 @@ rack->rc_gp_saw_ca = 0; rack->rc_gp_saw_ss = 0; rack->rc_dragged_bottom = 0; - if (quality == RACK_QUALITY_HIGH) { /* * Gput in the stats world is in kbps where bytes_ps is @@ -5326,7 +5603,7 @@ */ static void rack_ack_received(struct tcpcb *tp, struct tcp_rack *rack, uint32_t th_ack, uint16_t nsegs, - uint16_t type, int32_t recovery) + uint16_t type, int32_t post_recovery) { uint32_t prior_cwnd, acked; struct tcp_log_buffer *lgb = NULL; @@ -5335,7 +5612,7 @@ INP_WLOCK_ASSERT(tptoinpcb(tp)); tp->t_ccv.nsegs = nsegs; acked = tp->t_ccv.bytes_this_ack = (th_ack - tp->snd_una); - if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { + if ((post_recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); @@ -5348,17 +5625,21 @@ ((int32_t)rack->r_ctl.cwnd_to_use) - tp->snd_wnd); #endif if ((th_ack == tp->snd_max) && rack->lt_bw_up) { - /* We will ack all, time - * to end any lt_bw_up we - * have running until something - * new is sent. + /* + * We will ack all the data, time to end any + * lt_bw_up we have running until something + * new is sent. Note we need to use the actual + * ack_rcv_time which with pacing may be different. */ - struct timeval tv; + uint64_t tmark; rack->r_ctl.lt_bw_bytes += (tp->snd_max - rack->r_ctl.lt_seq); rack->r_ctl.lt_seq = tp->snd_max; - (void)tcp_get_usecs(&tv); - rack->r_ctl.lt_bw_time += (tcp_tv_to_lusectick(&tv) - rack->r_ctl.lt_timemark); + tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + if (tmark >= rack->r_ctl.lt_timemark) { + rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + } + rack->r_ctl.lt_timemark = tmark; rack->lt_bw_up = 0; } quality = RACK_QUALITY_NONE; @@ -5385,7 +5666,7 @@ tp->t_bytes_acked = 0; } prior_cwnd = tp->snd_cwnd; - if ((recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || + if ((post_recovery == 0) || (rack_max_abc_post_recovery == 0) || rack->r_use_labc_for_rec || (rack_client_low_buf && rack->client_bufferlvl && (rack->client_bufferlvl < rack_client_low_buf))) labc_to_use = rack->rc_labc; @@ -5446,6 +5727,14 @@ if (rack->r_ctl.rc_rack_largest_cwnd < rack->r_ctl.cwnd_to_use) { rack->r_ctl.rc_rack_largest_cwnd = rack->r_ctl.cwnd_to_use; } + if ((rack->rc_initial_ss_comp == 0) && + (tp->snd_cwnd >= tp->snd_ssthresh)) { + /* + * The cwnd has grown beyond ssthresh we have + * entered ca and completed our first Slowstart. + */ + rack->rc_initial_ss_comp = 1; + } } static void @@ -5467,180 +5756,64 @@ rack->r_wanted_output = 1; } -static inline void -rack_set_most_aggr(struct tcp_rack *rack) -{ - rack->r_fill_less_agg = 0; - /* Once the cwnd as been clamped we don't do fill_cw */ - if (rack->r_cwnd_was_clamped == 0) - rack->rc_pace_to_cwnd = 1; - rack->r_pacing_discount = 0; -} - -static inline void -rack_limit_fillcw(struct tcp_rack *rack) -{ - rack->r_fill_less_agg = 1; - /* Once the cwnd as been clamped we don't do fill_cw */ - if (rack->r_cwnd_was_clamped == 0) - rack->rc_pace_to_cwnd = 1; - rack->r_pacing_discount = 0; -} - -static inline void -rack_disable_fillcw(struct tcp_rack *rack) +static inline uint64_t +rack_get_rxt_per(uint64_t snds, uint64_t rxts) { - rack->r_fill_less_agg = 1; - rack->rc_pace_to_cwnd = 0; - rack->r_pacing_discount = 0; -} + uint64_t rxt_per; -static void -rack_client_buffer_level_set(struct tcp_rack *rack) -{ - /* - * Only if DGP is on do we do anything that - * changes stack behavior. If DGP is off all - * we will do is issue a BB log (if BB logging is - * on) and return. - */ - if (rack->dgp_on == 0) { - rack_log_pacing_delay_calc(rack, 0, rack->client_bufferlvl, - 0, 0, 0, 30, __LINE__, NULL, 0); - return; - } - if (IN_RECOVERY(rack->rc_tp->t_flags) && rack->r_ctl.full_dgp_in_rec) { - goto set_most_agg; - } - /* - * We are in DGP so what setting should we - * apply based on where the client is? - */ - switch(rack->r_ctl.rc_dgp_bl_agg) { - default: - case DGP_LEVEL0: -set_most_agg: - rack_set_most_aggr(rack); - break; - case DGP_LEVEL1: - if (rack->client_bufferlvl == 4) - rack_limit_fillcw(rack); - else if (rack->client_bufferlvl == 5) - rack_disable_fillcw(rack); - else - rack_set_most_aggr(rack); - break; - case DGP_LEVEL2: - if (rack->client_bufferlvl == 3) - rack_limit_fillcw(rack); - else if (rack->client_bufferlvl == 4) - rack_disable_fillcw(rack); - else if (rack->client_bufferlvl == 5) { - rack_disable_fillcw(rack); - rack->r_pacing_discount = 1; - rack->r_ctl.pacing_discount_amm = 1; - } else - rack_set_most_aggr(rack); - break; - case DGP_LEVEL3: - if (rack->client_bufferlvl == 2) - rack_limit_fillcw(rack); - else if (rack->client_bufferlvl == 3) - rack_disable_fillcw(rack); - else if (rack->client_bufferlvl == 4) { - rack_disable_fillcw(rack); - rack->r_pacing_discount = 1; - rack->r_ctl.pacing_discount_amm = 1; - } else if (rack->client_bufferlvl == 5) { - rack_disable_fillcw(rack); - rack->r_pacing_discount = 1; - rack->r_ctl.pacing_discount_amm = 2; - } else - rack_set_most_aggr(rack); - break; + if (snds > 0) { + rxt_per = rxts * 1000; + rxt_per /= snds; + } else { + /* This is an unlikely path */ + if (rxts) { + /* Its the max it was all re-transmits */ + rxt_per = 0xffffffffffffffff; + } else { + rxt_per = 0; + } } - rack_log_pacing_delay_calc(rack, rack->r_ctl.rc_dgp_bl_agg, rack->client_bufferlvl, 0, - 0, 0, 30, __LINE__, NULL, 0); + return (rxt_per); } static void -do_rack_check_for_unclamp(struct tcpcb *tp, struct tcp_rack *rack) +policer_detection_log(struct tcp_rack *rack, uint32_t flex1, uint32_t flex2, uint32_t flex3, uint32_t flex4, uint8_t flex8) { - /* - * Can we unclamp. We unclamp if more than - * N rounds have transpired with no loss. - */ - uint64_t snds, rxts, rxt_per; - uint32_t rnds; - - rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; - if ((rack_unclamp_round_thresh > 0) && - (rnds >= rack_unclamp_round_thresh)) { - snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; - KASSERT ((snds > 0), ("rack:%p tp:%p snds:%ju is 0", rack, tp, - (uintmax_t)snds)); - rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; - rxt_per = rxts * 1000; - rxt_per /= snds; - if ((uint32_t)rxt_per <= rack_unclamp_rxt_thresh) { - /* Unclamp */ - if (tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex3 = rnds; - log.u_bbr.flex4 = rack_unclamp_round_thresh; - log.u_bbr.flex5 = (uint32_t)rxt_per; - log.u_bbr.flex8 = 6; - log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; - log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; - log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; - log.u_bbr.applimited = rack->r_ctl.max_clamps; - log.u_bbr.epoch = rack->r_ctl.clamp_options; - log.u_bbr.cur_del_rate = rxts; - log.u_bbr.bw_inuse = rack_get_lt_bw(rack); - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); - log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); - tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } - rack->r_ctl.num_of_clamps_applied = 0; - rack->r_cwnd_was_clamped = 0; - rack->excess_rxt_on = 1; - if (rack->r_ctl.clamp_options) { - /* - * We only allow fillcw to be toggled - * if you are setting a max seg too. - */ - if (rack->r_ctl.clamp_options & 0x1) { - if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { - /* turn on fill cw for non-dgp*/ - rack->rc_pace_to_cwnd = 0; - } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { - /* For DGP we want it off */ - rack->rc_pace_to_cwnd = 1; - } - } - } - if (rack->dgp_on) { - /* Reset all multipliers to 100.0 so just the measured bw */ - /* Crash any per boosts down to 100% */ - rack->r_ctl.rack_per_of_gp_rec = 100; - rack->r_ctl.rack_per_of_gp_ss = 100; - rack->r_ctl.rack_per_of_gp_ca = 100; - /* Set in an upper bound for ss/ca % increase */ - rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; - rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; - } - } + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = flex1; + log.u_bbr.flex2 = flex2; + log.u_bbr.flex3 = flex3; + log.u_bbr.flex4 = flex4; + log.u_bbr.flex5 = rack->r_ctl.current_policer_bucket; + log.u_bbr.flex6 = rack->r_ctl.policer_bucket_size; + log.u_bbr.flex7 = 0; + log.u_bbr.flex8 = flex8; + log.u_bbr.bw_inuse = rack->r_ctl.policer_bw; + log.u_bbr.applimited = rack->r_ctl.current_round; + log.u_bbr.epoch = rack->r_ctl.policer_max_seg; + log.u_bbr.delivered = (uint32_t)rack->r_ctl.bytes_acked_in_recovery; + log.u_bbr.cur_del_rate = rack->rc_tp->t_sndbytes; + log.u_bbr.delRate = rack->rc_tp->t_snd_rxt_bytes; + log.u_bbr.rttProp = rack->r_ctl.gp_bw; + log.u_bbr.bbr_state = rack->rc_policer_detected; + log.u_bbr.bbr_substate = 0; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.use_lt_bw = rack->policer_detect_on; + log.u_bbr.lt_epoch = 0; + log.u_bbr.pkts_out = 0; + tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, + 0, &log, false, NULL, NULL, 0, &tv); } + } static void -do_rack_excess_rxt(struct tcpcb *tp, struct tcp_rack *rack) +policer_detection(struct tcpcb *tp, struct tcp_rack *rack, int post_recovery) { /* * Rack excess rxt accounting is turned on. If we @@ -5648,166 +5821,395 @@ * rounds, then back off the cwnd and ssthresh * to fit into the long-term b/w. */ - uint64_t snds, rxts, rxt_per, lt_bw, bdp; - uint32_t rnds, new_cwnd, new_ssthresh, rtt, shared_cwnd_was_enabled = 0; - /* Is it shut off by 0 rounds? */ - if (rack_rxt_min_rnds == 0) - return; - if ((rack->r_ctl.max_clamps > 0) && - (rack->r_ctl.num_of_clamps_applied >= rack->r_ctl.max_clamps)) { - /* - * The idea, if max_clamps is set, is that if clamping it - * N times did not work again, then there is no sense - * clamping it again. The link is just a lossy link and - * our clamps are doing no good. Turn it off so we don't come - * back here again. - */ - rack->excess_rxt_on = 0; - rack->r_cwnd_was_clamped = 0; - rack->r_ctl.num_of_clamps_applied = 0; - return; - } - snds = tp->t_sndbytes - rack->r_ctl.last_sndbytes; - rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_snd_rxt_bytes; - rnds = rack->r_ctl.current_round - rack->r_ctl.last_rnd_rxt_clamped; - /* Has enough rounds progressed for us to re-measure? */ - if ((rnds >= rack_rxt_min_rnds) && - (rack->r_ctl.rxt_threshold > 0)){ - rxt_per = rxts * 1000; - rxt_per /= snds; - if (rxt_per >= rack->r_ctl.rxt_threshold) { - /* - * Action required: - * We are above our excess retransmit level, lets - * cut down the cwnd and ssthresh to match the long-term - * b/w we are getting. - */ - /* First disable scwnd if enabled */ -#ifdef NETFLIX_SHARED_CWND - rack->rack_enable_scwnd = 0; - if (rack->r_ctl.rc_scw) { - uint32_t limit; + uint32_t pkts, mid, med, alt_med, avg, segsiz, tot_retran_pkt_count = 0; + uint32_t cnt_of_mape_rxt = 0; + uint64_t snds, rxts, rxt_per, tim, del, del_bw; + int i; + struct timeval tv; - shared_cwnd_was_enabled = 1; - if (rack->r_limit_scw) - limit = max(1, rack->r_ctl.rc_lowest_us_rtt); - else - limit = 0; - tcp_shared_cwnd_free_full(tp, rack->r_ctl.rc_scw, - rack->r_ctl.rc_scw_index, - limit); - rack->r_ctl.rc_scw = NULL; - } -#endif - /* Calculate what the cwnd and ssthresh should be */ - tcp_trace_point(rack->rc_tp, TCP_TP_EXCESS_RXT); - lt_bw = rack_get_lt_bw(rack); - if (lt_bw == 0) { - /* - * No lt_bw, lets chop things to one MSS - * and the ssthresh to the iwnd. - */ -reset_to_iw: - new_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); - new_ssthresh = tcp_compute_initwnd(tcp_maxseg(tp)); - } else { - rtt = rack->rc_rack_rtt; - if (rtt == 0) { - /* If we have no rack_rtt drop to the IW situation */ - goto reset_to_iw; - } - bdp = lt_bw * (uint64_t)rtt; - bdp /= HPTS_USEC_IN_SEC; - new_cwnd = (uint32_t)bdp; - new_ssthresh = new_cwnd - 1; - if (new_cwnd < ctf_fixed_maxseg(tp)) { - /* Rock bottom, goto IW settings */ - goto reset_to_iw; - } - } - rack->r_cwnd_was_clamped = 1; - rack->r_ctl.num_of_clamps_applied++; - /* Reset the counter fromn now */ - tp->t_bytes_acked = 0; + /* + * First is there enough packets delivered during recovery to make + * a determiniation of b/w? + */ + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + if ((rack->rc_policer_detected == 0) && + (rack->r_ctl.policer_del_mss > 0) && + ((uint32_t)rack->r_ctl.policer_del_mss > ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz))) { + /* + * Not enough data sent in recovery for initial detection. Once + * we have deteced a policer we allow less than the threshold (polcer_del_mss) + * amount of data in a recovery to let us fall through and double check + * our policer settings and possibly expand or collapse the bucket size and + * the polcier b/w. + * + * Once you are declared to be policed. this block of code cannot be + * reached, instead blocks further down will re-check the policer detection + * triggers and possibly reset the measurements if somehow we have let the + * policer bucket size grow too large. + */ + if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { + policer_detection_log(rack, rack->r_ctl.policer_del_mss, + ((rack->r_ctl.bytes_acked_in_recovery + segsiz - 1)/segsiz), + rack->r_ctl.bytes_acked_in_recovery, segsiz, 18); + } + return; + } + tcp_get_usecs(&tv); + tim = tcp_tv_to_lusectick(&tv) - rack->r_ctl.time_entered_recovery; + del = rack->r_ctl.bytes_acked_in_recovery; + if (tim > 0) + del_bw = (del * (uint64_t)1000000) / tim; + else + del_bw = 0; + /* B/W compensation? */ + + if (rack->r_ctl.pol_bw_comp && ((rack->r_ctl.policer_bw > 0) || + (del_bw > 0))) { + /* + * Sanity check now that the data is in. How long does it + * take for us to pace out two of our policer_max_seg's? + * + * If it is longer than the RTT then we are set + * too slow, maybe because of not enough data + * sent during recovery. + */ + uint64_t lentime, res, srtt, max_delbw, alt_bw; + + srtt = (uint64_t)rack_grab_rtt(tp, rack); + if ((tp->t_srtt > 0) && (srtt > tp->t_srtt)) + srtt = tp->t_srtt; + lentime = rack->r_ctl.policer_max_seg * (uint64_t)HPTS_USEC_IN_SEC * 2; + if (del_bw > rack->r_ctl.policer_bw) { + max_delbw = del_bw; + } else { + max_delbw = rack->r_ctl.policer_bw; + } + res = lentime / max_delbw; + if ((srtt > 0) && (res > srtt)) { /* - * Now what about options? - * We look at the bottom 8 bits: - * F = fill cw bit (toggle it if set) - * S = Segment bits - * M = set max segment bit + * At this rate we can not get two policer_maxsegs + * out before the ack arrives back. * - * SSSS SSMF + * Lets at least get it raised up so that + * we can be a bit faster than that if possible. */ - if (rack->r_ctl.clamp_options) { - if (rack->r_ctl.clamp_options & 0x1) { - if ((rack->rc_pace_to_cwnd == 0) && (rack->dgp_on == 0)) { - /* turn on fill cw for non-dgp*/ - rack->rc_pace_to_cwnd = 1; - } else if ((rack->dgp_on == 1) && (rack->rc_pace_to_cwnd == 1)) { - /* For DGP we want it off */ - rack->rc_pace_to_cwnd = 0; - } + lentime = (rack->r_ctl.policer_max_seg * 2); + tim = srtt; + alt_bw = (lentime * (uint64_t)HPTS_USEC_IN_SEC) / tim; + if (alt_bw > max_delbw) { + uint64_t cap_alt_bw; + + cap_alt_bw = (max_delbw + (max_delbw * rack->r_ctl.pol_bw_comp)); + if ((rack_pol_min_bw > 0) && (cap_alt_bw < rack_pol_min_bw)) { + /* We place a min on the cap which defaults to 1Mbps */ + cap_alt_bw = rack_pol_min_bw; + } + if (alt_bw <= cap_alt_bw) { + /* It should be */ + del_bw = alt_bw; + policer_detection_log(rack, + (uint32_t)tim, + rack->r_ctl.policer_max_seg, + 0, + 0, + 16); + } else { + /* + * This is an odd case where likely the RTT is very very + * low. And yet it is still being policed. We don't want + * to get more than (rack_policing_do_bw_comp+1) x del-rate + * where del-rate is what we got in recovery for either the + * first Policer Detection(PD) or this PD we are on now. + */ + del_bw = cap_alt_bw; + policer_detection_log(rack, + (uint32_t)tim, + rack->r_ctl.policer_max_seg, + (uint32_t)max_delbw, + (rack->r_ctl.pol_bw_comp + 1), + 16); } } - if (rack->dgp_on) { - /* Reset all multipliers to 100.0 so just the measured bw */ - /* Crash any per boosts down to 100% */ - rack->r_ctl.rack_per_of_gp_rec = 100; - rack->r_ctl.rack_per_of_gp_ss = 100; - rack->r_ctl.rack_per_of_gp_ca = 100; - /* Set in an upper bound for ss/ca % increase */ - rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_clamp_ss_upper; - rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_clamp_ca_upper; - /* Now move to the lt_bw */ - rack->r_ctl.gp_bw = lt_bw; - rack->rc_gp_filled = 1; - rack->r_ctl.num_measurements = RACK_REQ_AVG; - } - if (tcp_bblogging_on(rack->rc_tp)) { - union tcp_log_stackspecific log; - struct timeval tv; - - memset(&log.u_bbr, 0, sizeof(log.u_bbr)); - log.u_bbr.timeStamp = tcp_get_usecs(&tv); - log.u_bbr.flex1 = new_cwnd; - log.u_bbr.flex2 = new_ssthresh; - log.u_bbr.flex3 = rnds; - log.u_bbr.flex4 = rack_rxt_min_rnds; - log.u_bbr.flex5 = rtt; - log.u_bbr.flex6 = shared_cwnd_was_enabled; - log.u_bbr.flex8 = 5; - log.u_bbr.pkt_epoch = rack->r_ctl.rc_pace_max_segs; - log.u_bbr.bbr_state = rack->rc_pace_to_cwnd; - log.u_bbr.delivered = rack->r_ctl.num_of_clamps_applied; - log.u_bbr.applimited = rack->r_ctl.max_clamps; - log.u_bbr.epoch = rack->r_ctl.clamp_options; - log.u_bbr.cur_del_rate = rxts; - log.u_bbr.delRate = snds; - log.u_bbr.rttProp = rack->r_ctl.rxt_threshold; - log.u_bbr.bw_inuse = lt_bw; - log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); - log.u_bbr.lt_epoch = (uint32_t)((rack->r_ctl.gp_bw >> 32) & 0x00000000ffffffff); - log.u_bbr.pkts_out = (uint32_t)(rack->r_ctl.gp_bw & 0x00000000ffffffff); - tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, - 0, &log, false, NULL, NULL, 0, &tv); - } - /* Update our point where we did it */ - if (rack->r_ctl.already_had_a_excess == 0) { - rack->r_ctl.already_had_a_excess = 1; - counter_u64_add(rack_rxt_clamps_cwnd_uniq, 1); + } + } + snds = tp->t_sndbytes - rack->r_ctl.last_policer_sndbytes; + rxts = tp->t_snd_rxt_bytes - rack->r_ctl.last_policer_snd_rxt_bytes; + rxt_per = rack_get_rxt_per(snds, rxts); + /* Figure up the average and median */ + for(i = 0; i < RETRAN_CNT_SIZE; i++) { + if (rack->r_ctl.rc_cnt_of_retran[i] > 0) { + tot_retran_pkt_count += (i + 1) * rack->r_ctl.rc_cnt_of_retran[i]; + cnt_of_mape_rxt += rack->r_ctl.rc_cnt_of_retran[i]; + } + } + if (cnt_of_mape_rxt) + avg = (tot_retran_pkt_count * 10)/cnt_of_mape_rxt; + else + avg = 0; + alt_med = med = 0; + mid = tot_retran_pkt_count/2; + for(i = 0; i < RETRAN_CNT_SIZE; i++) { + pkts = (i + 1) * rack->r_ctl.rc_cnt_of_retran[i]; + if (mid > pkts) { + mid -= pkts; + continue; + } + med = (i + 1); + break; + } + mid = cnt_of_mape_rxt / 2; + for(i = 0; i < RETRAN_CNT_SIZE; i++) { + if (mid > rack->r_ctl.rc_cnt_of_retran[i]) { + mid -= rack->r_ctl.rc_cnt_of_retran[i]; + continue; + } + alt_med = (i + 1); + break; + } + if (rack->r_ctl.policer_alt_median) { + /* Swap the medians */ + uint32_t swap; + + swap = med; + med = alt_med; + alt_med = swap; + } + if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = avg; + log.u_bbr.flex2 = med; + log.u_bbr.flex3 = (uint32_t)rxt_per; + log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold; + log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold; + log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold; + log.u_bbr.flex7 = rack->r_ctl.policer_alt_median; + log.u_bbr.flex8 = 1; + log.u_bbr.delivered = rack->r_ctl.policer_bucket_size; + log.u_bbr.applimited = rack->r_ctl.current_round; + log.u_bbr.epoch = rack->r_ctl.policer_max_seg; + log.u_bbr.bw_inuse = del_bw; + log.u_bbr.cur_del_rate = rxts; + log.u_bbr.delRate = snds; + log.u_bbr.rttProp = rack->r_ctl.gp_bw; + log.u_bbr.bbr_state = rack->rc_policer_detected; + log.u_bbr.bbr_substate = 0; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.use_lt_bw = rack->policer_detect_on; + log.u_bbr.lt_epoch = (uint32_t)tim; + log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery; + tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } + if (med == RETRAN_CNT_SIZE) { + /* + * If the median is the maximum, then what we + * likely have here is a network breakage. Either that + * or we are so unlucky that all of our traffic is being + * dropped and having to be retransmitted the maximum times + * and this just is not how a policer works. + * + * If it is truely a policer eventually we will come + * through and it won't be the maximum. + */ + return; + } + /* Has enough rounds progressed for us to re-measure? */ + if ((rxt_per >= (uint64_t)rack->r_ctl.policer_rxt_threshold) && + (avg >= rack->r_ctl.policer_avg_threshold) && + (med >= rack->r_ctl.policer_med_threshold)) { + /* + * We hit all thresholds that indicate we are + * being policed. Now we may be doing this from a rack timeout + * which then means the rest of recovery will hopefully go + * smoother as we pace. At the end of recovery we will + * fall back in here and reset the values using the + * results of the entire recovery episode (we could also + * hit this as we exit recovery as well which means only + * one time in here). + * + * This is done explicitly that if we hit the thresholds + * again in a second recovery we overwrite the values. We do + * that because over time, as we pace the policer_bucket_size may + * continue to grow. This then provides more and more times when + * we are not pacing to the policer rate. This lets us compensate + * for when we hit a false positive and those flows continue to + * increase. However if its a real policer we will then get over its + * limit, over time, again and thus end up back here hitting the + * thresholds again. + * + * The alternative to this is to instead whenever we pace due to + * policing in rack_policed_sending we could add the amount len paced to the + * idle_snd_una value (which decreases the amount in last_amount_before_rec + * since that is always [th_ack - idle_snd_una]). This would then prevent + * the polcier_bucket_size from growing in additional recovery episodes + * Which would then mean false postives would be pretty much stuck + * after things got back to normal (assuming that what caused the + * false positive was a small network outage). + * + */ + tcp_trace_point(rack->rc_tp, TCP_TP_POLICER_DET); + if (rack->rc_policer_detected == 0) { + /* + * Increment the stat that tells us we identified + * a policer only once. Note that if we ever allow + * the flag to be cleared (reverted) then we need + * to adjust this to not do multi-counting. + */ + counter_u64_add(tcp_policer_detected, 1); + } + rack->r_ctl.last_policer_sndbytes = tp->t_sndbytes; + rack->r_ctl.last_policer_snd_rxt_bytes = tp->t_snd_rxt_bytes; + rack->r_ctl.policer_bw = del_bw; + rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, + rack->r_ctl.policer_bw, + min(ctf_fixed_maxseg(rack->rc_tp), + rack->r_ctl.rc_pace_min_segs), + 0, NULL, + NULL, rack->r_ctl.pace_len_divisor); + /* Now what about the policer bucket size */ + rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; + if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) { + /* We must be able to send our max-seg or else chaos ensues */ + rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2; + } + if (rack->rc_policer_detected == 0) + rack->r_ctl.current_policer_bucket = 0; + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = avg; + log.u_bbr.flex2 = med; + log.u_bbr.flex3 = rxt_per; + log.u_bbr.flex4 = rack->r_ctl.policer_avg_threshold; + log.u_bbr.flex5 = rack->r_ctl.policer_med_threshold; + log.u_bbr.flex6 = rack->r_ctl.policer_rxt_threshold; + log.u_bbr.flex7 = rack->r_ctl.policer_alt_median; + log.u_bbr.flex8 = 2; + log.u_bbr.applimited = rack->r_ctl.current_round; + log.u_bbr.bw_inuse = del_bw; + log.u_bbr.delivered = rack->r_ctl.policer_bucket_size; + log.u_bbr.cur_del_rate = rxts; + log.u_bbr.delRate = snds; + log.u_bbr.rttProp = rack->r_ctl.gp_bw; + log.u_bbr.bbr_state = rack->rc_policer_detected; + log.u_bbr.bbr_substate = 0; + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.use_lt_bw = rack->policer_detect_on; + log.u_bbr.epoch = rack->r_ctl.policer_max_seg; + log.u_bbr.lt_epoch = (uint32_t)tim; + log.u_bbr.pkts_out = rack->r_ctl.bytes_acked_in_recovery; + tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, + 0, &log, false, NULL, NULL, 0, &tv); + /* + * Put out an added log, 19, for the sole purpose + * of getting the txt/rxt so that we can benchmark + * in read-bbrlog the ongoing rxt rate after our + * policer invocation in the HYSTART announcments. + */ + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); + log.u_bbr.flex1 = alt_med; + log.u_bbr.flex8 = 19; + log.u_bbr.cur_del_rate = tp->t_sndbytes; + log.u_bbr.delRate = tp->t_snd_rxt_bytes; + tcp_log_event(tp, NULL, NULL, NULL, TCP_POLICER_DET, 0, + 0, &log, false, NULL, NULL, 0, &tv); + } + /* Turn off any fast output, thats ended */ + rack->r_fast_output = 0; + /* Mark the time for credits */ + rack->r_ctl.last_sendtime = tcp_get_u64_usecs(NULL); + if (rack->r_rr_config < 2) { + /* + * We need to be stricter on the RR config so + * the pacing has priority. + */ + rack->r_rr_config = 2; + } + policer_detection_log(rack, + rack->r_ctl.idle_snd_una, + rack->r_ctl.ack_for_idle, + 0, + (uint32_t)tim, + 14); + rack->rc_policer_detected = 1; + } else if ((rack->rc_policer_detected == 1) && + (post_recovery == 1)) { + /* + * If we are exiting recovery and have already detected + * we need to possibly update the values. + * + * First: Update the idle -> recovery sent value. + */ + uint32_t srtt; + + if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) { + rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; + } + srtt = (uint64_t)rack_grab_rtt(tp, rack); + if ((tp->t_srtt > 0) && (srtt > tp->t_srtt)) + srtt = tp->t_srtt; + if ((srtt != 0) && + (tim < (uint64_t)srtt)) { + /* + * Not long enough. + */ + if (rack_verbose_logging) + policer_detection_log(rack, + (uint32_t)tim, + 0, + 0, + 0, + 15); + return; + } + /* + * Finally update the b/w if its grown. + */ + if (del_bw > rack->r_ctl.policer_bw) { + rack->r_ctl.policer_bw = del_bw; + rack->r_ctl.policer_max_seg = tcp_get_pacing_burst_size_w_divisor(rack->rc_tp, + rack->r_ctl.policer_bw, + min(ctf_fixed_maxseg(rack->rc_tp), + rack->r_ctl.rc_pace_min_segs), + 0, NULL, + NULL, rack->r_ctl.pace_len_divisor); + if (rack->r_ctl.policer_bucket_size < rack->r_ctl.policer_max_seg) { + /* We must be able to send our max-seg or else chaos ensues */ + rack->r_ctl.policer_bucket_size = rack->r_ctl.policer_max_seg * 2; } - counter_u64_add(rack_rxt_clamps_cwnd, 1); - rack->r_ctl.last_sndbytes = tp->t_sndbytes; - rack->r_ctl.last_snd_rxt_bytes = tp->t_snd_rxt_bytes; - rack->r_ctl.last_rnd_rxt_clamped = rack->r_ctl.current_round; - if (new_cwnd < tp->snd_cwnd) - tp->snd_cwnd = new_cwnd; - if (new_ssthresh < tp->snd_ssthresh) - tp->snd_ssthresh = new_ssthresh; } + policer_detection_log(rack, + rack->r_ctl.idle_snd_una, + rack->r_ctl.ack_for_idle, + 0, + (uint32_t)tim, + 3); + } +} + +static void +rack_exit_recovery(struct tcpcb *tp, struct tcp_rack *rack, int how) +{ + /* now check with the policer if on */ + if (rack->policer_detect_on == 1) { + policer_detection(tp, rack, 1); } + /* + * Now exit recovery, note we must do the idle set after the policer_detection + * to get the amount acked prior to recovery correct. + */ + rack->r_ctl.idle_snd_una = tp->snd_una; + EXIT_RECOVERY(tp->t_flags); } static void @@ -5882,9 +6284,12 @@ } rack_log_dsack_event(rack, 1, __LINE__, 0, 0); } - EXIT_RECOVERY(tp->t_flags); - if (rack->r_ctl.full_dgp_in_rec) - rack_client_buffer_level_set(rack); + if (rack->rto_from_rec == 1) { + rack->rto_from_rec = 0; + if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) + tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; + } + rack_exit_recovery(tp, rack, 1); } static void @@ -5909,12 +6314,69 @@ tp->t_flags &= ~TF_WASFRECOVERY; tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { - if (rack->dgp_on && rack->r_cwnd_was_clamped) { - /* Reset the gains so that on exit we will be softer longer */ - rack->r_ctl.rack_per_of_gp_rec = 100; - rack->r_ctl.rack_per_of_gp_ss = 98; - rack->r_ctl.rack_per_of_gp_ca = 98; + struct rack_sendmap *rsm; + struct timeval tv; + uint32_t segsiz; + + /* Check if this is the end of the initial Start-up i.e. initial slow-start */ + if (rack->rc_initial_ss_comp == 0) { + /* Yep it is the end of the initial slowstart */ + rack->rc_initial_ss_comp = 1; + } + microuptime(&tv); + rack->r_ctl.time_entered_recovery = tcp_tv_to_lusectick(&tv); + if (SEQ_GEQ(ack, tp->snd_una)) { + /* + * The ack is above snd_una. Lets see + * if we can establish a postive distance from + * our idle mark. + */ + rack->r_ctl.ack_for_idle = ack; + if (SEQ_GT(ack, rack->r_ctl.idle_snd_una)) { + rack->r_ctl.last_amount_before_rec = ack - rack->r_ctl.idle_snd_una; + } else { + /* No data thru yet */ + rack->r_ctl.last_amount_before_rec = 0; + } + } else if (SEQ_GT(tp->snd_una, rack->r_ctl.idle_snd_una)) { + /* + * The ack is out of order and behind the snd_una. It may + * have contained SACK information which we processed else + * we would have rejected it. + */ + rack->r_ctl.ack_for_idle = tp->snd_una; + rack->r_ctl.last_amount_before_rec = tp->snd_una - rack->r_ctl.idle_snd_una; + } else { + rack->r_ctl.ack_for_idle = ack; + rack->r_ctl.last_amount_before_rec = 0; + } + if (rack->rc_policer_detected) { + /* + * If we are being policed and we have a loss, it + * means our bucket is now empty. This can happen + * where some other flow on the same host sends + * that this connection is not aware of. + */ + rack->r_ctl.current_policer_bucket = 0; + if (rack_verbose_logging) + policer_detection_log(rack, rack->r_ctl.last_amount_before_rec, 0, 0, 0, 4); + if (rack->r_ctl.last_amount_before_rec > rack->r_ctl.policer_bucket_size) { + rack->r_ctl.policer_bucket_size = rack->r_ctl.last_amount_before_rec; + } + } + memset(rack->r_ctl.rc_cnt_of_retran, 0, sizeof(rack->r_ctl.rc_cnt_of_retran)); + segsiz = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_tmap, r_tnext) { + /* + * Go through the outstanding and re-peg + * any that should have been left in the + * retransmit list (on a double recovery). + */ + if (rsm->r_act_rxt_cnt > 0) { + rack_peg_rxt(rack, rsm, segsiz); + } } + rack->r_ctl.bytes_acked_in_recovery = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_fast_output = 0; @@ -5947,15 +6409,19 @@ tp->t_dupacks = 0; tp->t_bytes_acked = 0; rack->r_fast_output = 0; - EXIT_RECOVERY(tp->t_flags); - if (tp->t_rxtshift == 1) { + if (IN_RECOVERY(tp->t_flags)) + rack_exit_recovery(tp, rack, 2); + rack->r_ctl.bytes_acked_in_recovery = 0; + rack->r_ctl.time_entered_recovery = 0; + orig_cwnd = tp->snd_cwnd; + rack_log_to_prr(rack, 16, orig_cwnd, line); + if (CC_ALGO(tp)->cong_signal == NULL) { + /* TSNH */ tp->snd_ssthresh = max(2, min(tp->snd_wnd, rack->r_ctl.cwnd_to_use) / 2 / ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); + tp->snd_cwnd = ctf_fixed_maxseg(tp); } - orig_cwnd = tp->snd_cwnd; - tp->snd_cwnd = ctf_fixed_maxseg(tp); - rack_log_to_prr(rack, 16, orig_cwnd, line); if (tp->t_flags2 & TF2_ECN_PERMIT) tp->t_flags2 |= TF2_ECN_SND_CWR; break; @@ -5984,8 +6450,6 @@ } if ((in_rec_at_entry == 0) && IN_RECOVERY(tp->t_flags)) { rack_log_to_prr(rack, 15, cwnd_enter, line); - if (rack->r_ctl.full_dgp_in_rec) - rack_client_buffer_level_set(rack); rack->r_ctl.dsack_byte_cnt = 0; rack->r_ctl.retran_during_recovery = 0; rack->r_ctl.rc_cwnd_at_erec = cwnd_enter; @@ -6078,7 +6542,7 @@ } static uint32_t -rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts) +rack_calc_thresh_rack(struct tcp_rack *rack, uint32_t srtt, uint32_t cts, int line, int log_allowed) { int32_t lro; uint32_t thresh; @@ -6149,7 +6613,8 @@ * have seen reordering we have a DSACK count. */ thresh += rack->r_ctl.num_dsack * (srtt >> 2); - rack_log_dsack_event(rack, 4, __LINE__, srtt, thresh); + if (log_allowed) + rack_log_dsack_event(rack, 4, line, srtt, thresh); } /* SRTT * 2 is the ceiling */ if (thresh > (srtt * 2)) { @@ -6159,7 +6624,8 @@ if (thresh > rack_rto_max) { thresh = rack_rto_max; } - rack_log_dsack_event(rack, 6, __LINE__, srtt, thresh); + if (log_allowed) + rack_log_dsack_event(rack, 6, line, srtt, thresh); return (thresh); } @@ -6294,7 +6760,7 @@ } idx = rsm->r_rtr_cnt - 1; srtt = rack_grab_rtt(tp, rack); - thresh = rack_calc_thresh_rack(rack, srtt, tsused); + thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); if (TSTMP_LT(tsused, ((uint32_t)rsm->r_tim_lastsent[idx]))) { return (NULL); } @@ -6456,7 +6922,7 @@ goto activate_tlp; } srtt = rack_grab_rtt(tp, rack); - thresh = rack_calc_thresh_rack(rack, srtt, cts); + thresh = rack_calc_thresh_rack(rack, srtt, cts, __LINE__, 1); idx = rsm->r_rtr_cnt - 1; exp = ((uint32_t)rsm->r_tim_lastsent[idx]) + thresh; if (SEQ_GEQ(exp, cts)) { @@ -6563,8 +7029,6 @@ static void rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, tcp_seq snd_una) { - struct timeval tv; - if (rack->rc_in_persist == 0) { if (tp->t_flags & TF_GPUTINPROG) { /* @@ -6580,21 +7044,23 @@ rack->rack_scwnd_is_idle = 1; } #endif - rack->r_ctl.rc_went_idle_time = tcp_get_usecs(&tv); + rack->r_ctl.rc_went_idle_time = cts; + if (rack->r_ctl.rc_went_idle_time == 0) + rack->r_ctl.rc_went_idle_time = 1; if (rack->lt_bw_up) { /* Suspend our LT BW measurement */ uint64_t tmark; rack->r_ctl.lt_bw_bytes += (snd_una - rack->r_ctl.lt_seq); rack->r_ctl.lt_seq = snd_una; - tmark = tcp_tv_to_lusectick(&tv); - rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + tmark = tcp_tv_to_lusectick(&rack->r_ctl.act_rcv_time); + if (tmark >= rack->r_ctl.lt_timemark) { + rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + } rack->r_ctl.lt_timemark = tmark; rack->lt_bw_up = 0; rack->r_persist_lt_bw_off = 1; } - if (rack->r_ctl.rc_went_idle_time == 0) - rack->r_ctl.rc_went_idle_time = 1; rack_timer_cancel(tp, rack, cts, __LINE__); rack->r_ctl.persist_lost_ends = 0; rack->probe_not_answered = 0; @@ -6609,9 +7075,6 @@ static void rack_exit_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { - struct timeval tv; - uint32_t t_time; - if (tcp_in_hpts(rack->rc_tp)) { tcp_hpts_remove(rack->rc_tp); rack->r_ctl.rc_hpts_flags = 0; @@ -6622,7 +7085,6 @@ rack->rack_scwnd_is_idle = 0; } #endif - t_time = tcp_get_usecs(&tv); if (rack->rc_gp_dyn_mul && (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { @@ -6632,7 +7094,7 @@ */ uint32_t time_idle, idle_min; - time_idle = t_time - rack->r_ctl.rc_went_idle_time; + time_idle = cts - rack->r_ctl.rc_went_idle_time; idle_min = rack_min_probertt_hold; if (rack_probertt_gpsrtt_cnt_div) { uint64_t extra; @@ -6658,10 +7120,11 @@ } if (rack->r_persist_lt_bw_off) { /* Continue where we left off */ - rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); + rack->r_ctl.lt_timemark = tcp_get_u64_usecs(NULL); rack->lt_bw_up = 1; rack->r_persist_lt_bw_off = 0; } + rack->r_ctl.idle_snd_una = tp->snd_una; rack->rc_in_persist = 0; rack->r_ctl.rc_went_idle_time = 0; tp->t_rxtshift = 0; @@ -6734,7 +7197,7 @@ } static void -rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, +rack_start_hpts_timer (struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t slot, uint32_t tot_len_this_send, int sup_rack) { struct hpts_diag diag; @@ -6778,7 +7241,8 @@ rack->r_early = 0; rack->r_ctl.rc_agg_early = 0; } - if (rack->r_late) { + if ((rack->r_late) && + ((rack->r_use_hpts_min == 0) || (rack->dgp_on == 0))) { /* * This is harder, we can * compensate some but it @@ -6812,6 +7276,32 @@ if (rack->r_ctl.rc_agg_delayed == 0) rack->r_late = 0; } + } else if (rack->r_late) { + /* r_use_hpts_min is on and so is DGP */ + uint32_t max_red; + + max_red = (slot * rack->r_ctl.max_reduction) / 100; + if (max_red >= rack->r_ctl.rc_agg_delayed) { + slot -= rack->r_ctl.rc_agg_delayed; + rack->r_ctl.rc_agg_delayed = 0; + } else { + slot -= max_red; + rack->r_ctl.rc_agg_delayed -= max_red; + } + } + if ((rack->r_use_hpts_min == 1) && + (slot > 0) && + (rack->dgp_on == 1)) { + /* + * We are enforcing a min pacing timer + * based on our hpts min timeout. + */ + uint32_t min; + + min = get_hpts_min_sleep_time(); + if (min > slot) { + slot = min; + } } hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); #ifdef TCP_SAD_DETECTION @@ -7041,6 +7531,34 @@ rack_log_type_bbrsnd(rack, tot_len_this_send, slot, us_cts, &tv, __LINE__); } +static void +rack_mark_lost(struct tcpcb *tp, + struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) +{ + struct rack_sendmap *nrsm; + uint32_t thresh, exp; + + thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); + nrsm = rsm; + TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_tmap, r_tnext) { + if ((nrsm->r_flags & RACK_SACK_PASSED) == 0) { + /* Got up to all that were marked sack-passed */ + break; + } + if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { + exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; + if (TSTMP_LT(exp, cts) || (exp == cts)) { + /* We now consider it lost */ + nrsm->r_flags |= RACK_WAS_LOST; + rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; + } else { + /* Past here it won't be lost so stop */ + break; + } + } + } +} + /* * RACK Timer, here we simply do logging and house keeping. * the normal rack_output() function will call the @@ -7067,6 +7585,8 @@ rsm = rack_check_recovery_mode(tp, cts); rack_log_to_event(rack, RACK_TO_FRM_RACK, rsm); if (rsm) { + /* We need to stroke any lost that are now declared as lost */ + rack_mark_lost(tp, rack, rsm, cts); rack->r_ctl.rc_resend = rsm; rack->r_timer_override = 1; if (rack->use_rack_rr) { @@ -7088,6 +7608,16 @@ 0, 0, 0); return (1); } + if ((rack->policer_detect_on == 1) && + (rack->rc_policer_detected == 0)) { + /* + * We do this early if we have not + * deteceted to attempt to detect + * quicker. Normally we want to do this + * as recovery exits (and we will again). + */ + policer_detection(tp, rack, 0); + } return (0); } @@ -7189,13 +7719,14 @@ nrsm->r_start = start; nrsm->r_end = rsm->r_end; nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_act_rxt_cnt = rsm->r_act_rxt_cnt; nrsm->r_flags = rsm->r_flags; nrsm->r_dupack = rsm->r_dupack; nrsm->r_no_rtt_allowed = rsm->r_no_rtt_allowed; nrsm->r_rtr_bytes = 0; nrsm->r_fas = rsm->r_fas; nrsm->r_bas = rsm->r_bas; - rsm->r_end = nrsm->r_start; + tqhash_update_end(rack->r_ctl.tqh, rsm, nrsm->r_start); nrsm->r_just_ret = rsm->r_just_ret; for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; @@ -7242,7 +7773,7 @@ */ rack_log_map_chg(rack->rc_tp, rack, NULL, l_rsm, r_rsm, MAP_MERGE, r_rsm->r_end, __LINE__); - l_rsm->r_end = r_rsm->r_end; + tqhash_update_end(rack->r_ctl.tqh, l_rsm, r_rsm->r_end); if (l_rsm->r_dupack < r_rsm->r_dupack) l_rsm->r_dupack = r_rsm->r_dupack; if (r_rsm->r_rtr_bytes) @@ -7344,6 +7875,7 @@ */ rack_log_to_event(rack, RACK_TO_FRM_TLP, NULL); rack->r_ctl.retran_during_recovery = 0; + rack->r_might_revert = 0; rack->r_ctl.dsack_byte_cnt = 0; counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) @@ -7517,6 +8049,32 @@ return (0); } +static inline int +rack_send_ack_challange(struct tcp_rack *rack) +{ + struct tcptemp *t_template; + + t_template = tcpip_maketemplate(rack->rc_inp); + if (t_template) { + if (rack->forced_ack == 0) { + rack->forced_ack = 1; + rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); + } else { + rack->probe_not_answered = 1; + } + tcp_respond(rack->rc_tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + rack->rc_tp->rcv_nxt, rack->rc_tp->snd_una - 1, 0); + free(t_template, M_TEMP); + /* This does send an ack so kill any D-ack timer */ + if (rack->rc_tp->t_flags & TF_DELACK) + rack->rc_tp->t_flags &= ~TF_DELACK; + return(1); + } else + return (0); + +} + /* * Persists timer, here we simply send the * same thing as a keepalive will. @@ -7528,7 +8086,6 @@ static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { - struct tcptemp *t_template; int32_t retval = 1; if (rack->rc_in_persist == 0) @@ -7575,26 +8132,14 @@ retval = -ETIMEDOUT; /* tcp_drop() */ goto out; } - t_template = tcpip_maketemplate(rack->rc_inp); - if (t_template) { + if (rack_send_ack_challange(rack)) { /* only set it if we were answered */ - if (rack->forced_ack == 0) { - rack->forced_ack = 1; - rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); - } else { - rack->probe_not_answered = 1; + if (rack->probe_not_answered) { counter_u64_add(rack_persists_loss, 1); rack->r_ctl.persist_lost_ends++; } counter_u64_add(rack_persists_sends, 1); counter_u64_add(rack_out_size[TCP_MSS_ACCT_PERSIST], 1); - tcp_respond(tp, t_template->tt_ipgen, - &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0); - /* This sends an ack */ - if (tp->t_flags & TF_DELACK) - tp->t_flags &= ~TF_DELACK; - free(t_template, M_TEMP); } if (tp->t_rxtshift < V_tcp_retries) tp->t_rxtshift++; @@ -7614,7 +8159,6 @@ static int rack_timeout_keepalive(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { - struct tcptemp *t_template; struct inpcb *inp = tptoinpcb(tp); rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; @@ -7641,19 +8185,7 @@ * respond. */ KMOD_TCPSTAT_INC(tcps_keepprobe); - t_template = tcpip_maketemplate(inp); - if (t_template) { - if (rack->forced_ack == 0) { - rack->forced_ack = 1; - rack->r_ctl.forced_ack_ts = tcp_get_usecs(NULL); - } else { - rack->probe_not_answered = 1; - } - tcp_respond(tp, t_template->tt_ipgen, - &t_template->tt_t, (struct mbuf *)NULL, - tp->rcv_nxt, tp->snd_una - 1, 0); - free(t_template, M_TEMP); - } + rack_send_ack_challange(rack); } rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (1); @@ -7680,8 +8212,26 @@ rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_get_usecs(NULL), __LINE__); rack_log_to_event(rack, RACK_TO_FRM_TMR, NULL); + rack->r_timer_override = 1; + rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; + rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; + rack->r_late = 0; + rack->r_early = 0; + rack->r_ctl.rc_agg_delayed = 0; + rack->r_ctl.rc_agg_early = 0; if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); + if (tp->t_rxtshift <= rack_rxt_scoreboard_clear_thresh) { + /* + * We do not clear the scoreboard until we have had + * more than rack_rxt_scoreboard_clear_thresh time-outs. + */ + rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rack->r_ctl.rc_resend != NULL) + rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; + + return; + } /* * Ideally we would like to be able to * mark SACK-PASS on anything not acked here. @@ -7714,27 +8264,26 @@ trsm = rsm; if (rsm->r_flags & RACK_ACKED) rsm->r_flags |= RACK_WAS_ACKED; - rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED); + rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS | RACK_RWND_COLLAPSED | RACK_WAS_LOST); rsm->r_flags |= RACK_MUST_RXT; } + /* zero the lost since it's all gone */ + rack->r_ctl.rc_considered_lost = 0; /* Clear the count (we just un-acked them) */ - rack->r_ctl.rc_last_timeout_snduna = tp->snd_una; rack->r_ctl.rc_sacked = 0; rack->r_ctl.rc_sacklast = NULL; - rack->r_ctl.rc_agg_delayed = 0; - rack->r_early = 0; - rack->r_ctl.rc_agg_early = 0; - rack->r_late = 0; /* Clear the tlp rtx mark */ rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); if (rack->r_ctl.rc_resend != NULL) rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; rack->r_ctl.rc_prr_sndcnt = 0; rack_log_to_prr(rack, 6, 0, __LINE__); - rack->r_timer_override = 1; + rack->r_ctl.rc_resend = tqhash_min(rack->r_ctl.tqh); + if (rack->r_ctl.rc_resend != NULL) + rack->r_ctl.rc_resend->r_flags |= RACK_TO_REXT; if ((((tp->t_flags & TF_SACK_PERMIT) == 0) #ifdef TCP_SAD_DETECTION - || (rack->sack_attack_disable != 0) + || (rack->sack_attack_disable != 0) #endif ) && ((tp->t_flags & TF_SENTFIN) == 0)) { /* @@ -7744,9 +8293,8 @@ */ rack->r_must_retran = 1; rack->r_ctl.rc_out_at_rto = ctf_flight_size(rack->rc_tp, - rack->r_ctl.rc_sacked); + rack->r_ctl.rc_sacked); } - rack->r_ctl.rc_snd_max_at_rto = tp->snd_max; } static void @@ -7829,6 +8377,17 @@ rack->r_ctl.retran_during_recovery = 0; rack->rc_ack_required = 1; rack->r_ctl.dsack_byte_cnt = 0; + if (IN_RECOVERY(tp->t_flags) && + (rack->rto_from_rec == 0)) { + /* + * Mark that we had a rto while in recovery + * and save the ssthresh so if we go back + * into recovery we will have a chance + * to slowstart back to the level. + */ + rack->rto_from_rec = 1; + rack->r_ctl.rto_ssthresh = tp->snd_ssthresh; + } if (IN_FASTRECOVERY(tp->t_flags)) tp->t_flags |= TF_WASFRECOVERY; else @@ -7877,7 +8436,6 @@ * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ - rack_remxt_tmr(tp); if ((rack->r_ctl.rc_resend == NULL) || ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { /* @@ -7888,6 +8446,7 @@ */ tp->t_rxtshift++; } + rack_remxt_tmr(tp); if (tp->t_rxtshift > V_tcp_retries) { tcp_log_end_status(tp, TCP_EI_STATUS_RETRAN); drop_it: @@ -8240,23 +8799,124 @@ } } +/* + * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This + * array is zeroed at the start of recovery. Each time a segment + * is retransmitted, we translate that into a number of packets + * (based on segsiz) and based on how many times its been retransmitted + * increment by the number of packets the counter that represents + * retansmitted N times. Index 0 is retransmitted 1 time, index 1 + * is retransmitted 2 times etc. + * + * So for example when we send a 4344 byte transmission with a 1448 + * byte segsize, and its the third time we have retransmitted this + * segment, we would add to the rc_cnt_of_retran[2] the value of + * 3. That represents 3 MSS were retransmitted 3 times (index is + * the number of times retranmitted minus 1). + */ +static void +rack_peg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz) +{ + int idx; + uint32_t peg; + + peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1; + peg /= segsiz; + idx = rsm->r_act_rxt_cnt - 1; + if (idx >= RETRAN_CNT_SIZE) + idx = RETRAN_CNT_SIZE - 1; + /* Max of a uint16_t retransmits in a bucket */ + if ((rack->r_ctl.rc_cnt_of_retran[idx] + peg) < 0xffff) + rack->r_ctl.rc_cnt_of_retran[idx] += peg; + else + rack->r_ctl.rc_cnt_of_retran[idx] = 0xffff; +} + +/* + * We maintain an array fo 16 (RETRAN_CNT_SIZE) entries. This + * array is zeroed at the start of recovery. Each time a segment + * is retransmitted, we translate that into a number of packets + * (based on segsiz) and based on how many times its been retransmitted + * increment by the number of packets the counter that represents + * retansmitted N times. Index 0 is retransmitted 1 time, index 1 + * is retransmitted 2 times etc. + * + * The rack_unpeg_rxt is used when we go to retransmit a segment + * again. Basically if the segment had previously been retransmitted + * say 3 times (as our previous example illustrated in the comment + * above rack_peg_rxt() prior to calling that and incrementing + * r_ack_rxt_cnt we would have called rack_unpeg_rxt() that would + * subtract back the previous add from its last rxt (in this + * example r_act_cnt would have been 2 for 2 retransmissions. So + * we would have subtracted 3 from rc_cnt_of_reetran[1] to remove + * those 3 segments. You will see this in the rack_update_rsm() + * below where we do: + * if (rsm->r_act_rxt_cnt > 0) { + * rack_unpeg_rxt(rack, rsm, segsiz); + * } + * rsm->r_act_rxt_cnt++; + * rack_peg_rxt(rack, rsm, segsiz); + * + * This effectively moves the count from rc_cnt_of_retran[1] to + * rc_cnt_of_retran[2]. + */ +static void +rack_unpeg_rxt(struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t segsiz) +{ + int idx; + uint32_t peg; + + idx = rsm->r_act_rxt_cnt - 1; + if (idx >= RETRAN_CNT_SIZE) + idx = RETRAN_CNT_SIZE - 1; + peg = ((rsm->r_end - rsm->r_start) + segsiz) - 1; + peg /= segsiz; + if (peg < rack->r_ctl.rc_cnt_of_retran[idx]) + rack->r_ctl.rc_cnt_of_retran[idx] -= peg; + else { + /* TSNH */ + rack->r_ctl.rc_cnt_of_retran[idx] = 0; + } +} + static void rack_update_rsm(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, uint16_t add_flag, int segsiz) + struct rack_sendmap *rsm, uint64_t ts, uint32_t add_flag, int segsiz) { int32_t idx; rsm->r_rtr_cnt++; - rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); - rsm->r_dupack = 0; if (rsm->r_rtr_cnt > RACK_NUM_OF_RETRANS) { rsm->r_rtr_cnt = RACK_NUM_OF_RETRANS; rsm->r_flags |= RACK_OVERMAX; } + if (rsm->r_act_rxt_cnt > 0) { + /* Drop the count back for this, its retransmitting again */ + rack_unpeg_rxt(rack, rsm, segsiz); + } + rsm->r_act_rxt_cnt++; + /* Peg the count/index */ + rack_peg_rxt(rack, rsm, segsiz); + rack_log_retran_reason(rack, rsm, __LINE__, 0, 2); + rsm->r_dupack = 0; if ((rsm->r_rtr_cnt > 1) && ((rsm->r_flags & RACK_TLP) == 0)) { rack->r_ctl.rc_holes_rxt += (rsm->r_end - rsm->r_start); rsm->r_rtr_bytes += (rsm->r_end - rsm->r_start); } + if (rsm->r_flags & RACK_WAS_LOST) { + /* + * We retransmitted it putting it back in flight + * remove the lost desgination and reduce the + * bytes considered lost. + */ + rsm->r_flags &= ~RACK_WAS_LOST; + KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) + rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; + else + rack->r_ctl.rc_considered_lost = 0; + } idx = rsm->r_rtr_cnt - 1; rsm->r_tim_lastsent[idx] = ts; /* @@ -8304,7 +8964,7 @@ static uint32_t rack_update_entry(struct tcpcb *tp, struct tcp_rack *rack, - struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint16_t add_flag, int segsiz) + struct rack_sendmap *rsm, uint64_t ts, int32_t *lenp, uint32_t add_flag, int segsiz) { /* * We (re-)transmitted starting at rsm->r_start for some length @@ -8381,7 +9041,7 @@ static void rack_log_output(struct tcpcb *tp, struct tcpopt *to, int32_t len, uint32_t seq_out, uint16_t th_flags, int32_t err, uint64_t cts, - struct rack_sendmap *hintrsm, uint16_t add_flag, struct mbuf *s_mb, + struct rack_sendmap *hintrsm, uint32_t add_flag, struct mbuf *s_mb, uint32_t s_moff, int hw_tls, int segsiz) { struct tcp_rack *rack; @@ -8440,13 +9100,6 @@ len++; if (th_flags & TH_FIN) len++; - if (SEQ_LT(snd_max, tp->snd_nxt)) { - /* - * The add/update as not been done for the FIN/SYN - * yet. - */ - snd_max = tp->snd_nxt; - } } if (SEQ_LEQ((seq_out + len), snd_una)) { /* Are sending an old segment to induce an ack (keep-alive)? */ @@ -8492,6 +9145,7 @@ rsm->r_hw_tls = 1; rsm->r_tim_lastsent[0] = cts; rsm->r_rtr_cnt = 1; + rsm->r_act_rxt_cnt = 0; rsm->r_rtr_bytes = 0; if (th_flags & TH_SYN) { /* The data space is one beyond snd_una */ @@ -8515,6 +9169,10 @@ rsm->r_fas = (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) + (rsm->r_end - rsm->r_start)); + if ((rack->rc_initial_ss_comp == 0) && + (rack->r_ctl.ss_hi_fs < rsm->r_fas)) { + rack->r_ctl.ss_hi_fs = rsm->r_fas; + } /* rsm->m will be NULL if RACK_HAS_SYN or RACK_HAS_FIN is set */ if (rsm->m) { if (rsm->m->m_len <= rsm->soff) { @@ -8558,6 +9216,13 @@ #endif TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; + if (rsm->r_flags & RACK_IS_PCM) { + rack->r_ctl.pcm_i.send_time = cts; + rack->r_ctl.pcm_i.eseq = rsm->r_end; + /* First time through we set the start too */ + if (rack->pcm_in_progress == 0) + rack->r_ctl.pcm_i.sseq = rsm->r_start; + } /* * Special case detection, is there just a single * packet outstanding when we are not in recovery? @@ -8886,6 +9551,7 @@ } stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_PATHRTT, imax(0, rack->r_ctl.rack_rs.rs_us_rtt)); #endif + rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); /* * the retransmit should happen at rtt + 4 * rttvar. Because of the * way we do the smoothing, srtt and rttvar will each average +1/2 @@ -8939,6 +9605,7 @@ val = rack_probertt_lower_within * rack_time_between_probertt; val /= 100; if ((rack->in_probe_rtt == 0) && + (rack->rc_skip_timely == 0) && ((us_cts - rack->r_ctl.rc_lower_rtt_us_cts) >= (rack_time_between_probertt - val))) { rack_enter_probertt(rack, us_cts); } @@ -9051,7 +9718,7 @@ (!IN_FASTRECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ if (rack->r_ctl.rc_tlp_cwnd_reduce) { - rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); + rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); } } if ((rack->r_ctl.rc_rack_tmit_time == 0) || @@ -9198,10 +9865,14 @@ */ static void rack_log_sack_passed(struct tcpcb *tp, - struct tcp_rack *rack, struct rack_sendmap *rsm) + struct tcp_rack *rack, struct rack_sendmap *rsm, uint32_t cts) { struct rack_sendmap *nrsm; + uint32_t thresh; + /* Get our rxt threshold for lost consideration */ + thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(tp, rack), cts, __LINE__, 0); + /* Now start looking at rsm's */ nrsm = rsm; TAILQ_FOREACH_REVERSE_FROM(nrsm, &rack->r_ctl.rc_tmap, rack_head, r_tnext) { @@ -9224,6 +9895,17 @@ */ continue; } + /* Check lost state */ + if ((nrsm->r_flags & RACK_WAS_LOST) == 0) { + uint32_t exp; + + exp = ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)]) + thresh; + if (TSTMP_LT(exp, cts) || (exp == cts)) { + /* We consider it lost */ + nrsm->r_flags |= RACK_WAS_LOST; + rack->r_ctl.rc_considered_lost += nrsm->r_end - nrsm->r_start; + } + } if (nrsm->r_flags & RACK_SACK_PASSED) { /* * We found one that is already marked @@ -9407,8 +10089,6 @@ return (1); } - - static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts, @@ -9625,16 +10305,11 @@ (rsm->bindex == next->bindex) && ((rsm->r_flags & RACK_STRADDLE) == 0) && ((next->r_flags & RACK_STRADDLE) == 0) && + ((rsm->r_flags & RACK_IS_PCM) == 0) && + ((next->r_flags & RACK_IS_PCM) == 0) && (rsm->r_flags & RACK_IN_GP_WIN) && (next->r_flags & RACK_IN_GP_WIN)) can_use_hookery = 1; - else if (next && - (rsm->bindex == next->bindex) && - ((rsm->r_flags & RACK_STRADDLE) == 0) && - ((next->r_flags & RACK_STRADDLE) == 0) && - ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && - ((next->r_flags & RACK_IN_GP_WIN) == 0)) - can_use_hookery = 1; else can_use_hookery = 0; if (next && can_use_hookery && @@ -9661,7 +10336,7 @@ nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); /* Now adjust our tree blocks */ - rsm->r_end = start; + tqhash_update_end(rack->r_ctl.tqh, rsm, start); next->r_start = start; rsm->r_flags |= RACK_SHUFFLED; next->r_flags |= RACK_SHUFFLED; @@ -9712,6 +10387,17 @@ if ((nrsm->r_end - nrsm->r_start) >= segsiz) rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); + if (rsm->r_flags & RACK_WAS_LOST) { + int my_chg; + + my_chg = (nrsm->r_end - nrsm->r_start); + KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + if (my_chg <= rack->r_ctl.rc_considered_lost) + rack->r_ctl.rc_considered_lost -= my_chg; + else + rack->r_ctl.rc_considered_lost = 0; + } if (nrsm->r_flags & RACK_SACK_PASSED) { rack->r_ctl.rc_reorder_ts = cts; if (rack->r_ctl.rc_reorder_ts == 0) @@ -9734,7 +10420,7 @@ * one walk backwards from there. */ if (nrsm && nrsm->r_in_tmap) - rack_log_sack_passed(tp, rack, nrsm); + rack_log_sack_passed(tp, rack, nrsm, cts); } /* Now are we done? */ if (SEQ_LT(end, next->r_end) || @@ -9875,9 +10561,21 @@ /* You get a count for acking a whole segment or more */ if ((rsm->r_end - rsm->r_start) >= segsiz) rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); + if (rsm->r_flags & RACK_WAS_LOST) { + int my_chg; + + my_chg = (rsm->r_end - rsm->r_start); + rsm->r_flags &= ~RACK_WAS_LOST; + KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + if (my_chg <= rack->r_ctl.rc_considered_lost) + rack->r_ctl.rc_considered_lost -= my_chg; + else + rack->r_ctl.rc_considered_lost = 0; + } rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); if (rsm->r_in_tmap) /* should be true */ - rack_log_sack_passed(tp, rack, rsm); + rack_log_sack_passed(tp, rack, rsm, cts); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; @@ -9889,6 +10587,7 @@ rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; + rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 0; @@ -9968,19 +10667,13 @@ (rsm->bindex == prev->bindex) && ((rsm->r_flags & RACK_STRADDLE) == 0) && ((prev->r_flags & RACK_STRADDLE) == 0) && + ((rsm->r_flags & RACK_IS_PCM) == 0) && + ((prev->r_flags & RACK_IS_PCM) == 0) && (rsm->r_flags & RACK_IN_GP_WIN) && (prev->r_flags & RACK_IN_GP_WIN)) can_use_hookery = 1; - else if (prev && - (rsm->bindex == prev->bindex) && - ((rsm->r_flags & RACK_STRADDLE) == 0) && - ((prev->r_flags & RACK_STRADDLE) == 0) && - ((rsm->r_flags & RACK_IN_GP_WIN) == 0) && - ((prev->r_flags & RACK_IN_GP_WIN) == 0)) - can_use_hookery = 1; else can_use_hookery = 0; - if (prev && can_use_hookery && (prev->r_flags & RACK_ACKED)) { /** @@ -10003,7 +10696,7 @@ noextra++; nrsm = &stack_map; memcpy(nrsm, rsm, sizeof(struct rack_sendmap)); - prev->r_end = end; + tqhash_update_end(rack->r_ctl.tqh, prev, end); rsm->r_start = end; rsm->r_flags |= RACK_SHUFFLED; prev->r_flags |= RACK_SHUFFLED; @@ -10064,6 +10757,17 @@ rack->r_ctl.ack_count += ((nrsm->r_end - nrsm->r_start) / segsiz); rack->r_ctl.rc_sacked += (nrsm->r_end - nrsm->r_start); + if (rsm->r_flags & RACK_WAS_LOST) { + int my_chg; + + my_chg = (nrsm->r_end - nrsm->r_start); + KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + if (my_chg <= rack->r_ctl.rc_considered_lost) + rack->r_ctl.rc_considered_lost -= my_chg; + else + rack->r_ctl.rc_considered_lost = 0; + } if (nrsm->r_flags & RACK_SACK_PASSED) { rack->r_ctl.rc_reorder_ts = cts; if (rack->r_ctl.rc_reorder_ts == 0) @@ -10160,10 +10864,22 @@ /* You get a count for acking a whole segment or more */ if ((rsm->r_end - rsm->r_start) >= segsiz) rack->r_ctl.ack_count += ((rsm->r_end - rsm->r_start) / segsiz); - + if (rsm->r_flags & RACK_WAS_LOST) { + int my_chg; + + my_chg = (rsm->r_end - rsm->r_start); + rsm->r_flags &= ~RACK_WAS_LOST; + KASSERT((rack->r_ctl.rc_considered_lost >= my_chg), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + if (my_chg <= rack->r_ctl.rc_considered_lost) + rack->r_ctl.rc_considered_lost -= my_chg; + else + rack->r_ctl.rc_considered_lost = 0; + } rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); + if (rsm->r_in_tmap) /* should be true */ - rack_log_sack_passed(tp, rack, rsm); + rack_log_sack_passed(tp, rack, rsm, cts); /* Is Reordering occuring? */ if (rsm->r_flags & RACK_SACK_PASSED) { rsm->r_flags &= ~RACK_SACK_PASSED; @@ -10175,6 +10891,7 @@ rack_need_set_test(tp, rack, rsm, tp->snd_una, __LINE__, RACK_USE_END); rsm->r_ack_arrival = rack_to_usec_ts(&rack->r_ctl.act_rcv_time); rsm->r_flags |= RACK_ACKED; + rack_update_pcm_ack(rack, 0, rsm->r_start, rsm->r_end); rack_log_map_chg(tp, rack, NULL, rsm, nrsm, MAP_SACK_M5, end, __LINE__); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); @@ -10214,8 +10931,12 @@ break; if (rsm->r_flags & RACK_STRADDLE) break; + if (rsm->r_flags & RACK_IS_PCM) + break; if (next->r_flags & RACK_STRADDLE) break; + if (next->r_flags & RACK_IS_PCM) + break; if (next->r_flags & RACK_ACKED) { /* yep this and next can be merged */ rsm = rack_merge_rsm(rack, rsm, next); @@ -10242,8 +10963,12 @@ break; if (rsm->r_flags & RACK_STRADDLE) break; + if (rsm->r_flags & RACK_IS_PCM) + break; if (prev->r_flags & RACK_STRADDLE) break; + if (prev->r_flags & RACK_IS_PCM) + break; if (prev->r_flags & RACK_ACKED) { /* yep the previous and this can be merged */ rsm = rack_merge_rsm(rack, prev, rsm); @@ -10264,6 +10989,9 @@ /* Pass back the moved. */ *moved_two = moved; *no_extra = noextra; + if (IN_RECOVERY(tp->t_flags)) { + rack->r_ctl.bytes_acked_in_recovery += changed; + } return (changed); } @@ -10464,6 +11192,17 @@ * RTT's. */ + if (sack_filter_blks_used(&rack->r_ctl.rack_sf)) { + /* + * If we have some sack blocks in the filter + * lets prune them out by calling sfb with no blocks. + */ + sack_filter_blks(&rack->r_ctl.rack_sf, NULL, 0, th_ack); + } + if (SEQ_GT(th_ack, tp->snd_una)) { + /* Clear any app ack remembered settings */ + rack->r_ctl.cleared_app_ack = 0; + } rack->r_wanted_output = 1; if (SEQ_GT(th_ack, tp->snd_una)) rack->r_ctl.last_cumack_advance = acktime; @@ -10533,10 +11272,10 @@ return; } #ifdef INVARIANTS - panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u snd_nxt:%u\n", + panic("No rack map tp:%p for state:%d ack:%u rack:%p snd_una:%u snd_max:%u\n", tp, tp->t_state, th_ack, rack, - tp->snd_una, tp->snd_max, tp->snd_nxt); + tp->snd_una, tp->snd_max); #endif return; } @@ -10599,6 +11338,20 @@ uint32_t left; uint8_t newly_acked; + if (rsm->r_flags & RACK_WAS_LOST) { + /* + * This can happen when we marked it as lost + * and yet before retransmitting we get an ack + * which can happen due to reordering. + */ + rsm->r_flags &= ~RACK_WAS_LOST; + KASSERT((rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)), + ("rsm:%p rack:%p rc_considered_lost goes negative", rsm, rack)); + if (rack->r_ctl.rc_considered_lost >= (rsm->r_end - rsm->r_start)) + rack->r_ctl.rc_considered_lost -= rsm->r_end - rsm->r_start; + else + rack->r_ctl.rc_considered_lost = 0; + } rack_log_map_chg(tp, rack, NULL, rsm, NULL, MAP_FREE, rsm->r_end, __LINE__); rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; @@ -10613,6 +11366,10 @@ rsm->r_in_tmap = 0; } newly_acked = 1; + if (((rsm->r_flags & RACK_ACKED) == 0) && + (IN_RECOVERY(tp->t_flags))) { + rack->r_ctl.bytes_acked_in_recovery += (rsm->r_end - rsm->r_start); + } if (rsm->r_flags & RACK_ACKED) { /* * It was acked on the scoreboard -- remove @@ -10639,6 +11396,9 @@ */ rack->r_might_revert = 1; } + rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); + } else { + rack_update_pcm_ack(rack, 1, rsm->r_start, rsm->r_end); } if ((rsm->r_flags & RACK_TO_REXT) && (tp->t_flags & TF_RCVD_TSTMP) && @@ -10691,6 +11451,27 @@ * total for the part being cum-acked. */ rack->r_ctl.rc_sacked -= (th_ack - rsm->r_start); + } else { + if (((rsm->r_flags & RACK_ACKED) == 0) && + (IN_RECOVERY(tp->t_flags))) { + rack->r_ctl.bytes_acked_in_recovery += (th_ack - rsm->r_start); + } + rack_update_pcm_ack(rack, 1, rsm->r_start, th_ack); + } + /* And what about the lost flag? */ + if (rsm->r_flags & RACK_WAS_LOST) { + /* + * This can happen when we marked it as lost + * and yet before retransmitting we get an ack + * which can happen due to reordering. In this + * case its only a partial ack of the send. + */ + KASSERT((rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)), + ("rsm:%p rack:%p rc_considered_lost goes negative th_ack:%u", rsm, rack, th_ack)); + if (rack->r_ctl.rc_considered_lost >= (th_ack - rsm->r_start)) + rack->r_ctl.rc_considered_lost -= th_ack - rsm->r_start; + else + rack->r_ctl.rc_considered_lost = 0; } /* * Clear the dup ack count for @@ -10807,7 +11588,26 @@ tp->snd_ssthresh = rack->r_ctl.rc_ssthresh_at_erec; tp->snd_recover = tp->snd_una; rack_log_to_prr(rack, 14, orig_cwnd, __LINE__); - EXIT_RECOVERY(tp->t_flags); + if (IN_RECOVERY(tp->t_flags)) { + rack_exit_recovery(tp, rack, 3); + if ((rack->rto_from_rec == 1) && (rack_ssthresh_rest_rto_rec != 0) ){ + /* + * We were in recovery, had an RTO + * and then re-entered recovery (more sack's arrived) + * and we have properly recorded the old ssthresh from + * the first recovery. We want to be able to slow-start + * back to this level. The ssthresh from the timeout + * and then back into recovery will end up most likely + * to be min(cwnd=1mss, 2mss). Which makes it basically + * so we get no slow-start after our RTO. + */ + rack->rto_from_rec = 0; + if (rack->r_ctl.rto_ssthresh > tp->snd_ssthresh) + tp->snd_ssthresh = rack->r_ctl.rto_ssthresh; + } + } + rack->r_ctl.bytes_acked_in_recovery = 0; + rack->r_ctl.time_entered_recovery = 0; } rack->r_might_revert = 0; } @@ -11062,7 +11862,8 @@ static uint32_t do_rack_compute_pipe(struct tcpcb *tp, struct tcp_rack *rack, uint32_t snd_una) { - return (((tp->snd_max - snd_una) - rack->r_ctl.rc_sacked) + rack->r_ctl.rc_holes_rxt); + return (((tp->snd_max - snd_una) - + (rack->r_ctl.rc_sacked + rack->r_ctl.rc_considered_lost)) + rack->r_ctl.rc_holes_rxt); } static int32_t @@ -11505,7 +12306,7 @@ ((rsm->r_flags & RACK_MUST_RXT) == 0)) { /* Enter recovery */ entered_recovery = 1; - rack_cong_signal(tp, CC_NDUPACK, tp->snd_una, __LINE__); + rack_cong_signal(tp, CC_NDUPACK, th_ack, __LINE__); /* * When we enter recovery we need to assure we send * one packet. @@ -11547,7 +12348,7 @@ } static void -rack_strike_dupack(struct tcp_rack *rack) +rack_strike_dupack(struct tcp_rack *rack, tcp_seq th_ack) { struct rack_sendmap *rsm; @@ -11581,7 +12382,7 @@ if (rack->r_ctl.rc_resend != NULL) { if (!IN_FASTRECOVERY(rack->rc_tp->t_flags)) { rack_cong_signal(rack->rc_tp, CC_NDUPACK, - rack->rc_tp->snd_una, __LINE__); + th_ack, __LINE__); } rack->r_wanted_output = 1; rack->r_timer_override = 1; @@ -11598,6 +12399,25 @@ struct tcp_rack *rack, struct socket *so) { + /* + * So what is dragging bottom? + * + * Dragging bottom means you were under pacing and had a + * delay in processing inbound acks waiting on our pacing + * timer to expire. While you were waiting all of the acknowledgments + * for the packets you sent have arrived. This means we are pacing + * way underneath the bottleneck to the point where our Goodput + * measurements stop working, since they require more than one + * ack (usually at least 8 packets worth with multiple acks so we can + * gauge the inter-ack times). If that occurs we have a real problem + * since we are stuck in a hole that we can't get out of without + * something speeding us up. + * + * We also check to see if we are widdling down to just one segment + * outstanding. If this occurs and we have room to send in our cwnd/rwnd + * then we are adding the delayed ack interval into our measurments and + * we need to speed up slightly. + */ uint32_t segsiz, minseg; segsiz = ctf_fixed_maxseg(tp); @@ -11614,10 +12434,13 @@ */ uint64_t lt_bw; + tcp_trace_point(rack->rc_tp, TCP_TP_PACED_BOTTOM); lt_bw = rack_get_lt_bw(rack); rack->rc_dragged_bottom = 1; rack_validate_multipliers_at_or_above100(rack); if ((rack->r_ctl.rack_rs.rs_flags & RACK_RTT_VALID) && + (rack->dis_lt_bw == 0) && + (rack->use_lesser_lt_bw == 0) && (lt_bw > 0)) { /* * Lets use the long-term b/w we have @@ -11729,7 +12552,7 @@ log.u_bbr.delivered = (uint32_t)((cur->end >> 32) & 0x00000000ffffffff) ; log.u_bbr.epoch = (uint32_t)(cur->deadline & 0x00000000ffffffff); log.u_bbr.lt_epoch = (uint32_t)((cur->deadline >> 32) & 0x00000000ffffffff) ; - log.u_bbr.bbr_state = 1; + log.u_bbr.inhpts = 1; #ifdef TCP_REQUEST_TRK off = (uint64_t)(cur) - (uint64_t)(&rack->rc_tp->t_tcpreq_info[0]); log.u_bbr.use_lt_bw = (uint8_t)(off / sizeof(struct tcp_sendfile_track)); @@ -11745,6 +12568,20 @@ log.u_bbr.flex7 |= rack->rc_hybrid_mode; log.u_bbr.flex7 <<= 1; log.u_bbr.flex7 |= rack->dgp_on; + /* + * Compose bbr_state to be a bit wise 0000ADHF + * where A is the always_pace flag + * where D is the dgp_on flag + * where H is the hybrid_mode on flag + * where F is the use_fixed_rate flag. + */ + log.u_bbr.bbr_state = rack->rc_always_pace; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->dgp_on; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->rc_hybrid_mode; + log.u_bbr.bbr_state <<= 1; + log.u_bbr.bbr_state |= rack->use_fixed_rate; log.u_bbr.flex8 = mod; log.u_bbr.delRate = rack->r_ctl.bw_rate_cap; log.u_bbr.bbr_substate = rack->r_ctl.client_suggested_maxseg; @@ -11763,12 +12600,13 @@ #ifdef TCP_REQUEST_TRK static void -rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len) +rack_set_dgp_hybrid_mode(struct tcp_rack *rack, tcp_seq seq, uint32_t len, uint64_t cts) { - struct tcp_sendfile_track *rc_cur; + struct tcp_sendfile_track *rc_cur, *orig_ent; struct tcpcb *tp; int err = 0; + orig_ent = rack->r_ctl.rc_last_sft; rc_cur = tcp_req_find_req_for_seq(rack->rc_tp, seq); if (rc_cur == NULL) { /* If not in the beginning what about the end piece */ @@ -11781,11 +12619,17 @@ /* If we find no parameters we are in straight DGP mode */ if(rc_cur == NULL) { /* None found for this seq, just DGP for now */ - rack->r_ctl.client_suggested_maxseg = 0; - rack->rc_catch_up = 0; - rack->r_ctl.bw_rate_cap = 0; - if (rack->rc_hybrid_mode) + if (rack->rc_hybrid_mode) { + rack->r_ctl.client_suggested_maxseg = 0; + rack->rc_catch_up = 0; + if (rack->cspr_is_fcc == 0) + rack->r_ctl.bw_rate_cap = 0; + else + rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; + } + if (rack->rc_hybrid_mode) { rack_log_hybrid(rack, (seq + len - 1), NULL, HYBRID_LOG_NO_RANGE, __LINE__, err); + } if (rack->r_ctl.rc_last_sft) { rack->r_ctl.rc_last_sft = NULL; } @@ -11793,6 +12637,20 @@ } if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_WASSET) == 0) { /* This entry was never setup for hybrid pacing on/off etc */ + if (rack->rc_hybrid_mode) { + rack->r_ctl.client_suggested_maxseg = 0; + rack->rc_catch_up = 0; + rack->r_ctl.bw_rate_cap = 0; + } + if (rack->r_ctl.rc_last_sft) { + rack->r_ctl.rc_last_sft = NULL; + } + if ((rc_cur->flags & TCP_TRK_TRACK_FLG_FSND) == 0) { + rc_cur->flags |= TCP_TRK_TRACK_FLG_FSND; + rc_cur->first_send = cts; + rc_cur->sent_at_fs = rack->rc_tp->t_sndbytes; + rc_cur->rxt_at_fs = rack->rc_tp->t_snd_rxt_bytes; + } return; } /* @@ -11812,18 +12670,40 @@ } if (rack->rc_hybrid_mode == 0) { rack->r_ctl.rc_last_sft = rc_cur; + if (orig_ent) { + orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; + orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; + orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; + } rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); return; } if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CSPR) && rc_cur->cspr){ /* Compensate for all the header overhead's */ - rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); - } else - rack->r_ctl.bw_rate_cap = 0; + if (rack->cspr_is_fcc == 0) + rack->r_ctl.bw_rate_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); + else + rack->r_ctl.fillcw_cap = rack_compensate_for_linerate(rack, rc_cur->cspr); + } else { + if (rack->rc_hybrid_mode) { + if (rack->cspr_is_fcc == 0) + rack->r_ctl.bw_rate_cap = 0; + else + rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; + } + } if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_H_MS) rack->r_ctl.client_suggested_maxseg = rc_cur->hint_maxseg; else rack->r_ctl.client_suggested_maxseg = 0; + if (rc_cur->timestamp == rack->r_ctl.last_tm_mark) { + /* + * It is the same timestamp as the previous one + * add the hybrid flag that will indicate we use + * sendtime not arrival time for catch-up mode. + */ + rc_cur->hybrid_flags |= TCP_HYBRID_PACING_SENDTIME; + } if ((rc_cur->hybrid_flags & TCP_HYBRID_PACING_CU) && (rc_cur->cspr > 0)) { uint64_t len; @@ -11833,7 +12713,20 @@ * Calculate the deadline time, first set the * time to when the request arrived. */ - rc_cur->deadline = rc_cur->localtime; + if (rc_cur->hybrid_flags & TCP_HYBRID_PACING_SENDTIME) { + /* + * For cases where its a duplicate tm (we received more + * than one request for a tm) we want to use now, the point + * where we are just sending the first bit of the request. + */ + rc_cur->deadline = cts; + } else { + /* + * Here we have a different tm from the last request + * so we want to use arrival time as our base. + */ + rc_cur->deadline = rc_cur->localtime; + } /* * Next calculate the length and compensate for * TLS if need be. @@ -11867,9 +12760,15 @@ */ rack_set_pace_segments(tp, rack, __LINE__, NULL); } + if (orig_ent) { + orig_ent->sent_at_ls = rack->rc_tp->t_sndbytes; + orig_ent->rxt_at_ls = rack->rc_tp->t_snd_rxt_bytes; + orig_ent->flags |= TCP_TRK_TRACK_FLG_LSND; + } rack_log_hybrid(rack, seq, rc_cur, HYBRID_LOG_RULES_APP, __LINE__, 0); /* Remember it for next time and for CU mode */ rack->r_ctl.rc_last_sft = rc_cur; + rack->r_ctl.last_tm_mark = rc_cur->timestamp; } #endif @@ -11884,7 +12783,7 @@ (ent->flags == TCP_TRK_TRACK_FLG_EMPTY) || (SEQ_GEQ(seq, ent->end_seq))) { /* Time to update the track. */ - rack_set_dgp_hybrid_mode(rack, seq, len); + rack_set_dgp_hybrid_mode(rack, seq, len, cts); ent = rack->r_ctl.rc_last_sft; } /* Out of all */ @@ -12116,8 +13015,17 @@ * if so be sure to NULL the pointer so we know we are no longer * set to anything. */ - if (ent == rack->r_ctl.rc_last_sft) + if (ent == rack->r_ctl.rc_last_sft) { rack->r_ctl.rc_last_sft = NULL; + if (rack->rc_hybrid_mode) { + rack->rc_catch_up = 0; + if (rack->cspr_is_fcc == 0) + rack->r_ctl.bw_rate_cap = 0; + else + rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; + rack->r_ctl.client_suggested_maxseg = 0; + } + } /* Generate the log that the tcp_netflix call would have */ tcp_req_log_req_info(rack->rc_tp, ent, i, TCP_TRK_REQ_LOG_FREED, 0, 0); @@ -12139,7 +13047,7 @@ rack_process_ack(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, - int32_t * ofia, int32_t thflags, int32_t *ret_val) + int32_t * ofia, int32_t thflags, int32_t *ret_val, int32_t orig_tlen) { int32_t ourfinisacked = 0; int32_t nsegs, acked_amount; @@ -12147,7 +13055,8 @@ struct mbuf *mfree; struct tcp_rack *rack; int32_t under_pacing = 0; - int32_t recovery = 0; + int32_t post_recovery = 0; + uint32_t p_cwnd; INP_WLOCK_ASSERT(tptoinpcb(tp)); @@ -12176,8 +13085,9 @@ if ((th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd) && + (orig_tlen == 0) && ((to->to_flags & TOF_SACK) == 0)) { - rack_strike_dupack(rack); + rack_strike_dupack(rack, th->th_ack); dup_ack_struck = 1; } rack_log_ack(tp, to, th, ((in_rec == 0) && IN_FASTRECOVERY(tp->t_flags)), @@ -12185,6 +13095,7 @@ if ((rack->sack_attack_disable > 0) && (th->th_ack == tp->snd_una) && (tiwin == tp->snd_wnd) && + (orig_tlen == 0) && (dsack_seen == 0) && (sacks_seen > 0)) { /* @@ -12197,7 +13108,7 @@ * were we are ignoring sacks from this guy due to * it being a suspected attacker. */ - rack_strike_dupack(rack); + rack_strike_dupack(rack, th->th_ack); } } @@ -12306,15 +13217,37 @@ tcp_rack_partialack(tp); } else { rack_post_recovery(tp, th->th_ack); - recovery = 1; + post_recovery = 1; + /* + * Grab the segsiz, multiply by 2 and add the snd_cwnd + * that is the max the CC should add if we are exiting + * recovery and doing a late add. + */ + p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + p_cwnd <<= 1; + p_cwnd += tp->snd_cwnd; } + } else if ((rack->rto_from_rec == 1) && + SEQ_GEQ(th->th_ack, tp->snd_recover)) { + /* + * We were in recovery, hit a rxt timeout + * and never re-entered recovery. The timeout(s) + * made up all the lost data. In such a case + * we need to clear the rto_from_rec flag. + */ + rack->rto_from_rec = 0; } /* * Let the congestion control algorithm update congestion control * related information. This typically means increasing the * congestion window. */ - rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, recovery); + rack_ack_received(tp, rack, th->th_ack, nsegs, CC_ACK, post_recovery); + if (post_recovery && + (tp->snd_cwnd > p_cwnd)) { + /* Must be non-newreno (cubic) getting too ahead of itself */ + tp->snd_cwnd = p_cwnd; + } SOCKBUF_LOCK(&so->so_snd); acked_amount = min(acked, (int)sbavail(&so->so_snd)); tp->snd_wnd -= acked_amount; @@ -12338,13 +13271,6 @@ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); /* NB: sowwakeup_locked() does an implicit unlock. */ sowwakeup_locked(so); - /* now check the rxt clamps */ - if ((recovery == 1) && - (rack->excess_rxt_on) && - (rack->r_cwnd_was_clamped == 0)) { - do_rack_excess_rxt(tp, rack); - } else if (rack->r_cwnd_was_clamped) - do_rack_check_for_unclamp(tp, rack); m_freem(mfree); if (SEQ_GT(tp->snd_una, tp->snd_recover)) tp->snd_recover = tp->snd_una; @@ -12363,11 +13289,12 @@ if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ tp->t_flags &= ~TF_PREVVALID; + rack->r_ctl.idle_snd_una = tp->snd_una; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); - rack->r_ctl.retran_during_recovery = 0; - rack->r_ctl.dsack_byte_cnt = 0; if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; + rack->r_ctl.retran_during_recovery = 0; + rack->r_ctl.dsack_byte_cnt = 0; rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); if (sbavail(&tptosocket(tp)->so_snd) == 0) tp->t_acktime = 0; @@ -12562,7 +13489,6 @@ } } - /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still @@ -12713,12 +13639,20 @@ SOCKBUF_LOCK(&so->so_rcv); if (so->so_rcv.sb_state & SBS_CANTRCVMORE) { m_freem(m); - } else + } else { + int32_t newsize; + + if (tlen > 0) { + newsize = tcp_autorcvbuf(m, th, so, tp, tlen); + if (newsize) + if (!sbreserve_locked(so, SO_RCV, newsize, NULL)) + so->so_rcv.sb_flags &= ~SB_AUTOSIZE; + } #ifdef NETFLIX_SB_LIMITS appended = #endif sbappendstream_locked(&so->so_rcv, m, 0); - + } rack_log_wakeup(tp,rack, &so->so_rcv, tlen, 1); /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); @@ -12877,9 +13811,6 @@ if (__predict_false(th->th_seq != tp->rcv_nxt)) { return (0); } - if (__predict_false(tp->snd_nxt != tp->snd_max)) { - return (0); - } if (tiwin && tiwin != tp->snd_wnd) { return (0); } @@ -13005,10 +13936,6 @@ /* Above what we have sent? */ return (0); } - if (__predict_false(tp->snd_nxt != tp->snd_max)) { - /* We are retransmitting */ - return (0); - } if (__predict_false(tiwin == 0)) { /* zero window */ return (0); @@ -13176,6 +14103,7 @@ rack->r_ctl.retran_during_recovery = 0; rack->rc_suspicious = 0; rack->r_ctl.dsack_byte_cnt = 0; + rack->r_ctl.idle_snd_una = tp->snd_una; rack->r_ctl.rc_went_idle_time = tcp_get_usecs(NULL); if (rack->r_ctl.rc_went_idle_time == 0) rack->r_ctl.rc_went_idle_time = 1; @@ -13203,6 +14131,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen = tlen; int32_t todrop; int32_t ourfinisacked = 0; struct tcp_rack *rack; @@ -13267,8 +14196,9 @@ */ if (IS_FASTOPEN(tp->t_flags) && (tp->snd_una != tp->snd_max)) { - tp->snd_nxt = th->th_ack; - tfo_partial = 1; + /* Was it a partial ack? */ + if (SEQ_LT(th->th_ack, tp->snd_max)) + tfo_partial = 1; } /* * If there's data, delay ACK; if there's also a FIN ACKNOW @@ -13299,6 +14229,24 @@ * and there is no send_map. */ tp->snd_una++; + if (tfo_partial && (SEQ_GT(tp->snd_max, tp->snd_una))) { + /* + * We sent a SYN with data, and thus have a + * sendmap entry with a SYN set. Lets find it + * and take off the send bit and the byte and + * set it up to be what we send (send it next). + */ + struct rack_sendmap *rsm; + + rsm = tqhash_min(rack->r_ctl.tqh); + if (rsm) { + if (rsm->r_flags & RACK_HAS_SYN) { + rsm->r_flags &= ~RACK_HAS_SYN; + rsm->r_start++; + } + rack->r_ctl.rc_resend = rsm; + } + } } /* * Received in SYN_SENT[*] state. Transitions: @@ -13361,7 +14309,7 @@ tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ if (tp->t_state == TCPS_FIN_WAIT_1) { @@ -13407,6 +14355,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { struct tcp_rack *rack; + int32_t orig_tlen = tlen; int32_t ret_val = 0; int32_t ourfinisacked = 0; @@ -13579,7 +14528,7 @@ tcp_rack_xmit_timer(rack, t + 1, 1, t, 0, NULL, 2); tcp_rack_xmit_timer_commit(rack, tp); } - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (tp->t_state == TCPS_FIN_WAIT_1) { @@ -13624,6 +14573,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen = tlen; struct tcp_rack *rack; /* @@ -13730,7 +14680,7 @@ /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (sbavail(&so->so_snd)) { @@ -13756,6 +14706,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen = tlen; struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -13830,7 +14781,7 @@ /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, NULL, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (sbavail(&so->so_snd)) { @@ -13884,6 +14835,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen = tlen; int32_t ourfinisacked = 0; struct tcp_rack *rack; @@ -13966,7 +14918,7 @@ /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (ourfinisacked) { @@ -14011,6 +14963,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen = tlen; int32_t ourfinisacked = 0; struct tcp_rack *rack; @@ -14093,7 +15046,7 @@ /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (ourfinisacked) { @@ -14124,6 +15077,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen; int32_t ourfinisacked = 0; struct tcp_rack *rack; @@ -14152,6 +15106,7 @@ if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } + orig_tlen = tlen; if (_ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val, &rack->r_ctl.challenge_ack_ts, &rack->r_ctl.challenge_ack_cnt)) { @@ -14206,7 +15161,7 @@ /* * case TCPS_LAST_ACK: Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (ourfinisacked) { @@ -14237,6 +15192,7 @@ uint32_t tiwin, int32_t thflags, int32_t nxt_pkt, uint8_t iptos) { int32_t ret_val = 0; + int32_t orig_tlen = tlen; int32_t ourfinisacked = 0; struct tcp_rack *rack; @@ -14320,7 +15276,7 @@ /* * Ack processing. */ - if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { + if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val, orig_tlen)) { return (ret_val); } if (sbavail(&so->so_snd)) { @@ -14919,65 +15875,43 @@ } static void -rack_translate_clamp_value(struct tcp_rack *rack, uint32_t optval) +rack_translate_policer_detect(struct tcp_rack *rack, uint32_t optval) { /* - * P = percent bits - * F = fill cw bit -- Toggle fillcw if this bit is set. - * S = Segment bits - * M = set max segment bit - * U = Unclamined - * C = If set to non-zero override the max number of clamps. - * L = Bit to indicate if clamped gets lower. + * P = Percent of retransmits 499 = 49.9% + * A = Average number 1 (.1%) -> 169 (16.9%) + * M = Median number of retrans 1 - 16 + * MMMM MMMM AAAA AAAA PPPP PPPP PPPP PPPP * - * CCCC CCCCC UUUU UULF PPPP PPPP PPPP PPPP - * - * The lowest 3 nibbles is the perentage .1 - 6553.5% - * where 10.1 = 101, max 6553.5 - * The upper 16 bits holds some options. - * The F bit will turn on fill-cw on if you are - * not pacing, it will turn it off if dgp is on. - * The L bit will change it so when clamped we get - * the min(gp, lt-bw) for dgp. */ - uint16_t per; + uint16_t per, upp; - rack->r_ctl.saved_rxt_clamp_val = optval; per = optval & 0x0000ffff; - rack->r_ctl.rxt_threshold = (uint64_t)(per & 0xffff); - if (optval > 0) { - uint16_t clamp_opt; - - rack->excess_rxt_on = 1; - clamp_opt = ((optval & 0xffff0000) >> 16); - rack->r_ctl.clamp_options = clamp_opt & 0x00ff; - if (clamp_opt & 0xff00) { - /* A max clamps is also present */ - rack->r_ctl.max_clamps = (clamp_opt >> 8); - } else { - /* No specified clamps means no limit */ - rack->r_ctl.max_clamps = 0; - } - if (rack->r_ctl.clamp_options & 0x0002) { - rack->r_clamped_gets_lower = 1; - } else { - rack->r_clamped_gets_lower = 0; - } + rack->r_ctl.policer_rxt_threshold = (uint32_t)(per & 0xffff); + upp = ((optval & 0xffff0000) >> 16); + rack->r_ctl.policer_avg_threshold = (0x00ff & upp); + rack->r_ctl.policer_med_threshold = ((upp >> 8) & 0x00ff); + if ((rack->r_ctl.policer_rxt_threshold > 0) && + (rack->r_ctl.policer_avg_threshold > 0) && + (rack->r_ctl.policer_med_threshold > 0)) { + rack->policer_detect_on = 1; } else { - /* Turn it off back to default */ - rack->excess_rxt_on = 0; - rack->r_clamped_gets_lower = 0; + rack->policer_detect_on = 0; } - + rack->r_ctl.saved_policer_val = optval; + policer_detection_log(rack, optval, + rack->r_ctl.policer_avg_threshold, + rack->r_ctl.policer_med_threshold, + rack->r_ctl.policer_rxt_threshold, 11); } - static int32_t rack_init(struct tcpcb *tp, void **ptr) { struct inpcb *inp = tptoinpcb(tp); struct tcp_rack *rack = NULL; uint32_t iwin, snt, us_cts; + size_t sz; int err, no_query; tcp_hpts_init(tp); @@ -15036,16 +15970,22 @@ rack->rc_new_rnd_needed = 1; rack->r_ctl.rc_split_limit = V_tcp_map_split_limit; /* We want abe like behavior as well */ + rack->r_ctl.rc_saved_beta.newreno_flags |= CC_NEWRENO_BETA_ECN_ENABLED; rack->r_ctl.rc_reorder_fade = rack_reorder_fade; rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; - if (rack_rxt_clamp_thresh) { - rack_translate_clamp_value(rack, rack_rxt_clamp_thresh); - rack->excess_rxt_on = 1; + rack->r_ctl.policer_del_mss = rack_req_del_mss; + if ((rack_policer_rxt_thresh > 0) && + (rack_policer_avg_thresh > 0) && + (rack_policer_med_thresh > 0)) { + rack->r_ctl.policer_rxt_threshold = rack_policer_rxt_thresh; + rack->r_ctl.policer_avg_threshold = rack_policer_avg_thresh; + rack->r_ctl.policer_med_threshold = rack_policer_med_thresh; + rack->policer_detect_on = 1; + } else { + rack->policer_detect_on = 0; } - if (rack_uses_full_dgp_in_rec) - rack->r_ctl.full_dgp_in_rec = 1; if (rack_fill_cw_state) rack->rc_pace_to_cwnd = 1; if (rack_pacing_min_seg) @@ -15063,6 +16003,15 @@ if (rack_tcp_accounting) { tp->t_flags2 |= TF2_TCP_ACCOUNTING; } +#endif + rack->r_ctl.pcm_i.cnt_alloc = RACK_DEFAULT_PCM_ARRAY; + sz = (sizeof(struct rack_pcm_stats) * rack->r_ctl.pcm_i.cnt_alloc); + rack->r_ctl.pcm_s = malloc(sz,M_TCPPCM, M_NOWAIT); + if (rack->r_ctl.pcm_s == NULL) { + rack->r_ctl.pcm_i.cnt_alloc = 0; + } +#ifdef NETFLIX_STATS + rack->r_ctl.side_chan_dis_mask = tcp_sidechannel_disable_mask; #endif rack->r_ctl.rack_per_upper_bound_ss = (uint8_t)rack_per_upper_bound_ss; rack->r_ctl.rack_per_upper_bound_ca = (uint8_t)rack_per_upper_bound_ca; @@ -15070,6 +16019,7 @@ rack->rack_enable_scwnd = 1; rack->r_ctl.pace_len_divisor = rack_default_pacing_divisor; rack->rc_user_set_max_segs = rack_hptsi_segments; + rack->r_ctl.max_reduction = rack_max_reduce; rack->rc_force_max_seg = 0; TAILQ_INIT(&rack->r_ctl.opt_list); rack->r_ctl.rc_saved_beta.beta = V_newreno_beta_ecn; @@ -15084,12 +16034,22 @@ } else { rack->r_ctl.saved_hibeta = 50; } + /* + * We initialize to all ones so we never match 0 + * just in case the client sends in 0, it hopefully + * will never have all 1's in ms :-) + */ + rack->r_ctl.last_tm_mark = 0xffffffffffffffff; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; + rack->r_ctl.pol_bw_comp = rack_policing_do_bw_comp; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_lowest_us_rtt = 0xffffffff; rack->r_ctl.rc_highest_us_rtt = 0; rack->r_ctl.bw_rate_cap = rack_bw_rate_cap; + rack->pcm_enabled = rack_pcm_is_enabled; + if (rack_fillcw_bw_cap) + rack->r_ctl.fillcw_cap = rack_fillcw_bw_cap; rack->r_ctl.timer_slop = TICKS_2_USEC(tcp_rexmit_slop); if (rack_use_cmp_acks) rack->r_use_cmp_ack = 1; @@ -15098,6 +16058,7 @@ if (rack_gp_no_rec_chg) rack->rc_gp_no_rec_chg = 1; if (rack_pace_every_seg && tcp_can_enable_pacing()) { + rack->r_ctl.pacing_method |= RACK_REG_PACING; rack->rc_always_pace = 1; if (rack->rack_hibeta) rack_set_cc_pacing(rack); @@ -15114,13 +16075,31 @@ rack->r_limit_scw = 0; rack_init_retransmit_value(rack, rack_rxt_controls); rack->rc_labc = V_tcp_abc_l_var; + if (rack_honors_hpts_min_to) + rack->r_use_hpts_min = 1; + if (tp->snd_una != 0) { + rack->r_ctl.idle_snd_una = tp->snd_una; + rack->rc_sendvars_notset = 0; + /* + * Make sure any TCP timers are not running. + */ + tcp_timer_stop(tp); + } else { + /* + * Server side, we are called from the + * syn-cache. This means none of the + * snd_una/max are set yet so we have + * to defer this until the first send. + */ + rack->rc_sendvars_notset = 1; + } + rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; microuptime(&rack->r_ctl.act_rcv_time); rack->r_ctl.rc_last_time_decay = rack->r_ctl.act_rcv_time; - rack->rc_init_win = rack_default_init_window; rack->r_ctl.rack_per_of_gp_ss = rack_per_of_gp_ss; if (rack_hw_up_only) rack->r_up_only = 1; @@ -15132,15 +16111,34 @@ } else rack->r_ctl.rack_per_of_gp_ca = rack_per_of_gp_ca; rack->r_ctl.rack_per_of_gp_rec = rack_per_of_gp_rec; + if (rack_timely_off) { + rack->rc_skip_timely = 1; + } + if (rack->rc_skip_timely) { + rack->r_ctl.rack_per_of_gp_rec = 90; + rack->r_ctl.rack_per_of_gp_ca = 100; + rack->r_ctl.rack_per_of_gp_ss = 250; + } rack->r_ctl.rack_per_of_gp_probertt = rack_per_of_gp_probertt; rack->r_ctl.rc_tlp_rxt_last_time = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); + rack->r_ctl.last_rcv_tstmp_for_rtt = tcp_tv_to_mssectick(&rack->r_ctl.act_rcv_time); + setup_time_filter_small(&rack->r_ctl.rc_gp_min_rtt, FILTER_TYPE_MIN, rack_probertt_filter_life); us_cts = tcp_tv_to_usectick(&rack->r_ctl.act_rcv_time); rack->r_ctl.rc_lower_rtt_us_cts = us_cts; rack->r_ctl.rc_time_of_last_probertt = us_cts; - rack->r_ctl.challenge_ack_ts = tcp_ts_getticks(); + rack->r_ctl.rc_went_idle_time = us_cts; + rack->r_ctl.challenge_ack_ts = tcp_ts_getticks() - (tcp_ack_war_time_window + 1); rack->r_ctl.rc_time_probertt_starts = 0; + + rack->r_ctl.gp_rnd_thresh = rack_rnd_cnt_req & 0xff; + if (rack_rnd_cnt_req & 0x10000) + rack->r_ctl.gate_to_fs = 1; + rack->r_ctl.gp_gain_req = rack_gp_gain_req; + if ((rack_rnd_cnt_req & 0x100) > 0) { + + } if (rack_dsack_std_based & 0x1) { /* Basically this means all rack timers are at least (srtt + 1/4 srtt) */ rack->rc_rack_tmr_std_based = 1; @@ -15449,10 +16447,8 @@ rack->r_ctl.fsb.tcp_ip_hdr = NULL; rack->r_ctl.fsb.th = NULL; } - if (rack->rc_always_pace) { - tcp_decrement_paced_conn(); - rack_undo_cc_pacing(rack); - rack->rc_always_pace = 0; + if (rack->rc_always_pace == 1) { + rack_remove_pacing(rack); } /* Clean up any options if they were not applied */ while (!TAILQ_EMPTY(&rack->r_ctl.opt_list)) { @@ -15492,6 +16488,12 @@ uma_zfree(rack_zone, rsm); rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); } + if (rack->r_ctl.pcm_s != NULL) { + free(rack->r_ctl.pcm_s, M_TCPPCM); + rack->r_ctl.pcm_s = NULL; + rack->r_ctl.pcm_i.cnt_alloc = 0; + rack->r_ctl.pcm_i.cnt = 0; + } if ((rack->r_ctl.rc_num_maps_alloced > 0) && (tcp_bblogging_on(tp))) { union tcp_log_stackspecific log; @@ -15593,6 +16595,16 @@ int tmr_up; tmr_up = rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK; + if (tcp_in_hpts(rack->rc_tp) == 0) { + /* + * Ok we probably need some timer up, but no + * matter what the mask we are not in hpts. We + * may have received an old ack and thus did nothing. + */ + rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); + rack_start_hpts_timer(rack, tp, tcp_get_usecs(NULL), 0, 0, 0); + return; + } if (rack->rc_in_persist && (tmr_up == PACE_TMR_PERSIT)) return; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); @@ -15916,6 +16928,134 @@ } } +static void +rack_new_round_starts(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) +{ + /* + * The next send has occurred mark the end of the round + * as when that data gets acknowledged. We can + * also do common things we might need to do when + * a round begins. + */ + rack->r_ctl.roundends = tp->snd_max; + rack->rc_new_rnd_needed = 0; + rack_log_hystart_event(rack, tp->snd_max, 4); +} + + +static void +rack_log_pcm(struct tcp_rack *rack, uint8_t mod, uint32_t flex1, uint32_t flex2, + uint32_t flex3) +{ + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + (void)tcp_get_usecs(&tv); + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_tv_to_usectick(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.flex8 = mod; + log.u_bbr.flex1 = flex1; + log.u_bbr.flex2 = flex2; + log.u_bbr.flex3 = flex3; + log.u_bbr.flex4 = rack_pcm_every_n_rounds; + log.u_bbr.flex5 = rack->r_ctl.pcm_idle_rounds; + log.u_bbr.bbr_substate = rack->pcm_needed; + log.u_bbr.bbr_substate <<= 1; + log.u_bbr.bbr_substate |= rack->pcm_in_progress; + log.u_bbr.bbr_substate <<= 1; + log.u_bbr.bbr_substate |= rack->pcm_enabled; /* bits are NIE for Needed, Inprogress, Enabled */ + (void)tcp_log_event(rack->rc_tp, NULL, NULL, NULL, TCP_PCM_MEASURE, ERRNO_UNK, + 0, &log, false, NULL, NULL, 0, &tv); + } +} + +static void +rack_new_round_setup(struct tcpcb *tp, struct tcp_rack *rack, uint32_t high_seq) +{ + /* + * The round (current_round) has ended. We now + * setup for the next round by incrementing the + * round numnber and doing any round specific + * things. + */ + rack_log_hystart_event(rack, high_seq, 21); + rack->r_ctl.current_round++; + /* New round (current_round) begins at next send */ + rack->rc_new_rnd_needed = 1; + if ((rack->pcm_enabled == 1) && + (rack->pcm_needed == 0) && + (rack->pcm_in_progress == 0)) { + /* + * If we have enabled PCM, then we need to + * check if the round has adanced to the state + * where one is required. + */ + int rnds; + + rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; + if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { + rack->pcm_needed = 1; + rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); + } else if (rack_verbose_logging) { + rack_log_pcm(rack, 3, rack->r_ctl.last_pcm_round, rack_pcm_every_n_rounds, rack->r_ctl.current_round ); + } + } + if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { + /* We have hystart enabled send the round info in */ + if (CC_ALGO(tp)->newround != NULL) { + CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); + } + } + /* + * For DGP an initial startup check. We want to validate + * that we are not just pushing on slow-start and just + * not gaining.. i.e. filling buffers without getting any + * boost in b/w during the inital slow-start. + */ + if (rack->dgp_on && + (rack->rc_initial_ss_comp == 0) && + (tp->snd_cwnd < tp->snd_ssthresh) && + (rack->r_ctl.num_measurements >= RACK_REQ_AVG) && + (rack->r_ctl.gp_rnd_thresh > 0) && + ((rack->r_ctl.current_round - rack->r_ctl.last_rnd_of_gp_rise) >= rack->r_ctl.gp_rnd_thresh)) { + + /* + * We are in the initial SS and we have hd rack_rnd_cnt_req rounds(def:5) where + * we have not gained the required amount in the gp_est (120.0% aka 1200). Lets + * exit SS. + * + * Pick up the flight size now as we enter slowstart (not the + * cwnd which may be inflated). + */ + rack->rc_initial_ss_comp = 1; + + if (tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = rack->r_ctl.current_round; + log.u_bbr.flex2 = rack->r_ctl.last_rnd_of_gp_rise; + log.u_bbr.flex3 = rack->r_ctl.gp_rnd_thresh; + log.u_bbr.flex5 = rack->r_ctl.gate_to_fs; + log.u_bbr.flex5 = rack->r_ctl.ss_hi_fs; + log.u_bbr.flex8 = 40; + (void)tcp_log_event(tp, NULL, NULL, NULL, BBR_LOG_CWND, 0, + 0, &log, false, NULL, __func__, __LINE__,&tv); + } + if ((rack->r_ctl.gate_to_fs == 1) && + (tp->snd_cwnd > rack->r_ctl.ss_hi_fs)) { + tp->snd_cwnd = rack->r_ctl.ss_hi_fs; + } + tp->snd_ssthresh = tp->snd_cwnd - 1; + /* Turn off any fast output running */ + rack->r_fast_output = 0; + } +} + static int rack_do_compressed_ack_processing(struct tcpcb *tp, struct socket *so, struct mbuf *m, int nxt_pkt, struct timeval *tv) { @@ -15949,7 +17089,7 @@ #endif int nsegs = 0; int under_pacing = 0; - int recovery = 0; + int post_recovery = 0; #ifdef TCP_ACCOUNTING sched_pin(); #endif @@ -16122,7 +17262,7 @@ } } else if (ae->ack_val_set == ACK_DUPACK) { /* Case D */ - rack_strike_dupack(rack); + rack_strike_dupack(rack, ae->ack); } else if (ae->ack_val_set == ACK_RWND) { /* Case C */ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { @@ -16172,8 +17312,6 @@ } #endif high_seq = ae->ack; - if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) - rack_log_hystart_event(rack, high_seq, 8); /* Setup our act_rcv_time */ if ((ae->flags & TSTMP_LRO) || (ae->flags & TSTMP_HDWR)) { ts.tv_sec = ae->timestamp / 1000000000; @@ -16239,13 +17377,11 @@ if (SEQ_GEQ(high_seq, rack->r_ctl.roundends) && (rack->rc_new_rnd_needed == 0) && (nxt_pkt == 0)) { - rack_log_hystart_event(rack, high_seq, 21); - rack->r_ctl.current_round++; - /* Force the next send to setup the next round */ - rack->rc_new_rnd_needed = 1; - if (CC_ALGO(tp)->newround != NULL) { - CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); - } + /* + * We have crossed into a new round with + * this th_ack value. + */ + rack_new_round_setup(tp, rack, high_seq); } /* * Clear the probe not answered flag @@ -16306,8 +17442,17 @@ tcp_rack_partialack(tp); } else { rack_post_recovery(tp, high_seq); - recovery = 1; + post_recovery = 1; } + } else if ((rack->rto_from_rec == 1) && + SEQ_GEQ(high_seq, tp->snd_recover)) { + /* + * We were in recovery, hit a rxt timeout + * and never re-entered recovery. The timeout(s) + * made up all the lost data. In such a case + * we need to clear the rto_from_rec flag. + */ + rack->rto_from_rec = 0; } /* Handle the rack-log-ack part (sendmap) */ if ((sbused(&so->so_snd) == 0) && @@ -16340,9 +17485,24 @@ KMOD_TCPSTAT_ADD(tcps_rcvackpack, 1); KMOD_TCPSTAT_ADD(tcps_rcvackbyte, acked); if (acked_amount > 0) { + uint32_t p_cwnd; struct mbuf *mfree; - rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, recovery); + if (post_recovery) { + /* + * Grab the segsiz, multiply by 2 and add the snd_cwnd + * that is the max the CC should add if we are exiting + * recovery and doing a late add. + */ + p_cwnd = min(ctf_fixed_maxseg(tp), rack->r_ctl.rc_pace_min_segs); + p_cwnd <<= 1; + p_cwnd += tp->snd_cwnd; + } + rack_ack_received(tp, rack, high_seq, nsegs, CC_ACK, post_recovery); + if (post_recovery && (tp->snd_cwnd > p_cwnd)) { + /* Must be non-newreno (cubic) getting too ahead of itself */ + tp->snd_cwnd = p_cwnd; + } SOCKBUF_LOCK(&so->so_snd); mfree = sbcut_locked(&so->so_snd, acked_amount); tp->snd_una = high_seq; @@ -16351,12 +17511,6 @@ /* Wake up the socket if we have room to write more */ rack_log_wakeup(tp,rack, &so->so_snd, acked, 2); sowwakeup_locked(so); - if ((recovery == 1) && - (rack->excess_rxt_on) && - (rack->r_cwnd_was_clamped == 0)) { - do_rack_excess_rxt(tp, rack); - } else if (rack->r_cwnd_was_clamped) - do_rack_check_for_unclamp(tp, rack); m_freem(mfree); } /* update progress */ @@ -16587,7 +17741,9 @@ } rack_handle_might_revert(tp, rack); ctf_calc_rwin(so, tp); - if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { + if ((rack->r_wanted_output != 0) || + (rack->r_fast_output != 0) || + (tp->t_flags & TF_ACKNOW )) { send_out_a_rst: if (tcp_output(tp) < 0) { #ifdef TCP_ACCOUNTING @@ -16630,7 +17786,7 @@ * us_cts - is the time that LRO or hardware actually got the packet in microseconds. */ uint32_t cts, us_cts, ms_cts; - uint32_t tiwin, high_seq; + uint32_t tiwin; struct timespec ts; struct tcpopt to; struct tcp_rack *rack; @@ -16818,7 +17974,6 @@ tp->t_flags &= ~TF_GPUTINPROG; } } - high_seq = th->th_ack; if (tcp_bblogging_on(rack->rc_tp)) { union tcp_log_stackspecific log; struct timeval ltv; @@ -16938,7 +18093,6 @@ m_freem(m); goto done_with_input; } - /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to @@ -16975,7 +18129,28 @@ if (TSTMP_GT(to.to_tsecr, ms_cts)) to.to_tsecr = 0; } - + if ((rack->r_rcvpath_rtt_up == 1) && + (to.to_flags & TOF_TS) && + (TSTMP_GEQ(to.to_tsecr, rack->r_ctl.last_rcv_tstmp_for_rtt))) { + uint32_t rtt = 0; + + /* + * We are receiving only and thus not sending + * data to do an RTT. We set a flag when we first + * sent this TS to the peer. We now have it back + * and have an RTT to share. We log it as a conf + * 4, we are not so sure about it.. since we + * may have lost an ack. + */ + if (TSTMP_GT(cts, rack->r_ctl.last_time_of_arm_rcv)) + rtt = (cts - rack->r_ctl.last_time_of_arm_rcv); + rack->r_rcvpath_rtt_up = 0; + /* Submit and commit the timer */ + if (rtt > 0) { + tcp_rack_xmit_timer(rack, rtt, 0, rtt, 4, NULL, 1); + tcp_rack_xmit_timer_commit(rack, tp); + } + } /* * If its the first time in we need to take care of options and * verify we can do SACK for rack! @@ -17069,7 +18244,7 @@ (rack->use_fixed_rate == 0) && (rack->rc_always_pace)) { /* Check in on probertt */ - rack_check_probe_rtt(rack, us_cts); + rack_check_probe_rtt(rack, cts); } rack_clear_rate_sample(rack); if ((rack->forced_ack) && @@ -17113,7 +18288,7 @@ * If we are going for target, lets recheck before * we output. */ - rack_check_probe_rtt(rack, us_cts); + rack_check_probe_rtt(rack, cts); } if (rack->set_pacing_done_a_iw == 0) { /* How much has been acked? */ @@ -17144,7 +18319,10 @@ } #endif if ((nxt_pkt == 0) && (no_output == 0)) { - if ((rack->r_wanted_output != 0) || (rack->r_fast_output != 0)) { + if ((rack->r_wanted_output != 0) || + (tp->t_flags & TF_ACKNOW) || + (rack->r_fast_output != 0)) { + do_output_now: if (tcp_output(tp) < 0) { #ifdef TCP_ACCOUNTING @@ -17156,6 +18334,8 @@ } rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); rack_free_trim(rack); + } else if ((nxt_pkt == 0) && (tp->t_flags & TF_ACKNOW)) { + goto do_output_now; } else if ((no_output == 1) && (nxt_pkt == 0) && (tcp_in_hpts(rack->rc_tp) == 0)) { @@ -17170,9 +18350,6 @@ /* Clear the flag, it may have been cleared by output but we may not have */ if ((nxt_pkt == 0) && (tp->t_flags2 & TF2_HPTS_CALLS)) tp->t_flags2 &= ~TF2_HPTS_CALLS; - /* Update any rounds needed */ - if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) - rack_log_hystart_event(rack, high_seq, 8); /* * The draft (v3) calls for us to use SEQ_GEQ, but that * causes issues when we are just going app limited. Lets @@ -17186,13 +18363,11 @@ if (SEQ_GEQ(tp->snd_una, rack->r_ctl.roundends) && (rack->rc_new_rnd_needed == 0) && (nxt_pkt == 0)) { - rack_log_hystart_event(rack, tp->snd_una, 21); - rack->r_ctl.current_round++; - /* Force the next send to setup the next round */ - rack->rc_new_rnd_needed = 1; - if (CC_ALGO(tp)->newround != NULL) { - CC_ALGO(tp)->newround(&tp->t_ccv, rack->r_ctl.current_round); - } + /* + * We have crossed into a new round with + * the new snd_unae. + */ + rack_new_round_setup(tp, rack, tp->snd_una); } if ((nxt_pkt == 0) && ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && @@ -17242,6 +18417,7 @@ if (did_out) rack->r_wanted_output = 0; } + #ifdef TCP_ACCOUNTING sched_unpin(); #endif @@ -17325,7 +18501,7 @@ srtt = rack_grab_rtt(tp, rack); idx = rsm->r_rtr_cnt - 1; ts_low = (uint32_t)rsm->r_tim_lastsent[idx]; - thresh = rack_calc_thresh_rack(rack, srtt, tsused); + thresh = rack_calc_thresh_rack(rack, srtt, tsused, __LINE__, 1); if ((tsused == ts_low) || (TSTMP_LT(tsused, ts_low))) { /* No time since sending */ @@ -17354,7 +18530,7 @@ } static void -rack_log_pacing_delay_calc(struct tcp_rack *rack, uint32_t len, uint32_t slot, +rack_log_pacing_delay_calc (struct tcp_rack *rack, uint32_t len, uint32_t slot, uint64_t bw_est, uint64_t bw, uint64_t len_time, int method, int line, struct rack_sendmap *rsm, uint8_t quality) { @@ -17370,6 +18546,7 @@ if ((method != 2) && (method != 3) && (method != 7) && + (method != 89) && (method != 14) && (method != 20)) { return; @@ -17429,12 +18606,8 @@ log.u_bbr.bbr_substate = quality; log.u_bbr.bbr_state = rack->dgp_on; log.u_bbr.bbr_state <<= 1; - log.u_bbr.bbr_state |= rack->r_fill_less_agg; - log.u_bbr.bbr_state <<= 1; log.u_bbr.bbr_state |= rack->rc_pace_to_cwnd; log.u_bbr.bbr_state <<= 2; - log.u_bbr.bbr_state |= rack->r_pacing_discount; - log.u_bbr.flex7 = ((rack->r_ctl.pacing_discount_amm << 1) | log.u_bbr.flex7); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -17537,7 +18710,6 @@ { uint64_t lentim, fill_bw; - /* Lets first see if we are full, if so continue with normal rate */ rack->r_via_fill_cw = 0; if (ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked) > rack->r_ctl.cwnd_to_use) return (slot); @@ -17551,6 +18723,8 @@ /* The rtt is huge, N * smallest, lets not fill */ return (slot); } + if (rack->r_ctl.fillcw_cap && *rate_wanted >= rack->r_ctl.fillcw_cap) + return (slot); /* * first lets calculate the b/w based on the last us-rtt * and the the smallest send window. @@ -17570,26 +18744,47 @@ /* Now lets make it into a b/w */ fill_bw *= (uint64_t)HPTS_USEC_IN_SEC; fill_bw /= (uint64_t)rack->r_ctl.rc_last_us_rtt; + /* Adjust to any cap */ + if (rack->r_ctl.fillcw_cap && fill_bw >= rack->r_ctl.fillcw_cap) + fill_bw = rack->r_ctl.fillcw_cap; + at_lt_bw: - if (rack->r_fill_less_agg) { + if (rack_bw_multipler > 0) { /* - * We want the average of the rate_wanted - * and our fill-cw calculated bw. We also want - * to cap any increase to be no more than - * X times the lt_bw (where X is the rack_bw_multipler). + * We want to limit fill-cw to the some multiplier + * of the max(lt_bw, gp_est). The normal default + * is 0 for off, so a sysctl has enabled it. */ - uint64_t lt_bw, rate; + uint64_t lt_bw, gp, rate; + gp = rack_get_gp_est(rack); lt_bw = rack_get_lt_bw(rack); - if (lt_bw > *rate_wanted) + if (lt_bw > gp) rate = lt_bw; else - rate = *rate_wanted; - fill_bw += rate; - fill_bw /= 2; - if (rack_bw_multipler && (fill_bw > (rate * rack_bw_multipler))) { - fill_bw = rate * rack_bw_multipler; - } + rate = gp; + rate *= rack_bw_multipler; + rate /= 100; + if (rack_verbose_logging && tcp_bblogging_on(rack->rc_tp)) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex1 = rack_bw_multipler; + log.u_bbr.flex2 = len; + log.u_bbr.cur_del_rate = gp; + log.u_bbr.delRate = lt_bw; + log.u_bbr.bw_inuse = rate; + log.u_bbr.rttProp = fill_bw; + log.u_bbr.flex8 = 44; + tcp_log_event(rack->rc_tp, NULL, NULL, NULL, + BBR_LOG_CWND, 0, + 0, &log, false, NULL, + __func__, __LINE__, &tv); + } + if (fill_bw > rate) + fill_bw = rate; } /* We are below the min b/w */ if (non_paced) @@ -17638,9 +18833,8 @@ } } if (rack->r_ctl.bw_rate_cap && (fill_bw > rack->r_ctl.bw_rate_cap)) { - if (rack->rc_hybrid_mode) - rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, - fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__); + rack_log_hybrid_bw(rack, rack->rc_tp->snd_max, + fill_bw, 0, 0, HYBRID_LOG_RATE_CAP, 2, NULL, __LINE__); fill_bw = rack->r_ctl.bw_rate_cap; } /* @@ -17659,11 +18853,121 @@ return (slot); } -static int32_t -rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz) +static uint32_t +rack_policer_check_send(struct tcp_rack *rack, uint32_t len, uint32_t segsiz, uint32_t *needs) { - uint64_t srtt; - int32_t slot = 0; + uint64_t calc; + + rack->rc_policer_should_pace = 0; + calc = rack_policer_bucket_reserve * rack->r_ctl.policer_bucket_size; + calc /= 100; + /* + * Now lets look at if we want more than is in the bucket + * we want more than is reserved in the bucket. + */ + if (rack_verbose_logging > 0) + policer_detection_log(rack, len, segsiz, calc, rack->r_ctl.current_policer_bucket, 8); + if ((calc > rack->r_ctl.current_policer_bucket) || + (len >= (rack->r_ctl.current_policer_bucket - calc))) { + /* + * We may want to pace depending on if we are going + * into the reserve or not. + */ + uint32_t newlen; + + if (calc > rack->r_ctl.current_policer_bucket) { + /* + * This will eat into the reserve if we + * don't have room at all some lines + * below will catch it. + */ + newlen = rack->r_ctl.policer_max_seg; + rack->rc_policer_should_pace = 1; + } else { + /* + * We have all of the reserve plus something in the bucket + * that we can give out. + */ + newlen = rack->r_ctl.current_policer_bucket - calc; + if (newlen < rack->r_ctl.policer_max_seg) { + /* + * Into the reserve to get a full policer_max_seg + * so we set the len to that and eat into + * the reserve. If we go over the code + * below will make us wait. + */ + newlen = rack->r_ctl.policer_max_seg; + rack->rc_policer_should_pace = 1; + } + } + if (newlen > rack->r_ctl.current_policer_bucket) { + /* We have to wait some */ + *needs = newlen - rack->r_ctl.current_policer_bucket; + return (0); + } + if (rack_verbose_logging > 0) + policer_detection_log(rack, len, segsiz, newlen, 0, 9); + len = newlen; + } /* else we have all len available above the reserve */ + if (rack_verbose_logging > 0) + policer_detection_log(rack, len, segsiz, calc, 0, 10); + return (len); +} + +static uint32_t +rack_policed_sending(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, uint32_t segsiz, int call_line) +{ + /* + * Given a send of len, and a token bucket set at current_policer_bucket_size + * are we close enough to the end of the bucket that we need to pace? If so + * calculate out a time and return it. Otherwise subtract the tokens from + * the bucket. + */ + uint64_t calc; + + if ((rack->r_ctl.policer_bw == 0) || + (rack->r_ctl.policer_bucket_size < segsiz)) { + /* + * We should have an estimate here... + */ + return (0); + } + calc = (uint64_t)rack_policer_bucket_reserve * (uint64_t)rack->r_ctl.policer_bucket_size; + calc /= 100; + if ((rack->r_ctl.current_policer_bucket < len) || + (rack->rc_policer_should_pace == 1) || + ((rack->r_ctl.current_policer_bucket - len) <= (uint32_t)calc)) { + /* we need to pace */ + uint64_t lentim, res; + uint32_t slot; + + lentim = (uint64_t)len * (uint64_t)HPTS_USEC_IN_SEC; + res = lentim / rack->r_ctl.policer_bw; + slot = (uint32_t)res; + if (rack->r_ctl.current_policer_bucket > len) + rack->r_ctl.current_policer_bucket -= len; + else + rack->r_ctl.current_policer_bucket = 0; + policer_detection_log(rack, len, slot, (uint32_t)rack_policer_bucket_reserve, call_line, 5); + rack->rc_policer_should_pace = 0; + return(slot); + } + /* Just take tokens out of the bucket and let rack do whatever it would have */ + policer_detection_log(rack, len, 0, (uint32_t)rack_policer_bucket_reserve, call_line, 6); + if (len < rack->r_ctl.current_policer_bucket) { + rack->r_ctl.current_policer_bucket -= len; + } else { + rack->r_ctl.current_policer_bucket = 0; + } + return (0); +} + + +static int32_t +rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len, struct rack_sendmap *rsm, uint32_t segsiz, int line) +{ + uint64_t srtt; + int32_t slot = 0; int32_t minslot = 0; int can_start_hw_pacing = 1; int err; @@ -17674,6 +18978,25 @@ pace_one = 1; else pace_one = 0; + if (rack->rc_policer_detected == 1) { + /* + * A policer has been detected and we + * have all of our data (policer-bw and + * policer bucket size) calculated. Call + * into the function to find out if we are + * overriding the time. + */ + slot = rack_policed_sending(rack, tp, len, segsiz, line); + if (slot) { + uint64_t logbw; + + logbw = rack->r_ctl.current_policer_bucket; + logbw <<= 32; + logbw |= rack->r_ctl.policer_bucket_size; + rack_log_pacing_delay_calc(rack, len, slot, rack->r_ctl.policer_bw, logbw, 0, 89, __LINE__, NULL, 0); + return(slot); + } + } if (rack->rc_always_pace == 0) { /* * We use the most optimistic possible cwnd/srtt for @@ -18214,6 +19537,16 @@ rack->r_ctl.rc_gp_output_ts = my_rsm->r_tim_lastsent[0]; tp->gput_ack = tp->gput_seq + rack_get_measure_window(tp, rack); rack->r_ctl.rc_gp_cumack_ts = 0; + if ((rack->r_ctl.cleared_app_ack == 1) && + (SEQ_GEQ(rack->r_ctl.cleared_app_ack, tp->gput_seq))) { + /* + * We just cleared an application limited period + * so the next seq out needs to skip the first + * ack. + */ + rack->app_limited_needs_set = 1; + rack->r_ctl.cleared_app_ack = 0; + } rack_log_pacing_delay_calc(rack, tp->gput_seq, tp->gput_ack, @@ -19132,7 +20465,6 @@ rack->r_late = 0; rack->r_ctl.rc_agg_early = 0; } - rack_log_output(tp, &to, len, rsm->r_start, flags, error, rack_to_usec_ts(tv), rsm, RACK_SENT_FP, rsm->m, rsm->soff, rsm->r_hw_tls, segsiz); if (doing_tlp) { @@ -19189,17 +20521,8 @@ tcp_rl_log_enobuf(rack->r_ctl.crte); } counter_u64_add(rack_saw_enobuf, 1); - } else - slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz); - if ((slot == 0) || - (rack->rc_always_pace == 0) || - (rack->r_rr_config == 1)) { - /* - * We have no pacing set or we - * are using old-style rack or - * we are overridden to use the old 1ms pacing. - */ - slot = rack->r_ctl.rc_min_to; + } else { + slot = rack_get_pacing_delay(rack, tp, len, NULL, segsiz, __LINE__); } rack_start_hpts_timer(rack, tp, cts, slot, len, 0); #ifdef TCP_ACCOUNTING @@ -19261,7 +20584,7 @@ (so->so_snd.sb_hiwat / 8 * 7) && sbused(&so->so_snd) < V_tcp_autosndbuf_max && sendwin >= (sbused(&so->so_snd) - - (tp->snd_nxt - tp->snd_una))) { + (tp->snd_max - tp->snd_una))) { if (rack_autosndbuf_inc) scaleup = (rack_autosndbuf_inc * so->so_snd.sb_hiwat) / 100; else @@ -19313,7 +20636,7 @@ uint32_t s_soff; uint32_t if_hw_tsomaxsegcount = 0, startseq; uint32_t if_hw_tsomaxsegsize; - uint16_t add_flag = RACK_SENT_FP; + uint32_t add_flag = RACK_SENT_FP; #ifdef INET6 struct ip6_hdr *ip6 = NULL; @@ -19680,6 +21003,22 @@ rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(tv); rack->r_ctl.lt_seq = tp->snd_una; rack->lt_bw_up = 1; + } else if ((error == 0) && + (((tp->snd_max + len) - rack->r_ctl.lt_seq) > 0x7fffffff)) { + /* + * Need to record what we have since we are + * approaching seq wrap. + */ + struct timeval tv; + uint64_t tmark; + + rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); + rack->r_ctl.lt_seq = tp->snd_una; + tmark = tcp_get_u64_usecs(&tv); + if (tmark > rack->r_ctl.lt_timemark) { + rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + rack->r_ctl.lt_timemark = tmark; + } } rack_log_output(tp, &to, len, tp->snd_max, flags, error, rack_to_usec_ts(tv), NULL, add_flag, s_mb, s_soff, rack->r_ctl.fsb.hw_tls, segsiz); @@ -19699,13 +21038,7 @@ tp->snd_max += len; tp->snd_nxt = tp->snd_max; if (rack->rc_new_rnd_needed) { - /* - * Update the rnd to start ticking not - * that from a time perspective all of - * the preceding idle time is "in the round" - */ - rack->rc_new_rnd_needed = 0; - rack->r_ctl.roundends = tp->snd_max; + rack_new_round_starts(tp, rack, tp->snd_max); } { int idx; @@ -19746,7 +21079,7 @@ } tp->t_flags &= ~(TF_ACKNOW | TF_DELACK); counter_u64_add(rack_fto_send, 1); - slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz); + slot = rack_get_pacing_delay(rack, tp, tot_len, NULL, segsiz, __LINE__); rack_start_hpts_timer(rack, tp, cts, slot, tot_len, 0); #ifdef TCP_ACCOUNTING crtsc = get_cyclecount(); @@ -19856,7 +21189,7 @@ goto restart; } /* Now has it been long enough ? */ - thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts); + thresh = rack_calc_thresh_rack(rack, rack_grab_rtt(rack->rc_tp, rack), cts, __LINE__, 1); if ((cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])) > thresh) { rack_log_collapse(rack, rsm->r_start, (cts - ((uint32_t)rsm->r_tim_lastsent[(rsm->r_rtr_cnt-1)])), @@ -19870,6 +21203,25 @@ return (NULL); } +static void +rack_credit_back_policer_idle_time(struct tcp_rack *rack, uint64_t idle_t, int line) +{ + /* + * We were idle some time (idle_t) and so our policer bucket + * needs to grow. It can go no higher than policer_bucket_size. + */ + uint64_t len; + + len = idle_t * rack->r_ctl.policer_bw; + len /= HPTS_USEC_IN_SEC; + rack->r_ctl.current_policer_bucket += (uint32_t)len; + if (rack->r_ctl.policer_bucket_size < rack->r_ctl.current_policer_bucket) { + rack->r_ctl.current_policer_bucket = rack->r_ctl.policer_bucket_size; + } + if (rack_verbose_logging > 0) + policer_detection_log(rack, (uint32_t)len, line, (uint32_t)idle_t, 0, 7); +} + static inline void rack_validate_sizes(struct tcp_rack *rack, int32_t *len, int32_t segsiz, uint32_t pace_max_seg) { @@ -19931,7 +21283,7 @@ unsigned ipsec_optlen = 0; #endif - int32_t idle, sendalot; + int32_t idle, sendalot, tot_idle; int32_t sub_from_prr = 0; volatile int32_t sack_rxmit; struct rack_sendmap *rsm = NULL; @@ -19940,7 +21292,7 @@ int32_t slot = 0; int32_t sup_rack = 0; uint32_t cts, ms_cts, delayed, early; - uint16_t add_flag = RACK_SENT_SP; + uint32_t add_flag = RACK_SENT_SP; /* The doing_tlp flag will be set by the actual rack_timeout_tlp() */ uint8_t doing_tlp = 0; uint32_t cwnd_to_use, pace_max_seg; @@ -20101,12 +21453,16 @@ early = rack->r_ctl.rc_last_output_to - cts; } else early = 0; - if (delayed) { + if (delayed && (rack->rc_always_pace == 1)) { rack->r_ctl.rc_agg_delayed += delayed; rack->r_late = 1; - } else if (early) { + } else if (early && (rack->rc_always_pace == 1)) { rack->r_ctl.rc_agg_early += early; rack->r_early = 1; + } else if (rack->rc_always_pace == 0) { + /* Non-paced we are not late */ + rack->r_ctl.rc_agg_delayed = rack->r_ctl.rc_agg_early = 0; + rack->r_early = rack->r_late = 0; } /* Now that early/late accounting is done turn off the flag */ rack->r_ctl.rc_hpts_flags &= ~PACE_PKT_OUTPUT; @@ -20168,9 +21524,9 @@ } if ((tp->snd_una == tp->snd_max) && rack->r_ctl.rc_went_idle_time && - TSTMP_GT(cts, rack->r_ctl.rc_went_idle_time)) { - idle = cts - rack->r_ctl.rc_went_idle_time; - if (idle > rack_min_probertt_hold) { + (cts > rack->r_ctl.rc_went_idle_time)) { + tot_idle = idle = (cts - rack->r_ctl.rc_went_idle_time); + if (idle > (uint64_t)rack_min_probertt_hold) { /* Count as a probe rtt */ if (rack->in_probe_rtt == 0) { rack->r_ctl.rc_lower_rtt_us_cts = cts; @@ -20183,17 +21539,75 @@ } idle = 0; } + if(rack->policer_detect_on) { + /* + * If we are doing policer detetion we at a minium + * record the time but if possible add back to + * the bucket based on the idle time. + */ + uint64_t idle_t, u64_cts; + + segsiz = min(ctf_fixed_maxseg(tp), + rack->r_ctl.rc_pace_min_segs); + u64_cts = tcp_tv_to_lusectick(&tv); + if ((rack->rc_policer_detected == 1) && + (rack->r_ctl.policer_bucket_size > segsiz) && + (rack->r_ctl.policer_bw > 0) && + (u64_cts > rack->r_ctl.last_sendtime)) { + /* We are being policed add back the time */ + idle_t = u64_cts - rack->r_ctl.last_sendtime; + rack_credit_back_policer_idle_time(rack, idle_t, __LINE__); + } + rack->r_ctl.last_sendtime = u64_cts; + } if (rack_use_fsb && (rack->r_ctl.fsb.tcp_ip_hdr) && (rack->r_fsb_inited == 0) && (rack->r_state != TCPS_CLOSED)) rack_init_fsb_block(tp, rack, tcp_outflags[tp->t_state]); + if (rack->rc_sendvars_notset == 1) { + rack->r_ctl.idle_snd_una = tp->snd_una; + rack->rc_sendvars_notset = 0; + /* + * Make sure any TCP timers (keep-alive) is not running. + */ + tcp_timer_stop(tp); + } + if ((rack->rack_no_prr == 1) && + (rack->rc_always_pace == 0)) { + /* + * Sanity check before sending, if we have + * no-pacing enabled and prr is turned off that + * is a logistics error. Correct this by turnning + * prr back on. A user *must* set some form of + * pacing in order to turn PRR off. We do this + * in the output path so that we can avoid socket + * option ordering issues that would occur if we + * tried to do it while setting rack_no_prr on. + */ + rack->rack_no_prr = 0; + } + if ((rack->pcm_enabled == 1) && + (rack->pcm_needed == 0) && + (tot_idle > 0)) { + /* + * We have been idle some micro seconds. We need + * to factor this in to see if a PCM is needed. + */ + uint32_t rtts_idle, rnds; + + if (tp->t_srtt) + rtts_idle = tot_idle / tp->t_srtt; + else + rtts_idle = 0; + rnds = rack->r_ctl.current_round - rack->r_ctl.last_pcm_round; + rack->r_ctl.pcm_idle_rounds += rtts_idle; + if ((rnds + rack->r_ctl.pcm_idle_rounds) >= rack_pcm_every_n_rounds) { + rack->pcm_needed = 1; + rack_log_pcm(rack, 8, rack->r_ctl.last_pcm_round, rtts_idle, rack->r_ctl.current_round ); + } + } again: - /* - * If we've recently taken a timeout, snd_max will be greater than - * snd_nxt. There may be SACK information that allows us to avoid - * resending already delivered data. Adjust snd_nxt accordingly. - */ sendalot = 0; cts = tcp_get_usecs(&tv); ms_cts = tcp_tv_to_mssectick(&tv); @@ -20205,6 +21619,44 @@ pace_max_seg = rack->rc_user_set_max_segs * segsiz; else pace_max_seg = rack->r_ctl.rc_pace_max_segs; + if (TCPS_HAVEESTABLISHED(tp->t_state) && + (rack->r_ctl.pcm_max_seg == 0)) { + /* + * We set in our first send so we know that the ctf_fixed_maxseg + * has been fully set. If we do it in rack_init() we most likely + * see 512 bytes so we end up at 5120, not desirable. + */ + rack->r_ctl.pcm_max_seg = rc_init_window(rack); + if (rack->r_ctl.pcm_max_seg < (ctf_fixed_maxseg(tp) * 10)) { + /* + * Assure our initial PCM probe is at least 10 MSS. + */ + rack->r_ctl.pcm_max_seg = ctf_fixed_maxseg(tp) * 10; + } + } + if ((rack->r_ctl.pcm_max_seg != 0) && (rack->pcm_needed == 1)) { + uint32_t rw_avail, cwa; + + if (tp->snd_wnd > ctf_outstanding(tp)) + rw_avail = tp->snd_wnd - ctf_outstanding(tp); + else + rw_avail = 0; + if (tp->snd_cwnd > ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked)) + cwa = tp->snd_cwnd -ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + else + cwa = 0; + if ((cwa >= rack->r_ctl.pcm_max_seg) && + (rw_avail > rack->r_ctl.pcm_max_seg)) { + /* Raise up the max seg for this trip through */ + pace_max_seg = rack->r_ctl.pcm_max_seg; + /* Disable any fast output */ + rack->r_fast_output = 0; + } + if (rack_verbose_logging) { + rack_log_pcm(rack, 4, + cwa, rack->r_ctl.pcm_max_seg, rw_avail); + } + } sb_offset = tp->snd_max - tp->snd_una; cwnd_to_use = rack->r_ctl.cwnd_to_use = tp->snd_cwnd; flags = tcp_outflags[tp->t_state]; @@ -20431,10 +21883,19 @@ ((rsm->r_flags & RACK_HAS_FIN) == 0)) { int ret; + if ((rack->rc_policer_detected == 1) && + (rack->r_ctl.policer_bucket_size > segsiz) && + (rack->r_ctl.policer_bw > 0)) { + /* Check to see if there is room */ + if (rack->r_ctl.current_policer_bucket < len) { + goto skip_fast_output; + } + } ret = rack_fast_rsm_output(tp, rack, rsm, ts_val, cts, ms_cts, &tv, len, doing_tlp); if (ret == 0) return (0); } +skip_fast_output: so = inp->inp_socket; sb = &so->so_snd; if (do_a_prefetch == 0) { @@ -20487,28 +21948,19 @@ prefetch_rsm = 1; } SOCKBUF_LOCK(sb); - /* - * If snd_nxt == snd_max and we have transmitted a FIN, the - * sb_offset will be > 0 even if so_snd.sb_cc is 0, resulting in a - * negative length. This can also occur when TCP opens up its - * congestion window while receiving additional duplicate acks after - * fast-retransmit because TCP will reset snd_nxt to snd_max after - * the fast-retransmit. - * - * In the normal retransmit-FIN-only case, however, snd_nxt will be - * set to snd_una, the sb_offset will be 0, and the length may wind - * up 0. - * - * If sack_rxmit is true we are retransmitting from the scoreboard - * in which case len is already set. - */ if ((sack_rxmit == 0) && (TCPS_HAVEESTABLISHED(tp->t_state) || IS_FASTOPEN(tp->t_flags))) { + /* + * We are not retransmitting (sack_rxmit is 0) so we + * are sending new data. This is always based on snd_max. + * Now in theory snd_max may be equal to snd_una, if so + * then nothing is outstanding and the offset would be 0. + */ uint32_t avail; avail = sbavail(sb); - if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) - sb_offset = tp->snd_nxt - tp->snd_una; + if (SEQ_GT(tp->snd_max, tp->snd_una) && avail) + sb_offset = tp->snd_max - tp->snd_una; else sb_offset = 0; if ((IN_FASTRECOVERY(tp->t_flags) == 0) || rack->rack_no_prr) { @@ -20632,13 +22084,53 @@ kern_prefetch(so, &prefetch_so_done); prefetch_so_done = 1; } + orig_len = len; + if ((rack->rc_policer_detected == 1) && + (rack->r_ctl.policer_bucket_size > segsiz) && + (rack->r_ctl.policer_bw > 0) && + (len > 0)) { + /* + * Ok we believe we have a policer watching + * what we send, can we send len? If not can + * we tune it down to a smaller value? + */ + uint32_t plen, buck_needs; + + plen = rack_policer_check_send(rack, len, segsiz, &buck_needs); + if (plen == 0) { + /* + * We are not allowed to send. How long + * do we need to pace for i.e. how long + * before len is available to send? + */ + uint64_t lentime; + + lentime = buck_needs; + lentime *= HPTS_USEC_IN_SEC; + lentime /= rack->r_ctl.policer_bw; + slot = (uint32_t)lentime; + tot_len_this_send = 0; + SOCKBUF_UNLOCK(sb); + if (rack_verbose_logging > 0) + policer_detection_log(rack, len, slot, buck_needs, 0, 12); + rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); + rack_log_type_just_return(rack, cts, 0, slot, hpts_calling, 0, cwnd_to_use); + goto just_return_clean; + } + if (plen < len) { + sendalot = 0; + len = plen; + } + } /* * Lop off SYN bit if it has already been sent. However, if this is * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ - if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && - ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { + if ((flags & TH_SYN) && + SEQ_GT(tp->snd_max, tp->snd_una) && + ((sack_rxmit == 0) && + (tp->t_rxtshift == 0))) { /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. @@ -20678,7 +22170,6 @@ } /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) { - tp->snd_nxt = tp->iss; len = 0; } if ((len > segsiz) && (tcp_dsack_block_exists(tp))) { @@ -20686,22 +22177,10 @@ add_flag |= RACK_SENT_W_DSACK; len = segsiz; } - orig_len = len; if (len <= 0) { /* - * If FIN has been sent but not acked, but we haven't been - * called to retransmit, len will be < 0. Otherwise, window - * shrank after we sent into it. If window shrank to 0, - * cancel pending retransmit, pull snd_nxt back to (closed) - * window, and set the persist timer if it isn't already - * going. If the window didn't close completely, just wait - * for an ACK. - * - * We also do a general check here to ensure that we will - * set the persist timer when we have data to send, but a - * 0-byte window. This makes sure the persist timer is set - * even if the packet hits one of the "goto send" lines - * below. + * We have nothing to send, or the window shrank, or + * is closed, do we need to go into persists? */ len = 0; if ((tp->snd_wnd == 0) && @@ -20859,10 +22338,6 @@ if (sack_rxmit) { if ((rsm->r_flags & RACK_HAS_FIN) == 0) flags &= ~TH_FIN; - } else { - if (SEQ_LT(tp->snd_nxt + len, tp->snd_una + - sbused(sb))) - flags &= ~TH_FIN; } } recwin = lmin(lmax(sbspace(&so->so_rcv), 0), @@ -20903,10 +22378,6 @@ pass = 4; goto send; } - if (SEQ_LT(tp->snd_nxt, tp->snd_max)) { /* retransmit case */ - pass = 5; - goto send; - } if (sack_rxmit) { pass = 6; goto send; @@ -21014,7 +22485,7 @@ * yet done so, then we need to send. */ if ((flags & TH_FIN) && - (tp->snd_nxt == tp->snd_una)) { + (tp->snd_max == tp->snd_una)) { pass = 11; goto send; } @@ -21027,15 +22498,32 @@ { int app_limited = CTF_JR_SENT_DATA; + if ((IS_FASTOPEN(tp->t_flags) == 0) && + (flags & TH_FIN) && + (len == 0) && + (sbused(sb) == (tp->snd_max - tp->snd_una)) && + ((tp->snd_max - tp->snd_una) <= segsiz)) { + /* + * Ok less than or right at a MSS is + * outstanding. The original FreeBSD stack would + * have sent a FIN, which can speed things up for + * a transactional application doing a MSG_WAITALL. + * To speed things up since we do *not* send a FIN + * if data is outstanding, we send a "challenge ack". + * The idea behind that is instead of having to have + * the peer wait for the delayed-ack timer to run off + * we send an ack that makes the peer send us an ack. + */ + rack_send_ack_challange(rack); + } if (tot_len_this_send > 0) { - /* Make sure snd_nxt is up to max */ rack->r_ctl.fsb.recwin = recwin; - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz); + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, NULL, segsiz, __LINE__); if ((error == 0) && + (rack->rc_policer_detected == 0) && rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && (ipoptlen == 0) && - (tp->snd_nxt == tp->snd_max) && (tp->rcv_numsacks == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && @@ -21052,11 +22540,10 @@ segsiz, pace_max_seg, hw_tls, flags); } else rack->r_fast_output = 0; - - rack_log_fsb(rack, tp, so, flags, ipoptlen, orig_len, len, 0, 1, optlen, __LINE__, 1); + /* Assure when we leave that snd_nxt will point to top */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; } else { @@ -21218,6 +22705,7 @@ rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling, app_limited, cwnd_to_use); } +just_return_clean: #ifdef NETFLIX_SHARED_CWND if ((sbavail(sb) == 0) && rack->r_ctl.rc_scw) { @@ -21284,13 +22772,39 @@ * is acked first. */ flags &= ~TH_FIN; + if ((sbused(sb) == (tp->snd_max - tp->snd_una)) && + ((tp->snd_max - tp->snd_una) <= segsiz)) { + /* + * Ok less than or right at a MSS is + * outstanding. The original FreeBSD stack would + * have sent a FIN, which can speed things up for + * a transactional application doing a MSG_WAITALL. + * To speed things up since we do *not* send a FIN + * if data is outstanding, we send a "challenge ack". + * The idea behind that is instead of having to have + * the peer wait for the delayed-ack timer to run off + * we send an ack that makes the peer send us an ack. + */ + rack_send_ack_challange(rack); + } } /* Enforce stack imposed max seg size if we have one */ - if (rack->r_ctl.rc_pace_max_segs && - (len > rack->r_ctl.rc_pace_max_segs)) { + if (pace_max_seg && + (len > pace_max_seg)) { mark = 1; - len = rack->r_ctl.rc_pace_max_segs; + len = pace_max_seg; + } + if ((rsm == NULL) && + (rack->pcm_in_progress == 0) && + (rack->r_ctl.pcm_max_seg > 0) && + (len >= rack->r_ctl.pcm_max_seg)) { + /* It is large enough for a measurement */ + add_flag |= RACK_IS_PCM; + rack_log_pcm(rack, 5, len, rack->r_ctl.pcm_max_seg, add_flag); + } else if (rack_verbose_logging) { + rack_log_pcm(rack, 6, len, rack->r_ctl.pcm_max_seg, add_flag); } + SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { if (len >= segsiz) @@ -21313,6 +22827,24 @@ #endif hdrlen = sizeof(struct tcpiphdr); + /* + * Ok what seq are we sending from. If we have + * no rsm to use, then we look at various bits, + * if we are putting out a SYN it will be ISS. + * If we are retransmitting a FIN it will + * be snd_max-1 else its snd_max. + */ + if (rsm == NULL) { + if (flags & TH_SYN) + rack_seq = tp->iss; + else if ((flags & TH_FIN) && + (tp->t_flags & TF_SENTFIN)) + rack_seq = tp->snd_max - 1; + else + rack_seq = tp->snd_max; + } else { + rack_seq = rsm->r_start; + } /* * Compute options for segment. We only have to care about SYN and * established connection segments. Options for SYN-ACK segments @@ -21322,7 +22854,6 @@ if ((tp->t_flags & TF_NOOPT) == 0) { /* Maximum segment size. */ if (flags & TH_SYN) { - tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; @@ -21369,14 +22900,47 @@ /* Timestamps. */ if ((tp->t_flags & TF_RCVD_TSTMP) || ((flags & TH_SYN) && (tp->t_flags & TF_REQ_TSTMP))) { - to.to_tsval = ms_cts + tp->ts_offset; + uint32_t ts_to_use; + + if ((rack->r_rcvpath_rtt_up == 1) && + (ms_cts == rack->r_ctl.last_rcv_tstmp_for_rtt)) { + /* + * When we are doing a rcv_rtt probe all + * other timestamps use the next msec. This + * is safe since our previous ack is in the + * air and we will just have a few more + * on the next ms. This assures that only + * the one ack has the ms_cts that was on + * our ack-probe. + */ + ts_to_use = ms_cts + 1; + } else { + ts_to_use = ms_cts; + } + to.to_tsval = ts_to_use + tp->ts_offset; to.to_tsecr = tp->ts_recent; to.to_flags |= TOF_TS; + if ((len == 0) && + (TCPS_HAVEESTABLISHED(tp->t_state)) && + ((ms_cts - rack->r_ctl.last_rcv_tstmp_for_rtt) > RCV_PATH_RTT_MS) && + (tp->snd_una == tp->snd_max) && + (flags & TH_ACK) && + (sbavail(sb) == 0) && + (rack->r_ctl.current_round != 0) && + ((flags & (TH_SYN|TH_FIN)) == 0) && + (rack->r_rcvpath_rtt_up == 0)) { + rack->r_ctl.last_rcv_tstmp_for_rtt = ms_cts; + rack->r_ctl.last_time_of_arm_rcv = cts; + rack->r_rcvpath_rtt_up = 1; + /* Subtract 1 from seq to force a response */ + rack_seq--; + } } /* Set receive buffer autosizing timestamp. */ if (tp->rfbuf_ts == 0 && - (so->so_rcv.sb_flags & SB_AUTOSIZE)) - tp->rfbuf_ts = tcp_ts_getticks(); + (so->so_rcv.sb_flags & SB_AUTOSIZE)) { + tp->rfbuf_ts = ms_cts; + } /* Selective ACK's. */ if (tp->t_flags & TF_SACK_PERMIT) { if (flags & TH_SYN) @@ -21544,7 +23108,24 @@ (sbused(sb))) { /* * We have outstanding data, don't send a fin by itself!. + * + * Check to see if we need to send a challenge ack. */ + if ((sbused(sb) == (tp->snd_max - tp->snd_una)) && + ((tp->snd_max - tp->snd_una) <= segsiz)) { + /* + * Ok less than or right at a MSS is + * outstanding. The original FreeBSD stack would + * have sent a FIN, which can speed things up for + * a transactional application doing a MSG_WAITALL. + * To speed things up since we do *not* send a FIN + * if data is outstanding, we send a "challenge ack". + * The idea behind that is instead of having to have + * the peer wait for the delayed-ack timer to run off + * we send an ack that makes the peer send us an ack. + */ + rack_send_ack_challange(rack); + } goto just_return; } /* @@ -21557,10 +23138,8 @@ uint32_t max_val; uint32_t moff; - if (rack->r_ctl.rc_pace_max_segs) - max_val = rack->r_ctl.rc_pace_max_segs; - else if (rack->rc_user_set_max_segs) - max_val = rack->rc_user_set_max_segs * segsiz; + if (pace_max_seg) + max_val = pace_max_seg; else max_val = len; /* @@ -21596,16 +23175,28 @@ if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); - if (SEQ_LT(tp->snd_nxt, tp->snd_max)) + /* + * If we are not retransmitting advance the + * sndptr to help remember the next place in + * the sb. + */ + if (rsm == NULL) sbsndptr_adv(sb, mb, len); m->m_len += len; } else { struct sockbuf *msb; - if (SEQ_LT(tp->snd_nxt, tp->snd_max)) - msb = NULL; - else + /* + * If we are not retransmitting pass in msb so + * the socket buffer can be advanced. Otherwise + * set it to NULL if its a retransmission since + * we don't want to change the sb remembered + * location. + */ + if (rsm == NULL) msb = sb; + else + msb = NULL; m->m_next = tcp_m_copym( mb, moff, &len, if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb, @@ -21631,7 +23222,7 @@ goto out; } } - if (SEQ_LT(tp->snd_nxt, tp->snd_max) || sack_rxmit) { + if (sack_rxmit) { if (rsm && (rsm->r_flags & RACK_TLP)) { /* * TLP should not count in retran count, but @@ -21750,14 +23341,6 @@ #endif } } - /* - * Fill in fields, remembering maximum advertised window for use in - * delaying messages about window sizes. If resending a FIN, be sure - * not to use a new sequence number. - */ - if (flags & TH_FIN && tp->t_flags & TF_SENTFIN && - tp->snd_nxt == tp->snd_max) - tp->snd_nxt--; /* * If we are starting a connection, send ECN setup SYN packet. If we * are on a retransmit, we may resend those bits a number of times @@ -21787,29 +23370,7 @@ #endif } } - /* - * If we are doing retransmissions, then snd_nxt will not reflect - * the first unsent octet. For ACK only packets, we do not want the - * sequence number of the retransmitted packet, we want the sequence - * number of the next unsent octet. So, if there is no data (and no - * SYN or FIN), use snd_max instead of snd_nxt when filling in - * ti_seq. But if we are in persist state, snd_max might reflect - * one byte beyond the right edge of the window, so use snd_nxt in - * that case, since we know we aren't doing a retransmission. - * (retransmit and persist are mutually exclusive...) - */ - if (sack_rxmit == 0) { - if (len || (flags & (TH_SYN | TH_FIN))) { - th->th_seq = htonl(tp->snd_nxt); - rack_seq = tp->snd_nxt; - } else { - th->th_seq = htonl(tp->snd_max); - rack_seq = tp->snd_max; - } - } else { - th->th_seq = htonl(rsm->r_start); - rack_seq = rsm->r_start; - } + th->th_seq = htonl(rack_seq); th->th_ack = htonl(tp->rcv_nxt); tcp_set_flags(th, flags); /* @@ -22170,6 +23731,13 @@ rack_to_usec_ts(&tv), rsm, add_flag, s_mb, s_moff, hw_tls, segsiz); if (error == 0) { + if (add_flag & RACK_IS_PCM) { + /* We just launched a PCM */ + /* rrs here log */ + rack->pcm_in_progress = 1; + rack->pcm_needed = 0; + rack_log_pcm(rack, 7, len, rack->r_ctl.pcm_max_seg, add_flag); + } if (rsm == NULL) { if (rack->lt_bw_up == 0) { rack->r_ctl.lt_timemark = tcp_tv_to_lusectick(&tv); @@ -22184,9 +23752,11 @@ rack->r_ctl.lt_bw_bytes += (tp->snd_una - rack->r_ctl.lt_seq); rack->r_ctl.lt_seq = tp->snd_una; - tmark = tcp_tv_to_lusectick(&tv); - rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); - rack->r_ctl.lt_timemark = tmark; + tmark = tcp_get_u64_usecs(&tv); + if (tmark > rack->r_ctl.lt_timemark) { + rack->r_ctl.lt_bw_time += (tmark - rack->r_ctl.lt_timemark); + rack->r_ctl.lt_timemark = tmark; + } } } rack->forced_ack = 0; /* If we send something zap the FA flag */ @@ -22256,15 +23826,17 @@ (len > 0) && (tp->snd_una == tp->snd_max)) rack->r_ctl.rc_tlp_rxt_last_time = cts; + { - tcp_seq startseq = tp->snd_nxt; + /* + * This block is not associated with the above error == 0 test. + * It is used to advance snd_max if we have a new transmit. + */ + tcp_seq startseq = tp->snd_max; + - /* Track our lost count */ if (rsm && (doing_tlp == 0)) rack->r_ctl.rc_loss_count += rsm->r_end - rsm->r_start; - /* - * Advance snd_nxt over sequence space of this segment. - */ if (error) /* We don't log or do anything with errors */ goto nomore; @@ -22287,53 +23859,53 @@ rack->rc_tlp_in_progress = 1; rack->r_ctl.rc_tlp_cnt_out++; } - if (flags & (TH_SYN | TH_FIN)) { - if (flags & TH_SYN) - tp->snd_nxt++; - if (flags & TH_FIN) { - tp->snd_nxt++; - tp->t_flags |= TF_SENTFIN; - } - } - /* In the ENOBUFS case we do *not* update snd_max */ + /* + * If we are retransmitting we are done, snd_max + * does not get updated. + */ if (sack_rxmit) goto nomore; - - tp->snd_nxt += len; - if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { - if (tp->snd_una == tp->snd_max) { - /* - * Update the time we just added data since - * none was outstanding. - */ - rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); - tp->t_acktime = ticks; - } - tp->snd_max = tp->snd_nxt; - if (rack->rc_new_rnd_needed) { - /* - * Update the rnd to start ticking not - * that from a time perspective all of - * the preceding idle time is "in the round" - */ - rack->rc_new_rnd_needed = 0; - rack->r_ctl.roundends = tp->snd_max; - } + if ((tp->snd_una == tp->snd_max) && (len > 0)) { /* - * Time this transmission if not a retransmission and - * not currently timing anything. - * This is only relevant in case of switching back to - * the base stack. + * Update the time we just added data since + * nothing was outstanding. */ - if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; - tp->t_rtseq = startseq; - KMOD_TCPSTAT_INC(tcps_segstimed); + rack_log_progress_event(rack, tp, ticks, PROGRESS_START, __LINE__); + tp->t_acktime = ticks; + } + /* + * Now for special SYN/FIN handling. + */ + if (flags & (TH_SYN | TH_FIN)) { + if ((flags & TH_SYN) && + ((tp->t_flags & TF_SENTSYN) == 0)) { + tp->snd_max++; + tp->t_flags |= TF_SENTSYN; } - if (len && - ((tp->t_flags & TF_GPUTINPROG) == 0)) - rack_start_gp_measurement(tp, rack, startseq, sb_offset); + if ((flags & TH_FIN) && + ((tp->t_flags & TF_SENTFIN) == 0)) { + tp->snd_max++; + tp->t_flags |= TF_SENTFIN; + } + } + tp->snd_max += len; + if (rack->rc_new_rnd_needed) { + rack_new_round_starts(tp, rack, tp->snd_max); + } + /* + * Time this transmission if not a retransmission and + * not currently timing anything. + * This is only relevant in case of switching back to + * the base stack. + */ + if (tp->t_rtttime == 0) { + tp->t_rtttime = ticks; + tp->t_rtseq = startseq; + KMOD_TCPSTAT_INC(tcps_segstimed); } + if (len && + ((tp->t_flags & TF_GPUTINPROG) == 0)) + rack_start_gp_measurement(tp, rack, startseq, sb_offset); /* * If we are doing FO we need to update the mbuf position and subtract * this happens when the peer sends us duplicate information and @@ -22356,6 +23928,47 @@ rack->r_ctl.fsb.o_t_len = M_TRAILINGROOM(rack->r_ctl.fsb.m); } } + if (rack_pcm_blast == 0) { + if ((orig_len > len) && + (add_flag & RACK_IS_PCM) && + (len < pace_max_seg) && + ((pace_max_seg - len) > segsiz)) { + /* + * We are doing a PCM measurement and we did + * not get enough data in the TSO to meet the + * burst requirement. + */ + uint32_t n_len; + + n_len = (orig_len - len); + orig_len -= len; + pace_max_seg -= len; + len = n_len; + sb_offset = tp->snd_max - tp->snd_una; + /* Re-lock for the next spin */ + SOCKBUF_LOCK(sb); + goto send; + } + } else { + if ((orig_len > len) && + (add_flag & RACK_IS_PCM) && + ((orig_len - len) > segsiz)) { + /* + * We are doing a PCM measurement and we did + * not get enough data in the TSO to meet the + * burst requirement. + */ + uint32_t n_len; + + n_len = (orig_len - len); + orig_len -= len; + len = n_len; + sb_offset = tp->snd_max - tp->snd_una; + /* Re-lock for the next spin */ + SOCKBUF_LOCK(sb); + goto send; + } + } } nomore: if (error) { @@ -22488,14 +24101,10 @@ enobufs: if (sendalot) { /* Do we need to turn off sendalot? */ - if (rack->r_ctl.rc_pace_max_segs && - (tot_len_this_send >= rack->r_ctl.rc_pace_max_segs)) { + if (pace_max_seg && + (tot_len_this_send >= pace_max_seg)) { /* We hit our max. */ sendalot = 0; - } else if ((rack->rc_user_set_max_segs) && - (tot_len_this_send >= (rack->rc_user_set_max_segs * segsiz))) { - /* We hit the user defined max */ - sendalot = 0; } } if ((error == 0) && (flags & TH_FIN)) @@ -22515,22 +24124,7 @@ * hit the else if with slot preset. Other * errors return. */ - slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz); - } - if (rsm && - (rsm->r_flags & RACK_HAS_SYN) == 0 && - rack->use_rack_rr) { - /* Its a retransmit and we use the rack cheat? */ - if ((slot == 0) || - (rack->rc_always_pace == 0) || - (rack->r_rr_config == 1)) { - /* - * We have no pacing set or we - * are using old-style rack or - * we are overridden to use the old 1ms pacing. - */ - slot = rack->r_ctl.rc_min_to; - } + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send, rsm, segsiz, __LINE__); } /* We have sent clear the flag */ rack->r_ent_rec_ns = 0; @@ -22568,9 +24162,9 @@ rack_use_rfo && ((flags & (TH_SYN|TH_FIN)) == 0) && (rsm == NULL) && - (tp->snd_nxt == tp->snd_max) && (ipoptlen == 0) && (tp->rcv_numsacks == 0) && + (rack->rc_policer_detected == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && ((IN_RECOVERY(tp->t_flags)) == 0) && @@ -22599,7 +24193,6 @@ (rsm == NULL) && (ipoptlen == 0) && (tp->rcv_numsacks == 0) && - (tp->snd_nxt == tp->snd_max) && (rack->r_must_retran == 0) && rack->r_fsb_inited && TCPS_HAVEESTABLISHED(tp->t_state) && @@ -22625,8 +24218,8 @@ } goto again; } - /* Assure when we leave that snd_nxt will point to top */ skip_all_send: + /* Assure when we leave that snd_nxt will point to top */ if (SEQ_GT(tp->snd_max, tp->snd_nxt)) tp->snd_nxt = tp->snd_max; rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); @@ -22705,14 +24298,26 @@ static int rack_set_dgp(struct tcp_rack *rack) { - /* pace_always=1 */ - if (rack->rc_always_pace == 0) { - if (tcp_can_enable_pacing() == 0) - return (EBUSY); + if (rack->dgp_on == 1) + return(0); + if ((rack->use_fixed_rate == 1) && + (rack->rc_always_pace == 1)) { + /* + * We are already pacing another + * way. + */ + return (EBUSY); + } + if (rack->rc_always_pace == 1) { + rack_remove_pacing(rack); } + if (tcp_incr_dgp_pacing_cnt() == 0) + return (ENOSPC); + rack->r_ctl.pacing_method |= RACK_DGP_PACING; rack->rc_fillcw_apply_discount = 0; rack->dgp_on = 1; rack->rc_always_pace = 1; + rack->rc_pace_dnd = 1; rack->use_fixed_rate = 0; if (rack->gp_ready) rack_set_cc_pacing(rack); @@ -22737,14 +24342,7 @@ /* npush=2 */ rack->r_ctl.rc_no_push_at_mrtt = 2; /* fillcw=1 */ - if (rack->r_cwnd_was_clamped == 0) { - rack->rc_pace_to_cwnd = 1; - } else { - rack->rc_pace_to_cwnd = 0; - /* Reset all multipliers to 100.0 so just the measured bw */ - rack->r_ctl.rack_per_of_gp_ss = 100; - rack->r_ctl.rack_per_of_gp_ca = 100; - } + rack->rc_pace_to_cwnd = 1; rack->rc_pace_fill_if_rttin_range = 0; rack->rtt_limit_mul = 0; /* noprr=1 */ @@ -22753,12 +24351,9 @@ rack->r_limit_scw = 1; /* gp_inc_rec */ rack->r_ctl.rack_per_of_gp_rec = 90; - rack_client_buffer_level_set(rack); return (0); } - - static int rack_set_profile(struct tcp_rack *rack, int prof) { @@ -22768,72 +24363,37 @@ * Profile 1 is "standard" DGP. It ignores * client buffer level. */ - rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL0; err = rack_set_dgp(rack); if (err) return (err); - } else if (prof == 2) { - /* - * Profile 2 is DGP. Less aggressive with - * respect to client buffer level. - */ - rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL1; + } else if (prof == 6) { err = rack_set_dgp(rack); if (err) return (err); - } else if (prof == 3) { /* - * Profile 3 is DGP. Even Less aggressive with - * respect to client buffer level. - */ - rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL2; - err = rack_set_dgp(rack); - if (err) - return (err); - } else if (prof == 4) { - /* - * Profile 4 is DGP with the most responsiveness - * to client buffer level. - */ - rack->r_ctl.rc_dgp_bl_agg = DGP_LEVEL3; - err = rack_set_dgp(rack); - if (err) - return (err); - } else if (prof == 5) { - err = rack_set_dgp(rack); - if (err) - return (err); - /* - * By turning DGP off we change the rate - * picked to be only the one the cwnd and rtt - * get us. - */ - rack->dgp_on = 0; - } else if (prof == 6) { - err = rack_set_dgp(rack); - if (err) - return (err); - /* - * Profile 6 tweaks DGP so that it will apply to - * fill-cw the same settings that profile5 does - * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted). + * Profile 6 tweaks DGP so that it will apply to + * fill-cw the same settings that profile5 does + * to replace DGP. It gets then the max(dgp-rate, fillcw(discounted). */ rack->rc_fillcw_apply_discount = 1; } else if (prof == 0) { /* This changes things back to the default settings */ - rack->dgp_on = 0; - rack->rc_hybrid_mode = 0; + if (rack->rc_always_pace == 1) { + rack_remove_pacing(rack); + } else { + /* Make sure any stray flags are off */ + rack->dgp_on = 0; + rack->rc_hybrid_mode = 0; + rack->use_fixed_rate = 0; + } err = 0; if (rack_fill_cw_state) rack->rc_pace_to_cwnd = 1; else rack->rc_pace_to_cwnd = 0; - if (rack->rc_always_pace) { - tcp_decrement_paced_conn(); - rack_undo_cc_pacing(rack); - rack->rc_always_pace = 0; - } + if (rack_pace_every_seg && tcp_can_enable_pacing()) { + rack->r_ctl.pacing_method |= RACK_REG_PACING; rack->rc_always_pace = 1; if (rack->rack_hibeta) rack_set_cc_pacing(rack); @@ -22883,7 +24443,6 @@ } rack->r_rr_config = 0; rack->r_ctl.rc_no_push_at_mrtt = 0; - rack->rc_pace_to_cwnd = 0; rack->rc_pace_fill_if_rttin_range = 0; rack->rtt_limit_mul = 0; @@ -22911,7 +24470,7 @@ struct deferred_opt_list *dol; dol = malloc(sizeof(struct deferred_opt_list), - M_TCPFSB, M_NOWAIT|M_ZERO); + M_TCPDO, M_NOWAIT|M_ZERO); if (dol == NULL) { /* * No space yikes -- fail out.. @@ -22935,19 +24494,6 @@ microuptime(&tv); - /* - * If BB logging is not on we need to look at the DTL flag. - * If its on already then those reasons override the DTL input. - * We do this with any request, you can turn DTL on, but it does - * not turn off at least from hybrid pacing requests. - */ - if (tcp_bblogging_on(rack->rc_tp) == 0) { - if (hybrid->hybrid_flags & TCP_HYBRID_PACING_DTL) { - /* Turn on BB point logging */ - tcp_set_bblog_state(rack->rc_tp, TCP_LOG_VIA_BBPOINTS, - TCP_BBPOINT_REQ_LEVEL_LOGGING); - } - } /* Make sure no fixed rate is on */ rack->use_fixed_rate = 0; rack->r_ctl.rc_fixed_pacing_rate_rec = 0; @@ -22962,6 +24508,8 @@ rack_log_hybrid(rack, seq, NULL, HYBRID_LOG_NO_ROOM, __LINE__, 0); return (ENOSPC); } + /* mask our internal flags */ + hybrid->hybrid_flags &= TCP_HYBRID_PACING_USER_MASK; /* The seq will be snd_una + everything in the buffer */ seq = sft->start_seq; if ((hybrid->hybrid_flags & TCP_HYBRID_PACING_ENABLE) == 0) { @@ -22986,6 +24534,26 @@ return (err); } } + /* + * Now we must switch to hybrid mode as well which also + * means moving to regular pacing. + */ + if (rack->rc_hybrid_mode == 0) { + /* First time */ + if (tcp_can_enable_pacing()) { + rack->r_ctl.pacing_method |= RACK_REG_PACING; + rack->rc_hybrid_mode = 1; + } else { + return (ENOSPC); + } + if (rack->r_ctl.pacing_method & RACK_DGP_PACING) { + /* + * This should be true. + */ + tcp_dec_dgp_pacing_cnt(); + rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; + } + } /* Now set in our flags */ sft->hybrid_flags = hybrid->hybrid_flags | TCP_HYBRID_PACING_WASSET; if (hybrid->hybrid_flags & TCP_HYBRID_PACING_CSPR) @@ -22996,7 +24564,6 @@ sft->hint_maxseg = hybrid->hint_maxseg; else sft->hint_maxseg = 0; - rack->rc_hybrid_mode = 1; rack->rc_tp->tcp_hybrid_start++; rack_log_hybrid(rack, seq, sft, HYBRID_LOG_RULES_SET, __LINE__,0); return (0); @@ -23005,6 +24572,36 @@ #endif } +static int +rack_stack_information(struct tcpcb *tp, struct stack_specific_info *si) +{ + /* + * Gather rack specific information. + */ + struct tcp_rack *rack; + + rack = (struct tcp_rack *)tp->t_fb_ptr; + /* We pulled a SSI info log out what was there */ + policer_detection_log(rack, rack->rc_highly_buffered, 0, 0, 0, 20); + if (rack->policer_detect_on) { + si->policer_detection_enabled = 1; + if (rack->rc_policer_detected) { + si->policer_detected = 1; + si->policer_bucket_size = rack->r_ctl.policer_bucket_size; + si->policer_last_bw = rack->r_ctl.policer_bw; + } else { + si->policer_detected = 0; + si->policer_bucket_size = 0; + si->policer_last_bw = 0; + } + si->current_round = rack->r_ctl.current_round; + si->highly_buffered = rack->rc_highly_buffered; + } + si->bytes_transmitted = tp->t_sndbytes; + si->bytes_retransmitted = tp->t_snd_rxt_bytes; + return (0); +} + static int rack_process_option(struct tcpcb *tp, struct tcp_rack *rack, int sopt_name, uint32_t optval, uint64_t loptval, struct tcp_hybrid_req *hybrid) @@ -23077,34 +24674,7 @@ } break; case TCP_RACK_PACING_BETA: - RACK_OPTS_INC(tcp_rack_beta); - if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) { - /* This only works for newreno. */ - error = EINVAL; - break; - } - if (rack->rc_pacing_cc_set) { - /* - * Set them into the real CC module - * whats in the rack pcb is the old values - * to be used on restoral/ - */ - sopt.sopt_dir = SOPT_SET; - opt.name = CC_NEWRENO_BETA; - opt.val = optval; - if (CC_ALGO(tp)->ctl_output != NULL) - error = CC_ALGO(tp)->ctl_output(&tp->t_ccv, &sopt, &opt); - else { - error = ENOENT; - break; - } - } else { - /* - * Not pacing yet so set it into our local - * rack pcb storage. - */ - rack->r_ctl.rc_saved_beta.beta = optval; - } + error = EINVAL; break; case TCP_RACK_TIMER_SLOP: RACK_OPTS_INC(tcp_rack_timer_slop); @@ -23188,8 +24758,29 @@ else rack->r_up_only = 0; break; + case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ + RACK_OPTS_INC(tcp_fillcw_rate_cap); + rack->r_ctl.fillcw_cap = loptval; + break; case TCP_PACING_RATE_CAP: RACK_OPTS_INC(tcp_pacing_rate_cap); + if ((rack->dgp_on == 1) && + (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { + /* + * If we are doing DGP we need to switch + * to using the pacing limit. + */ + if (tcp_can_enable_pacing() == 0) { + error = ENOSPC; + break; + } + /* + * Now change up the flags and counts to be correct. + */ + rack->r_ctl.pacing_method |= RACK_REG_PACING; + tcp_dec_dgp_pacing_cnt(); + rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; + } rack->r_ctl.bw_rate_cap = loptval; break; case TCP_HYBRID_PACING: @@ -23197,8 +24788,18 @@ error = EINVAL; break; } + if (rack->r_ctl.side_chan_dis_mask & HYBRID_DIS_MASK) { + error = EPERM; + break; + } error = process_hybrid_pacing(rack, hybrid); break; + case TCP_SIDECHAN_DIS: /* URL:scodm */ + if (optval) + rack->r_ctl.side_chan_dis_mask = optval; + else + rack->r_ctl.side_chan_dis_mask = 0; + break; case TCP_RACK_PROFILE: RACK_OPTS_INC(tcp_profile); error = rack_set_profile(rack, optval); @@ -23224,15 +24825,37 @@ rack->r_limit_scw = 0; break; case TCP_RACK_DGP_IN_REC: - RACK_OPTS_INC(tcp_dgp_in_rec); - if (optval) - rack->r_ctl.full_dgp_in_rec = 1; - else - rack->r_ctl.full_dgp_in_rec = 0; + error = EINVAL; + break; + case TCP_POLICER_DETECT: /* URL:pol_det */ + RACK_OPTS_INC(tcp_pol_detect); + rack_translate_policer_detect(rack, optval); break; - case TCP_RXT_CLAMP: - RACK_OPTS_INC(tcp_rxt_clamp); - rack_translate_clamp_value(rack, optval); + case TCP_POLICER_MSS: + RACK_OPTS_INC(tcp_pol_mss); + rack->r_ctl.policer_del_mss = (uint8_t)optval; + if (optval & 0x00000100) { + /* + * Value is setup like so: + * VVVV VVVV VVVV VVVV VVVV VVAI MMMM MMMM + * Where MMMM MMMM is MSS setting + * I (9th bit) is the Postive value that + * says it is being set (if its 0 then the + * upper bits 11 - 32 have no meaning. + * This allows setting it off with + * 0x000001MM. + * + * The 10th bit is used to turn on the + * alternate median (not the expanded one). + * + */ + rack->r_ctl.pol_bw_comp = (optval >> 10); + } + if (optval & 0x00000200) { + rack->r_ctl.policer_alt_median = 1; + } else { + rack->r_ctl.policer_alt_median = 0; + } break; case TCP_RACK_PACE_TO_FILL: RACK_OPTS_INC(tcp_fillcw); @@ -23240,8 +24863,6 @@ rack->rc_pace_to_cwnd = 0; else { rack->rc_pace_to_cwnd = 1; - if (optval > 1) - rack->r_fill_less_agg = 1; } if ((optval >= rack_gp_rtt_maxmul) && rack_gp_rtt_maxmul && @@ -23299,6 +24920,12 @@ else error = EINVAL; break; + case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ + if (optval > 0) + rack->cspr_is_fcc = 1; + else + rack->cspr_is_fcc = 0; + break; case TCP_TIMELY_DYN_ADJ: RACK_OPTS_INC(tcp_timely_dyn); if (optval == 0) @@ -23341,11 +24968,16 @@ * method using a pacing rate. */ RACK_OPTS_INC(tcp_rack_pace_always); + if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { + error = EPERM; + break; + } if (optval > 0) { if (rack->rc_always_pace) { error = EALREADY; break; } else if (tcp_can_enable_pacing()) { + rack->r_ctl.pacing_method |= RACK_REG_PACING; rack->rc_always_pace = 1; if (rack->rack_hibeta) rack_set_cc_pacing(rack); @@ -23355,10 +24987,8 @@ break; } } else { - if (rack->rc_always_pace) { - tcp_decrement_paced_conn(); - rack->rc_always_pace = 0; - rack_undo_cc_pacing(rack); + if (rack->rc_always_pace == 1) { + rack_remove_pacing(rack); } } if (rack->r_mbuf_queue || rack->rc_always_pace || rack->r_use_cmp_ack) @@ -23375,58 +25005,11 @@ val *= 1000; val /= 8; rack->r_ctl.init_rate = val; - if (rack->rc_init_win != rack_default_init_window) { - uint32_t win, snt; - - /* - * Options don't always get applied - * in the order you think. So in order - * to assure we update a cwnd we need - * to check and see if we are still - * where we should raise the cwnd. - */ - win = rc_init_window(rack); - if (SEQ_GT(tp->snd_max, tp->iss)) - snt = tp->snd_max - tp->iss; - else - snt = 0; - if ((snt < win) && - (tp->snd_cwnd < win)) - tp->snd_cwnd = win; - } if (rack->rc_always_pace) rack_update_seg(rack); break; case TCP_BBR_IWINTSO: - RACK_OPTS_INC(tcp_initial_win); - if (optval && (optval <= 0xff)) { - uint32_t win, snt; - - rack->rc_init_win = optval; - win = rc_init_window(rack); - if (SEQ_GT(tp->snd_max, tp->iss)) - snt = tp->snd_max - tp->iss; - else - snt = 0; - if ((snt < win) && - (tp->t_srtt | - rack->r_ctl.init_rate)) { - /* - * We are not past the initial window - * and we have some bases for pacing, - * so we need to possibly adjust up - * the cwnd. Note even if we don't set - * the cwnd, its still ok to raise the rc_init_win - * which can be used coming out of idle when we - * would have a rate. - */ - if (tp->snd_cwnd < win) - tp->snd_cwnd = win; - } - if (rack->rc_always_pace) - rack_update_seg(rack); - } else - error = EINVAL; + error = EINVAL; break; case TCP_RACK_FORCE_MSEG: RACK_OPTS_INC(tcp_rack_force_max_seg); @@ -23443,6 +25026,24 @@ case TCP_RACK_PACE_MAX_SEG: /* Max segments size in a pace in bytes */ RACK_OPTS_INC(tcp_rack_max_seg); + if ((rack->dgp_on == 1) && + (rack->r_ctl.pacing_method & RACK_DGP_PACING)) { + /* + * If we set a max-seg and are doing DGP then + * we now fall under the pacing limits not the + * DGP ones. + */ + if (tcp_can_enable_pacing() == 0) { + error = ENOSPC; + break; + } + /* + * Now change up the flags and counts to be correct. + */ + rack->r_ctl.pacing_method |= RACK_REG_PACING; + tcp_dec_dgp_pacing_cnt(); + rack->r_ctl.pacing_method &= ~RACK_DGP_PACING; + } if (optval <= MAX_USER_SET_SEG) rack->rc_user_set_max_segs = optval; else @@ -23452,6 +25053,18 @@ case TCP_RACK_PACE_RATE_REC: /* Set the fixed pacing rate in Bytes per second ca */ RACK_OPTS_INC(tcp_rack_pace_rate_rec); + if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { + error = EPERM; + break; + } + if (rack->dgp_on) { + /* + * We are already pacing another + * way. + */ + error = EBUSY; + break; + } rack->r_ctl.rc_fixed_pacing_rate_rec = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) rack->r_ctl.rc_fixed_pacing_rate_ca = optval; @@ -23470,6 +25083,18 @@ case TCP_RACK_PACE_RATE_SS: /* Set the fixed pacing rate in Bytes per second ca */ RACK_OPTS_INC(tcp_rack_pace_rate_ss); + if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { + error = EPERM; + break; + } + if (rack->dgp_on) { + /* + * We are already pacing another + * way. + */ + error = EBUSY; + break; + } rack->r_ctl.rc_fixed_pacing_rate_ss = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ca == 0) rack->r_ctl.rc_fixed_pacing_rate_ca = optval; @@ -23488,6 +25113,18 @@ case TCP_RACK_PACE_RATE_CA: /* Set the fixed pacing rate in Bytes per second ca */ RACK_OPTS_INC(tcp_rack_pace_rate_ca); + if (rack->r_ctl.side_chan_dis_mask & CCSP_DIS_MASK) { + error = EPERM; + break; + } + if (rack->dgp_on) { + /* + * We are already pacing another + * way. + */ + error = EBUSY; + break; + } rack->r_ctl.rc_fixed_pacing_rate_ca = optval; if (rack->r_ctl.rc_fixed_pacing_rate_ss == 0) rack->r_ctl.rc_fixed_pacing_rate_ss = optval; @@ -23571,6 +25208,41 @@ rack->r_rack_hw_rate_caps = 0; } break; + case TCP_DGP_UPPER_BOUNDS: + { + uint8_t val; + val = optval & 0x0000ff; + rack->r_ctl.rack_per_upper_bound_ca = val; + val = (optval >> 16) & 0x0000ff; + rack->r_ctl.rack_per_upper_bound_ss = val; + break; + } + case TCP_SS_EEXIT: /* URL:eexit */ + if (optval > 0) { + rack->r_ctl.gp_rnd_thresh = optval & 0x0ff; + if (optval & 0x10000) { + rack->r_ctl.gate_to_fs = 1; + } else { + rack->r_ctl.gate_to_fs = 0; + } + if (optval & 0x20000) { + rack->r_ctl.use_gp_not_last = 1; + } else { + rack->r_ctl.use_gp_not_last = 0; + } + if (optval & 0xfffc0000) { + uint32_t v; + + v = (optval >> 18) & 0x00003fff; + if (v >= 1000) + rack->r_ctl.gp_gain_req = v; + } + } else { + /* We do not do ss early exit at all */ + rack->rc_initial_ss_comp = 1; + rack->r_ctl.gp_rnd_thresh = 0; + } + break; case TCP_RACK_SPLIT_LIMIT: RACK_OPTS_INC(tcp_split_limit); rack->r_ctl.rc_split_limit = optval; @@ -23681,6 +25353,50 @@ else rack->r_ctl.rc_rate_sample_method = optval; break; + case TCP_HONOR_HPTS_MIN: + RACK_OPTS_INC(tcp_honor_hpts); + if (optval) { + rack->r_use_hpts_min = 1; + /* + * Must be between 2 - 80% to be a reduction else + * we keep the default (10%). + */ + if ((optval > 1) && (optval <= 80)) { + rack->r_ctl.max_reduction = optval; + } + } else + rack->r_use_hpts_min = 0; + break; + case TCP_REC_IS_DYN: /* URL:dynrec */ + RACK_OPTS_INC(tcp_dyn_rec); + if (optval) + rack->rc_gp_no_rec_chg = 1; + else + rack->rc_gp_no_rec_chg = 0; + break; + case TCP_NO_TIMELY: + RACK_OPTS_INC(tcp_notimely); + if (optval) { + rack->rc_skip_timely = 1; + rack->r_ctl.rack_per_of_gp_rec = 90; + rack->r_ctl.rack_per_of_gp_ca = 100; + rack->r_ctl.rack_per_of_gp_ss = 250; + } else { + rack->rc_skip_timely = 0; + } + break; + case TCP_GP_USE_LTBW: + if (optval == 0) { + rack->use_lesser_lt_bw = 0; + rack->dis_lt_bw = 1; + } else if (optval == 1) { + rack->use_lesser_lt_bw = 1; + rack->dis_lt_bw = 0; + } else if (optval == 2) { + rack->use_lesser_lt_bw = 0; + rack->dis_lt_bw = 0; + } + break; case TCP_DATA_AFTER_CLOSE: RACK_OPTS_INC(tcp_data_after_close); if (optval) @@ -23695,6 +25411,431 @@ return (error); } +static void +rack_inherit(struct tcpcb *tp, struct inpcb *parent) +{ + /* + * A new connection has been created (tp) and + * the parent is the inpcb given. We want to + * apply a read-lock to the parent (we are already + * holding a write lock on the tp) and copy anything + * out of the rack specific data as long as its tfb is + * the same as ours i.e. we are the same stack. Otherwise + * we just return. + */ + struct tcpcb *par; + struct tcp_rack *dest, *src; + int cnt = 0; + + par = intotcpcb(parent); + if (par->t_fb != tp->t_fb) { + /* Not the same stack */ + tcp_log_socket_option(tp, 0, 0, 1); + return; + } + /* Ok if we reach here lets setup the two rack pointers */ + dest = (struct tcp_rack *)tp->t_fb_ptr; + src = (struct tcp_rack *)par->t_fb_ptr; + if ((src == NULL) || (dest == NULL)) { + /* Huh? */ + tcp_log_socket_option(tp, 0, 0, 2); + return; + } + /* Now copy out anything we wish to inherit i.e. things in socket-options */ + /* TCP_RACK_PROFILE we can't know but we can set DGP if its on */ + if ((src->dgp_on) && (dest->dgp_on == 0)) { + /* Profile 1 had to be set via sock opt */ + rack_set_dgp(dest); + cnt++; + } + /* TCP_RACK_SET_RXT_OPTIONS */ + if (dest->full_size_rxt != src->full_size_rxt) { + dest->full_size_rxt = src->full_size_rxt; + cnt++; + } + if (dest->shape_rxt_to_pacing_min != src->shape_rxt_to_pacing_min) { + dest->shape_rxt_to_pacing_min = src->shape_rxt_to_pacing_min; + cnt++; + } + /* TCP_RACK_DSACK_OPT */ + if (dest->rc_rack_tmr_std_based != src->rc_rack_tmr_std_based) { + dest->rc_rack_tmr_std_based = src->rc_rack_tmr_std_based; + cnt++; + } + if (dest->rc_rack_use_dsack != src->rc_rack_use_dsack) { + dest->rc_rack_use_dsack = src->rc_rack_use_dsack; + cnt++; + } + /* TCP_RACK_PACING_DIVISOR */ + if (dest->r_ctl.pace_len_divisor != src->r_ctl.pace_len_divisor) { + dest->r_ctl.pace_len_divisor = src->r_ctl.pace_len_divisor; + cnt++; + } + /* TCP_RACK_HI_BETA */ + if (src->rack_hibeta != dest->rack_hibeta) { + cnt++; + if (src->rack_hibeta) { + dest->r_ctl.rc_saved_beta.beta = src->r_ctl.rc_saved_beta.beta; + dest->rack_hibeta = 1; + } else { + dest->rack_hibeta = 0; + } + } + /* TCP_RACK_TIMER_SLOP */ + if (dest->r_ctl.timer_slop != src->r_ctl.timer_slop) { + dest->r_ctl.timer_slop = src->r_ctl.timer_slop; + cnt++; + } + /* TCP_RACK_PACING_BETA_ECN */ + if (dest->r_ctl.rc_saved_beta.beta_ecn != src->r_ctl.rc_saved_beta.beta_ecn) { + dest->r_ctl.rc_saved_beta.beta_ecn = src->r_ctl.rc_saved_beta.beta_ecn; + cnt++; + } + if (dest->r_ctl.rc_saved_beta.newreno_flags != src->r_ctl.rc_saved_beta.newreno_flags) { + dest->r_ctl.rc_saved_beta.newreno_flags = src->r_ctl.rc_saved_beta.newreno_flags; + cnt++; + } + /* We do not do TCP_DEFER_OPTIONS */ + /* TCP_RACK_MEASURE_CNT */ + if (dest->r_ctl.req_measurements != src->r_ctl.req_measurements) { + dest->r_ctl.req_measurements = src->r_ctl.req_measurements; + cnt++; + } + /* TCP_HDWR_UP_ONLY */ + if (dest->r_up_only != src->r_up_only) { + dest->r_up_only = src->r_up_only; + cnt++; + } + /* TCP_FILLCW_RATE_CAP */ + if (dest->r_ctl.fillcw_cap != src->r_ctl.fillcw_cap) { + dest->r_ctl.fillcw_cap = src->r_ctl.fillcw_cap; + cnt++; + } + /* TCP_PACING_RATE_CAP */ + if (dest->r_ctl.bw_rate_cap != src->r_ctl.bw_rate_cap) { + dest->r_ctl.bw_rate_cap = src->r_ctl.bw_rate_cap; + cnt++; + } + /* A listener can't set TCP_HYBRID_PACING */ + /* TCP_SIDECHAN_DIS */ + if (dest->r_ctl.side_chan_dis_mask != src->r_ctl.side_chan_dis_mask) { + dest->r_ctl.side_chan_dis_mask = src->r_ctl.side_chan_dis_mask; + cnt++; + } + /* TCP_SHARED_CWND_TIME_LIMIT */ + if (dest->r_limit_scw != src->r_limit_scw) { + dest->r_limit_scw = src->r_limit_scw; + cnt++; + } + /* TCP_POLICER_DETECT */ + if (dest->r_ctl.policer_rxt_threshold != src->r_ctl.policer_rxt_threshold) { + dest->r_ctl.policer_rxt_threshold = src->r_ctl.policer_rxt_threshold; + cnt++; + } + if (dest->r_ctl.policer_avg_threshold != src->r_ctl.policer_avg_threshold) { + dest->r_ctl.policer_avg_threshold = src->r_ctl.policer_avg_threshold; + cnt++; + } + if (dest->r_ctl.policer_med_threshold != src->r_ctl.policer_med_threshold) { + dest->r_ctl.policer_med_threshold = src->r_ctl.policer_med_threshold; + cnt++; + } + if (dest->policer_detect_on != src->policer_detect_on) { + dest->policer_detect_on = src->policer_detect_on; + cnt++; + } + + if (dest->r_ctl.saved_policer_val != src->r_ctl.saved_policer_val) { + dest->r_ctl.saved_policer_val = src->r_ctl.saved_policer_val; + cnt++; + } + /* TCP_POLICER_MSS */ + if (dest->r_ctl.policer_del_mss != src->r_ctl.policer_del_mss) { + dest->r_ctl.policer_del_mss = src->r_ctl.policer_del_mss; + cnt++; + } + + if (dest->r_ctl.pol_bw_comp != src->r_ctl.pol_bw_comp) { + dest->r_ctl.pol_bw_comp = src->r_ctl.pol_bw_comp; + cnt++; + } + + if (dest->r_ctl.policer_alt_median != src->r_ctl.policer_alt_median) { + dest->r_ctl.policer_alt_median = src->r_ctl.policer_alt_median; + cnt++; + } + /* TCP_RACK_PACE_TO_FILL */ + if (dest->rc_pace_to_cwnd != src->rc_pace_to_cwnd) { + dest->rc_pace_to_cwnd = src->rc_pace_to_cwnd; + cnt++; + } + if (dest->rc_pace_fill_if_rttin_range != src->rc_pace_fill_if_rttin_range) { + dest->rc_pace_fill_if_rttin_range = src->rc_pace_fill_if_rttin_range; + cnt++; + } + if (dest->rtt_limit_mul != src->rtt_limit_mul) { + dest->rtt_limit_mul = src->rtt_limit_mul; + cnt++; + } + /* TCP_RACK_NO_PUSH_AT_MAX */ + if (dest->r_ctl.rc_no_push_at_mrtt != src->r_ctl.rc_no_push_at_mrtt) { + dest->r_ctl.rc_no_push_at_mrtt = src->r_ctl.rc_no_push_at_mrtt; + cnt++; + } + /* TCP_SHARED_CWND_ENABLE */ + if (dest->rack_enable_scwnd != src->rack_enable_scwnd) { + dest->rack_enable_scwnd = src->rack_enable_scwnd; + cnt++; + } + /* TCP_USE_CMP_ACKS */ + if (dest->r_use_cmp_ack != src->r_use_cmp_ack) { + dest->r_use_cmp_ack = src->r_use_cmp_ack; + cnt++; + } + + if (dest->r_mbuf_queue != src->r_mbuf_queue) { + dest->r_mbuf_queue = src->r_mbuf_queue; + cnt++; + } + /* TCP_RACK_MBUF_QUEUE */ + if (dest->r_mbuf_queue != src->r_mbuf_queue) { + dest->r_mbuf_queue = src->r_mbuf_queue; + cnt++; + } + if (dest->r_mbuf_queue || dest->rc_always_pace || dest->r_use_cmp_ack) { + tp->t_flags2 |= TF2_SUPPORTS_MBUFQ; + } else { + tp->t_flags2 &= ~TF2_SUPPORTS_MBUFQ; + } + if (dest->r_use_cmp_ack && TCPS_HAVEESTABLISHED(tp->t_state)) { + tp->t_flags2 |= TF2_MBUF_ACKCMP; + } + /* TCP_RACK_NONRXT_CFG_RATE */ + if (dest->rack_rec_nonrxt_use_cr != src->rack_rec_nonrxt_use_cr) { + dest->rack_rec_nonrxt_use_cr = src->rack_rec_nonrxt_use_cr; + cnt++; + } + /* TCP_NO_PRR */ + if (dest->rack_no_prr != src->rack_no_prr) { + dest->rack_no_prr = src->rack_no_prr; + cnt++; + } + if (dest->no_prr_addback != src->no_prr_addback) { + dest->no_prr_addback = src->no_prr_addback; + cnt++; + } + /* RACK_CSPR_IS_FCC */ + if (dest->cspr_is_fcc != src->cspr_is_fcc) { + dest->cspr_is_fcc = src->cspr_is_fcc; + cnt++; + } + /* TCP_TIMELY_DYN_ADJ */ + if (dest->rc_gp_dyn_mul != src->rc_gp_dyn_mul) { + dest->rc_gp_dyn_mul = src->rc_gp_dyn_mul; + cnt++; + } + if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { + dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; + cnt++; + } + /* TCP_RACK_DO_DETECTION */ + if (dest->do_detection != src->do_detection) { + dest->do_detection = src->do_detection; + cnt++; + } + /* TCP_RACK_TLP_USE */ + if (dest->rack_tlp_threshold_use != src->rack_tlp_threshold_use) { + dest->rack_tlp_threshold_use = src->rack_tlp_threshold_use; + cnt++; + } + /* we don't allow inheritence of TCP_RACK_PACE_ALWAYS */ + /* TCP_BBR_RACK_INIT_RATE */ + if (dest->r_ctl.init_rate != src->r_ctl.init_rate) { + dest->r_ctl.init_rate = src->r_ctl.init_rate; + cnt++; + } + /* TCP_RACK_FORCE_MSEG */ + if (dest->rc_force_max_seg != src->rc_force_max_seg) { + dest->rc_force_max_seg = src->rc_force_max_seg; + cnt++; + } + /* TCP_RACK_PACE_MIN_SEG */ + if (dest->r_ctl.rc_user_set_min_segs != src->r_ctl.rc_user_set_min_segs) { + dest->r_ctl.rc_user_set_min_segs = src->r_ctl.rc_user_set_min_segs; + cnt++; + } + /* we don't allow TCP_RACK_PACE_MAX_SEG */ + /* TCP_RACK_PACE_RATE_REC, TCP_RACK_PACE_RATE_SS, TCP_RACK_PACE_RATE_CA */ + if (dest->r_ctl.rc_fixed_pacing_rate_ca != src->r_ctl.rc_fixed_pacing_rate_ca) { + dest->r_ctl.rc_fixed_pacing_rate_ca = src->r_ctl.rc_fixed_pacing_rate_ca; + cnt++; + } + if (dest->r_ctl.rc_fixed_pacing_rate_ss != src->r_ctl.rc_fixed_pacing_rate_ss) { + dest->r_ctl.rc_fixed_pacing_rate_ss = src->r_ctl.rc_fixed_pacing_rate_ss; + cnt++; + } + if (dest->r_ctl.rc_fixed_pacing_rate_rec != src->r_ctl.rc_fixed_pacing_rate_rec) { + dest->r_ctl.rc_fixed_pacing_rate_rec = src->r_ctl.rc_fixed_pacing_rate_rec; + cnt++; + } + /* TCP_RACK_GP_INCREASE_REC, TCP_RACK_GP_INCREASE_CA, TCP_RACK_GP_INCREASE_SS */ + if (dest->r_ctl.rack_per_of_gp_rec != src->r_ctl.rack_per_of_gp_rec) { + dest->r_ctl.rack_per_of_gp_rec = src->r_ctl.rack_per_of_gp_rec; + cnt++; + } + if (dest->r_ctl.rack_per_of_gp_ca != src->r_ctl.rack_per_of_gp_ca) { + dest->r_ctl.rack_per_of_gp_ca = src->r_ctl.rack_per_of_gp_ca; + cnt++; + } + + if (dest->r_ctl.rack_per_of_gp_ss != src->r_ctl.rack_per_of_gp_ss) { + dest->r_ctl.rack_per_of_gp_ss = src->r_ctl.rack_per_of_gp_ss; + cnt++; + } + /* TCP_RACK_RR_CONF */ + if (dest->r_rr_config != src->r_rr_config) { + dest->r_rr_config = src->r_rr_config; + cnt++; + } + /* TCP_PACING_DND */ + if (dest->rc_pace_dnd != src->rc_pace_dnd) { + dest->rc_pace_dnd = src->rc_pace_dnd; + cnt++; + } + /* TCP_HDWR_RATE_CAP */ + if (dest->r_rack_hw_rate_caps != src->r_rack_hw_rate_caps) { + dest->r_rack_hw_rate_caps = src->r_rack_hw_rate_caps; + cnt++; + } + /* TCP_DGP_UPPER_BOUNDS */ + if (dest->r_ctl.rack_per_upper_bound_ca != src->r_ctl.rack_per_upper_bound_ca) { + dest->r_ctl.rack_per_upper_bound_ca = src->r_ctl.rack_per_upper_bound_ca; + cnt++; + } + if (dest->r_ctl.rack_per_upper_bound_ss != src->r_ctl.rack_per_upper_bound_ss) { + dest->r_ctl.rack_per_upper_bound_ss = src->r_ctl.rack_per_upper_bound_ss; + cnt++; + } + /* TCP_SS_EEXIT */ + if (dest->r_ctl.gp_rnd_thresh != src->r_ctl.gp_rnd_thresh) { + dest->r_ctl.gp_rnd_thresh = src->r_ctl.gp_rnd_thresh; + cnt++; + } + if (dest->r_ctl.gate_to_fs != src->r_ctl.gate_to_fs) { + dest->r_ctl.gate_to_fs = src->r_ctl.gate_to_fs; + cnt++; + } + if (dest->r_ctl.use_gp_not_last != src->r_ctl.use_gp_not_last) { + dest->r_ctl.use_gp_not_last = src->r_ctl.use_gp_not_last; + cnt++; + } + if (dest->r_ctl.gp_gain_req != src->r_ctl.gp_gain_req) { + dest->r_ctl.gp_gain_req = src->r_ctl.gp_gain_req; + cnt++; + } + /* TCP_BBR_HDWR_PACE */ + if (dest->rack_hdw_pace_ena != src->rack_hdw_pace_ena) { + dest->rack_hdw_pace_ena = src->rack_hdw_pace_ena; + cnt++; + } + if (dest->rack_attempt_hdwr_pace != src->rack_attempt_hdwr_pace) { + dest->rack_attempt_hdwr_pace = src->rack_attempt_hdwr_pace; + cnt++; + } + /* TCP_RACK_PRR_SENDALOT */ + if (dest->r_ctl.rc_prr_sendalot != src->r_ctl.rc_prr_sendalot) { + dest->r_ctl.rc_prr_sendalot = src->r_ctl.rc_prr_sendalot; + cnt++; + } + /* TCP_RACK_MIN_TO */ + if (dest->r_ctl.rc_min_to != src->r_ctl.rc_min_to) { + dest->r_ctl.rc_min_to = src->r_ctl.rc_min_to; + cnt++; + } + /* TCP_RACK_EARLY_SEG */ + if (dest->r_ctl.rc_early_recovery_segs != src->r_ctl.rc_early_recovery_segs) { + dest->r_ctl.rc_early_recovery_segs = src->r_ctl.rc_early_recovery_segs; + cnt++; + } + /* TCP_RACK_ENABLE_HYSTART */ + if (par->t_ccv.flags != tp->t_ccv.flags) { + cnt++; + if (par->t_ccv.flags & CCF_HYSTART_ALLOWED) { + tp->t_ccv.flags |= CCF_HYSTART_ALLOWED; + if (rack_do_hystart > RACK_HYSTART_ON) + tp->t_ccv.flags |= CCF_HYSTART_CAN_SH_CWND; + if (rack_do_hystart > RACK_HYSTART_ON_W_SC) + tp->t_ccv.flags |= CCF_HYSTART_CONS_SSTH; + } else { + tp->t_ccv.flags &= ~(CCF_HYSTART_ALLOWED|CCF_HYSTART_CAN_SH_CWND|CCF_HYSTART_CONS_SSTH); + } + } + /* TCP_RACK_REORD_THRESH */ + if (dest->r_ctl.rc_reorder_shift != src->r_ctl.rc_reorder_shift) { + dest->r_ctl.rc_reorder_shift = src->r_ctl.rc_reorder_shift; + cnt++; + } + /* TCP_RACK_REORD_FADE */ + if (dest->r_ctl.rc_reorder_fade != src->r_ctl.rc_reorder_fade) { + dest->r_ctl.rc_reorder_fade = src->r_ctl.rc_reorder_fade; + cnt++; + } + /* TCP_RACK_TLP_THRESH */ + if (dest->r_ctl.rc_tlp_threshold != src->r_ctl.rc_tlp_threshold) { + dest->r_ctl.rc_tlp_threshold = src->r_ctl.rc_tlp_threshold; + cnt++; + } + /* TCP_BBR_USE_RACK_RR */ + if (dest->use_rack_rr != src->use_rack_rr) { + dest->use_rack_rr = src->use_rack_rr; + cnt++; + } + /* TCP_RACK_PKT_DELAY */ + if (dest->r_ctl.rc_pkt_delay != src->r_ctl.rc_pkt_delay) { + dest->r_ctl.rc_pkt_delay = src->r_ctl.rc_pkt_delay; + cnt++; + } + /* TCP_DELACK will get copied via the main code if applicable */ + /* TCP_BBR_RACK_RTT_USE */ + if (dest->r_ctl.rc_rate_sample_method != src->r_ctl.rc_rate_sample_method) { + dest->r_ctl.rc_rate_sample_method = src->r_ctl.rc_rate_sample_method; + cnt++; + } + /* TCP_HONOR_HPTS_MIN */ + if (dest->r_use_hpts_min != src->r_use_hpts_min) { + dest->r_use_hpts_min = src->r_use_hpts_min; + cnt++; + } + if (dest->r_ctl.max_reduction != src->r_ctl.max_reduction) { + dest->r_ctl.max_reduction = src->r_ctl.max_reduction; + cnt++; + } + /* TCP_REC_IS_DYN */ + if (dest->rc_gp_no_rec_chg != src->rc_gp_no_rec_chg) { + dest->rc_gp_no_rec_chg = src->rc_gp_no_rec_chg; + cnt++; + } + if (dest->rc_skip_timely != src->rc_skip_timely) { + dest->rc_skip_timely = src->rc_skip_timely; + cnt++; + } + /* TCP_DATA_AFTER_CLOSE */ + if (dest->rc_allow_data_af_clo != src->rc_allow_data_af_clo) { + dest->rc_allow_data_af_clo = src->rc_allow_data_af_clo; + cnt++; + } + /* TCP_GP_USE_LTBW */ + if (src->use_lesser_lt_bw != dest->use_lesser_lt_bw) { + dest->use_lesser_lt_bw = src->use_lesser_lt_bw; + cnt++; + } + if (dest->dis_lt_bw != src->dis_lt_bw) { + dest->dis_lt_bw = src->dis_lt_bw; + cnt++; + } + tcp_log_socket_option(tp, 0, cnt, 0); +} + static void rack_apply_deferred_options(struct tcp_rack *rack) @@ -23778,7 +25919,10 @@ .tfb_switch_failed = rack_switch_failed, .tfb_early_wake_check = rack_wake_check, .tfb_compute_pipe = rack_compute_pipe, + .tfb_stack_info = rack_stack_information, + .tfb_inherit = rack_inherit, .tfb_flags = TCP_FUNC_OUTPUT_CANDROP, + }; /* @@ -23846,7 +25990,6 @@ /* Already read in and sanity checked in sosetopt(). */ if (inp->inp_socket) { rack->client_bufferlvl = inp->inp_socket->so_peerprio; - rack_client_buffer_level_set(rack); } break; } @@ -23859,7 +26002,6 @@ /* Pacing related ones */ case TCP_RACK_PACE_ALWAYS: /* URL:pace_always */ case TCP_BBR_RACK_INIT_RATE: /* URL:irate */ - case TCP_BBR_IWINTSO: /* URL:tso_iwin */ case TCP_RACK_PACE_MIN_SEG: /* URL:pace_min_seg */ case TCP_RACK_PACE_MAX_SEG: /* URL:pace_max_seg */ case TCP_RACK_FORCE_MSEG: /* URL:force_max_seg */ @@ -23874,12 +26016,12 @@ case TCP_HDWR_RATE_CAP: /* URL:hdwrcap boolean */ case TCP_PACING_RATE_CAP: /* URL:cap -- used by side-channel */ case TCP_HDWR_UP_ONLY: /* URL:uponly -- hardware pacing boolean */ - case TCP_RACK_PACING_BETA: /* URL:pacing_beta */ + case TCP_FILLCW_RATE_CAP: /* URL:fillcw_cap */ case TCP_RACK_PACING_BETA_ECN: /* URL:pacing_beta_ecn */ case TCP_RACK_PACE_TO_FILL: /* URL:fillcw */ - case TCP_RACK_DGP_IN_REC: /* URL:dgpinrec */ /* End pacing related */ - case TCP_RXT_CLAMP: /* URL:rxtclamp */ + case TCP_POLICER_DETECT: /* URL:pol_det */ + case TCP_POLICER_MSS: /* URL:pol_mss */ case TCP_DELACK: /* URL:delack (in base TCP i.e. tcp_hints along with cc etc ) */ case TCP_RACK_PRR_SENDALOT: /* URL:prr_sendalot */ case TCP_RACK_MIN_TO: /* URL:min_to */ @@ -23901,7 +26043,8 @@ case TCP_RACK_NO_PUSH_AT_MAX: /* URL:npush */ case TCP_SHARED_CWND_TIME_LIMIT: /* URL:lscwnd */ case TCP_RACK_PROFILE: /* URL:profile */ - case TCP_HYBRID_PACING: /* URL:hybrid */ + case TCP_SIDECHAN_DIS: /* URL:scodm */ + case TCP_HYBRID_PACING: /* URL:pacing=hybrid */ case TCP_USE_CMP_ACKS: /* URL:cmpack */ case TCP_RACK_ABC_VAL: /* URL:labc */ case TCP_REC_ABC_VAL: /* URL:reclabc */ @@ -23913,8 +26056,15 @@ case TCP_RACK_SET_RXT_OPTIONS: /* URL:rxtsz */ case TCP_RACK_HI_BETA: /* URL:hibeta */ case TCP_RACK_SPLIT_LIMIT: /* URL:split */ + case TCP_SS_EEXIT: /* URL:eexit */ + case TCP_DGP_UPPER_BOUNDS: /* URL:upper */ case TCP_RACK_PACING_DIVISOR: /* URL:divisor */ case TCP_PACING_DND: /* URL:dnd */ + case TCP_NO_TIMELY: /* URL:notimely */ + case RACK_CSPR_IS_FCC: /* URL:csprisfcc */ + case TCP_HONOR_HPTS_MIN: /* URL:hptsmin */ + case TCP_REC_IS_DYN: /* URL:dynrec */ + case TCP_GP_USE_LTBW: /* URL:useltbw */ goto process_opt; break; default: @@ -23922,14 +26072,14 @@ return (tcp_default_ctloutput(tp, sopt)); break; } - default: INP_WUNLOCK(inp); return (0); } process_opt: INP_WUNLOCK(inp); - if (sopt->sopt_name == TCP_PACING_RATE_CAP) { + if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || + (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) { error = sooptcopyin(sopt, &loptval, sizeof(loptval), sizeof(loptval)); /* * We truncate it down to 32 bits for the socket-option trace this @@ -23953,11 +26103,10 @@ if (rack->defer_options && (rack->gp_ready == 0) && (sopt->sopt_name != TCP_DEFER_OPTIONS) && (sopt->sopt_name != TCP_HYBRID_PACING) && - (sopt->sopt_name != TCP_RACK_PACING_BETA) && (sopt->sopt_name != TCP_RACK_SET_RXT_OPTIONS) && (sopt->sopt_name != TCP_RACK_PACING_BETA_ECN) && (sopt->sopt_name != TCP_RACK_MEASURE_CNT)) { - /* Options are beind deferred */ + /* Options are being deferred */ if (rack_add_deferred_option(rack, sopt->sopt_name, loptval)) { INP_WUNLOCK(inp); return (0); @@ -24016,6 +26165,7 @@ ti->tcpi_snd_zerowin = tp->t_sndzerowin; ti->tcpi_total_tlp = tp->t_sndtlppack; ti->tcpi_total_tlp_bytes = tp->t_sndtlpbyte; + ti->tcpi_rttmin = tp->t_rttlow; #ifdef NETFLIX_STATS memcpy(&ti->tcpi_rxsyninfo, &tp->t_rxsyninfo, sizeof(struct tcpsyninfo)); #endif @@ -24062,21 +26212,6 @@ * when you exit recovery. */ case TCP_RACK_PACING_BETA: - if (strcmp(tp->t_cc->name, CCALGONAME_NEWRENO) != 0) - error = EINVAL; - else if (rack->rc_pacing_cc_set == 0) - optval = rack->r_ctl.rc_saved_beta.beta; - else { - /* - * Reach out into the CC data and report back what - * I have previously set. Yeah it looks hackish but - * we don't want to report the saved values. - */ - if (tp->t_ccv.cc_data) - optval = ((struct newreno *)tp->t_ccv.cc_data)->beta; - else - error = EINVAL; - } break; /* * Beta_ecn is the congestion control value for NewReno that influences how @@ -24112,7 +26247,7 @@ optval |= 2; } break; - case TCP_RACK_ENABLE_HYSTART: + case TCP_RACK_ENABLE_HYSTART: { if (tp->t_ccv.flags & CCF_HYSTART_ALLOWED) { optval = RACK_HYSTART_ON; @@ -24126,13 +26261,16 @@ } break; case TCP_RACK_DGP_IN_REC: - optval = rack->r_ctl.full_dgp_in_rec; + error = EINVAL; break; case TCP_RACK_HI_BETA: optval = rack->rack_hibeta; break; - case TCP_RXT_CLAMP: - optval = rack->r_ctl.saved_rxt_clamp_val; + case TCP_POLICER_MSS: + optval = rack->r_ctl.policer_del_mss; + break; + case TCP_POLICER_DETECT: + optval = rack->r_ctl.saved_policer_val; break; case TCP_DEFER_OPTIONS: optval = rack->defer_options; @@ -24149,6 +26287,9 @@ case TCP_HDWR_UP_ONLY: optval= rack->r_up_only; break; + case TCP_FILLCW_RATE_CAP: + loptval = rack->r_ctl.fillcw_cap; + break; case TCP_PACING_RATE_CAP: loptval = rack->r_ctl.bw_rate_cap; break; @@ -24156,6 +26297,9 @@ /* You cannot retrieve a profile, its write only */ error = EINVAL; break; + case TCP_SIDECHAN_DIS: + optval = rack->r_ctl.side_chan_dis_mask; + break; case TCP_HYBRID_PACING: /* You cannot retrieve hybrid pacing information, its write only */ error = EINVAL; @@ -24165,8 +26309,6 @@ break; case TCP_RACK_PACE_TO_FILL: optval = rack->rc_pace_to_cwnd; - if (optval && rack->r_fill_less_agg) - optval++; break; case TCP_RACK_NO_PUSH_AT_MAX: optval = rack->r_ctl.rc_no_push_at_mrtt; @@ -24185,6 +26327,18 @@ else optval = 0; break; + case TCP_GP_USE_LTBW: + if (rack->dis_lt_bw) { + /* It is not used */ + optval = 0; + } else if (rack->use_lesser_lt_bw) { + /* we use min() */ + optval = 1; + } else { + /* we use max() */ + optval = 2; + } + break; case TCP_RACK_DO_DETECTION: optval = rack->do_detection; break; @@ -24192,11 +26346,14 @@ /* Now do we use the LRO mbuf-queue feature */ optval = rack->r_mbuf_queue; break; + case RACK_CSPR_IS_FCC: + optval = rack->cspr_is_fcc; + break; case TCP_TIMELY_DYN_ADJ: optval = rack->rc_gp_dyn_mul; break; case TCP_BBR_IWINTSO: - optval = rack->rc_init_win; + error = EINVAL; break; case TCP_RACK_TLP_REDUCE: /* RACK TLP cwnd reduction (bool) */ @@ -24242,6 +26399,18 @@ /* RACK reorder threshold (shift amount) */ optval = rack->r_ctl.rc_reorder_shift; break; + case TCP_SS_EEXIT: + if (rack->r_ctl.gp_rnd_thresh) { + uint32_t v; + + v = rack->r_ctl.gp_gain_req; + v <<= 17; + optval = v | (rack->r_ctl.gp_rnd_thresh & 0xff); + if (rack->r_ctl.gate_to_fs == 1) + optval |= 0x10000; + } else + optval = 0; + break; case TCP_RACK_REORD_FADE: /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; @@ -24282,6 +26451,11 @@ case TCP_RACK_PACE_RATE_REC: optval = rack->r_ctl.rc_fixed_pacing_rate_rec; break; + case TCP_DGP_UPPER_BOUNDS: + optval = rack->r_ctl.rack_per_upper_bound_ss; + optval <<= 16; + optval |= rack->r_ctl.rack_per_upper_bound_ca; + break; case TCP_RACK_GP_INCREASE_SS: optval = rack->r_ctl.rack_per_of_gp_ca; break; @@ -24303,6 +26477,18 @@ case TCP_SHARED_CWND_TIME_LIMIT: optval = rack->r_limit_scw; break; + case TCP_HONOR_HPTS_MIN: + if (rack->r_use_hpts_min) + optval = rack->r_ctl.max_reduction; + else + optval = 0; + break; + case TCP_REC_IS_DYN: + optval = rack->rc_gp_no_rec_chg; + break; + case TCP_NO_TIMELY: + optval = rack->rc_skip_timely; + break; case TCP_RACK_TIMER_SLOP: optval = rack->r_ctl.timer_slop; break; @@ -24312,7 +26498,8 @@ } INP_WUNLOCK(inp); if (error == 0) { - if (TCP_PACING_RATE_CAP) + if ((sopt->sopt_name == TCP_PACING_RATE_CAP) || + (sopt->sopt_name == TCP_FILLCW_RATE_CAP)) error = sooptcopyout(sopt, &loptval, sizeof loptval); else error = sooptcopyout(sopt, &optval, sizeof optval); diff --git a/sys/netinet/tcp_stacks/rack_pcm.c b/sys/netinet/tcp_stacks/rack_pcm.c new file mode 100644 diff --git a/sys/netinet/tcp_stacks/sack_filter.h b/sys/netinet/tcp_stacks/sack_filter.h --- a/sys/netinet/tcp_stacks/sack_filter.h +++ b/sys/netinet/tcp_stacks/sack_filter.h @@ -51,5 +51,10 @@ int sack_filter_blks(struct sack_filter *sf, struct sackblk *in, int numblks, tcp_seq th_ack); void sack_filter_reject(struct sack_filter *sf, struct sackblk *in); +static inline uint8_t sack_filter_blks_used(struct sack_filter *sf) +{ + return (sf->sf_used); +} + #endif #endif diff --git a/sys/netinet/tcp_stacks/tailq_hash.h b/sys/netinet/tcp_stacks/tailq_hash.h --- a/sys/netinet/tcp_stacks/tailq_hash.h +++ b/sys/netinet/tcp_stacks/tailq_hash.h @@ -13,10 +13,12 @@ #define MAX_ALLOWED_SEQ_RANGE (SEQ_BUCKET_SIZE * (MAX_HASH_ENTRIES-1)) struct tailq_hash { - struct rack_head ht[MAX_HASH_ENTRIES]; uint32_t min; uint32_t max; uint32_t count; + struct rack_sendmap *rsm_min; + struct rack_sendmap *rsm_max; + struct rack_head ht[MAX_HASH_ENTRIES]; }; struct rack_sendmap * @@ -53,6 +55,10 @@ int tqhash_trim(struct tailq_hash *hs, uint32_t th_ack); +void +tqhash_update_end(struct tailq_hash *hs, struct rack_sendmap *rsm, + uint32_t th_ack); + #define TQHASH_FOREACH(var, head) \ for ((var) = tqhash_min((head)); \ diff --git a/sys/netinet/tcp_stacks/tailq_hash.c b/sys/netinet/tcp_stacks/tailq_hash.c --- a/sys/netinet/tcp_stacks/tailq_hash.c +++ b/sys/netinet/tcp_stacks/tailq_hash.c @@ -65,7 +65,6 @@ #include #include #include -#include #include #include #include @@ -100,6 +99,7 @@ #include "sack_filter.h" #include "tcp_rack.h" #include "tailq_hash.h" +#include "opt_global.h" struct rack_sendmap * @@ -107,7 +107,7 @@ { struct rack_sendmap *rsm; - rsm = tqhash_find(hs, hs->min); + rsm = hs->rsm_min; return(rsm); } @@ -116,7 +116,7 @@ { struct rack_sendmap *rsm; - rsm = tqhash_find(hs, (hs->max - 1)); + rsm = hs->rsm_max; return (rsm); } @@ -224,13 +224,19 @@ void tqhash_remove(struct tailq_hash *hs, struct rack_sendmap *rsm, int type) { - TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next); + hs->count--; if (hs->count == 0) { hs->min = hs->max; + hs->rsm_max = hs->rsm_min = NULL; } else if (type == REMOVE_TYPE_CUMACK) { hs->min = rsm->r_end; + hs->rsm_min = tqhash_next(hs, rsm); + } else if (rsm == hs->rsm_max) { + hs->rsm_max = tqhash_prev(hs, rsm); + hs->max = hs->rsm_max->r_end; } + TAILQ_REMOVE(&hs->ht[rsm->bindex], rsm, next); } int @@ -240,6 +246,7 @@ int inserted = 0; uint32_t ebucket; +#ifdef INVARIANTS if (hs->count > 0) { if ((rsm->r_end - hs->min) > MAX_ALLOWED_SEQ_RANGE) { return (-1); @@ -249,6 +256,7 @@ return (-2); } } +#endif rsm->bindex = rsm->r_start / SEQ_BUCKET_SIZE; rsm->bindex %= MAX_HASH_ENTRIES; ebucket = rsm->r_end / SEQ_BUCKET_SIZE; @@ -263,13 +271,17 @@ /* Special case */ hs->min = rsm->r_start; hs->max = rsm->r_end; + hs->rsm_min = hs->rsm_max = rsm; hs->count = 1; } else { hs->count++; - if (SEQ_GT(rsm->r_end, hs->max)) + if (SEQ_GEQ(rsm->r_end, hs->max)) { hs->max = rsm->r_end; - if (SEQ_LT(rsm->r_start, hs->min)) + hs->rsm_max = rsm; + } if (SEQ_LEQ(rsm->r_start, hs->min)) { hs->min = rsm->r_start; + hs->rsm_min = rsm; + } } /* Check the common case of inserting at the end */ l = TAILQ_LAST(&hs->ht[rsm->bindex], rack_head); @@ -299,6 +311,7 @@ TAILQ_INIT(&hs->ht[i]); } hs->min = hs->max = 0; + hs->rsm_min = hs->rsm_max = NULL; hs->count = 0; } @@ -339,3 +352,11 @@ return (0); } +void +tqhash_update_end(struct tailq_hash *hs, struct rack_sendmap *rsm, + uint32_t th_ack) +{ + if (hs->max == rsm->r_end) + hs->max = th_ack; + rsm->r_end = th_ack; +} diff --git a/sys/netinet/tcp_stacks/tcp_rack.h b/sys/netinet/tcp_stacks/tcp_rack.h --- a/sys/netinet/tcp_stacks/tcp_rack.h +++ b/sys/netinet/tcp_stacks/tcp_rack.h @@ -48,6 +48,8 @@ #define RACK_MERGED 0x080000/* The RSM was merged */ #define RACK_PMTU_CHG 0x100000/* The path mtu changed on this guy */ #define RACK_STRADDLE 0x200000/* The seq straddles the bucket line */ +#define RACK_WAS_LOST 0x400000/* Is the rsm considered lost */ +#define RACK_IS_PCM 0x800000/* A PCM measurement is being taken */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000000 /* 1 second in microseconds */ @@ -63,6 +65,7 @@ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint32_t r_flags : 24, /* Flags as defined above */ r_rtr_cnt : 8; /* Retran count, index this -1 to get time */ + uint32_t r_act_rxt_cnt; /* The actual total count of transmits */ struct mbuf *m; uint32_t soff; uint32_t orig_m_len; /* The original mbuf len when we sent (can update) */ @@ -174,6 +177,8 @@ #define RACK_TO_FRM_PERSIST 5 #define RACK_TO_FRM_DELACK 6 +#define RCV_PATH_RTT_MS 10 /* How many ms between recv path RTT's */ + struct rack_opts_stats { uint64_t tcp_rack_tlp_reduce; uint64_t tcp_rack_pace_always; @@ -232,7 +237,7 @@ uint64_t tcp_rack_rtt_use; uint64_t tcp_data_after_close; uint64_t tcp_defer_opt; - uint64_t tcp_rxt_clamp; + uint64_t tcp_pol_detect; uint64_t tcp_rack_beta; uint64_t tcp_rack_beta_ecn; uint64_t tcp_rack_timer_slop; @@ -242,6 +247,11 @@ uint64_t tcp_rack_pacing_divisor; uint64_t tcp_rack_min_seg; uint64_t tcp_dgp_in_rec; + uint64_t tcp_notimely; + uint64_t tcp_honor_hpts; + uint64_t tcp_dyn_rec; + uint64_t tcp_fillcw_rate_cap; + uint64_t tcp_pol_mss; }; /* RTT shrink reasons */ @@ -263,6 +273,9 @@ #define TLP_USE_TWO_TWO 3 /* Use 2.2 behavior */ #define RACK_MIN_BW 8000 /* 64kbps in Bps */ +#define CCSP_DIS_MASK 0x0001 +#define HYBRID_DIS_MASK 0x0002 + /* Rack quality indicators for GPUT measurements */ #define RACK_QUALITY_NONE 0 /* No quality stated */ #define RACK_QUALITY_HIGH 1 /* A normal measurement of a GP RTT */ @@ -319,6 +332,7 @@ * */ #define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ +#define RETRAN_CNT_SIZE 16 #define RACK_NUM_FSB_DEBUG 16 #ifdef _KERNEL @@ -342,6 +356,26 @@ struct tailq_hash; +struct rack_pcm_info { + /* Base send time and s/e filled in by rack_log_output */ + uint64_t send_time; + uint32_t sseq; + uint32_t eseq; + /* Ack's fill in the rest of the data */ + uint16_t cnt; + /* Maximum acks present */ + uint16_t cnt_alloc; +}; + +#define RACK_DEFAULT_PCM_ARRAY 16 + +struct rack_pcm_stats { + uint32_t sseq; + uint32_t eseq; + uint64_t ack_time; +}; + + struct rack_control { /* Second cache line 0x40 from tcp_rack */ struct tailq_hash *tqh; /* Tree of all segments Lock(a) */ @@ -402,6 +436,7 @@ uint32_t rc_rcvtime; /* When we last received data */ uint32_t rc_num_split_allocs; /* num split map entries allocated */ uint32_t rc_split_limit; /* Limit from control var can be set by socket opt */ + uint32_t rack_avg_rec_sends; uint32_t rc_last_output_to; uint32_t rc_went_idle_time; @@ -452,19 +487,45 @@ struct tcp_sendfile_track *rc_last_sft; uint32_t lt_seq; /* Seq at start of lt_bw gauge */ int32_t rc_rtt_diff; /* Timely style rtt diff of our gp_srtt */ - uint64_t last_sndbytes; - uint64_t last_snd_rxt_bytes; - uint64_t rxt_threshold; uint64_t last_tmit_time_acked; /* Holds the last cumack point's last send time */ - uint32_t last_rnd_rxt_clamped; - uint32_t num_of_clamps_applied; - uint32_t clamp_options; - uint32_t max_clamps; + /* Recovery stats */ + uint64_t time_entered_recovery; + uint64_t bytes_acked_in_recovery; + /* Policer Detection */ + uint64_t last_policer_sndbytes; + uint64_t last_policer_snd_rxt_bytes; + uint64_t policer_bw; + uint64_t last_sendtime; + + uint64_t last_gpest; + uint64_t last_tm_mark; /* Last tm mark used */ + uint64_t fillcw_cap; /* B/W cap on fill cw */ + struct rack_pcm_info pcm_i; + struct rack_pcm_stats *pcm_s; + uint32_t gp_gain_req; /* Percent off gp gain req */ + uint32_t last_rnd_of_gp_rise; + uint32_t gp_rnd_thresh; + uint32_t ss_hi_fs; + uint32_t gate_to_fs; + uint32_t policer_max_seg; + uint32_t pol_bw_comp; + uint16_t policer_rxt_threshold; + uint8_t policer_avg_threshold; + uint8_t policer_med_threshold; + uint32_t pcm_max_seg; + uint32_t last_pcm_round; + uint32_t pcm_idle_rounds; + uint32_t current_policer_bucket; + uint32_t policer_bucket_size; + uint32_t idle_snd_una; + uint32_t ack_for_idle; + uint32_t last_amount_before_rec; uint32_t rc_gp_srtt; /* Current GP srtt */ uint32_t rc_prev_gp_srtt; /* Previous RTT */ uint32_t rc_entry_gp_rtt; /* Entry to PRTT gp-rtt */ uint32_t rc_loss_at_start; /* At measurement window where was our lost value */ + uint32_t rc_considered_lost; /* Count in recovery of non-retransmitted bytes considered lost */ uint32_t dsack_round_end; /* In a round of seeing a DSACK */ uint32_t current_round; /* Starting at zero */ @@ -491,6 +552,8 @@ uint32_t rc_snd_max_at_rto; /* For non-sack when the RTO occurred what was snd-max */ uint32_t rc_out_at_rto; int32_t rc_scw_index; + uint32_t max_reduction; + uint32_t side_chan_dis_mask; /* Bit mask of socket opt's disabled */ uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint32_t rc_last_timeout_snduna; uint32_t last_tlp_acked_start; @@ -503,7 +566,11 @@ uint32_t ack_during_sd; uint32_t input_pkt; uint32_t saved_input_pkt; - uint32_t saved_rxt_clamp_val; /* The encoded value we used to setup clamping */ + uint32_t saved_policer_val; /* The encoded value we used to setup policer detection */ + uint32_t cleared_app_ack_seq; + uint32_t last_rcv_tstmp_for_rtt; + uint32_t last_time_of_arm_rcv; + uint32_t rto_ssthresh; struct newreno rc_saved_beta; /* * For newreno cc: * rc_saved_cc are the values we have had @@ -516,10 +583,13 @@ * we also set the flag (if ecn_beta is set) to make * new_reno do less of a backoff for ecn (think abe). */ + uint16_t rc_cnt_of_retran[RETRAN_CNT_SIZE]; uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ + uint8_t policer_del_mss; /* How many mss during recovery for policer detection */ uint8_t rack_per_upper_bound_ss; uint8_t rack_per_upper_bound_ca; + uint8_t cleared_app_ack; uint8_t dsack_persist; uint8_t rc_no_push_at_mrtt; /* No push when we exceed max rtt */ uint8_t num_measurements; /* Number of measurements (up to 0xff, we freeze at 0xff) */ @@ -528,17 +598,19 @@ uint8_t rc_tlp_cwnd_reduce; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_rate_sample_method; - uint8_t rc_dgp_bl_agg; /* Buffer Level aggression during DGP */ + uint8_t policer_alt_median; /* Alternate median for policer detection */ uint8_t full_dgp_in_rec; /* Flag to say if we do full DGP in recovery */ uint8_t client_suggested_maxseg; /* Not sure what to do with this yet */ - uint8_t pacing_discount_amm; /* - * This is a multipler to the base discount that - * can be used to increase the discount. - */ + uint8_t use_gp_not_last; + uint8_t pacing_method; /* If pace_always, what type of pacing */ uint8_t already_had_a_excess; }; #endif +#define RACK_PACING_NONE 0x00 +#define RACK_DGP_PACING 0x01 +#define RACK_REG_PACING 0x02 + /* DGP with no buffer level mitigations */ #define DGP_LEVEL0 0 @@ -578,6 +650,10 @@ #define HYBRID_LOG_EXTEND 14 /* We extended the end */ #define HYBRID_LOG_SENT_LOST 15 /* A closing sent/lost report */ +#define LOST_ZERO 1 /* Zero it out */ +#define LOST_ADD 2 /* Add to it */ +#define LOST_SUB 3 /* Sub from it */ + #define RACK_TIMELY_CNT_BOOST 5 /* At 5th increase boost */ #define RACK_MINRTT_FILTER_TIM 10 /* Seconds */ @@ -590,6 +666,7 @@ */ #define MAX_USER_SET_SEG 0x3f /* The max we can set is 63 which is probably too many */ +#define RACK_FREE_CNT_MAX 0x2f /* Max our counter can do */ #ifdef _KERNEL @@ -601,8 +678,9 @@ int32_t, int32_t, uint32_t, int, int, uint8_t); /* Lock(a) */ struct tcpcb *rc_tp; /* The tcpcb Lock(a) */ struct inpcb *rc_inp; /* The inpcb Lock(a) */ - uint8_t rc_free_cnt; /* Number of free entries on the rc_free list - * Lock(a) */ + uint8_t rc_free_cnt : 6, + rc_skip_timely : 1, + pcm_enabled : 1; /* Is PCM enabled */ uint8_t client_bufferlvl : 3, /* Expected range [0,5]: 0=unset, 1=low/empty */ rack_deferred_inited : 1, /* ******************************************************************** */ @@ -612,11 +690,11 @@ shape_rxt_to_pacing_min : 1, /* ******************************************************************** */ rc_ack_required: 1, - r_pacing_discount : 1; + r_use_hpts_min : 1; uint8_t no_prr_addback : 1, gp_ready : 1, defer_options: 1, - excess_rxt_on: 1, /* Are actions on for excess retransmissions? */ + dis_lt_bw : 1, rc_ack_can_sendout_data: 1, /* * If set it will override pacing restrictions on not sending * data when the pacing timer is running. I.e. you set this @@ -659,7 +737,7 @@ r_rack_hw_rate_caps: 1, r_up_only: 1, r_via_fill_cw : 1, - r_fill_less_agg : 1; + r_rcvpath_rtt_up : 1; uint8_t rc_user_set_max_segs : 7, /* Socket option value Lock(a) */ rc_fillcw_apply_discount; @@ -673,7 +751,7 @@ rc_highly_buffered: 1, /* The path is highly buffered */ rc_dragged_bottom: 1, rc_pace_dnd : 1, /* The pace do not disturb bit */ - rc_avali2 : 1, + rc_initial_ss_comp : 1, rc_gp_filled : 1, rc_hw_nobuf : 1; uint8_t r_state : 4, /* Current rack state Lock(a) */ @@ -696,8 +774,8 @@ uint8_t app_limited_needs_set : 1, use_fixed_rate : 1, rc_has_collapsed : 1, - r_cwnd_was_clamped : 1, - r_clamped_gets_lower : 1, + use_lesser_lt_bw : 1, + cspr_is_fcc : 1, rack_hdrw_pacing : 1, /* We are doing Hardware pacing */ rack_hdw_pace_ena : 1, /* Is hardware pacing enabled? */ rack_attempt_hdwr_pace : 1; /* Did we attempt hdwr pacing (if allowed) */ @@ -722,7 +800,14 @@ r_persist_lt_bw_off : 1, r_collapse_point_valid : 1, dgp_on : 1; - uint16_t rc_init_win : 8, + uint16_t rto_from_rec: 1, + avail_bit: 1, + pcm_in_progress: 1, + pcm_needed: 1, + policer_detect_on: 1, /* Are we detecting policers? */ + rc_policer_detected : 1, /* We are beiing policed */ + rc_policer_should_pace : 1, /* The sizing algo thinks we should pace */ + rc_sendvars_notset : 1, /* Inside rack_init send variables (snd_max/una etc) were not set */ rc_gp_rtt_set : 1, rc_gp_dyn_mul : 1, rc_gp_saw_rec : 1, @@ -735,5 +820,9 @@ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE); + +void rack_update_pcm_ack(struct tcp_rack *rack, int was_cumack, + uint32_t ss, uint32_t es); + #endif #endif diff --git a/sys/netinet/tcp_subr.c b/sys/netinet/tcp_subr.c --- a/sys/netinet/tcp_subr.c +++ b/sys/netinet/tcp_subr.c @@ -287,18 +287,29 @@ static volatile uint32_t number_of_tcp_connections_pacing = 0; static uint32_t shadow_num_connections = 0; static counter_u64_t tcp_pacing_failures; +static counter_u64_t tcp_dgp_failures; +static uint32_t shadow_tcp_pacing_dgp = 0; +static volatile uint32_t number_of_dgp_connections = 0; static int tcp_pacing_limit = 10000; SYSCTL_INT(_net_inet_tcp, OID_AUTO, pacing_limit, CTLFLAG_RW, &tcp_pacing_limit, 1000, "If the TCP stack does pacing, is there a limit (-1 = no, 0 = no pacing N = number of connections)"); +static int tcp_dgp_limit = -1; +SYSCTL_INT(_net_inet_tcp, OID_AUTO, dgp_limit, CTLFLAG_RW, + &tcp_dgp_limit, -1, + "If the TCP stack does DGP, is there a limit (-1 = no, 0 = no dgp N = number of connections)"); + SYSCTL_UINT(_net_inet_tcp, OID_AUTO, pacing_count, CTLFLAG_RD, &shadow_num_connections, 0, "Number of TCP connections being paced"); SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, pacing_failures, CTLFLAG_RD, &tcp_pacing_failures, "Number of times we failed to enable pacing to avoid exceeding the limit"); +SYSCTL_COUNTER_U64(_net_inet_tcp, OID_AUTO, dgp_failures, CTLFLAG_RD, + &tcp_dgp_failures, "Number of times we failed to enable dgp to avoid exceeding the limit"); + static int tcp_log_debug = 0; SYSCTL_INT(_net_inet_tcp, OID_AUTO, log_debug, CTLFLAG_RW, &tcp_log_debug, 0, "Log errors caused by incoming TCP segments"); @@ -1571,6 +1582,7 @@ tcp_uncomp_total = counter_u64_alloc(M_WAITOK); tcp_bad_csums = counter_u64_alloc(M_WAITOK); tcp_pacing_failures = counter_u64_alloc(M_WAITOK); + tcp_dgp_failures = counter_u64_alloc(M_WAITOK); #ifdef TCPPCAP tcp_pcap_init(); #endif @@ -4022,6 +4034,43 @@ } } +int +tcp_incr_dgp_pacing_cnt(void) +{ + if ((tcp_dgp_limit == -1) || + (tcp_dgp_limit > number_of_dgp_connections)) { + atomic_fetchadd_int(&number_of_dgp_connections, 1); + shadow_tcp_pacing_dgp = number_of_dgp_connections; + return (1); + } else { + counter_u64_add(tcp_dgp_failures, 1); + return (0); + } +} + +static uint8_t tcp_dgp_warning = 0; + +void +tcp_dec_dgp_pacing_cnt(void) +{ + uint32_t ret; + + ret = atomic_fetchadd_int(&number_of_dgp_connections, -1); + shadow_tcp_pacing_dgp = number_of_dgp_connections; + KASSERT(ret != 0, ("number_of_dgp_connections -1 would cause wrap?")); + if (ret == 0) { + if (tcp_dgp_limit != -1) { + printf("Warning all DGP is now disabled, count decrements invalidly!\n"); + tcp_dgp_limit = 0; + tcp_dgp_warning = 1; + } else if (tcp_dgp_warning == 0) { + printf("Warning DGP pacing is invalid, invalid decrement\n"); + tcp_dgp_warning = 1; + } + } + +} + static uint8_t tcp_pacing_warning = 0; void @@ -4541,7 +4590,7 @@ if (tp->t_tcpreq_req) { for(i = 0, allocated = 0; i < MAX_TCP_TRK_REQ; i++) { fil = &tp->t_tcpreq_info[i]; - if (fil->flags != TCP_TRK_TRACK_FLG_USED) + if ((fil->flags & TCP_TRK_TRACK_FLG_USED) == 0) continue; if ((fil->timestamp == req->timestamp) && (fil->start == req->start) && @@ -4573,6 +4622,7 @@ allocated = 1; fil->flags = TCP_TRK_TRACK_FLG_USED; fil->timestamp = req->timestamp; + fil->playout_ms = req->playout_ms; fil->localtime = ts; fil->start = req->start; if (req->flags & TCP_LOG_HTTPD_RANGE_END) { @@ -4589,7 +4639,10 @@ fil->sbcc_at_s = tptosocket(tp)->so_snd.sb_ccc; fil->start_seq = tp->snd_una + tptosocket(tp)->so_snd.sb_ccc; - fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); + if (req->flags & TCP_LOG_HTTPD_RANGE_END) + fil->end_seq = (fil->start_seq + ((uint32_t)(fil->end - fil->start))); + else + fil->end_seq = 0; if (tptosocket(tp)->so_snd.sb_tls_info) { /* * This session is doing TLS. Take a swag guess diff --git a/sys/netinet/tcp_syncache.c b/sys/netinet/tcp_syncache.c --- a/sys/netinet/tcp_syncache.c +++ b/sys/netinet/tcp_syncache.c @@ -1032,7 +1032,10 @@ if (!solisten_enqueue(so, SS_ISCONNECTED)) tp->t_flags |= TF_SONOTCONN; - + /* Can we inherit anything from the listener? */ + if (tp->t_fb->tfb_inherit != NULL) { + (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(lso)); + } return (so); allocfail: diff --git a/sys/netinet/tcp_usrreq.c b/sys/netinet/tcp_usrreq.c --- a/sys/netinet/tcp_usrreq.c +++ b/sys/netinet/tcp_usrreq.c @@ -179,6 +179,12 @@ goto out; } tp->t_state = TCPS_CLOSED; + /* Can we inherit anything from the listener? */ + if ((so->so_listen != NULL) && + (so->so_listen->so_pcb != NULL) && + (tp->t_fb->tfb_inherit != NULL)) { + (*tp->t_fb->tfb_inherit)(tp, sotoinpcb(so->so_listen)); + } tcp_bblog_pru(tp, PRU_ATTACH, error); INP_WUNLOCK(inp); TCPSTATES_INC(TCPS_CLOSED); @@ -1601,6 +1607,7 @@ ti->tcpi_rcv_numsacks = tp->rcv_numsacks; ti->tcpi_rcv_adv = tp->rcv_adv; ti->tcpi_dupacks = tp->t_dupacks; + ti->tcpi_rttmin = tp->t_rttlow; #ifdef TCP_OFFLOAD if (tp->t_flags & TF_TOE) { ti->tcpi_options |= TCPI_OPT_TOE; diff --git a/sys/netinet/tcp_var.h b/sys/netinet/tcp_var.h --- a/sys/netinet/tcp_var.h +++ b/sys/netinet/tcp_var.h @@ -138,7 +138,8 @@ #define TCP_TRK_TRACK_FLG_OPEN 0x02 /* End is not valid (open range request) */ #define TCP_TRK_TRACK_FLG_SEQV 0x04 /* We had a sendfile that touched it */ #define TCP_TRK_TRACK_FLG_COMP 0x08 /* Sendfile as placed the last bits (range req only) */ -#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */ +#define TCP_TRK_TRACK_FLG_FSND 0x10 /* First send has been done into the seq space */ +#define TCP_TRK_TRACK_FLG_LSND 0x20 /* We were able to set the Last Sent */ #define MAX_TCP_TRK_REQ 5 /* Max we will have at once */ struct tcp_sendfile_track { @@ -151,11 +152,14 @@ uint64_t cspr; /* Client suggested pace rate */ uint64_t sent_at_fs; /* What was t_sndbytes as we begun sending */ uint64_t rxt_at_fs; /* What was t_snd_rxt_bytes as we begun sending */ + uint64_t sent_at_ls; /* Sent value at the last send */ + uint64_t rxt_at_ls; /* Retransmit value at the last send */ tcp_seq start_seq; /* First TCP Seq assigned */ tcp_seq end_seq; /* If range req last seq */ uint32_t flags; /* Type of request open etc */ uint32_t sbcc_at_s; /* When we allocate what is the sb_cc */ uint32_t hint_maxseg; /* Client hinted maxseg */ + uint32_t playout_ms; /* Client playout ms */ uint32_t hybrid_flags; /* Hybrid flags on this request */ }; @@ -623,6 +627,8 @@ void (*tfb_switch_failed)(struct tcpcb *); bool (*tfb_early_wake_check)(struct tcpcb *); int (*tfb_compute_pipe)(struct tcpcb *tp); + int (*tfb_stack_info)(struct tcpcb *tp, struct stack_specific_info *); + void (*tfb_inherit)(struct tcpcb *tp, struct inpcb *h_inp); volatile uint32_t tfb_refcnt; uint32_t tfb_flags; uint8_t tfb_id; @@ -788,7 +794,7 @@ #define TF_TSO 0x01000000 /* TSO enabled on this connection */ #define TF_TOE 0x02000000 /* this connection is offloaded */ #define TF_CLOSED 0x04000000 /* close(2) called on socket */ -#define TF_UNUSED1 0x08000000 /* unused */ +#define TF_SENTSYN 0x08000000 /* At least one syn has been sent */ #define TF_LRD 0x10000000 /* Lost Retransmission Detection */ #define TF_CONGRECOVERY 0x20000000 /* congestion recovery mode */ #define TF_WASCRECOVERY 0x40000000 /* was in congestion recovery */ @@ -1501,6 +1507,8 @@ int tcp_stats_sample_rollthedice(struct tcpcb *tp, void *seed_bytes, size_t seed_len); int tcp_can_enable_pacing(void); +int tcp_incr_dgp_pacing_cnt(void); +void tcp_dec_dgp_pacing_cnt(void); void tcp_decrement_paced_conn(void); void tcp_change_time_units(struct tcpcb *, int); void tcp_handle_orphaned_packets(struct tcpcb *);