Index: modules/tcp/rack/Makefile =================================================================== --- modules/tcp/rack/Makefile +++ modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c +SRCS= rack.c sack_filter.c rack_bbr_common.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h Index: netinet/in_pcb.h =================================================================== --- netinet/in_pcb.h +++ netinet/in_pcb.h @@ -759,7 +759,9 @@ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ - +#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */ +#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */ +#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */ /* * Flags passed to in_pcblookup*() functions. */ Index: netinet/tcp.h =================================================================== --- netinet/tcp.h +++ netinet/tcp.h @@ -201,9 +201,8 @@ #define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ #define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ #define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ -#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */ #define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ -#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */ +#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ #define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ #define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ #define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ @@ -211,14 +210,18 @@ #define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ #define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ #define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ -#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */ -#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */ -#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */ +#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ +#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ +#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ +#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ +#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ +#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ #define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ #define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ #define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ -#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */ +#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ +#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ #define TCP_BBR_PACE_PER_SEC 1086 @@ -227,11 +230,12 @@ #define TCP_BBR_PACE_SEG_MIN 1089 #define TCP_BBR_PACE_CROSS 1090 #define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ -#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ #define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ #define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ +#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */ #define TCP_RACK_TLP_USE 1095 #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ +#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 @@ -238,6 +242,15 @@ #define TCP_DATA_AFTER_CLOSE 1100 #define TCP_BBR_PROBE_RTT_GAIN 1101 #define TCP_BBR_PROBE_RTT_LEN 1102 +#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ +#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */ +#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ +#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ +#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ +#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ +#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ +#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ +#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ /* Start of reserved space for third-party user-settable options. */ Index: netinet/tcp_hpts.h =================================================================== --- netinet/tcp_hpts.h +++ netinet/tcp_hpts.h @@ -121,6 +121,16 @@ uint8_t p_on_min_sleep; }; +/* Magic flags to tell whats cooking on the pacing wheel */ +#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */ +#define PACE_TMR_RACK 0x02 /* RACK timer running */ +#define PACE_TMR_TLP 0x04 /* TLP timer running */ +#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ +#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ +#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ +#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ +#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) + #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { Index: netinet/tcp_log_buf.h =================================================================== --- netinet/tcp_log_buf.h +++ netinet/tcp_log_buf.h @@ -175,7 +175,7 @@ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ - TCP_LOG_PACER, /* Pacer sending a packet 8 */ + TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ @@ -194,31 +194,36 @@ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ - BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ - BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ + BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ - UNUSED_32, /* Unused 32 */ - UNUSED_33, /* Unused 33 */ + BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ + BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ - BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ + BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ - BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ + BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ + BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ - TCP_LOG_END /* End (keep at end) 51 */ + TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ + BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ + BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ + TCP_LOG_CONNEND, /* End of connection 54 */ + TCP_LOG_LRO, /* LRO entry 55 */ + TCP_LOG_END /* End (keep at end) 56 */ }; enum tcp_log_states { Index: netinet/tcp_stacks/rack.c =================================================================== --- netinet/tcp_stacks/rack.c +++ netinet/tcp_stacks/rack.c @@ -31,7 +31,8 @@ #include "opt_inet6.h" #include "opt_ipsec.h" #include "opt_tcpdebug.h" - +#include "opt_ratelimit.h" +/*#include "opt_kern_tls.h"*/ #include #include #include @@ -46,10 +47,15 @@ #include /* for proc0 declaration */ #include #include +#ifdef KERN_TLS +#include +#endif #include #include #ifdef NETFLIX_STATS -#include +#include +#include +#include /* Must come after qmath.h and tree.h */ #endif #include #include @@ -84,9 +90,6 @@ #include #include #include -#ifdef NETFLIX_CWV -#include -#endif #include #ifdef TCPDEBUG #include @@ -161,23 +164,21 @@ * must maintain the new rack scoreboard. * */ -static int32_t rack_precache = 1; static int32_t rack_tlp_thresh = 1; static int32_t rack_reorder_thresh = 2; static int32_t rack_reorder_fade = 60000; /* 0 - never fade, def 60,000 * - 60 seconds */ static int32_t rack_pkt_delay = 1; -static int32_t rack_inc_var = 0;/* For TLP */ -static int32_t rack_reduce_largest_on_idle = 0; static int32_t rack_min_pace_time = 0; -static int32_t rack_min_pace_time_seg_req=6; static int32_t rack_early_recovery = 1; -static int32_t rack_early_recovery_max_seg = 6; static int32_t rack_send_a_lot_in_prr = 1; static int32_t rack_min_to = 1; /* Number of ms minimum timeout */ -static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; +static int32_t use_rack_cheat = 1; +static int32_t rack_persist_min = 250; /* 250ms */ +static int32_t rack_persist_max = 1000; /* 1 Second */ + /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -186,11 +187,11 @@ */ static int32_t rack_tlp_min = 10; static int32_t rack_rto_min = 30; /* 30ms same as main freebsd */ -static int32_t rack_rto_max = 30000; /* 30 seconds */ +static int32_t rack_rto_max = 4000; /* 4 seconds */ static const int32_t rack_free_cache = 2; static int32_t rack_hptsi_segments = 40; static int32_t rack_rate_sample_method = USE_RTT_LOW; -static int32_t rack_pace_every_seg = 1; +static int32_t rack_pace_every_seg = 0; static int32_t rack_delayed_ack_time = 200; /* 200ms */ static int32_t rack_slot_reduction = 4; static int32_t rack_lower_cwnd_at_tlp = 0; @@ -202,6 +203,7 @@ static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; +static int32_t rack_per_of_gp = 50; /* Rack specific counters */ counter_u64_t rack_badfr; @@ -212,8 +214,11 @@ counter_u64_t rack_reorder_seen; counter_u64_t rack_paced_segments; counter_u64_t rack_unpaced_segments; +counter_u64_t rack_calc_zero; +counter_u64_t rack_calc_nonzero; counter_u64_t rack_saw_enobuf; counter_u64_t rack_saw_enetunreach; +counter_u64_t rack_per_timer_hole; /* Tail loss probe counters */ counter_u64_t rack_tlp_tot; @@ -236,8 +241,18 @@ counter_u64_t rack_used_tlpmethod2; counter_u64_t rack_enter_tlp_calc; counter_u64_t rack_input_idle_reduces; +counter_u64_t rack_collapsed_win; counter_u64_t rack_tlp_does_nada; +/* Counters for HW TLS */ +counter_u64_t rack_tls_rwnd; +counter_u64_t rack_tls_cwnd; +counter_u64_t rack_tls_app; +counter_u64_t rack_tls_other; +counter_u64_t rack_tls_filled; +counter_u64_t rack_tls_rxt; +counter_u64_t rack_tls_tlp; + /* Temp CPU counters */ counter_u64_t rack_find_high; @@ -301,12 +316,8 @@ static void rack_log_sack_passed(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm); -static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num); +static void rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int num); static int32_t rack_output(struct tcpcb *tp); -static void -rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, - uint8_t iptos, int32_t nxt_pkt, struct timeval *tv); static uint32_t rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, @@ -335,9 +346,6 @@ rack_update_rtt(struct tcpcb *tp, struct tcp_rack *rack, struct rack_sendmap *rsm, struct tcpopt *to, uint32_t cts, int32_t ack_type); static int32_t tcp_addrack(module_t mod, int32_t type, void *data); -static void -rack_challenge_ack(struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t * ret_val); static int rack_do_close_wait(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -346,14 +354,6 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp); -static void -rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); -static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -382,13 +382,6 @@ rack_do_syn_sent(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); -static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, - struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, - int32_t * drop_hdrlen, int32_t * ret_val); -static int -rack_process_rst(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp); struct rack_sendmap * tcp_rack_output(struct tcpcb *tp, struct tcp_rack *rack, uint32_t tsused); @@ -396,10 +389,6 @@ static void tcp_rack_partialack(struct tcpcb *tp, struct tcphdr *th); -static int -rack_ts_check(struct mbuf *m, struct tcphdr *th, - struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); - int32_t rack_clear_counter=0; @@ -435,9 +424,12 @@ counter_u64_zero(rack_to_arm_rack); counter_u64_zero(rack_to_arm_tlp); counter_u64_zero(rack_paced_segments); + counter_u64_zero(rack_calc_zero); + counter_u64_zero(rack_calc_nonzero); counter_u64_zero(rack_unpaced_segments); counter_u64_zero(rack_saw_enobuf); counter_u64_zero(rack_saw_enetunreach); + counter_u64_zero(rack_per_timer_hole); counter_u64_zero(rack_to_alloc_hard); counter_u64_zero(rack_to_alloc_emerg); counter_u64_zero(rack_sack_proc_all); @@ -445,6 +437,13 @@ counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); counter_u64_zero(rack_find_high); + counter_u64_zero(rack_tls_rwnd); + counter_u64_zero(rack_tls_cwnd); + counter_u64_zero(rack_tls_app); + counter_u64_zero(rack_tls_other); + counter_u64_zero(rack_tls_filled); + counter_u64_zero(rack_tls_rxt); + counter_u64_zero(rack_tls_tlp); counter_u64_zero(rack_runt_sacks); counter_u64_zero(rack_used_tlpmethod); counter_u64_zero(rack_used_tlpmethod2); @@ -451,6 +450,8 @@ counter_u64_zero(rack_enter_tlp_calc); counter_u64_zero(rack_progress_drops); counter_u64_zero(rack_tlp_does_nada); + counter_u64_zero(rack_collapsed_win); + } rack_clear_counter = 0; return (0); @@ -461,6 +462,8 @@ static void rack_init_sysctls() { + struct sysctl_oid *rack_counters; + SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, @@ -473,26 +476,38 @@ "Do we hold off sending a RST until all pending data is ack'd"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "cheat_rxt", CTLFLAG_RW, + &use_rack_cheat, 1, + "Do we use the rxt cheat for rack?"); + + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "persmin", CTLFLAG_RW, + &rack_persist_min, 250, + "What is the minimum time in milliseconds between persists"); + SYSCTL_ADD_U32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "persmax", CTLFLAG_RW, + &rack_persist_max, 1000, + "What is the largest delay in milliseconds between persists"); + + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "tlpmethod", CTLFLAG_RW, &rack_tlp_threshold_use, TLP_USE_TWO_ONE, "What method do we do for TLP time calc 0=no-de-ack-comp, 1=ID, 2=2.1, 3=2.2"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "gp_percentage", CTLFLAG_RW, + &rack_per_of_gp, 50, + "Do we pace to percentage of goodput (0=old method)?"); + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "min_pace_time", CTLFLAG_RW, &rack_min_pace_time, 0, "Should we enforce a minimum pace time of 1ms"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "min_pace_segs", CTLFLAG_RW, - &rack_min_pace_time_seg_req, 6, - "How many segments have to be in the len to enforce min-pace-time"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "idle_reduce_high", CTLFLAG_RW, - &rack_reduce_largest_on_idle, 0, - "Should we reduce the largest cwnd seen to IW on idle reduction"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "bb_verbose", CTLFLAG_RW, &rack_verbose_logging, 0, "Should RACK black box logging be verbose"); @@ -513,11 +528,6 @@ "TLP minimum timeout per the specification (10ms)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "precache", CTLFLAG_RW, - &rack_precache, 0, - "Where should we precache the mcopy (0 is not at all)"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "sblklimit", CTLFLAG_RW, &rack_sack_block_limit, 128, "When do we start paying attention to small sack blocks"); @@ -528,11 +538,6 @@ "Should we always send the oldest TLP and RACK-TLP"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "rack_tlp_in_recovery", CTLFLAG_RW, - &rack_tlp_in_recovery, 1, - "Can we do a TLP during recovery?"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rack_tlimit", CTLFLAG_RW, &rack_limited_retran, 0, "How many times can a rack timeout drive out sends"); @@ -574,12 +579,12 @@ SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_every_seg", CTLFLAG_RW, - &rack_pace_every_seg, 1, - "Should we pace out every segment hptsi"); + &rack_pace_every_seg, 0, + "Should we use the original pacing mechanism that did not pace much?"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "hptsi_seg_max", CTLFLAG_RW, - &rack_hptsi_segments, 6, + &rack_hptsi_segments, 40, "Should we pace out only a limited size of segments"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -593,11 +598,6 @@ "Minimum rack timeout in milliseconds"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "earlyrecoveryseg", CTLFLAG_RW, - &rack_early_recovery_max_seg, 6, - "Max segments in early recovery"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "earlyrecovery", CTLFLAG_RW, &rack_early_recovery, 1, "Do we do early recovery with rack"); @@ -621,60 +621,64 @@ OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); - SYSCTL_ADD_S32(&rack_sysctl_ctx, + + rack_counters = SYSCTL_ADD_NODE(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "inc_var", CTLFLAG_RW, - &rack_inc_var, 0, - "Should rack add to the TLP timer the variance in rtt calculation"); + OID_AUTO, + "stats", + CTLFLAG_RW, 0, + "Rack Counters"); + + rack_badfr = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "badfr", CTLFLAG_RD, &rack_badfr, "Total number of bad FRs"); rack_badfr_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "badfr_bytes", CTLFLAG_RD, &rack_badfr_bytes, "Total number of bad FRs"); rack_rtm_prr_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prrsndret", CTLFLAG_RD, &rack_rtm_prr_retran, "Total number of prr based retransmits"); rack_rtm_prr_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prrsndnew", CTLFLAG_RD, &rack_rtm_prr_newdata, "Total number of prr based new transmits"); rack_timestamp_mismatch = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tsnf", CTLFLAG_RD, &rack_timestamp_mismatch, "Total number of timestamps that we could not find the reported ts"); rack_find_high = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "findhigh", CTLFLAG_RD, &rack_find_high, "Total number of FIN causing find-high"); rack_reorder_seen = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "reordering", CTLFLAG_RD, &rack_reorder_seen, "Total number of times we added delay due to reordering"); rack_tlp_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_to_total", CTLFLAG_RD, &rack_tlp_tot, "Total number of tail loss probe expirations"); rack_tlp_newdata = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_new", CTLFLAG_RD, &rack_tlp_newdata, "Total number of tail loss probe sending new data"); @@ -681,85 +685,98 @@ rack_tlp_retran = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran", CTLFLAG_RD, &rack_tlp_retran, "Total number of tail loss probe sending retransmitted data"); rack_tlp_retran_bytes = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran_bytes", CTLFLAG_RD, &rack_tlp_retran_bytes, "Total bytes of tail loss probe sending retransmitted data"); rack_tlp_retran_fail = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_retran_fail", CTLFLAG_RD, &rack_tlp_retran_fail, "Total number of tail loss probe sending retransmitted data that failed (wait for t3)"); rack_to_tot = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "rack_to_tot", CTLFLAG_RD, &rack_to_tot, "Total number of times the rack to expired?"); rack_to_arm_rack = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_rack", CTLFLAG_RD, &rack_to_arm_rack, "Total number of times the rack timer armed?"); rack_to_arm_tlp = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "arm_tlp", CTLFLAG_RD, &rack_to_arm_tlp, "Total number of times the tlp timer armed?"); + + rack_calc_zero = counter_u64_alloc(M_WAITOK); + rack_calc_nonzero = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "calc_zero", CTLFLAG_RD, + &rack_calc_zero, + "Total number of times pacing time worked out to zero?"); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "calc_nonzero", CTLFLAG_RD, + &rack_calc_nonzero, + "Total number of times pacing time worked out to non-zero?"); rack_paced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "paced", CTLFLAG_RD, &rack_paced_segments, "Total number of times a segment send caused hptsi"); rack_unpaced_segments = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "unpaced", CTLFLAG_RD, &rack_unpaced_segments, "Total number of times a segment did not cause hptsi"); rack_saw_enobuf = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enobufs", CTLFLAG_RD, &rack_saw_enobuf, "Total number of times a segment did not cause hptsi"); rack_saw_enetunreach = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "saw_enetunreach", CTLFLAG_RD, &rack_saw_enetunreach, "Total number of times a segment did not cause hptsi"); rack_to_alloc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allocs", CTLFLAG_RD, &rack_to_alloc, "Total allocations of tracking structures"); rack_to_alloc_hard = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allochard", CTLFLAG_RD, &rack_to_alloc_hard, "Total allocations done with sleeping the hard way"); rack_to_alloc_emerg = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total alocations done from emergency cache"); rack_sack_proc_all = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_long", CTLFLAG_RD, &rack_sack_proc_all, "Total times we had to walk whole list for sack processing"); @@ -766,58 +783,120 @@ rack_sack_proc_restart = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_restart", CTLFLAG_RD, &rack_sack_proc_restart, "Total times we had to walk whole list due to a restart"); rack_sack_proc_short = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "sack_short", CTLFLAG_RD, &rack_sack_proc_short, "Total times we took shortcut for sack processing"); rack_enter_tlp_calc = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_calc_entered", CTLFLAG_RD, &rack_enter_tlp_calc, "Total times we called calc-tlp"); rack_used_tlpmethod = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "hit_tlp_method", CTLFLAG_RD, &rack_used_tlpmethod, "Total number of runt sacks"); rack_used_tlpmethod2 = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "hit_tlp_method2", CTLFLAG_RD, &rack_used_tlpmethod2, "Total number of runt sacks 2"); rack_runt_sacks = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "runtsacks", CTLFLAG_RD, &rack_runt_sacks, "Total number of runt sacks"); rack_progress_drops = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "prog_drops", CTLFLAG_RD, &rack_progress_drops, "Total number of progress drops"); rack_input_idle_reduces = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "idle_reduce_oninput", CTLFLAG_RD, &rack_input_idle_reduces, "Total number of idle reductions on input"); + rack_collapsed_win = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "collapsed_win", CTLFLAG_RD, + &rack_collapsed_win, + "Total number of collapsed windows"); rack_tlp_does_nada = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), + SYSCTL_CHILDREN(rack_counters), OID_AUTO, "tlp_nada", CTLFLAG_RD, &rack_tlp_does_nada, "Total number of nada tlp calls"); + + rack_tls_rwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_rwnd", CTLFLAG_RD, + &rack_tls_rwnd, + "Total hdwr tls rwnd limited"); + + rack_tls_cwnd = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_cwnd", CTLFLAG_RD, + &rack_tls_cwnd, + "Total hdwr tls cwnd limited"); + + rack_tls_app = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_app", CTLFLAG_RD, + &rack_tls_app, + "Total hdwr tls app limited"); + + rack_tls_other = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_other", CTLFLAG_RD, + &rack_tls_other, + "Total hdwr tls other limited"); + + rack_tls_filled = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_filled", CTLFLAG_RD, + &rack_tls_filled, + "Total hdwr tls filled"); + + rack_tls_rxt = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_rxt", CTLFLAG_RD, + &rack_tls_rxt, + "Total hdwr rxt"); + + rack_tls_tlp = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "tls_tlp", CTLFLAG_RD, + &rack_tls_tlp, + "Total hdwr tls tlp"); + rack_per_timer_hole = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_counters), + OID_AUTO, "timer_hole", CTLFLAG_RD, + &rack_per_timer_hole, + "Total persists start in timer hole"); + COUNTER_ARRAY_ALLOC(rack_out_size, TCP_MSS_ACCT_SIZE, M_WAITOK); SYSCTL_ADD_COUNTER_U64_ARRAY(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "outsize", CTLFLAG_RD, @@ -861,6 +940,7 @@ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = TICKS_2_MSEC(rack->rc_tp->t_srtt >> TCP_RTT_SHIFT); @@ -869,22 +949,27 @@ log.u_bbr.flex4 = slot; log.u_bbr.flex5 = rack->rc_inp->inp_hptsslot; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; + log.u_bbr.flex7 = rack->rc_in_persist; log.u_bbr.flex8 = which; + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERSTAR, 0, - 0, &log, false); + 0, &log, false, &tv); } } static void -rack_log_to_event(struct tcp_rack *rack, int32_t to_num) +rack_log_to_event(struct tcp_rack *rack, int32_t to_num, int no) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -892,11 +977,15 @@ log.u_bbr.flex8 = to_num; log.u_bbr.flex1 = rack->r_ctl.rc_rack_min_rtt; log.u_bbr.flex2 = rack->rc_rack_rtt; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.flex3 = no; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_RTO, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -906,6 +995,7 @@ { if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -918,11 +1008,14 @@ log.u_bbr.flex6 = rack->r_ctl.rack_rs.rs_rtt_cnt; log.u_bbr.rttProp = rack->r_ctl.rack_rs.rs_rtt_tot; log.u_bbr.flex8 = rack->r_ctl.rc_rate_sample_method; - TCP_LOG_EVENT(tp, NULL, + log.u_bbr.pkts_out = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRRTT, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -939,8 +1032,10 @@ struct timeval tv; /* Convert our ms to a microsecond */ + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = rtt * 1000; log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, @@ -955,6 +1050,7 @@ { if (rack_verbose_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -964,11 +1060,13 @@ log.u_bbr.flex3 = tp->t_maxunacktime; log.u_bbr.flex4 = tp->t_acktime; log.u_bbr.flex8 = event; - TCP_LOG_EVENT(tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_PROGRESS, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -977,18 +1075,22 @@ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; + log.u_bbr.flex2 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = (0x0000ffff & rack->r_ctl.rc_hpts_flags); log.u_bbr.flex8 = rack->rc_in_persist; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_BBRSND, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -997,26 +1099,60 @@ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.applimited = rack->r_ctl.rc_pace_min_segs; log.u_bbr.flex7 = rack->r_wanted_output; log.u_bbr.flex8 = rack->rc_in_persist; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_DOSEG_DONE, 0, - 0, &log, false); + 0, &log, false, &tv); } } +static void +rack_log_type_hrdwtso(struct tcpcb *tp, struct tcp_rack *rack, int len, int mod, int32_t orig_len, int frm) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + uint32_t cts; + memset(&log, 0, sizeof(log)); + cts = tcp_get_usecs(&tv); + log.u_bbr.flex1 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = len; + log.u_bbr.flex5 = orig_len; + log.u_bbr.flex6 = rack->r_ctl.rc_sacked; + log.u_bbr.flex7 = mod; + log.u_bbr.flex8 = frm; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_HDWR_TLS, 0, + 0, &log, false, &tv); + } +} + static void rack_log_type_just_return(struct tcp_rack *rack, uint32_t cts, uint32_t tlen, uint32_t slot, uint8_t hpts_calling) { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -1023,13 +1159,16 @@ log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = slot; log.u_bbr.flex2 = rack->r_ctl.rc_hpts_flags; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex7 = hpts_calling; log.u_bbr.flex8 = rack->rc_in_persist; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_JUSTRET, 0, - tlen, &log, false); + tlen, &log, false, &tv); } } @@ -1038,6 +1177,7 @@ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; @@ -1046,13 +1186,16 @@ log.u_bbr.flex2 = 0; log.u_bbr.flex3 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex4 = 0; + log.u_bbr.flex5 = rack->r_ctl.rc_prr_sndcnt; log.u_bbr.flex6 = rack->rc_tp->t_rxtcur; log.u_bbr.flex8 = hpts_removed; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TIMERCANC, 0, - 0, &log, false); + 0, &log, false, &tv); } } @@ -1061,6 +1204,7 @@ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.flex1 = timers; @@ -1068,15 +1212,44 @@ log.u_bbr.flex3 = rack->r_ctl.rc_timer_exp; log.u_bbr.flex4 = rack->r_ctl.rc_hpts_flags; log.u_bbr.flex5 = cts; - TCP_LOG_EVENT(rack->rc_tp, NULL, + log.u_bbr.flex6 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, &rack->rc_inp->inp_socket->so_rcv, &rack->rc_inp->inp_socket->so_snd, BBR_LOG_TO_PROCESS, 0, - 0, &log, false); + 0, &log, false, &tv); } } + static void +rack_log_to_prr(struct tcp_rack *rack, int frm) +{ + if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = rack->r_ctl.rc_prr_out; + log.u_bbr.flex2 = rack->r_ctl.rc_prr_recovery_fs; + log.u_bbr.flex3 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex4 = rack->r_ctl.rc_prr_delivered; + log.u_bbr.flex5 = rack->r_ctl.rc_sacked; + log.u_bbr.flex6 = rack->r_ctl.rc_holes_rxt; + log.u_bbr.flex8 = frm; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + TCP_LOG_EVENTP(rack->rc_tp, NULL, + &rack->rc_inp->inp_socket->so_rcv, + &rack->rc_inp->inp_socket->so_snd, + BBR_LOG_BBRUPD, 0, + 0, &log, false, &tv); + } +} + +static void rack_counter_destroy() { counter_u64_free(rack_badfr); @@ -1110,6 +1283,7 @@ counter_u64_free(rack_used_tlpmethod2); counter_u64_free(rack_progress_drops); counter_u64_free(rack_input_idle_reduces); + counter_u64_free(rack_collapsed_win); counter_u64_free(rack_tlp_does_nada); COUNTER_ARRAY_FREE(rack_out_size, TCP_MSS_ACCT_SIZE); COUNTER_ARRAY_FREE(rack_opts_arry, RACK_OPTS_SIZE); @@ -1165,9 +1339,6 @@ #ifdef NETFLIX_STATS int32_t gput; #endif -#ifdef NETFLIX_CWV - u_long old_cwnd = tp->snd_cwnd; -#endif INP_WLOCK_ASSERT(tp->t_inpcb); tp->ccv->nsegs = nsegs; @@ -1175,7 +1346,7 @@ if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { uint32_t max; - max = rack->r_ctl.rc_early_recovery_segs * tp->t_maxseg; + max = rack->r_ctl.rc_early_recovery_segs * ctf_fixed_maxseg(tp); if (tp->ccv->bytes_this_ack > max) { tp->ccv->bytes_this_ack = max; } @@ -1193,6 +1364,12 @@ SEQ_GEQ(th->th_ack, tp->gput_ack)) { gput = (((int64_t) (th->th_ack - tp->gput_seq)) << 3) / max(1, tcp_ts_getticks() - tp->gput_ts); + /* We store it in bytes per ms (or kbytes per sec) */ + rack->r_ctl.rc_gp_history[rack->r_ctl.rc_gp_hist_idx] = gput / 8; + rack->r_ctl.rc_gp_hist_idx++; + if (rack->r_ctl.rc_gp_hist_idx >= RACK_GP_HIST) + rack->r_ctl.rc_gp_hist_filled = 1; + rack->r_ctl.rc_gp_hist_idx %= RACK_GP_HIST; stats_voi_update_abs_u32(tp->t_stats, VOI_TCP_GPUT, gput); /* @@ -1207,20 +1384,11 @@ tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; -#ifdef NETFLIX_CWV - if (tp->t_maxpeakrate) { - /* - * We update t_peakrate_thr. This gives us roughly - * one update per round trip time. - */ - tcp_update_peakrate_thr(tp); - } -#endif } #endif if (tp->snd_cwnd > tp->snd_ssthresh) { tp->t_bytes_acked += min(tp->ccv->bytes_this_ack, - nsegs * V_tcp_abc_l_var * tp->t_maxseg); + nsegs * V_tcp_abc_l_var * ctf_fixed_maxseg(tp)); if (tp->t_bytes_acked >= tp->snd_cwnd) { tp->t_bytes_acked -= tp->snd_cwnd; tp->ccv->flags |= CCF_ABC_SENTAWND; @@ -1241,39 +1409,10 @@ if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; } -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - /* - * Per RFC 7661: The behaviour in the non-validated phase is - * specified as: o A sender determines whether to increase - * the cwnd based upon whether it is cwnd-limited (see - * Section 4.5.3): * A sender that is cwnd-limited MAY use - * the standard TCP method to increase cwnd (i.e., the - * standard method permits a TCP sender that fully utilises - * the cwnd to increase the cwnd each time it receives an - * ACK). * A sender that is not cwnd-limited MUST NOT - * increase the cwnd when ACK packets are received in this - * phase (i.e., needs to avoid growing the cwnd when it has - * not recently sent using the current size of cwnd). - */ - if ((tp->snd_cwnd > old_cwnd) && - (tp->cwv_cwnd_valid == 0) && - (!(tp->ccv->flags & CCF_CWND_LIMITED))) { - tp->snd_cwnd = old_cwnd; - } - /* Try to update pipeAck and NCWV state */ - if (TCPS_HAVEESTABLISHED(tp->t_state) && - !IN_RECOVERY(tp->t_flags)) { - uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); - - tcp_newcwv_update_pipeack(tp, data); - } - } /* we enforce max peak rate if it is set. */ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { tp->snd_cwnd = tp->t_peakrate_thr; } -#endif } static void @@ -1321,17 +1460,11 @@ /* Suck the next prr cnt back into cwnd */ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 1); } EXIT_RECOVERY(tp->t_flags); -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - (tp->snd_cwv.in_recovery)) - tcp_newcwv_end_recovery(tp); - } -#endif } static void @@ -1344,13 +1477,15 @@ rack = (struct tcp_rack *)tp->t_fb_ptr; switch (type) { case CC_NDUPACK: -/* rack->r_ctl.rc_ssthresh_set = 1;*/ + tp->t_flags &= ~TF_WASFRECOVERY; + tp->t_flags &= ~TF_WASCRECOVERY; if (!IN_FASTRECOVERY(tp->t_flags)) { rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_prr_delivered = 0; rack->r_ctl.rc_prr_out = 0; rack->r_ctl.rc_loss_count = 0; - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 2); rack->r_ctl.rc_prr_recovery_fs = tp->snd_max - tp->snd_una; tp->snd_recover = tp->snd_max; if (tp->t_flags & TF_ECN_PERMIT) @@ -1370,8 +1505,8 @@ tp->t_bytes_acked = 0; EXIT_RECOVERY(tp->t_flags); tp->snd_ssthresh = max(2, min(tp->snd_wnd, tp->snd_cwnd) / 2 / - tp->t_maxseg) * tp->t_maxseg; - tp->snd_cwnd = tp->t_maxseg; + ctf_fixed_maxseg(tp)) * ctf_fixed_maxseg(tp); + tp->snd_cwnd = ctf_fixed_maxseg(tp); break; case CC_RTO_ERR: TCPSTAT_INC(tcps_sndrexmitbad); @@ -1379,10 +1514,14 @@ tp->snd_cwnd = tp->snd_cwnd_prev; tp->snd_ssthresh = tp->snd_ssthresh_prev; tp->snd_recover = tp->snd_recover_prev; - if (tp->t_flags & TF_WASFRECOVERY) + if (tp->t_flags & TF_WASFRECOVERY) { ENTER_FASTRECOVERY(tp->t_flags); - if (tp->t_flags & TF_WASCRECOVERY) + tp->t_flags &= ~TF_WASFRECOVERY; + } + if (tp->t_flags & TF_WASCRECOVERY) { ENTER_CONGRECOVERY(tp->t_flags); + tp->t_flags &= ~TF_WASCRECOVERY; + } tp->snd_nxt = tp->snd_max; tp->t_badrxtwin = 0; break; @@ -1393,22 +1532,12 @@ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { - tcp_newcwv_enter_recovery(tp); - } - if (type == CC_RTO) { - tcp_newcwv_reset(tp); - } - } -#endif } static inline void -rack_cc_after_idle(struct tcpcb *tp, int reduce_largest) +rack_cc_after_idle(struct tcpcb *tp) { uint32_t i_cwnd; @@ -1427,14 +1556,6 @@ else i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); - if (reduce_largest) { - /* - * Do we reduce the largest cwnd to make - * rack play nice on restart hptsi wise? - */ - if (((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd > i_cwnd) - ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rack_largest_cwnd = i_cwnd; - } /* * Being idle is no differnt than the initial window. If the cc * clamps it down below the initial window raise it to the initial @@ -1463,308 +1584,6 @@ (tlen <= tp->t_maxseg) && \ (tp->t_delayed_ack || (tp->t_flags & TF_NEEDSYN))) -static inline void -rack_calc_rwin(struct socket *so, struct tcpcb *tp) -{ - int32_t win; - - /* - * Calculate amount of space in receive window, and then do TCP - * input processing. Receive window is amount of space in rcv queue, - * but not less than advertised window. - */ - win = sbspace(&so->so_rcv); - if (win < 0) - win = 0; - tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); -} - -static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp) -{ - /* - * Drop space held by incoming segment and return. - */ - if (tp != NULL) - INP_WUNLOCK(tp->t_inpcb); - if (m) - m_freem(m); -} - -static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, - int32_t rstreason, int32_t tlen) -{ - if (tp != NULL) { - tcp_dropwithreset(m, th, tp, tlen, rstreason); - INP_WUNLOCK(tp->t_inpcb); - } else - tcp_dropwithreset(m, th, NULL, tlen, rstreason); -} - -/* - * The value in ret_val informs the caller - * if we dropped the tcb (and lock) or not. - * 1 = we dropped it, 0 = the TCB is still locked - * and valid. - */ -static void -rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) -{ - /* - * Generate an ACK dropping incoming segment if it occupies sequence - * space, where the ACK reflects our state. - * - * We can now skip the test for the RST flag since all paths to this - * code happen after packets containing RST have been dropped. - * - * In the SYN-RECEIVED state, don't send an ACK unless the segment - * we received passes the SYN-RECEIVED ACK test. If it fails send a - * RST. This breaks the loop in the "LAND" DoS attack, and also - * prevents an ACK storm between two listening ports that have been - * sent forged SYN segments, each with the source address of the - * other. - */ - struct tcp_rack *rack; - - if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && - (SEQ_GT(tp->snd_una, th->th_ack) || - SEQ_GT(th->th_ack, tp->snd_max))) { - *ret_val = 1; - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); - return; - } else - *ret_val = 0; - rack = (struct tcp_rack *)tp->t_fb_ptr; - rack->r_wanted_output++; - tp->t_flags |= TF_ACKNOW; - if (m) - m_freem(m); -} - - -static int -rack_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) -{ - /* - * RFC5961 Section 3.2 - * - * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in - * window, we send challenge ACK. - * - * Note: to take into account delayed ACKs, we should test against - * last_ack_sent instead of rcv_nxt. Note 2: we handle special case - * of closed window, not covered by the RFC. - */ - int dropped = 0; - - if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && - SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || - (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { - - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - KASSERT(tp->t_state != TCPS_SYN_SENT, - ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", - __func__, th, tp)); - - if (V_tcp_insecure_rst || - (tp->last_ack_sent == th->th_seq) || - (tp->rcv_nxt == th->th_seq) || - ((tp->last_ack_sent - 1) == th->th_seq)) { - TCPSTAT_INC(tcps_drops); - /* Drop the connection. */ - switch (tp->t_state) { - case TCPS_SYN_RECEIVED: - so->so_error = ECONNREFUSED; - goto close; - case TCPS_ESTABLISHED: - case TCPS_FIN_WAIT_1: - case TCPS_FIN_WAIT_2: - case TCPS_CLOSE_WAIT: - case TCPS_CLOSING: - case TCPS_LAST_ACK: - so->so_error = ECONNRESET; - close: - tcp_state_change(tp, TCPS_CLOSED); - /* FALLTHROUGH */ - default: - tp = tcp_close(tp); - } - dropped = 1; - rack_do_drop(m, tp); - } else { - TCPSTAT_INC(tcps_badrst); - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, - tp->rcv_nxt, tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; - } - } else { - m_freem(m); - } - return (dropped); -} - -/* - * The value in ret_val informs the caller - * if we dropped the tcb (and lock) or not. - * 1 = we dropped it, 0 = the TCB is still locked - * and valid. - */ -static void -rack_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) -{ - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - - TCPSTAT_INC(tcps_badsyn); - if (V_tcp_insecure_syn && - SEQ_GEQ(th->th_seq, tp->last_ack_sent) && - SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { - tp = tcp_drop(tp, ECONNRESET); - *ret_val = 1; - rack_do_drop(m, tp); - } else { - /* Send challenge ACK. */ - tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, - tp->snd_nxt, TH_ACK); - tp->last_ack_sent = tp->rcv_nxt; - m = NULL; - *ret_val = 0; - rack_do_drop(m, NULL); - } -} - -/* - * rack_ts_check returns 1 for you should not proceed. It places - * in ret_val what should be returned 1/0 by the caller. The 1 indicates - * that the TCB is unlocked and probably dropped. The 0 indicates the - * TCB is still valid and locked. - */ -static int -rack_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val) -{ - - /* Check to see if ts_recent is over 24 days old. */ - if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { - /* - * Invalidate ts_recent. If this segment updates ts_recent, - * the age will be reset later and ts_recent will get a - * valid value. If it does not, setting ts_recent to zero - * will at least satisfy the requirement that zero be placed - * in the timestamp echo reply when ts_recent isn't valid. - * The age isn't reset until we get a valid ts_recent - * because we don't want out-of-order segments to be dropped - * when ts_recent is old. - */ - tp->ts_recent = 0; - } else { - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, tlen); - TCPSTAT_INC(tcps_pawsdrop); - *ret_val = 0; - if (tlen) { - rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); - } else { - rack_do_drop(m, NULL); - } - return (1); - } - return (0); -} - -/* - * rack_drop_checks returns 1 for you should not proceed. It places - * in ret_val what should be returned 1/0 by the caller. The 1 indicates - * that the TCB is unlocked and probably dropped. The 0 indicates the - * TCB is still valid and locked. - */ -static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) -{ - int32_t todrop; - int32_t thflags; - int32_t tlen; - - thflags = *thf; - tlen = *tlenp; - todrop = tp->rcv_nxt - th->th_seq; - if (todrop > 0) { - if (thflags & TH_SYN) { - thflags &= ~TH_SYN; - th->th_seq++; - if (th->th_urp > 1) - th->th_urp--; - else - thflags &= ~TH_URG; - todrop--; - } - /* - * Following if statement from Stevens, vol. 2, p. 960. - */ - if (todrop > tlen - || (todrop == tlen && (thflags & TH_FIN) == 0)) { - /* - * Any valid FIN must be to the left of the window. - * At this point the FIN must be a duplicate or out - * of sequence; drop it. - */ - thflags &= ~TH_FIN; - /* - * Send an ACK to resynchronize and drop any data. - * But keep on processing for RST or ACK. - */ - tp->t_flags |= TF_ACKNOW; - todrop = tlen; - TCPSTAT_INC(tcps_rcvduppack); - TCPSTAT_ADD(tcps_rcvdupbyte, todrop); - } else { - TCPSTAT_INC(tcps_rcvpartduppack); - TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); - } - *drop_hdrlen += todrop; /* drop from the top afterwards */ - th->th_seq += todrop; - tlen -= todrop; - if (th->th_urp > todrop) - th->th_urp -= todrop; - else { - thflags &= ~TH_URG; - th->th_urp = 0; - } - } - /* - * If segment ends after window, drop trailing data (and PUSH and - * FIN); if nothing left, just ACK. - */ - todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); - if (todrop > 0) { - TCPSTAT_INC(tcps_rcvpackafterwin); - if (todrop >= tlen) { - TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); - /* - * If window is closed can only take segments at - * window edge, and have to drop data and PUSH from - * incoming segments. Continue processing, but - * remember to ack. Otherwise, drop segment and - * ack. - */ - if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { - tp->t_flags |= TF_ACKNOW; - TCPSTAT_INC(tcps_rcvwinprobe); - } else { - rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); - return (1); - } - } else - TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); - m_adj(m, -todrop); - tlen -= todrop; - thflags &= ~(TH_PUSH | TH_FIN); - } - *thf = thflags; - *tlenp = tlen; - return (0); -} - static struct rack_sendmap * rack_find_lowest_rsm(struct tcp_rack *rack) { @@ -1896,7 +1715,7 @@ thresh = (srtt * 2); /* Get the previous sent packet, if any */ - maxseg = tcp_maxseg(tp); + maxseg = ctf_fixed_maxseg(tp); counter_u64_add(rack_enter_tlp_calc, 1); len = rsm->r_end - rsm->r_start; if (rack->rack_tlp_threshold_use == TLP_USE_ID) { @@ -2025,7 +1844,7 @@ t = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT) + ((tp->t_rttvar * 4) >> TCP_RTT_SHIFT)); TCPT_RANGESET(tt, t * tcp_backoff[tp->t_rxtshift], - tcp_persmin, tcp_persmax); + rack_persist_min, rack_persist_max); if (tp->t_rxtshift < TCP_MAXRXTSHIFT) tp->t_rxtshift++; rack->r_ctl.rc_hpts_flags |= PACE_TMR_PERSIT; @@ -2034,7 +1853,7 @@ } static uint32_t -rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) +rack_timer_start(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts, int sup_rack) { /* * Start the FR timer, we do this based on getting the first one in @@ -2042,7 +1861,7 @@ * events we need to stop the running timer (if its running) before * starting the new one. */ - uint32_t thresh, exp, to, srtt, time_since_sent; + uint32_t thresh, exp, to, srtt, time_since_sent, tstmp_touse; uint32_t srtt_cur; int32_t idx; int32_t is_tlp_timer = 0; @@ -2059,12 +1878,27 @@ if (tp->t_state < TCPS_ESTABLISHED) goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - if (rsm == NULL) { + if ((rsm == NULL) || sup_rack) { /* Nothing on the send map */ activate_rxt: + time_since_sent = 0; + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + if (rsm) { + idx = rsm->r_rtr_cnt - 1; + if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = rsm->r_tim_lastsent[idx]; + else + tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; + if (TSTMP_GT(tstmp_touse, cts)) + time_since_sent = cts - tstmp_touse; + } if (SEQ_LT(tp->snd_una, tp->snd_max) || sbavail(&(tp->t_inpcb->inp_socket->so_snd))) { rack->r_ctl.rc_hpts_flags |= PACE_TMR_RXT; to = TICKS_2_MSEC(tp->t_rxtcur); + if (to > time_since_sent) + to -= time_since_sent; + else + to = rack->r_ctl.rc_min_to; if (to == 0) to = 1; return (to); @@ -2089,6 +1923,16 @@ */ goto activate_rxt; } + if ((rack->use_rack_cheat == 0) && + (IN_RECOVERY(tp->t_flags)) && + (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { + /* + * We are not cheating, in recovery and + * not enough ack's to yet get our next + * retransmission out. + */ + goto activate_tlp; + } if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); srtt = TICKS_2_MSEC(srtt_cur); @@ -2108,6 +1952,7 @@ } } else { /* Ok we need to do a TLP not RACK */ +activate_tlp: if ((rack->rc_tlp_in_progress != 0) || (rack->r_ctl.rc_tlp_rtx_out != 0)) { /* @@ -2127,10 +1972,13 @@ goto activate_rxt; } idx = rsm->r_rtr_cnt - 1; - if (TSTMP_GT(cts, rsm->r_tim_lastsent[idx])) - time_since_sent = cts - rsm->r_tim_lastsent[idx]; - else - time_since_sent = 0; + time_since_sent = 0; + if (TSTMP_GEQ(rsm->r_tim_lastsent[idx], rack->r_ctl.rc_tlp_rxt_last_time)) + tstmp_touse = rsm->r_tim_lastsent[idx]; + else + tstmp_touse = rack->r_ctl.rc_tlp_rxt_last_time; + if (TSTMP_GT(tstmp_touse, cts)) + time_since_sent = cts - tstmp_touse; is_tlp_timer = 1; if (tp->t_srtt) { srtt_cur = (tp->t_srtt >> TCP_RTT_SHIFT); @@ -2181,10 +2029,6 @@ rack_enter_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { if (rack->rc_in_persist == 0) { - if (((tp->t_flags & TF_SENTFIN) == 0) && - (tp->snd_max - tp->snd_una) >= sbavail(&rack->rc_inp->inp_socket->so_snd)) - /* Must need to send more data to enter persist */ - return; rack->r_ctl.rc_went_idle_time = cts; rack_timer_cancel(tp, rack, cts, __LINE__); tp->t_rxtshift = 0; @@ -2206,8 +2050,8 @@ } static void -rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, int32_t line, - int32_t slot, uint32_t tot_len_this_send, int32_t frm_out_sbavail) +rack_start_hpts_timer(struct tcp_rack *rack, struct tcpcb *tp, uint32_t cts, + int32_t slot, uint32_t tot_len_this_send, int sup_rack) { struct inpcb *inp; uint32_t delayed_ack = 0; @@ -2220,7 +2064,8 @@ /* A previous call is already set up */ return; } - if (tp->t_state == TCPS_CLOSED) { + if ((tp->t_state == TCPS_CLOSED) || + (tp->t_state == TCPS_LISTEN)) { return; } stopped = rack->rc_tmr_stopped; @@ -2227,6 +2072,7 @@ if (stopped && TSTMP_GT(rack->r_ctl.rc_timer_exp, cts)) { left = rack->r_ctl.rc_timer_exp - cts; } + rack->tlp_timer_up = 0; rack->r_ctl.rc_timer_exp = 0; if (rack->rc_inp->inp_in_hpts == 0) { rack->r_ctl.rc_hpts_flags = 0; @@ -2239,28 +2085,12 @@ * We are still left on the hpts when the to goes * it will be for output. */ - if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) - slot = cts - rack->r_ctl.rc_last_output_to; + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) + slot = rack->r_ctl.rc_last_output_to - cts; else slot = 1; } - if ((tp->snd_wnd == 0) && TCPS_HAVEESTABLISHED(tp->t_state)) { - /* No send window.. we must enter persist */ - rack_enter_persist(tp, rack, cts); - } else if ((frm_out_sbavail && - (frm_out_sbavail > (tp->snd_max - tp->snd_una)) && - (tp->snd_wnd < tp->t_maxseg)) && - TCPS_HAVEESTABLISHED(tp->t_state)) { - /* - * If we have no window or we can't send a segment (and have - * data to send.. we cheat here and frm_out_sbavail is - * passed in with the sbavail(sb) only from bbr_output) and - * we are established, then we must enter persits (if not - * already in persits). - */ - rack_enter_persist(tp, rack, cts); - } - hpts_timeout = rack_timer_start(tp, rack, cts); + hpts_timeout = rack_timer_start(tp, rack, cts, sup_rack); if (tp->t_flags & TF_DELACK) { delayed_ack = TICKS_2_MSEC(tcp_delacktime); rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; @@ -2317,6 +2147,11 @@ rack->r_ctl.rc_timer_exp = cts + hpts_timeout; } if (slot) { + rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + else + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; rack->r_ctl.rc_last_output_to = cts + slot; if ((hpts_timeout == 0) || (hpts_timeout > slot)) { if (rack->rc_inp->inp_in_hpts == 0) @@ -2332,6 +2167,15 @@ rack_log_to_start(rack, cts, hpts_timeout, slot, 0); } } else if (hpts_timeout) { + if (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK) { + /* For a rack timer, don't wake us */ + rack->rc_inp->inp_flags2 |= INP_MBUF_QUEUE_READY; + inp->inp_flags2 |= INP_DONT_SACK_QUEUE; + } else { + /* All other timers wake us up */ + rack->rc_inp->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + inp->inp_flags2 &= ~INP_DONT_SACK_QUEUE; + } if (rack->rc_inp->inp_in_hpts == 0) tcp_hpts_insert(tp->t_inpcb, HPTS_MS_TO_SLOTS(hpts_timeout)); rack_log_to_start(rack, cts, hpts_timeout, slot, 0); @@ -2367,7 +2211,7 @@ * settings. */ struct rack_sendmap *rsm; - int32_t recovery; + int32_t recovery, ll; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); @@ -2376,12 +2220,16 @@ /* Its not time yet */ return (0); } - rack_log_to_event(rack, RACK_TO_FRM_RACK); recovery = IN_RECOVERY(tp->t_flags); counter_u64_add(rack_to_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); rsm = rack_check_recovery_mode(tp, cts); + if (rsm) + ll = rsm->r_end - rsm->r_start; + else + ll = 0; + rack_log_to_event(rack, RACK_TO_FRM_RACK, ll); if (rsm) { uint32_t rtt; @@ -2389,7 +2237,7 @@ if (rtt == 0) rtt = 1; if ((recovery == 0) && - (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg)) { + (rack->r_ctl.rc_prr_sndcnt < ctf_fixed_maxseg(tp))) { /* * The rack-timeout that enter's us into recovery * will force out one MSS and set us up so that we @@ -2396,16 +2244,16 @@ * can do one more send in 2*rtt (transitioning the * rack timeout into a rack-tlp). */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; - } else if ((rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) && - ((rsm->r_end - rsm->r_start) > rack->r_ctl.rc_prr_sndcnt)) { + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 3); + } else if ((rack->r_ctl.rc_prr_sndcnt < (rsm->r_end - rsm->r_start)) && + rack->use_rack_cheat) { /* - * When a rack timer goes, we have to send at - * least one segment. They will be paced a min of 1ms - * apart via the next rack timer (or further - * if the rack timer dictates it). + * When a rack timer goes, if the rack cheat is + * on, arrange it so we can send a full segment. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 4); } } else { /* This is a case that should happen rarely if ever */ @@ -2419,6 +2267,24 @@ return (0); } +static __inline void +rack_clone_rsm(struct tcp_rack *rack, struct rack_sendmap *nrsm, + struct rack_sendmap *rsm, uint32_t start) +{ + int idx; + + nrsm->r_start = start; + nrsm->r_end = rsm->r_end; + nrsm->r_rtr_cnt = rsm->r_rtr_cnt; + nrsm->r_flags = rsm->r_flags; + nrsm->r_sndcnt = rsm->r_sndcnt; + nrsm->r_rtr_bytes = 0; + rsm->r_end = nrsm->r_start; + for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { + nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; + } +} + /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then @@ -2437,6 +2303,7 @@ struct socket *so; uint32_t amm, old_prr_snd = 0; uint32_t out, avail; + int collapsed_win = 0; if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); @@ -2453,14 +2320,28 @@ * A TLP timer has expired. We have been idle for 2 rtts. So we now * need to figure out how to force a full MSS segment out. */ - rack_log_to_event(rack, RACK_TO_FRM_TLP); + rack_log_to_event(rack, RACK_TO_FRM_TLP, 0); counter_u64_add(rack_tlp_tot, 1); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); so = tp->t_inpcb->inp_socket; +#ifdef KERN_TLS + if (rack->rc_inp->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) { + /* + * For hardware TLS we do *not* want to send + * new data, lets instead just do a retransmission. + */ + goto need_retran; + } +#endif avail = sbavail(&so->so_snd); out = tp->snd_max - tp->snd_una; - rack->rc_timer_up = 1; + rack->tlp_timer_up = 1; + if (out > tp->snd_wnd) { + /* special case, we need a retransmission */ + collapsed_win = 1; + goto need_retran; + } /* * If we are in recovery we can jazz out a segment if new data is * present simply by setting rc_prr_sndcnt to a segment. @@ -2469,9 +2350,9 @@ ((rack_always_send_oldest == 0) || (TAILQ_EMPTY(&rack->r_ctl.rc_tmap)))) { /* New data is available */ amm = avail - out; - if (amm > tp->t_maxseg) { - amm = tp->t_maxseg; - } else if ((amm < tp->t_maxseg) && ((tp->t_flags & TF_NODELAY) == 0)) { + if (amm > ctf_fixed_maxseg(tp)) { + amm = ctf_fixed_maxseg(tp); + } else if ((amm < ctf_fixed_maxseg(tp)) && ((tp->t_flags & TF_NODELAY) == 0)) { /* not enough to fill a MTU and no-delay is off */ goto need_retran; } @@ -2478,9 +2359,10 @@ if (IN_RECOVERY(tp->t_flags)) { /* Unlikely */ old_prr_snd = rack->r_ctl.rc_prr_sndcnt; - if (out + amm <= tp->snd_wnd) + if (out + amm <= tp->snd_wnd) { rack->r_ctl.rc_prr_sndcnt = amm; - else + rack_log_to_prr(rack, 4); + } else goto need_retran; } else { /* Set the send-new override */ @@ -2500,26 +2382,49 @@ * Ok we need to arrange the last un-acked segment to be re-sent, or * optionally the first un-acked segment. */ - if (rack_always_send_oldest) - rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); - else { - rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); - if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { - rsm = rack_find_high_nonack(rack, rsm); + if (collapsed_win == 0) { + if (rack_always_send_oldest) + rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); + else { + rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_map, rack_sendmap, r_next); + if (rsm && (rsm->r_flags & (RACK_ACKED | RACK_HAS_FIN))) { + rsm = rack_find_high_nonack(rack, rsm); + } } - } - if (rsm == NULL) { - counter_u64_add(rack_tlp_does_nada, 1); + if (rsm == NULL) { + counter_u64_add(rack_tlp_does_nada, 1); #ifdef TCP_BLACKBOX - tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); + tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); #endif - goto out; + goto out; + } + } else { + /* + * We must find the last segment + * that was acceptable by the client. + */ + TAILQ_FOREACH_REVERSE(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { + if ((rsm->r_flags & RACK_RWND_COLLAPSED) == 0) { + /* Found one */ + break; + } + } + if (rsm == NULL) { + /* None? if so send the first */ + rsm = TAILQ_FIRST(&rack->r_ctl.rc_map); + if (rsm == NULL) { + counter_u64_add(rack_tlp_does_nada, 1); +#ifdef TCP_BLACKBOX + tcp_log_dump_tp_logbuf(tp, "nada counter trips", M_NOWAIT, true); +#endif + goto out; + } + } } - if ((rsm->r_end - rsm->r_start) > tp->t_maxseg) { + if ((rsm->r_end - rsm->r_start) > ctf_fixed_maxseg(tp)) { /* * We need to split this the last segment in two. */ - int32_t idx; struct rack_sendmap *nrsm; nrsm = rack_alloc(rack); @@ -2531,16 +2436,8 @@ counter_u64_add(rack_tlp_does_nada, 1); goto out; } - nrsm->r_start = (rsm->r_end - tp->t_maxseg); - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - nrsm->r_rtr_bytes = 0; - rsm->r_end = nrsm->r_start; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } + rack_clone_rsm(rack, nrsm, rsm, + (rsm->r_end - ctf_fixed_maxseg(tp))); TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); @@ -2566,11 +2463,12 @@ * peer in max times. We need the retransmit timer to take * over. */ -restore: + restore: rack->r_ctl.rc_tlpsend = NULL; if (rsm) rsm->r_flags &= ~RACK_TLP; rack->r_ctl.rc_prr_sndcnt = old_prr_snd; + rack_log_to_prr(rack, 5); counter_u64_add(rack_tlp_retran_fail, 1); goto out; } else if (rsm) { @@ -2590,7 +2488,7 @@ rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); out: - rack->rc_timer_up = 0; + rack->tlp_timer_up = 0; rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_TLP; return (0); } @@ -2609,7 +2507,7 @@ if (tp->t_timers->tt_flags & TT_STOPPED) { return (1); } - rack_log_to_event(rack, RACK_TO_FRM_DELACK); + rack_log_to_event(rack, RACK_TO_FRM_DELACK, 0); tp->t_flags &= ~TF_DELACK; tp->t_flags |= TF_ACKNOW; TCPSTAT_INC(tcps_delack); @@ -2628,8 +2526,9 @@ static int rack_timeout_persist(struct tcpcb *tp, struct tcp_rack *rack, uint32_t cts) { + struct tcptemp *t_template; struct inpcb *inp; - int32_t retval = 0; + int32_t retval = 1; inp = tp->t_inpcb; @@ -2677,9 +2576,22 @@ tcp_set_inp_to_drop(rack->rc_inp, ETIMEDOUT); goto out; } - tp->t_flags |= TF_FORCEDATA; + t_template = tcpip_maketemplate(rack->rc_inp); + if (t_template) { + tcp_respond(tp, t_template->tt_ipgen, + &t_template->tt_t, (struct mbuf *)NULL, + tp->rcv_nxt, tp->snd_una - 1, 0); + /* This sends an ack */ + if (tp->t_flags & TF_DELACK) + tp->t_flags &= ~TF_DELACK; + free(t_template, M_TEMP); + } + if (tp->t_rxtshift < TCP_MAXRXTSHIFT) + tp->t_rxtshift++; out: - rack_log_to_event(rack, RACK_TO_FRM_PERSIST); + rack_log_to_event(rack, RACK_TO_FRM_PERSIST, 0); + rack_start_hpts_timer(rack, tp, cts, + 0, 0, 0); return (retval); } @@ -2700,7 +2612,7 @@ } rack->r_ctl.rc_hpts_flags &= ~PACE_TMR_KEEP; inp = tp->t_inpcb; - rack_log_to_event(rack, RACK_TO_FRM_KEEP); + rack_log_to_event(rack, RACK_TO_FRM_KEEP, 0); /* * Keep-alive timer went off; send something or drop connection if * idle for too long. @@ -2731,7 +2643,7 @@ free(t_template, M_TEMP); } } - rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); return (1); dropit: TCPSTAT_INC(tcps_keepdrops); @@ -2756,7 +2668,7 @@ rack = (struct tcp_rack *)tp->t_fb_ptr; rack_timer_cancel(tp, rack, tcp_ts_getticks(), __LINE__); - rack_log_to_event(rack, RACK_TO_FRM_TMR); + rack_log_to_event(rack, RACK_TO_FRM_TMR, 0); if (rack->r_state && (rack->r_state != tp->t_state)) rack_set_state(tp, rack); /* @@ -2779,9 +2691,9 @@ TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, trsm, rsm, r_tnext); } rsm->r_in_tmap = 1; - trsm = rsm; } } + trsm = rsm; rsm->r_flags &= ~(RACK_ACKED | RACK_SACK_PASSED | RACK_WAS_SACKPASS); } /* Clear the count (we just un-acked them) */ @@ -2790,9 +2702,8 @@ rack->r_ctl.rc_tlp_rtx_out = 0; rack->r_ctl.rc_tlp_seg_send_cnt = 0; rack->r_ctl.rc_resend = TAILQ_FIRST(&rack->r_ctl.rc_map); - /* Setup so we send one segment */ - if (rack->r_ctl.rc_prr_sndcnt < tp->t_maxseg) - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 6); rack->r_timer_override = 1; } @@ -2826,7 +2737,18 @@ * retransmit interval. Back off to a longer retransmit interval * and retransmit one segment. */ - if (++tp->t_rxtshift > TCP_MAXRXTSHIFT) { + rack_remxt_tmr(tp); + if ((rack->r_ctl.rc_resend == NULL) || + ((rack->r_ctl.rc_resend->r_flags & RACK_RWND_COLLAPSED) == 0)) { + /* + * If the rwnd collapsed on + * the one we are retransmitting + * it does not count against the + * rxt count. + */ + tp->t_rxtshift++; + } + if (tp->t_rxtshift > TCP_MAXRXTSHIFT) { tp->t_rxtshift = TCP_MAXRXTSHIFT; TCPSTAT_INC(tcps_timeoutdrop); retval = 1; @@ -2834,7 +2756,6 @@ (tp->t_softerror ? (uint16_t) tp->t_softerror : ETIMEDOUT)); goto out; } - rack_remxt_tmr(tp); if (tp->t_state == TCPS_SYN_SENT) { /* * If the SYN was retransmitted, indicate CWND to be limited @@ -2971,16 +2892,6 @@ } } /* - * Disable RFC1323 and SACK if we haven't got any response to our - * third SYN to work-around some broken terminal servers (most of - * which have hopefully been retired) that have bad VJ header - * compression code which trashes TCP segments containing - * unknown-to-them TCP options. - */ - if (tcp_rexmit_drop_options && (tp->t_state == TCPS_SYN_SENT) && - (tp->t_rxtshift == 3)) - tp->t_flags &= ~(TF_REQ_SCALE | TF_REQ_TSTMP | TF_SACK_PERMIT); - /* * If we backed off this far, our srtt estimate is probably bogus. * Clobber it so we'll take the next rtt measurement as our srtt; * move the current srtt into rttvar to keep the current retransmit @@ -3050,10 +2961,13 @@ if (timers & PACE_TMR_DELACK) { ret = rack_timeout_delack(tp, rack, cts); } else if (timers & PACE_TMR_RACK) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_rack(tp, rack, cts); } else if (timers & PACE_TMR_TLP) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_tlp(tp, rack, cts); } else if (timers & PACE_TMR_RXT) { + rack->r_ctl.rc_tlp_rxt_last_time = cts; ret = rack_timeout_rxt(tp, rack, cts); } else if (timers & PACE_TMR_PERSIT) { ret = rack_timeout_persist(tp, rack, cts); @@ -3186,7 +3100,6 @@ struct rack_sendmap *nrsm; uint32_t c_end; int32_t len; - int32_t idx; len = *lenp; c_end = rsm->r_start + len; @@ -3228,16 +3141,7 @@ * we retransmitted 5 bytes i.e. 1, 5. The original piece shrinks to * 1, 6 and the new piece will be 6, 11. */ - nrsm->r_start = c_end; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - nrsm->r_rtr_bytes = 0; - rsm->r_end = c_end; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } + rack_clone_rsm(rack, nrsm, rsm, c_end); TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); @@ -3258,7 +3162,6 @@ struct tcp_rack *rack; struct rack_sendmap *rsm, *nrsm; register uint32_t snd_max, snd_una; - int32_t idx; /* * Add to the RACK log of packets in flight or retransmitted. If @@ -3430,16 +3333,7 @@ * copy rsm to nrsm and then trim the front of rsm * to not include this part. */ - nrsm->r_start = seq_out; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - nrsm->r_rtr_bytes = 0; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } - rsm->r_end = nrsm->r_start; + rack_clone_rsm(rack, nrsm, rsm, seq_out); TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); @@ -3694,7 +3588,7 @@ rack->r_ctl.rc_rack_min_rtt = 1; } } - tcp_rack_xmit_timer(rack, TCP_TS_TO_TICKS(t) + 1); + tcp_rack_xmit_timer(rack, t + 1); if ((rsm->r_flags & RACK_TLP) && (!IN_RECOVERY(tp->t_flags))) { /* Segment was a TLP and our retrans matched */ @@ -3707,9 +3601,9 @@ * When we enter recovery we need to assure * we send one packet. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; - } else - rack->r_ctl.rc_tlp_rtx_out = 0; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 7); + } } if (SEQ_LT(rack->r_ctl.rc_rack_tmit_time, rsm->r_tim_lastsent[(rsm->r_rtr_cnt - 1)])) { /* New more recent rack_tmit_time */ @@ -3873,7 +3767,6 @@ rack_proc_sack_blk(struct tcpcb *tp, struct tcp_rack *rack, struct sackblk *sack, struct tcpopt *to, struct rack_sendmap **prsm, uint32_t cts) { - int32_t idx; int32_t times = 0; uint32_t start, end, changed = 0; struct rack_sendmap *rsm, *nrsm; @@ -3950,16 +3843,7 @@ */ goto out; } - nrsm->r_start = start; - nrsm->r_rtr_bytes = 0; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } - rsm->r_end = nrsm->r_start; + rack_clone_rsm(rack, nrsm, rsm, start); TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); if (rsm->r_in_tmap) { TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); @@ -3974,6 +3858,8 @@ * at this guy. */ + if (rsm->r_flags & RACK_TLP) + rack->r_ctl.rc_tlp_rtx_out = 0; if ((rsm->r_flags & RACK_ACKED) == 0) { rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); @@ -4009,15 +3895,7 @@ goto out; } /* Clone it */ - nrsm->r_start = end; - nrsm->r_end = rsm->r_end; - nrsm->r_rtr_bytes = 0; - nrsm->r_rtr_cnt = rsm->r_rtr_cnt; - nrsm->r_flags = rsm->r_flags; - nrsm->r_sndcnt = rsm->r_sndcnt; - for (idx = 0; idx < nrsm->r_rtr_cnt; idx++) { - nrsm->r_tim_lastsent[idx] = rsm->r_tim_lastsent[idx]; - } + rack_clone_rsm(rack, nrsm, rsm, end); /* The sack block does not cover this guy fully */ rsm->r_flags &= (~RACK_HAS_FIN); rsm->r_end = end; @@ -4030,6 +3908,8 @@ /* Been here done that */ goto out; } + if (rsm->r_flags & RACK_TLP) + rack->r_ctl.rc_tlp_rtx_out = 0; rack_update_rtt(tp, rack, rsm, to, cts, SACKED); changed += (rsm->r_end - rsm->r_start); rack->r_ctl.rc_sacked += (rsm->r_end - rsm->r_start); @@ -4172,6 +4052,8 @@ rack->r_ctl.rc_holes_rxt -= rsm->r_rtr_bytes; rsm->r_rtr_bytes = 0; + if (rsm->r_flags & RACK_TLP) + rack->r_ctl.rc_tlp_rtx_out = 0; TAILQ_REMOVE(&rack->r_ctl.rc_map, rsm, r_next); if (rsm->r_in_tmap) { TAILQ_REMOVE(&rack->r_ctl.rc_tmap, rsm, r_tnext); @@ -4391,12 +4273,13 @@ * When we enter recovery we need to assure we send * one packet. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 8); rack->r_timer_override = 1; } } if (IN_RECOVERY(tp->t_flags) && (entered_recovery == 0)) { - /* Deal with changed an PRR here (in recovery only) */ + /* Deal with changed and PRR here (in recovery only) */ uint32_t pipe, snd_una; rack->r_ctl.rc_prr_delivered += changed; @@ -4415,6 +4298,7 @@ sndcnt /= (long)rack->r_ctl.rc_prr_recovery_fs; else { rack->r_ctl.rc_prr_sndcnt = 0; + rack_log_to_prr(rack, 9); sndcnt = 0; } sndcnt++; @@ -4423,6 +4307,7 @@ else sndcnt = 0; rack->r_ctl.rc_prr_sndcnt = sndcnt; + rack_log_to_prr(rack, 10); } else { uint32_t limit; @@ -4432,14 +4317,16 @@ limit = 0; if (changed > limit) limit = changed; - limit += tp->t_maxseg; + limit += ctf_fixed_maxseg(tp); if (tp->snd_ssthresh > pipe) { rack->r_ctl.rc_prr_sndcnt = min((tp->snd_ssthresh - pipe), limit); + rack_log_to_prr(rack, 11); } else { rack->r_ctl.rc_prr_sndcnt = min(0, limit); + rack_log_to_prr(rack, 12); } } - if (rack->r_ctl.rc_prr_sndcnt >= tp->t_maxseg) { + if (rack->r_ctl.rc_prr_sndcnt >= ctf_fixed_maxseg(tp)) { rack->r_timer_override = 1; } } @@ -4466,10 +4353,13 @@ rack = (struct tcp_rack *)tp->t_fb_ptr; if (SEQ_GT(th->th_ack, tp->snd_max)) { - rack_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + rack->r_wanted_output++; return (1); } if (SEQ_GEQ(th->th_ack, tp->snd_una) || to->to_nsacks) { + if (rack->rc_in_persist) + tp->t_rxtshift = 0; rack_log_ack(tp, to, th); } if (__predict_false(SEQ_LEQ(th->th_ack, tp->snd_una))) { @@ -4543,9 +4433,6 @@ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); rack->r_wanted_output++; } - /* - * If no data (only SYN) was ACK'd, skip rest of ACK processing. - */ if (acked == 0) { if (ofia) *ofia = ourfinisacked; @@ -4598,7 +4485,8 @@ if (tp->snd_una == tp->snd_max) { /* Nothing left outstanding */ rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); - tp->t_acktime = 0; + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); /* Set need output so persist might get set */ rack->r_wanted_output++; @@ -4614,7 +4502,7 @@ */ *ret_val = 1; tp = tcp_close(tp); - rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, tlen); return (1); } } @@ -4623,7 +4511,123 @@ return (0); } +static void +rack_collapsed_window(struct tcp_rack *rack) +{ + /* + * Now we must walk the + * send map and divide the + * ones left stranded. These + * guys can't cause us to abort + * the connection and are really + * "unsent". However if a buggy + * client actually did keep some + * of the data i.e. collapsed the win + * and refused to ack and then opened + * the win and acked that data. We would + * get into an ack war, the simplier + * method then of just pretending we + * did not send those segments something + * won't work. + */ + struct rack_sendmap *rsm, *nrsm; + tcp_seq max_seq; + uint32_t maxseg; + int fnd = 0, can_split = 0; + max_seq = rack->rc_tp->snd_una + rack->rc_tp->snd_wnd; + maxseg = ctf_fixed_maxseg(rack->rc_tp); + TAILQ_FOREACH(rsm, &rack->r_ctl.rc_map, r_next) { + /* Find the first seq past or at maxseq */ + if (rsm->r_flags & RACK_RWND_COLLAPSED) + rsm->r_flags &= ~RACK_RWND_COLLAPSED; + if (SEQ_GEQ(max_seq, rsm->r_start) && + SEQ_GEQ(rsm->r_end, max_seq)) { + fnd = 1; + break; + } + } + rack->rc_has_collapsed = 0; + if (!fnd) { + /* Nothing to do strange */ + return; + } + /* Now can we split? */ + if ((max_seq != rsm->r_start) && + (max_seq != rsm->r_end)){ + /* can we split? */ + int res1, res2; + + res1 = max_seq - rsm->r_start; + res2 = rsm->r_end - max_seq; + if ((res1 >= (maxseg/8)) && + (res2 >= (maxseg/8))) { + /* No small pieces here */ + can_split = 1; + } else if (rack->r_ctl.rc_num_maps_alloced < rack_sack_block_limit) { + /* We are under the limit */ + can_split = 1; + } + } + /* + * Ok do we need to split this rsm? + * + * We don't want to split if splitting + * would generate too many small segments + * less we let an attacker fragment our + * send_map and leave us out of memory. + */ + if (max_seq == rsm->r_start) { + /* It's this guy */ + nrsm = rsm; + } else if (max_seq == rsm->r_end) { + /* It's the next guy */ + nrsm = TAILQ_NEXT(rsm, r_next); + if (nrsm == NULL) { + /* Huh? */ + return; + } + } else if (can_split && SEQ_GEQ(max_seq, rsm->r_end)) { + /* yep */ + nrsm = rack_alloc(rack); + if (nrsm == NULL) { + /* We can't get a rsm, mark all? */ + nrsm = rsm; + goto no_split; + } + /* Clone it */ + rack_clone_rsm(rack, nrsm, rsm, max_seq); + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_map, rsm, nrsm, r_next); + if (rsm->r_in_tmap) { + TAILQ_INSERT_AFTER(&rack->r_ctl.rc_tmap, rsm, nrsm, r_tnext); + nrsm->r_in_tmap = 1; + } + } else { + /* We can't split so just point to this guy */ + nrsm = rsm; + } +no_split: + counter_u64_add(rack_collapsed_win, 1); + TAILQ_FOREACH_FROM(nrsm, &rack->r_ctl.rc_map, r_next) { + nrsm->r_flags |= RACK_RWND_COLLAPSED; + rack->rc_has_collapsed = 1; + } +} + +static void +rack_un_collapse_window(struct tcp_rack *rack) +{ + struct rack_sendmap *rsm; + + TAILQ_FOREACH_REVERSE(rsm, &rack->r_ctl.rc_map, rack_head, r_next) { + if (rsm->r_flags & RACK_RWND_COLLAPSED) + rsm->r_flags &= ~RACK_RWND_COLLAPSED; + else + break; + } + rack->rc_has_collapsed = 0; +} + /* * Return value of 1, the TCB is unlocked and most * likely gone, return value of 0, the TCP is still @@ -4666,13 +4670,36 @@ tp->snd_wl2 = th->th_ack; } } + if (tp->snd_wnd < ctf_outstanding(tp)) + /* The peer collapsed the window */ + rack_collapsed_window(rack); + else if (rack->rc_has_collapsed) + rack_un_collapse_window(rack); /* Was persist timer active and now we have window space? */ - if ((rack->rc_in_persist != 0) && tp->snd_wnd) { + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) { rack_exit_persist(tp, rack); tp->snd_nxt = tp->snd_max; /* Make sure we output to start the timer */ rack->r_wanted_output++; } + /* Do we enter persists? */ + if ((rack->rc_in_persist == 0) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* + * Here the rwnd is less than + * the pacing size, we are established, + * nothing is outstanding, and there is + * data to send. Enter persists. + */ + tp->snd_nxt = tp->snd_una; + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } if (tp->t_flags2 & TF2_DROP_AF_DATA) { m_freem(m); return (0); @@ -4972,7 +4999,7 @@ so->so_rcv.sb_flags &= ~SB_AUTOSIZE; m_adj(m, drop_hdrlen); /* delayed header drop */ sbappendstream_locked(&so->so_rcv, m, 0); - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); } /* NB: sorwakeup_locked() does an implicit unlock. */ sorwakeup_locked(so); @@ -5053,6 +5080,12 @@ /* Ok if we reach here, we can process a fast-ack */ nsegs = max(1, m->m_pkthdr.lro_nsegs); rack_log_ack(tp, to, th); + /* + * We made progress, clear the tlp + * out flag so we could start a TLP + * again. + */ + rack->r_ctl.rc_tlp_rtx_out = 0; /* Did the window get updated? */ if (tiwin != tp->snd_wnd) { tp->snd_wnd = tiwin; @@ -5060,9 +5093,28 @@ if (tp->snd_wnd > tp->max_sndwnd) tp->max_sndwnd = tp->snd_wnd; } - if ((rack->rc_in_persist != 0) && (tp->snd_wnd >= tp->t_maxseg)) { + /* Do we exit persists? */ + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) { rack_exit_persist(tp, rack); } + /* Do we enter persists? */ + if ((rack->rc_in_persist == 0) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->snd_max == tp->snd_una) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd)) { + /* + * Here the rwnd is less than + * the pacing size, we are established, + * nothing is outstanding, and there is + * data to send. Enter persists. + */ + tp->snd_nxt = tp->snd_una; + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } /* * If last ACK falls within this segment's sequence numbers, record * the timestamp. NOTE that the test is modified according to the @@ -5112,6 +5164,12 @@ rack_ack_received(tp, rack, th, nsegs, CC_ACK, 0); tp->snd_una = th->th_ack; + if (tp->snd_wnd < ctf_outstanding(tp)) { + /* The peer collapsed the window */ + rack_collapsed_window(rack); + } else if (rack->rc_has_collapsed) + rack_un_collapse_window(rack); + /* * Pull snd_wl2 up to prevent seq wrap relative to th_ack. */ @@ -5135,7 +5193,8 @@ #endif if (tp->snd_una == tp->snd_max) { rack_log_progress_event(rack, tp, 0, PROGRESS_CLEAR, __LINE__); - tp->t_acktime = 0; + if (sbavail(&tp->t_inpcb->inp_socket->so_snd) == 0) + tp->t_acktime = 0; rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); } /* Wake up the socket if we have room to write more */ @@ -5159,8 +5218,9 @@ int32_t ret_val = 0; int32_t todrop; int32_t ourfinisacked = 0; + struct tcp_rack *rack; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); /* * If the state is SYN_SENT: if seg contains an ACK, but not for our * SYN, drop the input. if seg contains a RST, then drop the @@ -5175,7 +5235,7 @@ if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } if ((thflags & (TH_ACK | TH_RST)) == (TH_ACK | TH_RST)) { @@ -5182,19 +5242,20 @@ TCP_PROBE5(connect__refused, NULL, tp, mtod(m, const char *), tp, th); tp = tcp_drop(tp, ECONNREFUSED); - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } if (thflags & TH_RST) { - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } if (!(thflags & TH_SYN)) { - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } tp->irs = th->th_seq; tcp_rcvseqinit(tp); + rack = (struct tcp_rack *)tp->t_fb_ptr; if (thflags & TH_ACK) { int tfo_partial = 0; @@ -5224,11 +5285,11 @@ * will be turned on later. */ if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { - rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, - ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); + rack_timer_cancel(tp, rack, + rack->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; } else { - ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; + rack->r_wanted_output++; tp->t_flags |= TF_ACKNOW; } @@ -5303,6 +5364,16 @@ * of step 5, ack processing. Otherwise, goto step 6. */ if (thflags & TH_ACK) { + /* For syn-sent we need to possibly update the rtt */ + if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { + uint32_t t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_rack_xmit_timer(rack, t + 1); + tcp_rack_xmit_timer_commit(rack, tp); + } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) return (ret_val); /* We may have changed to FIN_WAIT_1 above */ @@ -5348,17 +5419,18 @@ struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt) { + struct tcp_rack *rack; int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); - + ctf_calc_rwin(so, tp); if ((thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->snd_una) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } + rack = (struct tcp_rack *)tp->t_fb_ptr; if (IS_FASTOPEN(tp->t_flags)) { /* * When a TFO connection is in SYN_RECEIVED, the @@ -5368,26 +5440,24 @@ * FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } else if (thflags & TH_SYN) { /* non-initial SYN is ignored */ - struct tcp_rack *rack; - - rack = (struct tcp_rack *)tp->t_fb_ptr; if ((rack->r_ctl.rc_hpts_flags & PACE_TMR_RXT) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_TLP) || (rack->r_ctl.rc_hpts_flags & PACE_TMR_RACK)) { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } else if (!(thflags & (TH_ACK | TH_FIN | TH_RST))) { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. @@ -5394,7 +5464,7 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } /* @@ -5405,10 +5475,10 @@ * "LAND" DoS attack. */ if (SEQ_LT(th->th_seq, tp->irs)) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5491,6 +5561,16 @@ (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; + /* For syn-recv we need to possibly update the rtt */ + if ((to->to_flags & TOF_TS) != 0 && to->to_tsecr) { + uint32_t t; + + t = tcp_ts_getticks() - to->to_tsecr; + if (!tp->t_rttlow || tp->t_rttlow > t) + tp->t_rttlow = t; + tcp_rack_xmit_timer(rack, t + 1); + tcp_rack_xmit_timer_commit(rack, tp); + } if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { return (ret_val); } @@ -5570,10 +5650,11 @@ } } } - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in @@ -5580,7 +5661,7 @@ * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5589,10 +5670,10 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5628,10 +5709,11 @@ tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -5644,7 +5726,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -5665,15 +5747,16 @@ { int32_t ret_val = 0; - rack_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + ctf_calc_rwin(so, tp); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5682,10 +5765,10 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5720,10 +5803,11 @@ tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -5736,7 +5820,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -5756,7 +5840,7 @@ close_now: tp = tcp_close(tp); TCPSTAT_INC(tcps_rcvafterclose); - rack_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); + ctf_do_dropwithreset(m, tp, th, BANDLIM_UNLIMITED, (*tlen)); return (1); } if (sbavail(&so->so_snd) == 0) @@ -5782,16 +5866,17 @@ int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5800,10 +5885,10 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5845,10 +5930,11 @@ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -5880,7 +5966,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -5901,16 +5987,17 @@ int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -5919,10 +6006,10 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -5964,10 +6051,11 @@ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -5986,7 +6074,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6007,16 +6095,17 @@ int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -6025,10 +6114,10 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -6070,10 +6159,11 @@ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -6086,13 +6176,13 @@ if (ourfinisacked) { INP_INFO_RLOCK_ASSERT(&V_tcbinfo); tp = tcp_close(tp); - rack_do_drop(m, tp); + ctf_do_drop(m, tp); return (1); } if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6114,17 +6204,18 @@ int32_t ret_val = 0; int32_t ourfinisacked = 0; - rack_calc_rwin(so, tp); + ctf_calc_rwin(so, tp); /* Reset receive buffer auto scaling when not in bulk receive mode. */ - if (thflags & TH_RST) - return (rack_process_rst(m, th, so, tp)); + if ((thflags & TH_RST) || + (tp->t_fin_is_rst && (thflags & TH_FIN))) + return (ctf_process_rst(m, th, so, tp)); /* * RFC5961 Section 4.2 Send challenge ACK for any SYN in * synchronized state. */ if (thflags & TH_SYN) { - rack_challenge_ack(m, th, tp, &ret_val); + ctf_challenge_ack(m, th, tp, &ret_val); return (ret_val); } /* @@ -6133,10 +6224,10 @@ */ if ((to->to_flags & TOF_TS) != 0 && tp->ts_recent && TSTMP_LT(to->to_tsval, tp->ts_recent)) { - if (rack_ts_check(m, th, tp, tlen, thflags, &ret_val)) + if (ctf_ts_check(m, th, tp, tlen, thflags, &ret_val)) return (ret_val); } - if (rack_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { + if (ctf_drop_checks(to, m, th, tp, &tlen, &thflags, &drop_hdrlen, &ret_val)) { return (ret_val); } /* @@ -6179,10 +6270,11 @@ return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } else if (tp->t_flags & TF_ACKNOW) { - rack_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ctf_do_dropafterack(m, tp, th, thflags, tlen, &ret_val); + ((struct tcp_rack *)tp->t_fb_ptr)->r_wanted_output++; return (ret_val); } else { - rack_do_drop(m, NULL); + ctf_do_drop(m, NULL); return (0); } } @@ -6195,7 +6287,7 @@ if (sbavail(&so->so_snd)) { if (rack_progress_timeout_check(tp)) { tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } } @@ -6212,6 +6304,32 @@ rack->r_ctl.rack_rs.rs_rtt_tot = 0; } +static void +rack_set_pace_segments(struct tcpcb *tp, struct tcp_rack *rack) +{ + uint32_t tls_seg = 0; + +#ifdef KERN_TLS + if (rack->rc_inp->inp_socket->so_snd.sb_tls_flags & SB_TLS_IFNET) { + tls_seg = ctf_get_opt_tls_size(rack->rc_inp->inp_socket, rack->rc_tp->snd_wnd); + rack->r_ctl.rc_pace_min_segs = tls_seg; + } else +#endif + rack->r_ctl.rc_pace_min_segs = ctf_fixed_maxseg(tp); + rack->r_ctl.rc_pace_max_segs = ctf_fixed_maxseg(tp) * rack->rc_pace_max_segs; + if (rack->r_ctl.rc_pace_max_segs > PACE_MAX_IP_BYTES) + rack->r_ctl.rc_pace_max_segs = PACE_MAX_IP_BYTES; +#ifdef KERN_TLS + if (tls_seg != 0) { + rack->r_ctl.rc_pace_max_segs %= tls_seg; + if (rack->r_ctl.rc_pace_max_segs == 0) + rack->r_ctl.rc_pace_max_segs = 1; + rack->r_ctl.rc_pace_max_segs *= tls_seg; + } +#endif + rack_log_type_hrdwtso(tp, rack, tls_seg, /*rack->rc_inp->inp_socket->so_snd.sb_tls_flags*/ 0, 0, 2); +} + static int rack_init(struct tcpcb *tp) { @@ -6237,6 +6355,7 @@ if (tp->t_inpcb) { rack->rc_inp = tp->t_inpcb; } + tp->t_inpcb->inp_flags2 |= INP_SUPPORTS_MBUFQ; /* Probably not needed but lets be sure */ rack_clear_rate_sample(rack); rack->r_cpu = 0; @@ -6244,28 +6363,29 @@ rack->rc_allow_data_af_clo = rack_ignore_data_after_close; rack->r_ctl.rc_tlp_threshold = rack_tlp_thresh; rack->rc_pace_reduce = rack_slot_reduction; + if (use_rack_cheat) + rack->use_rack_cheat = 1; if (V_tcp_delack_enabled) tp->t_delayed_ack = 1; else tp->t_delayed_ack = 0; rack->rc_pace_max_segs = rack_hptsi_segments; - rack->r_ctl.rc_early_recovery_segs = rack_early_recovery_max_seg; rack->r_ctl.rc_reorder_shift = rack_reorder_thresh; rack->r_ctl.rc_pkt_delay = rack_pkt_delay; rack->r_ctl.rc_prop_reduce = rack_use_proportional_reduce; - rack->r_idle_reduce_largest = rack_reduce_largest_on_idle; rack->r_enforce_min_pace = rack_min_pace_time; - rack->r_min_pace_seg_thresh = rack_min_pace_time_seg_req; rack->r_ctl.rc_prop_rate = rack_proportional_rate; rack->r_ctl.rc_tlp_cwnd_reduce = rack_lower_cwnd_at_tlp; rack->r_ctl.rc_early_recovery = rack_early_recovery; rack->rc_always_pace = rack_pace_every_seg; + rack_set_pace_segments(tp, rack); + rack->r_ctl.rc_high_rwnd = tp->snd_wnd; rack->r_ctl.rc_rate_sample_method = rack_rate_sample_method; rack->rack_tlp_threshold_use = rack_tlp_threshold_use; rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; - rack->r_ctl.rc_prr_inc_var = rack_inc_var; - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + rack->rack_per_of_gp = rack_per_of_gp; + rack->r_ctl.rc_tlp_rxt_last_time = tcp_ts_getticks(); if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; @@ -6277,7 +6397,7 @@ return (ENOMEM); } rsm->r_flags = RACK_OVERMAX; - rsm->r_tim_lastsent[0] = tcp_ts_getticks(); + rsm->r_tim_lastsent[0] = rack->r_ctl.rc_tlp_rxt_last_time; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; rsm->r_start = tp->snd_una; @@ -6287,6 +6407,8 @@ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } + rack_stop_all_timers(tp); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); return (0); } @@ -6322,7 +6444,10 @@ if (tp->t_fb_ptr) { struct tcp_rack *rack; struct rack_sendmap *rsm; - + if (tp->t_inpcb) { + tp->t_inpcb->inp_flags2 &= ~INP_SUPPORTS_MBUFQ; + tp->t_inpcb->inp_flags2 &= ~INP_MBUF_QUEUE_READY; + } rack = (struct tcp_rack *)tp->t_fb_ptr; #ifdef TCP_BLACKBOX tcp_log_flowend(tp); @@ -6345,6 +6470,7 @@ } } + static void rack_set_state(struct tcpcb *tp, struct tcp_rack *rack) { @@ -6358,6 +6484,7 @@ rack->r_substate = rack_do_syn_recv; break; case TCPS_ESTABLISHED: + rack_set_pace_segments(tp, rack); rack->r_state = TCPS_ESTABLISHED; rack->r_substate = rack_do_established; break; @@ -6385,9 +6512,6 @@ case TCPS_CLOSED: case TCPS_TIME_WAIT: default: -#ifdef INVARIANTS - panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); -#endif break; }; } @@ -6435,21 +6559,13 @@ return; } } - if (rsm && (rsm->r_flags & RACK_SACK_PASSED)) { - if ((tp->t_flags & TF_SENTFIN) && - ((tp->snd_max - tp->snd_una) == 1) && - (rsm->r_flags & RACK_HAS_FIN)) { - /* needs to be a RXT */ - if (tmr_up == PACE_TMR_RXT) - return; - } else if (tmr_up == PACE_TMR_RACK) - return; - } else if (SEQ_GT(tp->snd_max,tp->snd_una) && + if (SEQ_GT(tp->snd_max, tp->snd_una) && ((tmr_up == PACE_TMR_TLP) || + (tmr_up == PACE_TMR_RACK) || (tmr_up == PACE_TMR_RXT))) { /* - * Either a TLP or RXT is fine if no sack-passed - * is in place and data is outstanding. + * Either a Rack, TLP or RXT is fine if we + * have outstanding data. */ return; } else if (tmr_up == PACE_TMR_DELACK) { @@ -6468,11 +6584,11 @@ * with the slot set to what was in the saved slot. */ rack_timer_cancel(tp, rack, rack->r_ctl.rc_rcvtime, __LINE__); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } -static void -rack_hpts_do_segment(struct mbuf *m, struct tcphdr *th, struct socket *so, +static int +rack_do_segment_nounlock(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos, int32_t nxt_pkt, struct timeval *tv) { @@ -6485,6 +6601,10 @@ struct rack_sendmap *rsm; int32_t prev_state = 0; + if (m->m_flags & M_TSTMP_LRO) { + tv->tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv->tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } cts = tcp_tv_to_mssectick(tv); rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -6506,17 +6626,23 @@ __func__)); KASSERT(tp->t_state != TCPS_TIME_WAIT, ("%s: TCPS_TIME_WAIT", __func__)); - { + if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; - TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, - tlen, &log, true); + log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); + log.u_bbr.pkts_out = rack->rc_tp->t_maxseg; + TCP_LOG_EVENTP(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, + tlen, &log, true, &tv); } if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { way_out = 4; + retval = 0; goto done_with_input; } /* @@ -6525,8 +6651,8 @@ */ if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); - return; + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return(1); } /* * Segment received on connection. Reset idle time and keep-alive @@ -6533,32 +6659,16 @@ * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ - if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { -#ifdef NETFLIX_CWV - if ((tp->cwv_enabled) && - ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { - tcp_newcwv_nvp_closedown(tp); - } else -#endif - if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { - counter_u64_add(rack_input_idle_reduces, 1); - rack_cc_after_idle(tp, - (rack->r_idle_reduce_largest ? 1 :0)); - } + if (tp->t_idle_reduce && + (tp->snd_max == tp->snd_una) && + ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) { + counter_u64_add(rack_input_idle_reduces, 1); + rack_cc_after_idle(tp); + } rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) - tcp_newcwv_nvp_closedown(tp); - } -#endif /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. @@ -6567,6 +6677,8 @@ #ifdef NETFLIX_STATS stats_voi_update_abs_ulong(tp->t_stats, VOI_TCP_FRWIN, tiwin); #endif + if (tiwin > rack->r_ctl.rc_high_rwnd) + rack->r_ctl.rc_high_rwnd = tiwin; /* * TCP ECN processing. XXXJTL: If we ever use ECN, we need to move * this to occur after we've validated the segment. @@ -6676,12 +6788,11 @@ tcp_switch_back_to_default(tp); (*tp->t_fb->tfb_tcp_do_segment) (m, th, so, tp, drop_hdrlen, tlen, iptos); - return; + return (1); } /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); - rack_stop_all_timers(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } /* @@ -6712,33 +6823,24 @@ * is gone. */ INP_WLOCK_ASSERT(tp->t_inpcb); + if (rack->set_pacing_done_a_iw == 0) { + /* How much has been acked? */ + if ((tp->snd_una - tp->iss) > (ctf_fixed_maxseg(tp) * 10)) { + /* We have enough to set in the pacing segment size */ + rack->set_pacing_done_a_iw = 1; + rack_set_pace_segments(tp, rack); + } + } tcp_rack_xmit_timer_commit(rack, tp); - if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && - (rack->rc_in_persist == 0)){ - /* - * The peer shrunk its window on us to the point - * where we have sent too much. The only thing - * we can do here is stop any timers and - * enter persist. We most likely lost the last - * bytes we sent but oh well, we will have to - * retransmit them after the peer is caught up. - */ - if (rack->rc_inp->inp_in_hpts) - tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - rack_timer_cancel(tp, rack, cts, __LINE__); - rack_enter_persist(tp, rack, cts); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); - way_out = 3; - goto done_with_input; - } - if (nxt_pkt == 0) { + if ((nxt_pkt == 0) || (IN_RECOVERY(tp->t_flags))) { if (rack->r_wanted_output != 0) { did_out = 1; (void)tp->t_fb->tfb_tcp_output(tp); } - rack_start_hpts_timer(rack, tp, cts, __LINE__, 0, 0, 0); + rack_start_hpts_timer(rack, tp, cts, 0, 0, 0); } - if (((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && + if ((nxt_pkt == 0) && + ((rack->r_ctl.rc_hpts_flags & PACE_TMR_MASK) == 0) && (SEQ_GT(tp->snd_max, tp->snd_una) || (tp->t_flags & TF_DELACK) || ((tcp_always_keepalive || rack->rc_inp->inp_socket->so_options & SO_KEEPALIVE) && @@ -6746,16 +6848,19 @@ /* We could not send (probably in the hpts but stopped the timer earlier)? */ if ((tp->snd_max == tp->snd_una) && ((tp->t_flags & TF_DELACK) == 0) && + (rack->rc_inp->inp_in_hpts) && (rack->r_ctl.rc_hpts_flags & PACE_PKT_OUTPUT)) { /* keep alive not needed if we are hptsi output yet */ ; } else { - if (rack->rc_inp->inp_in_hpts) + if (rack->rc_inp->inp_in_hpts) { tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); + counter_u64_add(rack_per_timer_hole, 1); + } + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), 0, 0, 0); } way_out = 1; - } else { + } else if (nxt_pkt == 0) { /* Do we have the correct timer running? */ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; @@ -6771,8 +6876,8 @@ retval, tp, prev_state); } #endif - INP_WUNLOCK(tp->t_inpcb); } + return (retval); } void @@ -6780,31 +6885,24 @@ struct tcpcb *tp, int32_t drop_hdrlen, int32_t tlen, uint8_t iptos) { struct timeval tv; -#ifdef RSS - struct tcp_function_block *tfb; - struct tcp_rack *rack; - struct epoch_tracker et; - rack = (struct tcp_rack *)tp->t_fb_ptr; - if (rack->r_state == 0) { - /* - * Initial input (ACK to SYN-ACK etc)lets go ahead and get - * it processed - */ - INP_INFO_RLOCK_ET(&V_tcbinfo, et); + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + if (ctf_do_queued_segments(so, tp, 1)) { + m_freem(m); + return; + } + } + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ tcp_get_usecs(&tv); - rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, - tlen, iptos, 0, &tv); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - return; } - tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); - INP_WUNLOCK(tp->t_inpcb); -#else - tcp_get_usecs(&tv); - rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, - tlen, iptos, 0, &tv); -#endif + if(rack_do_segment_nounlock(m, th, so, tp, + drop_hdrlen, tlen, iptos, 0, &tv) == 0) + INP_WUNLOCK(tp->t_inpcb); } struct rack_sendmap * @@ -6855,6 +6953,103 @@ return (NULL); } +static int32_t +rack_get_pacing_delay(struct tcp_rack *rack, struct tcpcb *tp, uint32_t len) +{ + int32_t slot = 0; + + if ((rack->rack_per_of_gp == 0) || + (rack->rc_always_pace == 0)) { + /* + * We use the most optimistic possible cwnd/srtt for + * sending calculations. This will make our + * calculation anticipate getting more through + * quicker then possible. But thats ok we don't want + * the peer to have a gap in data sending. + */ + uint32_t srtt, cwnd, tr_perms = 0; + +old_method: + if (rack->r_ctl.rc_rack_min_rtt) + srtt = rack->r_ctl.rc_rack_min_rtt; + else + srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); + if (rack->r_ctl.rc_rack_largest_cwnd) + cwnd = rack->r_ctl.rc_rack_largest_cwnd; + else + cwnd = tp->snd_cwnd; + tr_perms = cwnd / srtt; + if (tr_perms == 0) { + tr_perms = ctf_fixed_maxseg(tp); + } + /* + * Calculate how long this will take to drain, if + * the calculation comes out to zero, thats ok we + * will use send_a_lot to possibly spin around for + * more increasing tot_len_this_send to the point + * that its going to require a pace, or we hit the + * cwnd. Which in that case we are just waiting for + * a ACK. + */ + slot = len / tr_perms; + /* Now do we reduce the time so we don't run dry? */ + if (slot && rack->rc_pace_reduce) { + int32_t reduce; + + reduce = (slot / rack->rc_pace_reduce); + if (reduce < slot) { + slot -= reduce; + } else + slot = 0; + } + } else { + int cnt; + uint64_t bw_est, bw_raise, res, lentim; + + bw_est = 0; + for (cnt=0; cntr_ctl.rc_gp_hist_filled == 0) && + (rack->r_ctl.rc_gp_history[cnt] == 0)) + break; + bw_est += rack->r_ctl.rc_gp_history[cnt]; + } + if (bw_est == 0) { + /* + * No way yet to make a b/w estimate + * (no goodput est yet). + */ + goto old_method; + } + /* Covert to bytes per second */ + bw_est *= MSEC_IN_SECOND; + /* + * Now ratchet it up by our percentage. Note + * that the minimum you can do is 1 which would + * get you 101% of the average last N goodput estimates. + * The max you can do is 256 which would yeild you + * 356% of the last N goodput estimates. + */ + bw_raise = bw_est * (uint64_t)rack->rack_per_of_gp; + bw_est += bw_raise; + /* average by the number we added */ + bw_est /= cnt; + /* Now calculate a rate based on this b/w */ + lentim = (uint64_t) len * (uint64_t)MSEC_IN_SECOND; + res = lentim / bw_est; + slot = (uint32_t)res; + } + if (rack->r_enforce_min_pace && + (slot == 0)) { + /* We are enforcing a minimum pace time of 1ms */ + slot = rack->r_enforce_min_pace; + } + if (slot) + counter_u64_add(rack_calc_nonzero, 1); + else + counter_u64_add(rack_calc_zero, 1); + return (slot); +} + static int rack_output(struct tcpcb *tp) { @@ -6866,6 +7061,7 @@ struct mbuf *mb; uint32_t if_hw_tsomaxsegcount = 0; uint32_t if_hw_tsomaxsegsize; + int32_t maxseg; long tot_len_this_send = 0; struct ip *ip = NULL; #ifdef TCPDEBUG @@ -6891,10 +7087,13 @@ int32_t tso, mtu, would_have_fin = 0; struct tcpopt to; int32_t slot = 0; + int32_t sup_rack = 0; uint32_t cts; - uint8_t hpts_calling, doing_tlp = 0; + uint8_t hpts_calling, new_data_tlp = 0, doing_tlp = 0; int32_t do_a_prefetch; int32_t prefetch_rsm = 0; + int force_tso = 0; + int32_t hw_tls, orig_len; int32_t prefetch_so_done = 0; struct tcp_log_buffer *lgb = NULL; struct inpcb *inp; @@ -6903,11 +7102,19 @@ struct ip6_hdr *ip6 = NULL; int32_t isipv6; #endif + uint8_t filled_all = 0; + /* setup and take the cache hits here */ rack = (struct tcp_rack *)tp->t_fb_ptr; inp = rack->rc_inp; so = inp->inp_socket; sb = &so->so_snd; +#ifdef KERN_TLS + if (sb->sb_tls_flags & SB_TLS_IFNET) + hw_tls = 1; + else +#endif + hw_tls = 0; kern_prefetch(sb, &do_a_prefetch); do_a_prefetch = 1; @@ -6916,6 +7123,16 @@ if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif + maxseg = ctf_fixed_maxseg(tp); + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if (IS_FASTOPEN(tp->t_flags) && + (tp->t_state == TCPS_SYN_RECEIVED) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN|ACK sent */ + (rack->r_ctl.rc_resend == NULL)) /* not a retransmit */ + return (0); #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ @@ -6975,18 +7192,9 @@ * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) - tcp_newcwv_nvp_closedown(tp); - } else -#endif if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) - rack_cc_after_idle(tp, - (rack->r_idle_reduce_largest ? 1 :0)); + rack_cc_after_idle(tp); } tp->t_flags &= ~TF_LASTIDLE; if (idle) { @@ -7047,12 +7255,20 @@ long tlen; doing_tlp = 1; - rsm = rack->r_ctl.rc_tlpsend; + /* + * Check if we can do a TLP with a RACK'd packet + * this can happen if we are not doing the rack + * cheat and we skipped to a TLP and it + * went off. + */ + rsm = tcp_rack_output(tp, rack, cts); + if (rsm == NULL) + rsm = rack->r_ctl.rc_tlpsend; rack->r_ctl.rc_tlpsend = NULL; sack_rxmit = 1; tlen = rsm->r_end - rsm->r_start; - if (tlen > tp->t_maxseg) - tlen = tp->t_maxseg; + if (tlen > ctf_fixed_maxseg(tp)) + tlen = ctf_fixed_maxseg(tp); KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, @@ -7072,13 +7288,14 @@ __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - if (len >= tp->t_maxseg) { - len = tp->t_maxseg; + if (len >= ctf_fixed_maxseg(tp)) { + len = ctf_fixed_maxseg(tp); } } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { - long tlen; + int maxseg; + maxseg = ctf_fixed_maxseg(tp); if ((!IN_RECOVERY(tp->t_flags)) && ((tp->t_flags & (TF_WASFRECOVERY | TF_WASCRECOVERY)) == 0)) { /* Enter recovery if not induced by a time-out */ @@ -7090,7 +7307,8 @@ * When we enter recovery we need to assure we send * one packet. */ - rack->r_ctl.rc_prr_sndcnt = tp->t_maxseg; + rack->r_ctl.rc_prr_sndcnt = ctf_fixed_maxseg(tp); + rack_log_to_prr(rack, 13); } #ifdef INVARIANTS if (SEQ_LT(rsm->r_start, tp->snd_una)) { @@ -7098,38 +7316,40 @@ tp, rack, rsm, rsm->r_start, tp->snd_una); } #endif - tlen = rsm->r_end - rsm->r_start; + len = rsm->r_end - rsm->r_start; KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", __func__, __LINE__, rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; - if (tlen > rack->r_ctl.rc_prr_sndcnt) { - len = rack->r_ctl.rc_prr_sndcnt; - } else { - len = tlen; - } - if (len >= tp->t_maxseg) { - sendalot = 1; - len = tp->t_maxseg; - } else { - sendalot = 0; - if ((rack->rc_timer_up == 0) && - (len < tlen)) { + /* Can we send it within the PRR boundary? */ + if ((rack->use_rack_cheat == 0) && (len > rack->r_ctl.rc_prr_sndcnt)) { + /* It does not fit */ + if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) > len) && + (rack->r_ctl.rc_prr_sndcnt < maxseg)) { /* - * If its not a timer don't send a partial - * segment. + * prr is less than a segment, we + * have more acks due in besides + * what we need to resend. Lets not send + * to avoid sending small pieces of + * what we need to retransmit. */ + sup_rack = 1; len = 0; goto just_return_nolock; } + len = rack->r_ctl.rc_prr_sndcnt; } + sendalot = 0; + if (len >= maxseg) { + len = maxseg; + } if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; TCPSTAT_INC(tcps_sack_rexmits); TCPSTAT_ADD(tcps_sack_rexmit_bytes, - min(len, tp->t_maxseg)); + min(len, ctf_fixed_maxseg(tp))); counter_u64_add(rack_rtm_prr_retran, 1); } } @@ -7189,7 +7409,9 @@ flags &= ~TH_FIN; sendwin = 1; } else { - if (rack->rc_in_persist) + if ((rack->rc_in_persist != 0) && + (tp->snd_wnd >= min((rack->r_ctl.rc_high_rwnd/2), + rack->r_ctl.rc_pace_min_segs))) rack_exit_persist(tp, rack); /* * If we are dropping persist mode then we need to @@ -7233,7 +7455,7 @@ else len = rack->r_ctl.rc_tlp_new_data; rack->r_ctl.rc_tlp_new_data = 0; - doing_tlp = 1; + new_data_tlp = doing_tlp = 1; } else { if (sendwin > avail) { /* use the available */ @@ -7274,7 +7496,7 @@ counter_u64_add(rack_rtm_prr_newdata, 1); } } - if (len > tp->t_maxseg) { + if (len > ctf_fixed_maxseg(tp)) { /* * We should never send more than a MSS when * retransmitting or sending new data in prr @@ -7283,8 +7505,8 @@ * let us send a lot as well :-) */ if (rack->r_ctl.rc_prr_sendalot == 0) - len = tp->t_maxseg; - } else if (len < tp->t_maxseg) { + len = ctf_fixed_maxseg(tp); + } else if (len < ctf_fixed_maxseg(tp)) { /* * Do we send any? The idea here is if the * send empty's the socket buffer we want to @@ -7355,6 +7577,7 @@ /* Without fast-open there should never be data sent on a SYN */ if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) len = 0; + orig_len = len; if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been @@ -7374,10 +7597,68 @@ len = 0; if ((tp->snd_wnd == 0) && (TCPS_HAVEESTABLISHED(tp->t_state)) && + (tp->snd_una == tp->snd_max) && (sb_offset < (int)sbavail(sb))) { tp->snd_nxt = tp->snd_una; rack_enter_persist(tp, rack, cts); } + } else if ((rsm == NULL) && + ((doing_tlp == 0) || (new_data_tlp == 1)) && + (len < rack->r_ctl.rc_pace_max_segs)) { + /* + * We are not sending a full segment for + * some reason. Should we not send anything (think + * sws or persists)? + */ + if ((tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + (TCPS_HAVEESTABLISHED(tp->t_state)) && + (len < (int)(sbavail(sb) - sb_offset))) { + /* + * Here the rwnd is less than + * the pacing size, this is not a retransmit, + * we are established and + * the send is not the last in the socket buffer + * we send nothing, and may enter persists. + */ + len = 0; + if (tp->snd_max == tp->snd_una) { + /* + * Nothing out we can + * go into persists. + */ + rack_enter_persist(tp, rack, cts); + tp->snd_nxt = tp->snd_una; + } + } else if ((tp->snd_cwnd >= max(rack->r_ctl.rc_pace_min_segs, (maxseg * 4))) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && + (len < (int)(sbavail(sb) - sb_offset)) && + (len < rack->r_ctl.rc_pace_min_segs)) { + /* + * Here we are not retransmitting, and + * the cwnd is not so small that we could + * not send at least a min size (rxt timer + * not having gone off), We have 2 segments or + * more already in flight, its not the tail end + * of the socket buffer and the cwnd is blocking + * us from sending out a minimum pacing segment size. + * Lets not send anything. + */ + len = 0; + } else if (((tp->snd_wnd - ctf_outstanding(tp)) < + min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs)) && + (ctf_flight_size(tp, rack->r_ctl.rc_sacked) > (2 * maxseg)) && + (len < (int)(sbavail(sb) - sb_offset)) && + (TCPS_HAVEESTABLISHED(tp->t_state))) { + /* + * Here we have a send window but we have + * filled it up and we can't send another pacing segment. + * We also have in flight more than 2 segments + * and we are not completing the sb i.e. we allow + * the last bytes of the sb to go out even if + * its not a full pacing segment. + */ + len = 0; + } } /* len will be >= 0 after this point. */ KASSERT(len >= 0, ("[%s:%d]: len < 0", __func__, __LINE__)); @@ -7430,7 +7711,7 @@ #if defined(IPSEC) || defined(IPSEC_SUPPORT) ipoptlen += ipsec_optlen; #endif - if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && + if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > ctf_fixed_maxseg(tp) && (tp->t_port == 0) && ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && @@ -7478,7 +7759,7 @@ * limited the window size) - we need to retransmit */ if (len) { - if (len >= tp->t_maxseg) { + if (len >= ctf_fixed_maxseg(tp)) { pass = 1; goto send; } @@ -7568,10 +7849,10 @@ if (oldwin >> tp->rcv_scale == (adv + oldwin) >> tp->rcv_scale) goto dontupdate; - if (adv >= (int32_t)(2 * tp->t_maxseg) && + if (adv >= (int32_t)(2 * ctf_fixed_maxseg(tp)) && (adv >= (int32_t)(so->so_rcv.sb_hiwat / 4) || recwin <= (int32_t)(so->so_rcv.sb_hiwat / 8) || - so->so_rcv.sb_hiwat <= 8 * tp->t_maxseg)) { + so->so_rcv.sb_hiwat <= 8 * ctf_fixed_maxseg(tp))) { pass = 7; goto send; } @@ -7613,7 +7894,23 @@ just_return_nolock: if (tot_len_this_send == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_JUSTRET], 1); - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); + if (slot) { + /* set the rack tcb into the slot N */ + counter_u64_add(rack_paced_segments, 1); + } else if (tot_len_this_send) { + counter_u64_add(rack_unpaced_segments, 1); + } + /* Check if we need to go into persists or not */ + if ((rack->rc_in_persist == 0) && + (tp->snd_max == tp->snd_una) && + TCPS_HAVEESTABLISHED(tp->t_state) && + sbavail(&tp->t_inpcb->inp_socket->so_snd) && + (sbavail(&tp->t_inpcb->inp_socket->so_snd) > tp->snd_wnd) && + (tp->snd_wnd < min((rack->r_ctl.rc_high_rwnd/2), rack->r_ctl.rc_pace_min_segs))) { + /* Yes lets make sure to move to persist before timer-start */ + rack_enter_persist(tp, rack, rack->r_ctl.rc_rcvtime); + } + rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, sup_rack); rack_log_type_just_return(rack, cts, tot_len_this_send, slot, hpts_calling); tp->t_flags &= ~TF_FORCEDATA; return (0); @@ -7631,7 +7928,7 @@ } SOCKBUF_LOCK_ASSERT(sb); if (len > 0) { - if (len >= tp->t_maxseg) + if (len >= ctf_fixed_maxseg(tp)) tp->t_flags2 |= TF2_PLPMTU_MAXSEGSNT; else tp->t_flags2 &= ~TF2_PLPMTU_MAXSEGSNT; @@ -7757,6 +8054,12 @@ ipoptlen += ipsec_optlen; #endif +#ifdef KERN_TLS + /* force TSO for so TLS offload can get mss */ + if (sb->sb_tls_flags & SB_TLS_IFNET) { + force_tso = 1; + } +#endif /* * Adjust data length if insertion of options will bump the packet * length beyond the t_maxseg length. Clear the FIN bit because we @@ -7799,18 +8102,19 @@ * unless the send sockbuf can be emptied: */ max_len = (tp->t_maxseg - optlen); - if ((sb_offset + len) < sbavail(sb)) { + if (((sb_offset + len) < sbavail(sb)) && + (hw_tls == 0)) { moff = len % (u_int)max_len; if (moff != 0) { len -= moff; sendalot = 1; } - } - /* + } + /* * In case there are too many small fragments don't * use TSO: */ - if (len <= max_len) { + if (len <= maxseg) { len = max_len; sendalot = 1; tso = 0; @@ -7865,7 +8169,7 @@ uint32_t moff; if (rack->rc_pace_max_segs) - max_val = rack->rc_pace_max_segs * tp->t_maxseg; + max_val = rack->rc_pace_max_segs * ctf_fixed_maxseg(tp); else max_val = len; /* @@ -7895,7 +8199,7 @@ * sb_offset in the socket buffer chain. */ mb = sbsndptr_noadv(sb, sb_offset, &moff); - if (len <= MHLEN - hdrlen - max_linkhdr) { + if (len <= MHLEN - hdrlen - max_linkhdr && !hw_tls) { m_copydata(mb, moff, (int)len, mtod(m, caddr_t)+hdrlen); if (SEQ_LT(tp->snd_nxt, tp->snd_max)) @@ -7908,8 +8212,8 @@ msb = NULL; else msb = sb; - m->m_next = tcp_m_copym(mb, moff, &len, - if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); + m->m_next = tcp_m_copym(/*tp,*/ mb, moff, &len, + if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, hw_tls, &filled_all*/); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -7973,7 +8277,7 @@ flags |= TH_PUSH; /* - * Are we doing hptsi, if so we must calculate the slot. We + * Are we doing pacing, if so we must calculate the slot. We * only do hptsi in ESTABLISHED and with no RESET being * sent where we have data to send. */ @@ -7982,56 +8286,10 @@ ((tp->t_state == TCPS_FIN_WAIT_1) && ((tp->t_flags & TF_SENTFIN) == 0) && ((flags & TH_FIN) == 0))) && - ((flags & TH_RST) == 0) && - (rack->rc_always_pace)) { - /* - * We use the most optimistic possible cwnd/srtt for - * sending calculations. This will make our - * calculation anticipate getting more through - * quicker then possible. But thats ok we don't want - * the peer to have a gap in data sending. - */ - uint32_t srtt, cwnd, tr_perms = 0; - - if (rack->r_ctl.rc_rack_min_rtt) - srtt = rack->r_ctl.rc_rack_min_rtt; - else - srtt = TICKS_2_MSEC((tp->t_srtt >> TCP_RTT_SHIFT)); - if (rack->r_ctl.rc_rack_largest_cwnd) - cwnd = rack->r_ctl.rc_rack_largest_cwnd; - else - cwnd = tp->snd_cwnd; - tr_perms = cwnd / srtt; - if (tr_perms == 0) { - tr_perms = tp->t_maxseg; - } + ((flags & TH_RST) == 0)) { + /* Get our pacing rate */ tot_len_this_send += len; - /* - * Calculate how long this will take to drain, if - * the calculation comes out to zero, thats ok we - * will use send_a_lot to possibly spin around for - * more increasing tot_len_this_send to the point - * that its going to require a pace, or we hit the - * cwnd. Which in that case we are just waiting for - * a ACK. - */ - slot = tot_len_this_send / tr_perms; - /* Now do we reduce the time so we don't run dry? */ - if (slot && rack->rc_pace_reduce) { - int32_t reduce; - - reduce = (slot / rack->rc_pace_reduce); - if (reduce < slot) { - slot -= reduce; - } else - slot = 0; - } - if (rack->r_enforce_min_pace && - (slot == 0) && - (tot_len_this_send >= (rack->r_min_pace_seg_thresh * tp->t_maxseg))) { - /* We are enforcing a minimum pace time of 1ms */ - slot = rack->r_enforce_min_pace; - } + slot = rack_get_pacing_delay(rack, tp, tot_len_this_send); } SOCKBUF_UNLOCK(sb); } else { @@ -8079,7 +8337,7 @@ } else #endif th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, ip6, th); + tcpip_fillheaders(inp, /*tp->t_port,*/ ip6, th); } else #endif /* INET6 */ { @@ -8098,7 +8356,7 @@ } else #endif th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, ip, th); + tcpip_fillheaders(inp, /*tp->t_port,*/ ip, th); } /* * Fill in fields, remembering maximum advertised window for use in @@ -8195,7 +8453,7 @@ recwin = 0; } else { if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)tp->t_maxseg) + recwin < (long)ctf_fixed_maxseg(tp)) recwin = 0; if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) @@ -8274,6 +8532,7 @@ m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); @@ -8294,6 +8553,7 @@ udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); } else { m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); @@ -8306,14 +8566,13 @@ ("%s: IP version incorrect: %d", __func__, ip->ip_v)); } #endif - /* * Enable TSO and specify the size of the segments. The TCP pseudo * header checksum is always provided. XXX: Fixme: This is currently * not the case for IPv6. */ - if (tso) { - KASSERT(len > tp->t_maxseg - optlen, + if (tso || force_tso) { + KASSERT(force_tso || len > tp->t_maxseg - optlen, ("%s: len <= tso_segsz", __func__)); m->m_pkthdr.csum_flags |= CSUM_TSO; m->m_pkthdr.tso_segsz = tp->t_maxseg - optlen; @@ -8332,7 +8591,6 @@ /* Run HHOOK_TCP_ESTABLISHED_OUT helper hooks. */ hhook_run_tcp_est_out(tp, th, &to, len, tso); #endif - #ifdef TCPDEBUG /* * Trace. @@ -8359,18 +8617,29 @@ /* We're getting ready to send; log now. */ if (tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + struct timeval tv; memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; log.u_bbr.flex1 = rack->r_ctl.rc_prr_sndcnt; + log.u_bbr.flex2 = rack->r_ctl.rc_pace_min_segs; + log.u_bbr.flex3 = rack->r_ctl.rc_pace_max_segs; + log.u_bbr.flex4 = orig_len; + if (filled_all) + log.u_bbr.flex5 = 0x80000000; + else + log.u_bbr.flex5 = 0; if (rsm || sack_rxmit) { log.u_bbr.flex8 = 1; } else { log.u_bbr.flex8 = 0; } + log.u_bbr.pkts_out = tp->t_maxseg; + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.inflight = ctf_flight_size(rack->rc_tp, rack->r_ctl.rc_sacked); lgb = tcp_log_event_(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_OUT, ERRNO_UNK, - len, &log, false, NULL, NULL, 0, NULL); + len, &log, false, NULL, NULL, 0, &tv); } else lgb = NULL; @@ -8478,12 +8747,38 @@ } else if (len > 1) { int idx; - idx = (len / tp->t_maxseg) + 3; + idx = (len / ctf_fixed_maxseg(tp)) + 3; if (idx >= TCP_MSS_ACCT_ATIMER) counter_u64_add(rack_out_size[(TCP_MSS_ACCT_ATIMER-1)], 1); else counter_u64_add(rack_out_size[idx], 1); } + if (hw_tls && len > 0) { + if (filled_all) { + counter_u64_add(rack_tls_filled, 1); + rack_log_type_hrdwtso(tp, rack, len, 0, orig_len, 1); + } else { + if (rsm) { + counter_u64_add(rack_tls_rxt, 1); + rack_log_type_hrdwtso(tp, rack, len, 2, orig_len, 1); + } else if (doing_tlp) { + counter_u64_add(rack_tls_tlp, 1); + rack_log_type_hrdwtso(tp, rack, len, 3, orig_len, 1); + } else if ( (ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > sbavail(sb)) { + counter_u64_add(rack_tls_app, 1); + rack_log_type_hrdwtso(tp, rack, len, 4, orig_len, 1); + } else if ((ctf_flight_size(tp, rack->r_ctl.rc_sacked) + rack->r_ctl.rc_pace_min_segs) > tp->snd_cwnd) { + counter_u64_add(rack_tls_cwnd, 1); + rack_log_type_hrdwtso(tp, rack, len, 5, orig_len, 1); + } else if ((ctf_outstanding(tp) + rack->r_ctl.rc_pace_min_segs) > tp->snd_wnd) { + counter_u64_add(rack_tls_rwnd, 1); + rack_log_type_hrdwtso(tp, rack, len, 6, orig_len, 1); + } else { + rack_log_type_hrdwtso(tp, rack, len, 7, orig_len, 1); + counter_u64_add(rack_tls_other, 1); + } + } + } } if (sub_from_prr && (error == 0)) { rack->r_ctl.rc_prr_sndcnt -= len; @@ -8491,6 +8786,10 @@ sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); + if ((error == 0) && + (len > 0) && + (tp->snd_una == tp->snd_max)) + rack->r_ctl.rc_tlp_rxt_last_time = cts; if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { tcp_seq startseq = tp->snd_nxt; @@ -8500,7 +8799,7 @@ */ if (error) /* We don't log or do anything with errors */ - goto timer; + goto nomore; if (flags & (TH_SYN | TH_FIN)) { if (flags & TH_SYN) @@ -8512,7 +8811,7 @@ } /* In the ENOBUFS case we do *not* update snd_max */ if (sack_rxmit) - goto timer; + goto nomore; tp->snd_nxt += len; if (SEQ_GT(tp->snd_nxt, tp->snd_max)) { @@ -8546,26 +8845,6 @@ } #endif } - /* - * Set retransmit timer if not currently set, and not doing - * a pure ack or a keep-alive probe. Initial value for - * retransmit timer is smoothed round-trip time + 2 * - * round-trip time variance. Initialize shift counter which - * is used for backoff of retransmit time. - */ -timer: - if ((tp->snd_wnd == 0) && - TCPS_HAVEESTABLISHED(tp->t_state)) { - /* - * If the persists timer was set above (right before - * the goto send), and still needs to be on. Lets - * make sure all is canceled. If the persist timer - * is not running, we want to get it up. - */ - if (rack->rc_in_persist == 0) { - rack_enter_persist(tp, rack, cts); - } - } } else { /* * Persist case, update snd_max but since we are in persist @@ -8647,7 +8926,7 @@ goto again; } slot = 10; - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); + rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); tp->t_flags &= ~TF_FORCEDATA; return (error); case ENETUNREACH: @@ -8661,7 +8940,7 @@ /* FALLTHROUGH */ default: slot = 10; - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, 0, 1); + rack_start_hpts_timer(rack, tp, cts, slot, 0, 0); tp->t_flags &= ~TF_FORCEDATA; return (error); } @@ -8703,7 +8982,7 @@ counter_u64_add(rack_unpaced_segments, 1); } tp->t_flags &= ~TF_FORCEDATA; - rack_start_hpts_timer(rack, tp, cts, __LINE__, slot, tot_len_this_send, 1); + rack_start_hpts_timer(rack, tp, cts, slot, tot_len_this_send, 0); return (error); } @@ -8739,8 +9018,9 @@ case TCP_RACK_TLP_INC_VAR: case TCP_RACK_IDLE_REDUCE_HIGH: case TCP_RACK_MIN_PACE: - case TCP_RACK_MIN_PACE_SEG: + case TCP_RACK_GP_INCREASE: case TCP_BBR_RACK_RTT_USE: + case TCP_BBR_USE_RACK_CHEAT: case TCP_DATA_AFTER_CLOSE: break; default: @@ -8811,6 +9091,7 @@ /* Max segments in a pace */ RACK_OPTS_INC(tcp_rack_max_seg); rack->rc_pace_max_segs = optval; + rack_set_pace_segments(tp, rack); break; case TCP_RACK_PRR_SENDALOT: /* Allow PRR to send more than one seg */ @@ -8848,6 +9129,13 @@ else error = EINVAL; break; + case TCP_BBR_USE_RACK_CHEAT: + RACK_OPTS_INC(tcp_rack_cheat); + if (optval) + rack->use_rack_cheat = 1; + else + rack->use_rack_cheat = 0; + break; case TCP_RACK_PKT_DELAY: /* RACK added ms i.e. rack-rtt + reord + N */ RACK_OPTS_INC(tcp_rack_pkt_delay); @@ -8855,15 +9143,10 @@ break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ - RACK_OPTS_INC(tcp_rack_tlp_inc_var); - rack->r_ctl.rc_prr_inc_var = optval; + return (EINVAL); break; case TCP_RACK_IDLE_REDUCE_HIGH: - RACK_OPTS_INC(tcp_rack_idle_reduce_high); - if (optval) - rack->r_idle_reduce_largest = 1; - else - rack->r_idle_reduce_largest = 0; + return (EINVAL); break; case TCP_DELACK: if (optval == 0) @@ -8883,12 +9166,13 @@ else rack->r_enforce_min_pace = optval; break; - case TCP_RACK_MIN_PACE_SEG: - RACK_OPTS_INC(tcp_rack_min_pace_seg); - if (optval >= 16) - rack->r_min_pace_seg_thresh = 15; + case TCP_RACK_GP_INCREASE: + if ((optval >= 0) && + (optval <= 256)) + rack->rack_per_of_gp = optval; else - rack->r_min_pace_seg_thresh = optval; + error = EINVAL; + break; case TCP_BBR_RACK_RTT_USE: if ((optval != USE_RTT_HIGH) && @@ -8975,6 +9259,10 @@ /* Does reordering fade after ms time */ optval = rack->r_ctl.rc_reorder_fade; break; + case TCP_BBR_USE_RACK_CHEAT: + /* Do we use the rack cheat for rxt */ + optval = rack->use_rack_cheat; + break; case TCP_RACK_TLP_THRESH: /* RACK TLP theshold i.e. srtt+(srtt/N) */ optval = rack->r_ctl.rc_tlp_threshold; @@ -8988,16 +9276,16 @@ break; case TCP_RACK_TLP_INC_VAR: /* Does TLP include rtt variance in t-o */ - optval = rack->r_ctl.rc_prr_inc_var; + return (EINVAL); break; case TCP_RACK_IDLE_REDUCE_HIGH: - optval = rack->r_idle_reduce_largest; + return (EINVAL); break; case TCP_RACK_MIN_PACE: optval = rack->r_enforce_min_pace; break; - case TCP_RACK_MIN_PACE_SEG: - optval = rack->r_min_pace_seg_thresh; + case TCP_RACK_GP_INCREASE: + optval = rack->rack_per_of_gp; break; case TCP_BBR_RACK_RTT_USE: optval = rack->r_ctl.rc_rate_sample_method; @@ -9042,8 +9330,9 @@ struct tcp_function_block __tcp_rack = { .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, + .tfb_do_queued_segments = ctf_do_queued_segments, + .tfb_do_segment_nounlock = rack_do_segment_nounlock, .tfb_tcp_do_segment = rack_do_segment, - .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, Index: netinet/tcp_stacks/rack_bbr_common.h =================================================================== --- netinet/tcp_stacks/rack_bbr_common.h +++ netinet/tcp_stacks/rack_bbr_common.h @@ -39,16 +39,6 @@ #define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF) -/* Magic flags to tell whats cooking on the pacing wheel */ -#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */ -#define PACE_TMR_RACK 0x02 /* RACK timer running */ -#define PACE_TMR_TLP 0x04 /* TLP timer running */ -#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ -#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ -#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ -#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */ -#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) - /* Magic flags for tracing progress events */ #define PROGRESS_DROP 1 #define PROGRESS_UPDATE 2 @@ -61,8 +51,60 @@ #define USE_RTT_LOW 1 #define USE_RTT_AVG 2 +#define PACE_MAX_IP_BYTES 65536 +#define USECS_IN_SECOND 1000000 +#define MSEC_IN_SECOND 1000 +#define MS_IN_USEC 1000 + +#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */ + #ifdef _KERNEL /* We have only 7 bits in rack so assert its true */ CTASSERT((PACE_TMR_MASK & 0x80) == 0); +#ifdef KERN_TLS +uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd); #endif +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, + struct mbuf *m, int has_pkt); +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt); +uint32_t ctf_outstanding(struct tcpcb *tp); +uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked); +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, + struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, + int32_t * drop_hdrlen, int32_t * ret_val); +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t rstreason, int32_t tlen); +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp); + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp); + +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t * ret_val); + +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp); + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen); + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp); + #endif +#endif Index: netinet/tcp_stacks/rack_bbr_common.c =================================================================== --- netinet/tcp_stacks/rack_bbr_common.c +++ netinet/tcp_stacks/rack_bbr_common.c @@ -0,0 +1,788 @@ +/*- + * Copyright (c) 2016-2018 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Author: Randall Stewart + * This work is based on the ACM Queue paper + * BBR - Congestion Based Congestion Control + * and also numerous discussions with Neal, Yuchung and Van. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +/*#include "opt_kern_tls.h"*/ +#include +#include +#include +#ifdef TCP_HHOOK +#include +#endif +#include +#include +#include +#include +#include +#ifdef KERN_TLS +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif +#ifdef INET6 +#include +#endif +#include + +#include +#include +#include + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include +#include +#endif /* IPSEC */ + +#include +#include +#include + +#ifdef MAC +#include +#endif +#include "rack_bbr_common.h" + +/* + * Common TCP Functions - These are shared by borth + * rack and BBR. + */ + + +#ifdef KERN_TLS +uint32_t +ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) +{ + struct sbtls_info *tls; + uint32_t len; + +again: + tls = so->so_snd.sb_tls_info; + len = tls->sb_params.sb_maxlen; /* max tls payload */ + len += tls->sb_params.sb_tls_hlen; /* tls header len */ + len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ + if ((len * 4) > rwnd) { + /* + * Stroke this will suck counter and what + * else should we do Drew? From the + * TCP perspective I am not sure + * what should be done... + */ + if (tls->sb_params.sb_maxlen > 4096) { + tls->sb_params.sb_maxlen -= 4096; + if (tls->sb_params.sb_maxlen < 4096) + tls->sb_params.sb_maxlen = 4096; + goto again; + } + } + return (len); +} +#endif + +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) +{ + /* + * We are passed a raw change of mbuf packets + * that arrived in LRO. They are linked via + * the m_nextpkt link in the pkt-headers. + * + * We process each one by: + * a) saving off the next + * b) stripping off the ether-header + * c) formulating the arguments for + * the tfb_tcp_hpts_do_segment + * d) calling each mbuf to tfb_tcp_hpts_do_segment + * after adjusting the time to match the arrival time. + * Note that the LRO code assures no IP options are present. + * + * The symantics for calling tfb_tcp_hpts_do_segment are the + * following: + * 1) It returns 0 if all went well and you (the caller) need + * to release the lock. + * 2) If nxt_pkt is set, then the function will surpress calls + * to tfb_tcp_output() since you are promising to call again + * with another packet. + * 3) If it returns 1, then you must free all the packets being + * shipped in, the tcb has been destroyed (or about to be destroyed). + */ + struct mbuf *m_save; + struct ether_header *eh; + struct epoch_tracker et; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip = NULL; /* Keep compiler happy. */ +#endif + struct ifnet *ifp; + struct timeval tv; + int32_t retval, nxt_pkt, tlen, off; + uint16_t etype; + uint16_t drop_hdrlen; + uint8_t iptos, no_vn=0, bpf_req=0; + + /* + * This is a bit deceptive, we get the + * "info epoch" which is really the network + * epoch. This covers us on both any INP + * type change but also if the ifp goes + * away it covers us as well. + */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + if (m && m->m_pkthdr.rcvif) + ifp = m->m_pkthdr.rcvif; + else + ifp = NULL; + if (ifp) { + bpf_req = bpf_peers_present(ifp->if_bpf); + } else { + /* + * We probably should not work around + * but kassert, since lro alwasy sets rcvif. + */ + no_vn = 1; + goto skip_vnet; + } + CURVNET_SET(ifp->if_vnet); +skip_vnet: + while (m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + /* Now lets get the ether header */ + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + /* Let the BPF see the packet */ + if (bpf_req && ifp) + ETHER_BPF_MTAP(ifp, m); + m_adj(m, sizeof(*eh)); + /* Trim off the ethernet header */ + switch (etype) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { + m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); + if (m == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + tlen = ntohs(ip6->ip6_plen); + drop_hdrlen = sizeof(*ip6); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_pseudo(ip6, tlen, + IPPROTO_TCP, m->m_pkthdr.csum_data); + th->th_sum ^= 0xffff; + } else + th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + m_freem(m); + goto skipped_pkt; + } + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) + == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip + 1); + drop_hdrlen = sizeof(*ip); + iptos = ip->ip_tos; + tlen = ntohs(ip->ip_len) - sizeof(struct ip); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + int len; + struct ipovly *ipov = (struct ipovly *)ip; + /* + * Checksum extended TCP header and data. + */ + len = drop_hdrlen + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = htons(tlen); + th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(len); + /* Reset TOS bits */ + ip->ip_tos = iptos; + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + } + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + break; + } +#endif + } + /* + * Convert TCP protocol specific fields to host format. + */ + tcp_fields_to_host(th); + + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + TCPSTAT_INC(tcps_rcvbadoff); + m_freem(m); + goto skipped_pkt; + } + tlen -= off; + drop_hdrlen += off; + /* + * Now lets setup the timeval to be when we should + * have been called (if we can). + */ + m->m_pkthdr.lro_nsegs = 1; + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + /* Now what about next packet? */ + if (m_save || has_pkt) + nxt_pkt = 1; + else + nxt_pkt = 0; + retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, + iptos, nxt_pkt, &tv); + if (retval) { + /* We lost the lock and tcb probably */ + m = m_save; + while(m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return(retval); + } +skipped_pkt: + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return(retval); +} + +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) +{ + struct mbuf *m; + + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + m = tp->t_in_pkt; + tp->t_in_pkt = NULL; + tp->t_tail_pkt = NULL; + if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { + /* We lost the tcpcb (maybe a RST came in)? */ + return(1); + } + } + return (0); +} + +uint32_t +ctf_outstanding(struct tcpcb *tp) +{ + return(tp->snd_max - tp->snd_una); +} + +uint32_t +ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) +{ + if (rc_sacked <= ctf_outstanding(tp)) + return(ctf_outstanding(tp) - rc_sacked); + else { + /* TSNH */ +#ifdef INVARIANTS + panic("tp:%p rc_sacked:%d > out:%d", + tp, rc_sacked, ctf_outstanding(tp)); +#endif + return (0); + } +} + +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); +} + +/* + * ctf_drop_checks returns 1 for you should not proceed. It places + * in ret_val what should be returned 1/0 by the caller. The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +{ + int32_t todrop; + int32_t thflags; + int32_t tlen; + + thflags = *thf; + tlen = *tlenp; + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + *drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + /* + * If segment ends after window, drop trailing data (and PUSH and + * FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment and + * ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + return (1); + } + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH | TH_FIN); + } + *thf = thflags; + *tlenp = tlen; + return (0); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) +{ + /* + * Generate an ACK dropping incoming segment if it occupies sequence + * space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all paths to this + * code happen after packets containing RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the segment + * we received passes the SYN-RECEIVED ACK test. If it fails send a + * RST. This breaks the loop in the "LAND" DoS attack, and also + * prevents an ACK storm between two listening ports that have been + * sent forged SYN segments, each with the source address of the + * other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max))) { + *ret_val = 1; + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return; + } else + *ret_val = 0; + tp->t_flags |= TF_ACKNOW; + if (m) + m_freem(m); +} + +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp) +{ + + /* + * Drop space held by incoming segment and return. + */ + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + if (m) + m_freem(m); +} + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) +{ + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in + * window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should test against + * last_ack_sent instead of rcv_nxt. Note 2: we handle special case + * of closed window, not covered by the RFC. + */ + int dropped = 0; + + if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + (tp->last_ack_sent == th->th_seq) || + (tp->rcv_nxt == th->th_seq) || + ((tp->last_ack_sent - 1) == th->th_seq)) { + TCPSTAT_INC(tcps_drops); + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + dropped = 1; + ctf_do_drop(m, tp); + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + } + } else { + m_freem(m); + } + return (dropped); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) +{ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + *ret_val = 1; + ctf_do_drop(m, tp); + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + *ret_val = 0; + ctf_do_drop(m, NULL); + } +} + +/* + * bbr_ts_check returns 1 for you should not proceed, the state + * machine should return. It places in ret_val what should + * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, + int32_t tlen, int32_t thflags, int32_t * ret_val) +{ + + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates ts_recent, + * the age will be reset later and ts_recent will get a + * valid value. If it does not, setting ts_recent to zero + * will at least satisfy the requirement that zero be placed + * in the timestamp echo reply when ts_recent isn't valid. + * The age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be dropped + * when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + *ret_val = 0; + if (tlen) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + } else { + ctf_do_drop(m, NULL); + } + return (1); + } + return (0); +} + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp) +{ + int32_t win; + + /* + * Calculate amount of space in receive window, and then do TCP + * input processing. Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); +} + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + + if (tp->t_inpcb) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + } + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); +} + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp) +{ + int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We only consider fixed options that we would send every + * time I.e. SACK is not considered. + * + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} Index: netinet/tcp_stacks/tcp_rack.h =================================================================== --- netinet/tcp_stacks/tcp_rack.h +++ netinet/tcp_stacks/tcp_rack.h @@ -28,15 +28,15 @@ #ifndef _NETINET_TCP_RACK_H_ #define _NETINET_TCP_RACK_H_ -#define RACK_ACKED 0x0001/* The remote endpoint acked this */ -#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */ -#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */ -#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ -#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ -#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ -#define RACK_HAS_FIN 0x0040/* segment is sent with fin */ -#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ - +#define RACK_ACKED 0x0001/* The remote endpoint acked this */ +#define RACK_TO_MIXED 0x0002/* A timeout occured that mixed the send order */ +#define RACK_DEFERRED 0x0004/* We can't use this for RTT calc */ +#define RACK_OVERMAX 0x0008/* We have more retran's then we can fit */ +#define RACK_SACK_PASSED 0x0010/* A sack was done above this block */ +#define RACK_WAS_SACKPASS 0x0020/* We retransmitted due to SACK pass */ +#define RACK_HAS_FIN 0x0040/* segment is sent with fin */ +#define RACK_TLP 0x0080/* segment sent as tail-loss-probe */ +#define RACK_RWND_COLLAPSED 0x0100/* The peer collapsed the rwnd on the segment */ #define RACK_NUM_OF_RETRANS 3 #define RACK_INITIAL_RTO 1000 /* 1 second in milli seconds */ @@ -50,11 +50,11 @@ uint32_t r_rtr_bytes; /* How many bytes have been retransmitted */ uint16_t r_rtr_cnt; /* Retran count, index this -1 to get time * sent */ - uint8_t r_flags; /* Flags as defined above */ + uint16_t r_flags; /* Flags as defined above */ uint8_t r_sndcnt; /* Retran count, not limited by * RACK_NUM_OF_RETRANS */ uint8_t r_in_tmap; /* Flag to see if its in the r_tnext array */ - uint8_t r_resv[3]; + uint8_t r_resv[2]; }; TAILQ_HEAD(rack_head, rack_sendmap); @@ -134,6 +134,7 @@ uint64_t rack_no_timer_in_hpts; uint64_t tcp_rack_min_pace_seg; uint64_t tcp_rack_min_pace; + uint64_t tcp_rack_cheat; }; #define TLP_USE_ID 1 /* Internet draft behavior */ @@ -186,6 +187,7 @@ * b) Locked by the hpts-mutex * */ +#define RACK_GP_HIST 4 /* How much goodput history do we maintain? */ struct rack_control { /* Second cache line 0x40 from tcp_rack */ @@ -257,7 +259,12 @@ struct sack_filter rack_sf; /* Cache line split 0x140 */ /* Flags for various things */ + uint32_t rc_pace_max_segs; + uint32_t rc_pace_min_segs; + uint32_t rc_high_rwnd; struct rack_rtt_sample rack_rs; + uint32_t rc_tlp_rxt_last_time; + uint32_t rc_gp_history[RACK_GP_HIST]; uint32_t rc_tlp_threshold; /* Socket option value Lock(a) */ uint16_t rc_early_recovery_segs; /* Socket option value Lock(a) */ uint16_t rc_reorder_shift; /* Socket option value Lock(a) */ @@ -268,9 +275,11 @@ uint8_t rc_early_recovery; /* Socket option value Lock(a) */ uint8_t rc_prr_sendalot;/* Socket option value Lock(a) */ uint8_t rc_min_to; /* Socket option value Lock(a) */ - uint8_t rc_prr_inc_var; /* Socket option value Lock(a) */ uint8_t rc_tlp_rtx_out; /* This is TLPRtxOut in the draft */ uint8_t rc_rate_sample_method; + uint8_t rc_gp_hist_idx: 7, + rc_gp_hist_filled: 1; + }; #ifdef _KERNEL @@ -303,15 +312,17 @@ rc_last_pto_set : 1, /* XXX not used */ rc_tlp_in_progress : 1, rc_always_pace : 1, /* Socket option value Lock(a) */ - rc_timer_up : 1; /* The rack timer is up flag Lock(a) */ - uint8_t r_idle_reduce_largest : 1, - r_enforce_min_pace : 2, - r_min_pace_seg_thresh : 5; + tlp_timer_up : 1; /* The tlp timer is up flag Lock(a) */ + uint8_t r_enforce_min_pace : 2, + rc_has_collapsed : 1, + r_xxx_min_pace_seg_thresh : 5; uint8_t rack_tlp_threshold_use; uint8_t rc_allow_data_af_clo: 1, delayed_ack : 1, - rc_avail : 6; - uint8_t r_resv[2]; /* Fill to cache line boundary */ + set_pacing_done_a_iw : 1, + use_rack_cheat : 1, + rc_avail : 4; + uint16_t rack_per_of_gp; /* Cache line 2 0x40 */ struct rack_control r_ctl; } __aligned(CACHE_LINE_SIZE); Index: netinet/tcp_var.h =================================================================== --- netinet/tcp_var.h +++ netinet/tcp_var.h @@ -102,7 +102,8 @@ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ - bits_spare : 4; + t_fin_is_rst: 1, /* Are fin's treated as resets */ + bits_spare : 3; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -271,6 +272,11 @@ void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); + int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); + int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, Index: sys/mbuf.h =================================================================== --- sys/mbuf.h +++ sys/mbuf.h @@ -310,6 +310,7 @@ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ +#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */