Index: head/sys/modules/tcp/rack/Makefile =================================================================== --- head/sys/modules/tcp/rack/Makefile +++ head/sys/modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c +SRCS= rack.c sack_filter.c rack_bbr_common.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h Index: head/sys/netinet/in_pcb.h =================================================================== --- head/sys/netinet/in_pcb.h +++ head/sys/netinet/in_pcb.h @@ -759,7 +759,9 @@ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ - +#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */ +#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */ +#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */ /* * Flags passed to in_pcblookup*() functions. */ Index: head/sys/netinet/tcp.h =================================================================== --- head/sys/netinet/tcp.h +++ head/sys/netinet/tcp.h @@ -201,9 +201,8 @@ #define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ #define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ #define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ -#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */ #define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ -#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */ +#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ #define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ #define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ #define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ @@ -211,14 +210,18 @@ #define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ #define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ #define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ -#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */ -#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */ -#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */ +#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ +#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ +#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ +#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ +#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ +#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ #define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ #define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ #define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ -#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */ +#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ +#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ #define TCP_BBR_PACE_PER_SEC 1086 @@ -227,17 +230,27 @@ #define TCP_BBR_PACE_SEG_MIN 1089 #define TCP_BBR_PACE_CROSS 1090 #define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ -#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ #define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ #define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ +#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */ #define TCP_RACK_TLP_USE 1095 #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ +#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 #define TCP_DATA_AFTER_CLOSE 1100 #define TCP_BBR_PROBE_RTT_GAIN 1101 #define TCP_BBR_PROBE_RTT_LEN 1102 +#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ +#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */ +#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ +#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ +#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ +#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ +#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ +#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ +#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ /* Start of reserved space for third-party user-settable options. */ Index: head/sys/netinet/tcp_hpts.h =================================================================== --- head/sys/netinet/tcp_hpts.h +++ head/sys/netinet/tcp_hpts.h @@ -45,112 +45,80 @@ /* Number of useconds in a hpts tick */ #define HPTS_TICKS_PER_USEC 10 -#define HPTS_MS_TO_SLOTS(x) (x * 100) +#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 -#define DEFAULT_HPTS_LOG 3072 -/* - * Log flags consist of - * 7f 7f 1 1 bits - * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE - * - * So for example cpu 10, number 10 would with - * input active would show up as: - * p_flags = 0001010 0001010 1 0 - * - * p_flags = 0x142a - */ -#define HPTS_HPTS_ACTIVE 0x01 -#define HPTS_INPUT_ACTIVE 0x02 - -#define HPTSLOG_IMMEDIATE 1 -#define HPTSLOG_INSERT_NORMAL 2 -#define HPTSLOG_INSERT_SLEEPER 3 -#define HPTSLOG_SLEEP_AFTER 4 -#define HPTSLOG_SLEEP_BEFORE 5 -#define HPTSLOG_INSERTED 6 -#define HPTSLOG_WAKEUP_HPTS 7 -#define HPTSLOG_SETTORUN 8 -#define HPTSLOG_HPTSI 9 -#define HPTSLOG_TOLONG 10 -#define HPTSLOG_AWAKENS 11 -#define HPTSLOG_TIMESOUT 12 -#define HPTSLOG_SLEEPSET 13 -#define HPTSLOG_WAKEUP_INPUT 14 -#define HPTSLOG_RESCHEDULE 15 -#define HPTSLOG_AWAKE 16 -#define HPTSLOG_INP_DONE 17 - -struct hpts_log { - struct inpcb *inp; - int32_t event; - uint32_t cts; - int32_t line; - uint32_t ticknow; - uint32_t t_paceslot; - uint32_t t_hptsreq; - uint32_t p_curtick; - uint32_t p_prevtick; - uint32_t slot_req; - uint32_t p_on_queue_cnt; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t p_hpts_sleep_time; - uint16_t p_flags; - uint8_t p_onhpts; - uint8_t p_oninput; - uint8_t is_notempty; -}; - struct hpts_diag { - uint32_t p_hpts_active; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t slot_req; - uint32_t inp_hptsslot; - uint32_t slot_now; - uint32_t have_slept; - uint32_t hpts_sleep_time; - uint32_t yet_to_sleep; - uint32_t need_new_to; - int32_t co_ret; - uint8_t p_on_min_sleep; + uint32_t p_hpts_active; /* bbr->flex7 x */ + uint32_t p_nxt_slot; /* bbr->flex1 x */ + uint32_t p_cur_slot; /* bbr->flex2 x */ + uint32_t p_prev_slot; /* bbr->delivered */ + uint32_t p_runningtick; /* bbr->inflight */ + uint32_t slot_req; /* bbr->flex3 x */ + uint32_t inp_hptsslot; /* bbr->flex4 x */ + uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t have_slept; /* bbr->epoch x */ + uint32_t hpts_sleep_time; /* bbr->applimited x */ + uint32_t yet_to_sleep; /* bbr->lt_epoch x */ + uint32_t need_new_to; /* bbr->flex6 x */ + uint32_t wheel_tick; /* bbr->bw_inuse x */ + uint32_t maxticks; /* bbr->delRate x */ + uint32_t wheel_cts; /* bbr->rttProp x */ + int32_t co_ret; /* bbr->pkts_out x */ + uint32_t p_curtick; /* upper bbr->cur_del_rate */ + uint32_t p_lasttick; /* lower bbr->cur_del_rate */ + uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; +/* Magic flags to tell whats cooking on the pacing wheel */ +#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */ +#define PACE_TMR_RACK 0x02 /* RACK timer running */ +#define PACE_TMR_TLP 0x04 /* TLP timer running */ +#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ +#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ +#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ +#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ +#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) + #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ - uint32_t p_hpts_active; /* Flag that says hpts is awake */ - uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ - uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ + uint32_t p_runningtick; /* Current tick we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t enobuf_cnt; - uint16_t p_log_at; + uint32_t p_lasttick; /* Last tick before the current one */ uint8_t p_direct_wake :1, /* boolean */ - p_log_wrapped :1, /* boolean */ - p_on_min_sleep:1; /* boolean */ - uint8_t p_fill; + p_on_min_sleep:1, /* boolean */ + p_avail:6; + uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ void *p_inp; struct hptsh p_input; /* For the tcp-input runner */ /* Hptsi wheel */ struct hptsh *p_hptss; - struct hpts_log *p_log; - uint32_t p_logsize; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ uint32_t hit_no_enobuf; uint32_t p_dyn_adjust; uint32_t p_hpts_sleep_time; /* Current sleep interval having a max * of 255ms */ + uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */ + uint32_t saved_lasttick; /* for logging */ + uint32_t saved_curtick; /* for logging */ + uint32_t saved_curslot; /* for logging */ + uint32_t saved_prev_slot; /* for logging */ uint32_t p_delayed_by; /* How much were we delayed by */ /* Cache line 0x80 */ struct sysctl_ctx_list hpts_ctx; @@ -236,13 +204,9 @@ int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); -#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) +__tcp_queue_to_input(struct inpcb *inp, int32_t line); +#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); Index: head/sys/netinet/tcp_hpts.c =================================================================== --- head/sys/netinet/tcp_hpts.c +++ head/sys/netinet/tcp_hpts.c @@ -37,7 +37,7 @@ * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * - * First, and probably the main thing its used by Rack and BBR for, it can + * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The @@ -59,42 +59,57 @@ * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. - * - * Now the tcp_hpts system will call tcp_output in one of two forms, - * it will first check to see if the stack as defined a - * tfb_tcp_output_wtime() function, if so that is the routine it - * will call, if that function is not defined then it will call the - * tfb_tcp_output() function. The only difference between these - * two calls is that the former passes the time in to the function - * so the function does not have to access the time (which tcp_hpts - * already has). What these functions do is of course totally up - * to the individual tcp stack. - * + * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort - * a connection (later) or process input on a connection. - * Why would you want to do this? To keep processor locality. + * a connection (later) or process input on a connection. + * Why would you want to do this? To keep processor locality + * and or not have to worry about untangling any recursive + * locks. The input function now is hooked to the new LRO + * system as well. * - * So in order to use the input redirection function the - * stack changes its tcp_do_segment() routine to instead - * of process the data call the function: + * In order to use the input redirection function the + * tcp stack must define an input function for + * tfb_do_queued_segments(). This function understands + * how to dequeue a array of packets that were input and + * knows how to call the correct processing routine. * - * tcp_queue_pkt_to_input() - * - * You will note that the arguments to this function look - * a lot like tcp_do_segments's arguments. This function - * will assure that the tcp_hpts system will - * call the functions tfb_tcp_hpts_do_segment() from the - * correct CPU. Note that multiple calls can get pushed - * into the tcp_hpts system this will be indicated by - * the next to last argument to tfb_tcp_hpts_do_segment() - * (nxt_pkt). If nxt_pkt is a 1 then another packet is - * coming. If nxt_pkt is a 0 then this is the last call - * that the tcp_hpts system has available for the tcp stack. + * Locking in this is important as well so most likely the + * stack will need to define the tfb_do_segment_nounlock() + * splitting tfb_do_segment() into two parts. The main processing + * part that does not unlock the INP and returns a value of 1 or 0. + * It returns 0 if all is well and the lock was not released. It + * returns 1 if we had to destroy the TCB (a reset received etc). + * The remains of tfb_do_segment() then become just a simple call + * to the tfb_do_segment_nounlock() function and check the return + * code and possibly unlock. * - * The other point of the input system is to be able to safely - * drop a tcp connection without worrying about the recursive - * locking that may be occuring on the INP_WLOCK. So if + * The stack must also set the flag on the INP that it supports this + * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes + * this flag as well and will queue packets when it is set. + * There are other flags as well INP_MBUF_QUEUE_READY and + * INP_DONT_SACK_QUEUE. The first flag tells the LRO code + * that we are in the pacer for output so there is no + * need to wake up the hpts system to get immediate + * input. The second tells the LRO code that its okay + * if a SACK arrives you can still defer input and let + * the current hpts timer run (this is usually set when + * a rack timer is up so we know SACK's are happening + * on the connection already and don't want to wakeup yet). + * + * There is a common functions within the rack_bbr_common code + * version i.e. ctf_do_queued_segments(). This function + * knows how to take the input queue of packets from + * tp->t_in_pkts and process them digging out + * all the arguments, calling any bpf tap and + * calling into tfb_do_segment_nounlock(). The common + * function (ctf_do_queued_segments()) requires that + * you have defined the tfb_do_segment_nounlock() as + * described above. + * + * The second feature of the input side of hpts is the + * dropping of a connection. This is due to the way that + * locking may have occured on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) @@ -156,6 +171,7 @@ #include #include #include +#include #ifdef tcpdebug #include @@ -168,24 +184,19 @@ MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -#include -#include static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 2; #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); -static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; - -TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); - static struct tcp_hptsi tcp_pace; +static int hpts_does_tp_logging = 0; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); -static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); +static void tcp_hptsi(struct tcp_hpts_entry *hpts); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); @@ -204,8 +215,6 @@ } \ } while (0) -static int32_t logging_on = 0; -static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; struct hpts_domain_info { @@ -219,44 +228,75 @@ &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, - &logging_on, 0, - "Turn on logging if compiled in"); +counter_u64_t hpts_hopelessly_behind; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD, + &hpts_hopelessly_behind, + "Number of times hpts could not catch up and was behind hopelessly"); + counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, &hpts_loops, "Number of times hpts had to loop to catch up"); + counter_u64_t back_tosleep; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); -static int32_t in_newts_every_tcb = 0; +counter_u64_t combined_wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, - &in_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for input"); -static int32_t in_ts_percision = 0; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, + &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, - &in_ts_percision, 0, - "Do we use percise timestamp for clients on input"); -static int32_t out_newts_every_tcb = 0; +counter_u64_t wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, - &out_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for output"); +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD, + &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); + static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, + &hpts_does_tp_logging, 0, + "Do we add to any tp that has logging on pacer logs"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, +static int32_t max_pacer_loops = 10; +SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW, + &max_pacer_loops, 10, + "What is the maximum number of times the pacer will loop trying to catch up"); + +#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) + +static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; + + +static int +sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t new; + + new = hpts_sleep_max; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if ((new < (NUM_OF_HPTSI_SLOTS / 4)) || + (new > HPTS_MAX_SLEEP_ALLOWED)) + error = EINVAL; + else + hpts_sleep_max = new; + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, + CTLTYPE_UINT | CTLFLAG_RW, &hpts_sleep_max, 0, - "The maximum time the hpts will sleep <1 - 254>"); + &sysctl_net_inet_tcp_hpts_max_sleep, "IU", + "Maximum time hpts will sleep"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, @@ -267,55 +307,35 @@ "Do we have the callout call directly to the hpts?"); static void -__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, - uint32_t ticknow, int32_t line) +tcp_hpts_log(struct tcp_hpts_entry *hpts, struct tcpcb *tp, struct timeval *tv, + int ticks_to_run, int idx) { - struct hpts_log *pl; - - HPTS_MTX_ASSERT(hpts); - if (hpts->p_log == NULL) - return; - pl = &hpts->p_log[hpts->p_log_at]; - hpts->p_log_at++; - if (hpts->p_log_at >= hpts->p_logsize) { - hpts->p_log_at = 0; - hpts->p_log_wrapped = 1; - } - pl->inp = inp; - if (inp) { - pl->t_paceslot = inp->inp_hptsslot; - pl->t_hptsreq = inp->inp_hpts_request; - pl->p_onhpts = inp->inp_in_hpts; - pl->p_oninput = inp->inp_in_input; - } else { - pl->t_paceslot = 0; - pl->t_hptsreq = 0; - pl->p_onhpts = 0; - pl->p_oninput = 0; - } - pl->is_notempty = 1; - pl->event = event; - pl->line = line; - pl->cts = tcp_get_usecs(NULL); - pl->p_curtick = hpts->p_curtick; - pl->p_prevtick = hpts->p_prevtick; - pl->p_on_queue_cnt = hpts->p_on_queue_cnt; - pl->ticknow = ticknow; - pl->slot_req = slot; - pl->p_nxt_slot = hpts->p_nxt_slot; - pl->p_cur_slot = hpts->p_cur_slot; - pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; - pl->p_flags = (hpts->p_cpu & 0x7f); - pl->p_flags <<= 7; - pl->p_flags |= (hpts->p_num & 0x7f); - pl->p_flags <<= 2; - if (hpts->p_hpts_active) { - pl->p_flags |= HPTS_HPTS_ACTIVE; - } + union tcp_log_stackspecific log; + + memset(&log.u_bbr, 0, sizeof(log.u_bbr)); + log.u_bbr.flex1 = hpts->p_nxt_slot; + log.u_bbr.flex2 = hpts->p_cur_slot; + log.u_bbr.flex3 = hpts->p_prev_slot; + log.u_bbr.flex4 = idx; + log.u_bbr.flex5 = hpts->p_curtick; + log.u_bbr.flex6 = hpts->p_on_queue_cnt; + log.u_bbr.use_lt_bw = 1; + log.u_bbr.inflight = ticks_to_run; + log.u_bbr.applimited = hpts->overidden_sleep; + log.u_bbr.delivered = hpts->saved_curtick; + log.u_bbr.timeStamp = tcp_tv_to_usectick(tv); + log.u_bbr.epoch = hpts->saved_curslot; + log.u_bbr.lt_epoch = hpts->saved_prev_slot; + log.u_bbr.pkts_out = hpts->p_delayed_by; + log.u_bbr.lost = hpts->p_hpts_sleep_time; + log.u_bbr.cur_del_rate = hpts->p_runningtick; + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + BBR_LOG_HPTSDIAG, 0, + 0, &log, false, tv); } -#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) - static void hpts_timeout_swi(void *arg) { @@ -347,12 +367,6 @@ /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } - if (TAILQ_EMPTY(head) && - (hpts->p_on_queue_cnt != 0)) { - /* We should not be empty with a queue count */ - panic("%s hpts:%p hpts bucket empty but cnt:%d", - __FUNCTION__, hpts, hpts->p_on_queue_cnt); - } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; @@ -456,58 +470,13 @@ in_pcbref(inp); } -static int -sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) -{ - struct tcp_hpts_entry *hpts; - size_t sz; - int32_t logging_was, i; - int32_t error = 0; - - /* - * HACK: Turn off logging so no locks are required this really needs - * a memory barrier :) - */ - logging_was = logging_on; - logging_on = 0; - if (!req->oldptr) { - /* How much? */ - sz = 0; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - sz += (sizeof(struct hpts_log) * hpts->p_logsize); - } - error = SYSCTL_OUT(req, 0, sz); - } else { - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - if (hpts->p_log_wrapped) - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - else - sz = (sizeof(struct hpts_log) * hpts->p_log_at); - error = SYSCTL_OUT(req, hpts->p_log, sz); - } - } - logging_on = logging_was; - return error; -} - -SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); - - static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -515,10 +484,9 @@ tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -648,8 +616,8 @@ * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. - * Note that you can or both values together and get two - * actions. + * Note that you can use one or both values together + * and get two actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) @@ -670,53 +638,198 @@ } static inline int -hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) +hpts_tick(uint32_t wheel_tick, uint32_t plus) { - return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); + /* + * Given a slot on the wheel, what slot + * is that plus ticks out? + */ + KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick)); + return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS); } +static inline int +tick_to_wheel(uint32_t cts_in_wticks) +{ + /* + * Given a timestamp in wheel ticks (10usec inc's) + * map it to our limited space wheel. + */ + return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); +} + +static inline int +hpts_ticks_diff(int prev_tick, int tick_now) +{ + /* + * Given two ticks that are someplace + * on our wheel. How far are they apart? + */ + if (tick_now > prev_tick) + return (tick_now - prev_tick); + else if (tick_now == prev_tick) + /* + * Special case, same means we can go all of our + * wheel less one slot. + */ + return (NUM_OF_HPTSI_SLOTS - 1); + else + return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now); +} + +/* + * Given a tick on the wheel that is the current time + * mapped to the wheel (wheel_tick), what is the maximum + * distance forward that can be obtained without + * wrapping past either prev_tick or running_tick + * depending on the htps state? Also if passed + * a uint32_t *, fill it with the tick location. + * + * Note if you do not give this function the current + * time (that you think it is) mapped to the wheel + * then the results will not be what you expect and + * could lead to invalid inserts. + */ +static inline int32_t +max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick) +{ + uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel; + + if ((hpts->p_hpts_active == 1) && + (hpts->p_wheel_complete == 0)) { + end_tick = hpts->p_runningtick; + /* Back up one tick */ + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + } else { + /* + * For the case where we are + * not active, or we have + * completed the pass over + * the wheel, we can use the + * prev tick and subtract one from it. This puts us + * as far out as possible on the wheel. + */ + end_tick = hpts->p_prev_slot; + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + /* + * Now we have close to the full wheel left minus the + * time it has been since the pacer went to sleep. Note + * that wheel_tick, passed in, should be the current time + * from the perspective of the caller, mapped to the wheel. + */ + if (hpts->p_prev_slot != wheel_tick) + dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + else + dis_to_travel = 1; + /* + * dis_to_travel in this case is the space from when the + * pacer stopped (p_prev_slot) and where our wheel_tick + * is now. To know how many slots we can put it in we + * subtract from the wheel size. We would not want + * to place something after p_prev_slot or it will + * get ran too soon. + */ + return (NUM_OF_HPTSI_SLOTS - dis_to_travel); + } + /* + * So how many slots are open between p_runningtick -> p_cur_slot + * that is what is currently un-available for insertion. Special + * case when we are at the last slot, this gets 1, so that + * the answer to how many slots are available is all but 1. + */ + if (hpts->p_runningtick == hpts->p_cur_slot) + dis_to_travel = 1; + else + dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + /* + * How long has the pacer been running? + */ + if (hpts->p_cur_slot != wheel_tick) { + /* The pacer is a bit late */ + pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick); + } else { + /* The pacer is right on time, now == pacers start time */ + pacer_to_now = 0; + } + /* + * To get the number left we can insert into we simply + * subract the distance the pacer has to run from how + * many slots there are. + */ + avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; + /* + * Now how many of those we will eat due to the pacer's + * time (p_cur_slot) of start being behind the + * real time (wheel_tick)? + */ + if (avail_on_wheel <= pacer_to_now) { + /* + * Wheel wrap, we can't fit on the wheel, that + * is unusual the system must be way overloaded! + * Insert into the assured tick, and return special + * "0". + */ + counter_u64_add(combined_wheel_wrap, 1); + *target_tick = hpts->p_nxt_slot; + return (0); + } else { + /* + * We know how many slots are open + * on the wheel (the reverse of what + * is left to run. Take away the time + * the pacer started to now (wheel_tick) + * and that tells you how many slots are + * open that can be inserted into that won't + * be touched by the pacer until later. + */ + return (avail_on_wheel - pacer_to_now); + } +} + static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { - int32_t need_wake = 0; - uint32_t ticknow = 0; - + uint32_t need_wake = 0; + HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ - if (hpts->p_hpts_active == 0) { - /* A sleeping hpts we want in next slot to run */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, - hpts_tick(hpts, 1)); - } - inp->inp_hptsslot = hpts_tick(hpts, 1); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); - } - need_wake = 1; + inp->inp_hpts_request = 0; + if ((hpts->p_hpts_active == 0) || + (hpts->p_wheel_complete)) { + /* + * A sleeping hpts we want in next slot to run + * note that in this state p_prev_slot == p_cur_slot + */ + inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1); + if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) + need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* + * The hpts system is running and the caller + * was awoken by the hpts system. * We can't allow you to go into the same slot we - * are in. We must put you out. + * are in (we don't want a loop :-D). */ inp->inp_hptsslot = hpts->p_nxt_slot; } else - inp->inp_hptsslot = hpts->p_cur_slot; + inp->inp_hptsslot = hpts->p_runningtick; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); - } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } @@ -737,141 +850,129 @@ return (ret); } +#ifdef INVARIANTS static void -tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, - struct hpts_diag *diag, int32_t noref) +check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) { - int32_t need_new_to = 0; - int32_t need_wakeup = 0; - uint32_t largest_slot; - uint32_t ticknow = 0; - uint32_t slot_calc; + /* + * Sanity checks for the pacer with invariants + * on insert. + */ + if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS) + panic("hpts:%p inp:%p slot:%d > max", + hpts, inp, inp_hptsslot); + if ((hpts->p_hpts_active) && + (hpts->p_wheel_complete == 0)) { + /* + * If the pacer is processing a arc + * of the wheel, we need to make + * sure we are not inserting within + * that arc. + */ + int distance, yet_to_run; + distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot); + if (hpts->p_runningtick != hpts->p_cur_slot) + yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + else + yet_to_run = 0; /* processing last slot */ + if (yet_to_run > distance) { + panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", + hpts, inp, inp_hptsslot, + distance, yet_to_run, + hpts->p_runningtick, hpts->p_cur_slot); + } + } +} +#endif + +static void +tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line, + struct hpts_diag *diag, struct timeval *tv) +{ + uint32_t need_new_to = 0; + uint32_t wheel_cts, last_tick; + int32_t wheel_tick, maxticks; + int8_t need_wakeup = 0; + HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; + diag->p_prev_slot = hpts->p_prev_slot; + diag->p_runningtick = hpts->p_runningtick; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; + diag->p_curtick = hpts->p_curtick; + diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; + diag->p_on_min_sleep = hpts->p_on_min_sleep; + diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((inp->inp_in_hpts == 0) || noref) { - inp->inp_hpts_request = slot; + if (inp->inp_in_hpts == 0) { if (slot == 0) { /* Immediate */ - tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); + tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); return; } - if (hpts->p_hpts_active) { - /* - * Its slot - 1 since nxt_slot is the next tick that - * will go off since the hpts is awake - */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); - } - /* - * We want to make sure that we don't place a inp in - * the range of p_cur_slot <-> p_nxt_slot. If we - * take from p_nxt_slot to the end, plus p_cur_slot - * and then take away 2, we will know how many is - * the max slots we can use. - */ - if (hpts->p_nxt_slot > hpts->p_cur_slot) { - /* - * Non-wrap case nxt_slot <-> cur_slot we - * don't want to land in. So the diff gives - * us what is taken away from the number of - * slots. + /* Get the current time relative to the wheel */ + wheel_cts = tcp_tv_to_hptstick(tv); + /* Map it onto the wheel */ + wheel_tick = tick_to_wheel(wheel_cts); + /* Now what's the max we can place it at? */ + maxticks = max_ticks_available(hpts, wheel_tick, &last_tick); + if (diag) { + diag->wheel_tick = wheel_tick; + diag->maxticks = maxticks; + diag->wheel_cts = wheel_cts; + } + if (maxticks == 0) { + /* The pacer is in a wheel wrap behind, yikes! */ + if (slot > 1) { + /* + * Reduce by 1 to prevent a forever loop in + * case something else is wrong. Note this + * probably does not hurt because the pacer + * if its true is so far behind we will be + * > 1second late calling anyway. */ - largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); - } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - } else { - /* - * Wrap case so the diff gives us the number - * of slots that we can land in. - */ - largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; + slot--; } - /* - * We take away two so we never have a problem (20 - * usec's) out of 1024000 usecs - */ - largest_slot -= 2; - if (inp->inp_hpts_request > largest_slot) { - /* - * Restrict max jump of slots and remember - * leftover - */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* This one will run when we hit it */ - inp->inp_hpts_request = 0; - } - if (hpts->p_nxt_slot == hpts->p_cur_slot) - slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; - else - slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; - if (slot_calc == hpts->p_cur_slot) { + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request = slot; + } else if (maxticks >= slot) { + /* It all fits on the wheel */ + inp->inp_hpts_request = 0; + inp->inp_hptsslot = hpts_tick(wheel_tick, slot); + } else { + /* It does not fit */ + inp->inp_hpts_request = slot - maxticks; + inp->inp_hptsslot = last_tick; + } + if (diag) { + diag->slot_remaining = inp->inp_hpts_request; + diag->inp_hptsslot = inp->inp_hptsslot; + } #ifdef INVARIANTS - /* TSNH */ - panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", - hpts, slot_calc, slot, largest_slot); + check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); #endif - if (slot_calc) - slot_calc--; - else - slot_calc = NUM_OF_HPTSI_SLOTS - 1; - } - inp->inp_hptsslot = slot_calc; - if (diag) { - diag->inp_hptsslot = inp->inp_hptsslot; - } - } else { + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); + if ((hpts->p_hpts_active == 0) && + (inp->inp_hpts_request == 0) && + (hpts->p_on_min_sleep == 0)) { /* - * The hpts is sleeping, we need to figure out where + * The hpts is sleeping and not on a minimum + * sleep time, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; - uint32_t slot_now; - struct timeval tv; - ticknow = tcp_gethptstick(&tv); - slot_now = ticknow % NUM_OF_HPTSI_SLOTS; - /* - * The user wants to be inserted at (slot_now + - * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. - */ - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - if (inp->inp_hpts_request > largest_slot) { - /* Adjust the residual in inp_hpts_request */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* No residual it all fits */ - inp->inp_hpts_request = 0; - } - inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; - if (diag) { - diag->slot_now = slot_now; - diag->inp_hptsslot = inp->inp_hptsslot; - diag->p_on_min_sleep = hpts->p_on_min_sleep; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); - } /* Now do we need to restart the hpts's timer? */ - if (TSTMP_GT(ticknow, hpts->p_curtick)) - have_slept = ticknow - hpts->p_curtick; - else - have_slept = 0; - if (have_slept < hpts->p_hpts_sleep_time) { - /* This should be what happens */ + have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + if (have_slept < hpts->p_hpts_sleep_time) yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; - } else { + else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; @@ -879,29 +980,22 @@ if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; - diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { + if (yet_to_sleep && + (yet_to_sleep > slot)) { /* - * We need to reschedule the hptss time-out. + * We need to reschedule the hpts's time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } - hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); - } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { @@ -944,9 +1038,10 @@ } uint32_t -tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ +tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag) +{ struct tcp_hpts_entry *hpts; - uint32_t slot_on, cts; + uint32_t slot_on; struct timeval tv; /* @@ -956,12 +1051,8 @@ */ INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); - if (in_ts_percision) - microuptime(&tv); - else - getmicrouptime(&tv); - cts = tcp_tv_to_usectick(&tv); - tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); + microuptime(&tv); + tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv); slot_on = hpts->p_nxt_slot; mtx_unlock(&hpts->p_mtx); return (slot_on); @@ -971,7 +1062,6 @@ __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } - int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { @@ -986,9 +1076,6 @@ /* * Activate the hpts if it is sleeping. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); - } retval = 2; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); @@ -1001,36 +1088,14 @@ return (retval); } -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) -{ - /* Setup packet for input first */ - INP_WLOCK_ASSERT(tp->t_inpcb); - m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); - m->m_pkthdr.pace_tlen = (uint16_t) tlen; - m->m_pkthdr.pace_drphdrlen = drop_hdrlen; - m->m_pkthdr.pace_tos = iptos; - m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); - if (tp->t_in_pkt == NULL) { - tp->t_in_pkt = m; - tp->t_tail_pkt = m; - } else { - tp->t_tail_pkt->m_nextpkt = m; - tp->t_tail_pkt = m; - } -} - - int32_t -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ +__tcp_queue_to_input(struct inpcb *inp, int line) +{ struct tcp_hpts_entry *hpts; int32_t ret; - tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); - hpts = tcp_input_lock(tp->t_inpcb); - ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); + hpts = tcp_input_lock(inp); + ret = __tcp_queue_to_input_locked(inp, hpts, line); mtx_unlock(&hpts->p_mtx); return (ret); } @@ -1132,6 +1197,25 @@ #endif } +static void +tcp_drop_in_pkts(struct tcpcb *tp) +{ + struct mbuf *m, *n; + + m = tp->t_in_pkt; + if (m) + n = m->m_nextpkt; + else + n = NULL; + tp->t_in_pkt = NULL; + while (m) { + m_freem(m); + m = n; + if (m) + n = m->m_nextpkt; + } +} + /* * Do NOT try to optimize the processing of inp's * by first pulling off all the inp's into a temporary @@ -1142,7 +1226,7 @@ * but then while you were processing one of the inp's * some other one that you switch will get a new * packet on the different CPU. It will insert it - * on the new hptss input list. Creating a temporary + * on the new hpts's input list. Creating a temporary * link in the inp will not fix it either, since * the other hpts will be doing the same thing and * you will both end up using the temporary link. @@ -1155,16 +1239,18 @@ static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) { - struct mbuf *m, *n; struct tcpcb *tp; struct inpcb *inp; uint16_t drop_reason; int16_t set_cpu; uint32_t did_prefetch = 0; - int32_t ti_locked = TI_UNLOCKED; + int dropped; struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); +#ifndef VIMAGE + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); @@ -1177,26 +1263,22 @@ drop_reason = inp->inp_hpts_drop_reas; inp->inp_in_input = 0; mtx_unlock(&hpts->p_mtx); - CURVNET_SET(inp->inp_vnet); - if (drop_reason) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else { - ti_locked = TI_UNLOCKED; - } INP_WLOCK(inp); +#ifdef VIMAGE + CURVNET_SET(inp->inp_vnet); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out: hpts->p_inp = NULL; - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); } - ti_locked = TI_UNLOCKED; +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); +#endif mtx_lock(&hpts->p_mtx); continue; } @@ -1206,26 +1288,17 @@ } if (drop_reason) { /* This tcb is being destroyed for drop_reason */ - m = tp->t_in_pkt; - if (m) - n = m->m_nextpkt; - else - n = NULL; - tp->t_in_pkt = NULL; - while (m) { - m_freem(m); - m = n; - if (m) - n = m->m_nextpkt; - } + tcp_drop_in_pkts(tp); tp = tcp_drop(tp, drop_reason); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); +#endif mtx_lock(&hpts->p_mtx); continue; } @@ -1246,220 +1319,184 @@ */ tcp_set_hpts(inp); } - m = tp->t_in_pkt; - n = NULL; - if (m != NULL && - (m->m_pkthdr.pace_lock == TI_RLOCKED || - tp->t_state != TCPS_ESTABLISHED)) { - ti_locked = TI_RLOCKED; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - m = tp->t_in_pkt; - } - if (in_newts_every_tcb) { - if (in_ts_percision) - microuptime(tv); - else - getmicrouptime(tv); - } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } - /* Any input work to do, if so do it first */ - if ((m != NULL) && (m == tp->t_in_pkt)) { - struct tcphdr *th; - int32_t tlen, drop_hdrlen, nxt_pkt; - uint8_t iptos; - - n = m->m_nextpkt; - tp->t_in_pkt = tp->t_tail_pkt = NULL; - while (m) { - th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); - tlen = m->m_pkthdr.pace_tlen; - drop_hdrlen = m->m_pkthdr.pace_drphdrlen; - iptos = m->m_pkthdr.pace_tos; - m->m_nextpkt = NULL; - if (n) - nxt_pkt = 1; - else - nxt_pkt = 0; - inp->inp_input_calls = 1; - if (tp->t_fb->tfb_tcp_hpts_do_segment) { - /* Use the hpts specific do_segment */ - (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, - tp, drop_hdrlen, - tlen, iptos, nxt_pkt, tv); - } else { - /* Use the default do_segment */ - (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, - tp, drop_hdrlen, - tlen, iptos); - } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - /* - * Do segment returns unlocked we need the - * lock again but we also need some kasserts - * here. - */ - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - INP_UNLOCK_ASSERT(inp); - m = n; - if (m) - n = m->m_nextpkt; - if (m != NULL && - m->m_pkthdr.pace_lock == TI_RLOCKED) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else - ti_locked = TI_UNLOCKED; + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + if (inp->inp_in_input) + tcp_hpts_remove(inp, HPTS_REMOVE_INPUT); + dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); + if (dropped) { + /* Re-acquire the wlock so we can release the reference */ INP_WLOCK(inp); - /* - * Since we have an opening here we must - * re-check if the tcb went away while we - * were getting the lock(s). - */ - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { - while (m) { - m_freem(m); - m = n; - if (m) - n = m->m_nextpkt; - } - goto out; - } - /* - * Now that we hold the INP lock, check if - * we need to upgrade our lock. - */ - if (ti_locked == TI_UNLOCKED && - (tp->t_state != TCPS_ESTABLISHED)) { - ti_locked = TI_RLOCKED; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - } - } /** end while(m) */ - } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ + } + } else if (tp->t_in_pkt) { + /* + * We reach here only if we had a + * stack that supported INP_SUPPORTS_MBUFQ + * and then somehow switched to a stack that + * does not. The packets are basically stranded + * and would hang with the connection until + * cleanup without this code. Its not the + * best way but I know of no other way to + * handle it since the stack needs functions + * it does not have to handle queued packets. + */ + tcp_drop_in_pkts(tp); + } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); - ti_locked = TI_UNLOCKED; +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + CURVNET_RESTORE(); +#endif mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; - CURVNET_RESTORE(); } +#ifndef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); +#endif } -static int -tcp_hpts_est_run(struct tcp_hpts_entry *hpts) -{ - int32_t ticks_to_run; - - if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { - ticks_to_run = hpts->p_curtick - hpts->p_prevtick; - if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { - ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; - } - } else { - if (hpts->p_prevtick == hpts->p_curtick) { - /* This happens when we get woken up right away */ - return (-1); - } - ticks_to_run = 1; - } - /* Set in where we will be when we catch up */ - hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; - if (hpts->p_nxt_slot == hpts->p_cur_slot) { - panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", - hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); - } - return (ticks_to_run); -} - static void -tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) +tcp_hptsi(struct tcp_hpts_entry *hpts) { + struct epoch_tracker et; struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; - int32_t ticks_to_run, i, error, tick_now, interum_tick; + int32_t ticks_to_run, i, error; int32_t paced_cnt = 0; + int32_t loop_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; - uint32_t cts; + int32_t wrap_loop_cnt = 0; int16_t set_cpu; HPTS_MTX_ASSERT(hpts); - hpts->p_curtick = tcp_tv_to_hptstick(ctick); - cts = tcp_tv_to_usectick(ctick); - memcpy(&tv, ctick, sizeof(struct timeval)); - hpts->p_cur_slot = hpts_tick(hpts, 1); + /* record previous info for any logging */ + hpts->saved_lasttick = hpts->p_lasttick; + hpts->saved_curtick = hpts->p_curtick; + hpts->saved_curslot = hpts->p_cur_slot; + hpts->saved_prev_slot = hpts->p_prev_slot; - /* Figure out if we had missed ticks */ + hpts->p_lasttick = hpts->p_curtick; + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + if ((hpts->p_on_queue_cnt == 0) || + (hpts->p_lasttick == hpts->p_curtick)) { + /* + * No time has yet passed, + * or nothing to do. + */ + hpts->p_prev_slot = hpts->p_cur_slot; + hpts->p_lasttick = hpts->p_curtick; + goto no_run; + } again: + hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); - ticks_to_run = tcp_hpts_est_run(hpts); - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); + ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot); + if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) && + (hpts->p_on_queue_cnt != 0)) { + /* + * Wheel wrap is occuring, basically we + * are behind and the distance between + * run's has spread so much it has exceeded + * the time on the wheel (1.024 seconds). This + * is ugly and should NOT be happening. We + * need to run the entire wheel. We last processed + * p_prev_slot, so that needs to be the last slot + * we run. The next slot after that should be our + * reserved first slot for new, and then starts + * the running postion. Now the problem is the + * reserved "not to yet" place does not exist + * and there may be inp's in there that need + * running. We can merge those into the + * first slot at the head. + */ + wrap_loop_cnt++; + hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1); + hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2); + /* + * Adjust p_cur_slot to be where we are starting from + * hopefully we will catch up (fat chance if something + * is broken this bad :( ) + */ + hpts->p_cur_slot = hpts->p_prev_slot; + /* + * The next slot has guys to run too, and that would + * be where we would normally start, lets move them into + * the next slot (p_prev_slot + 2) so that we will + * run them, the extra 10usecs of late (by being + * put behind) does not really matter in this situation. + */ +#ifdef INVARIANTS + /* + * To prevent a panic we need to update the inpslot to the + * new location. This is safe since it takes both the + * INP lock and the pacer mutex to change the inp_hptsslot. + */ + TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) { + inp->inp_hptsslot = hpts->p_runningtick; + } +#endif + TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick], + &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts); + ticks_to_run = NUM_OF_HPTSI_SLOTS - 1; + counter_u64_add(wheel_wrap, 1); + } else { + /* + * Nxt slot is always one after p_runningtick though + * its not used usually unless we are doing wheel wrap. + */ + hpts->p_nxt_slot = hpts->p_prev_slot; + hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts, hpts->p_on_inqueue_cnt); } #endif HPTS_MTX_ASSERT(hpts); - /* Reset the ticks to run and time if we need too */ - interum_tick = tcp_gethptstick(&tv); - if (interum_tick != hpts->p_curtick) { - /* Save off the new time we execute to */ - *ctick = tv; - hpts->p_curtick = interum_tick; - cts = tcp_tv_to_usectick(&tv); - hpts->p_cur_slot = hpts_tick(hpts, 1); - ticks_to_run = tcp_hpts_est_run(hpts); - } - if (ticks_to_run == -1) { - goto no_run; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); - } if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); +#ifndef VIMAGE + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there - * was not any + * was not any (i.e. if ticks_to_run == 1, no delay). */ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; HPTS_MTX_ASSERT(hpts); - while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { + while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { /* For debugging */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); - } hpts->p_inp = inp; paced_cnt++; - if (hpts->p_cur_slot != inp->inp_hptsslot) { +#ifdef INVARIANTS + if (hpts->p_runningtick != inp->inp_hptsslot) { panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", - hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); + hpts, inp, hpts->p_runningtick, inp->inp_hptsslot); } +#endif /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; } else { set_cpu = 0; } - hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); - if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { + hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0); + if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; @@ -1467,25 +1504,36 @@ if (inp->inp_hpts_request) { /* * This guy is deferred out further in time - * then our wheel had on it. Push him back - * on the wheel. + * then our wheel had available on it. + * Push him back on the wheel or run it + * depending. */ - int32_t remaining_slots; - + uint32_t maxticks, last_tick, remaining_slots; + remaining_slots = ticks_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* - * Keep INVARIANTS happy by clearing - * the flag + * How far out can we go? */ - tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); + maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick); + if (maxticks >= inp->inp_hpts_request) { + /* we can place it finally to be processed */ + inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request); + inp->inp_hpts_request = 0; + } else { + /* Work off some more time */ + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request-= maxticks; + } + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1); hpts->p_inp = NULL; continue; } inp->inp_hpts_request = 0; + /* Fall through we will so do it now */ } /* - * We clear the hpts flag here after dealing with + * We clear the hpts flag here after dealing with * remaining slots. This way anyone looking with the * TCB lock will see its on the hpts until just * before we unlock. @@ -1495,23 +1543,20 @@ INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); hpts->p_inp = NULL; continue; } - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { -out_now: + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || + (inp->inp_flags2 & INP_FREED)) { + out_now: #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); hpts->p_inp = NULL; continue; } @@ -1539,16 +1584,14 @@ */ tcp_set_hpts(inp); } - if (out_newts_every_tcb) { - struct timeval sv; - - if (out_ts_percision) - microuptime(&sv); - else - getmicrouptime(&sv); - cts = tcp_tv_to_usectick(&sv); - } +#ifdef VIMAGE CURVNET_SET(inp->inp_vnet); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif + /* Lets do any logging that we might want to */ + if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) { + tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i); + } /* * There is a hole here, we get the refcnt on the * inp so it will still be preserved but to make @@ -1560,19 +1603,23 @@ #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx before tcp-output:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } - inp->inp_hpts_calls = 1; - if (tp->t_fb->tfb_tcp_output_wtime != NULL) { - error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); - } else { - error = tp->t_fb->tfb_tcp_output(tp); + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); + if (error) { + /* The input killed the connection */ + goto skip_pacing; + } } + inp->inp_hpts_calls = 1; + error = tp->t_fb->tfb_tcp_output(tp); + inp->inp_hpts_calls = 0; if (ninp && ninp->inp_ppcb) { /* * If we have a nxt inp, see if we can @@ -1609,74 +1656,112 @@ prefetch_tp = 1; } INP_WUNLOCK(inp); - INP_UNLOCK_ASSERT(inp); + skip_pacing: +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); CURVNET_RESTORE(); +#endif + INP_UNLOCK_ASSERT(inp); #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); hpts->p_inp = NULL; } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; - hpts->p_cur_slot++; - if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { - hpts->p_cur_slot = 0; + hpts->p_runningtick++; + if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) { + hpts->p_runningtick = 0; } } +#ifndef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); +#endif no_one: HPTS_MTX_ASSERT(hpts); - hpts->p_prevtick = hpts->p_curtick; hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ - /* Re-run any input that may be there */ - (void)tcp_gethptstick(&tv); - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); - } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts, hpts->p_on_inqueue_cnt); } #endif - tick_now = tcp_gethptstick(&tv); - if (SEQ_GT(tick_now, hpts->p_prevtick)) { - struct timeval res; - - /* Did we really spend a full tick or more in here? */ - timersub(&tv, ctick, &res); - if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { + hpts->p_prev_slot = hpts->p_cur_slot; + hpts->p_lasttick = hpts->p_curtick; + if (loop_cnt > max_pacer_loops) { + /* + * Something is serious slow we have + * looped through processing the wheel + * and by the time we cleared the + * needs to run max_pacer_loops time + * we still needed to run. That means + * the system is hopelessly behind and + * can never catch up :( + * + * We will just lie to this thread + * and let it thing p_curtick is + * correct. When it next awakens + * it will find itself further behind. + */ + counter_u64_add(hpts_hopelessly_behind, 1); + goto no_run; + } + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + if ((wrap_loop_cnt < 2) && + (hpts->p_lasttick != hpts->p_curtick)) { + counter_u64_add(hpts_loops, 1); + loop_cnt++; + goto again; + } +no_run: + /* + * Set flag to tell that we are done for + * any slot input that happens during + * input. + */ + hpts->p_wheel_complete = 1; + /* + * Run any input that may be there not covered + * in running data. + */ + if (!TAILQ_EMPTY(&hpts->p_input)) { + tcp_input_data(hpts, &tv); + /* + * Now did we spend too long running + * input and need to run more ticks? + */ + KASSERT(hpts->p_prev_slot == hpts->p_cur_slot, + ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, + hpts->p_prev_slot, hpts->p_cur_slot)); + KASSERT(hpts->p_lasttick == hpts->p_curtick, + ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, + hpts->p_lasttick, hpts->p_curtick)); + hpts->p_curtick = tcp_gethptstick(&tv); + if (hpts->p_lasttick != hpts->p_curtick) { counter_u64_add(hpts_loops, 1); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); - } - *ctick = res; - hpts->p_curtick = tick_now; + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } } -no_run: { uint32_t t = 0, i, fnd = 0; - if (hpts->p_on_queue_cnt) { - - + if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) { /* * Find next slot that is occupied and use that to * be the sleep time. */ - for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { + for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { fnd = 1; break; @@ -1684,27 +1769,23 @@ t = (t + 1) % NUM_OF_HPTSI_SLOTS; } if (fnd) { - hpts->p_hpts_sleep_time = i; + hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); } else { - counter_u64_add(back_tosleep, 1); #ifdef INVARIANTS - panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); + panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt); #endif + counter_u64_add(back_tosleep, 1); hpts->p_on_queue_cnt = 0; goto non_found; } - t++; + } else if (wrap_loop_cnt >= 2) { + /* Special case handling */ + hpts->p_hpts_sleep_time = tcp_min_hptsi_time; } else { - /* No one on the wheel sleep for all but 2 slots */ -non_found: - if (hpts_sleep_max == 0) - hpts_sleep_max = 1; - hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); - t = 0; + /* No one on the wheel sleep for all but 400 slots or sleep max */ + non_found: + hpts->p_hpts_sleep_time = hpts_sleep_max; } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); - } } } @@ -1746,33 +1827,29 @@ mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input */ - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); callout_stop(&hpts->co); } else { /* Timed out */ if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); mtx_unlock(&hpts->p_mtx); return; } callout_deactivate(&hpts->co); - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); } + hpts->p_hpts_wake_scheduled = 0; hpts->p_hpts_active = 1; - (void)tcp_gethptstick(&tv); - tcp_hptsi(hpts, &tv); + tcp_hptsi(hpts); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) { + hpts->overidden_sleep = tv.tv_usec; tv.tv_usec = tcp_min_hptsi_time; hpts->p_on_min_sleep = 1; } else { /* Clear the min sleep flag */ + hpts->overidden_sleep = 0; hpts->p_on_min_sleep = 0; } hpts->p_hpts_active = 0; @@ -1809,9 +1886,11 @@ tcp_pace.rp_proc = NULL; tcp_pace.rp_num_hptss = ncpus; + hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK); hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); - + combined_wheel_wrap = counter_u64_alloc(M_WAITOK); + wheel_wrap = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; @@ -1850,7 +1929,7 @@ OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_ADD_U16(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, @@ -1859,29 +1938,23 @@ SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, - "What the current slot is if active"); + "What the current running pacers goal"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "curtick", CTLFLAG_RD, - &hpts->p_curtick, 0, - "What the current tick on if active"); + OID_AUTO, "runtick", CTLFLAG_RD, + &hpts->p_runningtick, 0, + "What the running pacers current slot is"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "logsize", CTLFLAG_RD, - &hpts->p_logsize, 0, - "Hpts logging buffer size"); - hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; + OID_AUTO, "curtick", CTLFLAG_RD, + &hpts->p_curtick, 0, + "What the running pacers last tick mapped to the wheel was"); + hpts->p_hpts_sleep_time = hpts_sleep_max; hpts->p_num = i; - hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_prevtick -= 1; - hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); hpts->p_cpu = 0xffff; - hpts->p_nxt_slot = 1; - hpts->p_logsize = tcp_hpts_logging_size; - if (hpts->p_logsize) { - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); - } + hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); } Index: head/sys/netinet/tcp_log_buf.h =================================================================== --- head/sys/netinet/tcp_log_buf.h +++ head/sys/netinet/tcp_log_buf.h @@ -175,7 +175,7 @@ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ - TCP_LOG_PACER, /* Pacer sending a packet 8 */ + TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ @@ -194,31 +194,38 @@ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ - BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ - BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ + BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ - UNUSED_32, /* Unused 32 */ - UNUSED_33, /* Unused 33 */ + BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ + BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ - BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ + BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ - BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ + BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ + BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ - TCP_LOG_END /* End (keep at end) 51 */ + TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ + BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ + BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ + TCP_LOG_CONNEND, /* End of connection 54 */ + TCP_LOG_LRO, /* LRO entry 55 */ + TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ + TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ + TCP_LOG_END /* End (keep at end) 58 */ }; enum tcp_log_states { @@ -275,8 +282,8 @@ #ifdef _KERNEL -#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 -#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 +#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 +#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always Index: head/sys/netinet/tcp_stacks/rack.c =================================================================== --- head/sys/netinet/tcp_stacks/rack.c +++ head/sys/netinet/tcp_stacks/rack.c @@ -1,5 +1,6 @@ /*- - * Copyright (c) 2016-2019 Netflix, Inc. + * Copyright (c) 2016 + * Netflix Inc. All rights reserved. * * Redistribution and use in source and binary forms, with or without * modification, are permitted provided that the following conditions @@ -44,12 +45,16 @@ #include #include #include /* for proc0 declaration */ +#ifdef NETFLIX_STATS +#include +#endif #include #include #include #include +#include #ifdef NETFLIX_STATS -#include +#include /* Must come after qmath.h and tree.h */ #endif #include #include @@ -74,8 +79,8 @@ #include #include #include -#include #define TCPOUTFLAGS +#include #include #include #include @@ -84,9 +89,6 @@ #include #include #include -#ifdef NETFLIX_CWV -#include -#endif #include #ifdef TCPDEBUG #include @@ -126,6 +128,10 @@ struct sysctl_ctx_list rack_sysctl_ctx; struct sysctl_oid *rack_sysctl_root; +#ifndef TCPHPTS +fatal error missing option TCPHSTS in the build; +#endif + #define CUM_ACKED 1 #define SACKED 2 @@ -178,6 +184,9 @@ static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */ static int32_t rack_verbose_logging = 0; static int32_t rack_ignore_data_after_close = 1; +static int32_t rack_map_entries_limit = 1024; +static int32_t rack_map_split_limit = 256; + /* * Currently regular tcp has a rto_min of 30ms * the backoff goes 12 times so that ends up @@ -202,7 +211,6 @@ static int32_t rack_sack_block_limit = 128; static int32_t rack_use_sack_filter = 1; static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE; -static uint32_t rack_map_split_limit = 0; /* unlimited by default */ /* Rack specific counters */ counter_u64_t rack_badfr; @@ -228,6 +236,7 @@ counter_u64_t rack_to_alloc; counter_u64_t rack_to_alloc_hard; counter_u64_t rack_to_alloc_emerg; +counter_u64_t rack_to_alloc_limited; counter_u64_t rack_alloc_limited_conns; counter_u64_t rack_split_limited; @@ -248,12 +257,21 @@ counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE]; counter_u64_t rack_opts_arry[RACK_OPTS_SIZE]; +/* + * This was originally defined in tcp_timer.c, but is now reproduced here given + * the unification of the SYN and non-SYN retransmit timer exponents combined + * with wanting to retain previous behaviour for previously deployed stack + * versions. + */ +int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] = + { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 }; + static void rack_log_progress_event(struct tcp_rack *rack, struct tcpcb *tp, uint32_t tick, int event, int line); static int rack_process_ack(struct mbuf *m, struct tcphdr *th, - struct socket *so, struct tcpcb *tp, struct tcpopt *to, + struct socket *so, struct tcpcb *tp, struct tcpopt *to, uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val); static int rack_process_data(struct mbuf *m, struct tcphdr *th, @@ -351,14 +369,13 @@ rack_do_closing(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt); +static void rack_do_drop(struct mbuf *m, struct tcpcb *tp); static void -rack_do_drop(struct mbuf *m, struct tcpcb *tp); -static void rack_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); static void rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, - struct tcphdr *th, int32_t rstreason, int32_t tlen); + struct tcphdr *th, int32_t rstreason, int32_t tlen); static int rack_do_established(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp, struct tcpopt *to, int32_t drop_hdrlen, @@ -449,6 +466,7 @@ counter_u64_zero(rack_sack_proc_short); counter_u64_zero(rack_sack_proc_restart); counter_u64_zero(rack_to_alloc); + counter_u64_zero(rack_to_alloc_limited); counter_u64_zero(rack_alloc_limited_conns); counter_u64_zero(rack_split_limited); counter_u64_zero(rack_find_high); @@ -470,6 +488,18 @@ { SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "map_limit", CTLFLAG_RW, + &rack_map_entries_limit , 1024, + "Is there a limit on how big the sendmap can grow? "); + + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "map_splitlimit", CTLFLAG_RW, + &rack_map_split_limit , 256, + "Is there a limit on how much splitting a peer can do?"); + + SYSCTL_ADD_S32(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "rate_sample_method", CTLFLAG_RW, &rack_rate_sample_method , USE_RTT_LOW, "What method should we use for rate sampling 0=high, 1=low "); @@ -628,11 +658,6 @@ OID_AUTO, "pktdelay", CTLFLAG_RW, &rack_pkt_delay, 1, "Extra RACK time (in ms) besides reordering thresh"); - SYSCTL_ADD_U32(&rack_sysctl_ctx, - SYSCTL_CHILDREN(rack_sysctl_root), - OID_AUTO, "split_limit", CTLFLAG_RW, - &rack_map_split_limit, 0, - "Is there a limit on the number of map split entries (0=unlimited)"); SYSCTL_ADD_S32(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), OID_AUTO, "inc_var", CTLFLAG_RW, @@ -769,6 +794,12 @@ OID_AUTO, "allocemerg", CTLFLAG_RD, &rack_to_alloc_emerg, "Total allocations done from emergency cache"); + rack_to_alloc_limited = counter_u64_alloc(M_WAITOK); + SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, + SYSCTL_CHILDREN(rack_sysctl_root), + OID_AUTO, "alloc_limited", CTLFLAG_RD, + &rack_to_alloc_limited, + "Total allocations dropped due to limit"); rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK); SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx, SYSCTL_CHILDREN(rack_sysctl_root), @@ -859,6 +890,7 @@ static inline int32_t rack_progress_timeout_check(struct tcpcb *tp) { +#ifdef NETFLIX_PROGRESS if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) { if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) { /* @@ -869,13 +901,12 @@ struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; counter_u64_add(rack_progress_drops, 1); -#ifdef NETFLIX_STATS TCPSTAT_INC(tcps_progdrops); -#endif rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__); return (1); } } +#endif return (0); } @@ -962,6 +993,7 @@ union tcp_log_stackspecific log; struct timeval tv; + memset(&log, 0, sizeof(log)); /* Convert our ms to a microsecond */ log.u_bbr.flex1 = rtt * 1000; log.u_bbr.timeStamp = tcp_get_usecs(&tv); @@ -1021,6 +1053,8 @@ { if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) { union tcp_log_stackspecific log; + + memset(&log, 0, sizeof(log)); log.u_bbr.flex1 = did_out; log.u_bbr.flex2 = nxt_pkt; log.u_bbr.flex3 = way_out; @@ -1127,6 +1161,8 @@ counter_u64_free(rack_sack_proc_short); counter_u64_free(rack_sack_proc_restart); counter_u64_free(rack_to_alloc); + counter_u64_free(rack_to_alloc_limited); + counter_u64_free(rack_split_limited); counter_u64_free(rack_find_high); counter_u64_free(rack_runt_sacks); counter_u64_free(rack_enter_tlp_calc); @@ -1146,9 +1182,8 @@ rsm = uma_zalloc(rack_zone, M_NOWAIT); if (rsm) { -alloc_done: - counter_u64_add(rack_to_alloc, 1); rack->r_ctl.rc_num_maps_alloced++; + counter_u64_add(rack_to_alloc, 1); return (rsm); } if (rack->rc_free_cnt) { @@ -1156,11 +1191,26 @@ rsm = TAILQ_FIRST(&rack->r_ctl.rc_free); TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next); rack->rc_free_cnt--; - goto alloc_done; + return (rsm); } return (NULL); } +static struct rack_sendmap * +rack_alloc_full_limit(struct tcp_rack *rack) +{ + if ((rack_map_entries_limit > 0) && + (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { + counter_u64_add(rack_to_alloc_limited, 1); + if (!rack->alloc_limit_reported) { + rack->alloc_limit_reported = 1; + counter_u64_add(rack_alloc_limited_conns, 1); + } + return (NULL); + } + return (rack_alloc(rack)); +} + /* wrapper to allocate a sendmap entry, subject to a specific limit */ static struct rack_sendmap * rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type) @@ -1196,7 +1246,6 @@ /* currently there is only one limit type */ rack->r_ctl.rc_num_split_allocs--; } - rack->r_ctl.rc_num_maps_alloced--; if (rack->r_ctl.rc_tlpsend == rsm) rack->r_ctl.rc_tlpsend = NULL; if (rack->r_ctl.rc_next == rsm) @@ -1206,9 +1255,11 @@ if (rack->rc_free_cnt < rack_free_cache) { memset(rsm, 0, sizeof(struct rack_sendmap)); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next); + rsm->r_limit_type = 0; rack->rc_free_cnt++; return; } + rack->r_ctl.rc_num_maps_alloced--; uma_zfree(rack_zone, rsm); } @@ -1222,11 +1273,9 @@ #ifdef NETFLIX_STATS int32_t gput; #endif -#ifdef NETFLIX_CWV - u_long old_cwnd = tp->snd_cwnd; -#endif INP_WLOCK_ASSERT(tp->t_inpcb); + tp->ccv->nsegs = nsegs; tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th); if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) { @@ -1264,7 +1313,6 @@ tp->t_stats_gput_prev); tp->t_flags &= ~TF_GPUTINPROG; tp->t_stats_gput_prev = gput; -#ifdef NETFLIX_CWV if (tp->t_maxpeakrate) { /* * We update t_peakrate_thr. This gives us roughly @@ -1272,7 +1320,6 @@ */ tcp_update_peakrate_thr(tp); } -#endif } #endif if (tp->snd_cwnd > tp->snd_ssthresh) { @@ -1298,39 +1345,10 @@ if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) { rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd; } -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - /* - * Per RFC 7661: The behaviour in the non-validated phase is - * specified as: o A sender determines whether to increase - * the cwnd based upon whether it is cwnd-limited (see - * Section 4.5.3): * A sender that is cwnd-limited MAY use - * the standard TCP method to increase cwnd (i.e., the - * standard method permits a TCP sender that fully utilises - * the cwnd to increase the cwnd each time it receives an - * ACK). * A sender that is not cwnd-limited MUST NOT - * increase the cwnd when ACK packets are received in this - * phase (i.e., needs to avoid growing the cwnd when it has - * not recently sent using the current size of cwnd). - */ - if ((tp->snd_cwnd > old_cwnd) && - (tp->cwv_cwnd_valid == 0) && - (!(tp->ccv->flags & CCF_CWND_LIMITED))) { - tp->snd_cwnd = old_cwnd; - } - /* Try to update pipeAck and NCWV state */ - if (TCPS_HAVEESTABLISHED(tp->t_state) && - !IN_RECOVERY(tp->t_flags)) { - uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd)); - - tcp_newcwv_update_pipeack(tp, data); - } - } /* we enforce max peak rate if it is set. */ if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) { tp->snd_cwnd = tp->t_peakrate_thr; } -#endif } static void @@ -1379,16 +1397,8 @@ tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt; rack->r_ctl.rc_prr_sndcnt = 0; } + tp->snd_recover = tp->snd_una; EXIT_RECOVERY(tp->t_flags); - - -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - (tp->snd_cwv.in_recovery)) - tcp_newcwv_end_recovery(tp); - } -#endif } static void @@ -1450,16 +1460,6 @@ tp->ccv->curack = th->th_ack; CC_ALGO(tp)->cong_signal(tp->ccv, type); } -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) { - tcp_newcwv_enter_recovery(tp); - } - if (type == CC_RTO) { - tcp_newcwv_reset(tp); - } - } -#endif } @@ -1479,11 +1479,21 @@ if (CC_ALGO(tp)->after_idle != NULL) CC_ALGO(tp)->after_idle(tp->ccv); - if (tp->snd_cwnd == 1) - i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */ - else - i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp)); - + if (V_tcp_initcwnd_segments) + i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg), + max(2 * tp->t_maxseg, 14600)); + else if (V_tcp_do_rfc3390) + i_cwnd = min(4 * tp->t_maxseg, + max(2 * tp->t_maxseg, 4380)); + else { + /* Per RFC5681 Section 3.1 */ + if (tp->t_maxseg > 2190) + i_cwnd = 2 * tp->t_maxseg; + else if (tp->t_maxseg > 1095) + i_cwnd = 3 * tp->t_maxseg; + else + i_cwnd = 4 * tp->t_maxseg; + } if (reduce_largest) { /* * Do we reduce the largest cwnd to make @@ -1549,8 +1559,7 @@ } static void -rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, - int32_t rstreason, int32_t tlen) +rack_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t rstreason, int32_t tlen) { if (tp != NULL) { tcp_dropwithreset(m, th, tp, tlen, rstreason); @@ -1736,7 +1745,7 @@ * TCB is still valid and locked. */ static int -rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +rack_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) { int32_t todrop; int32_t thflags; @@ -1778,17 +1787,6 @@ TCPSTAT_INC(tcps_rcvpartduppack); TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); } - /* - * DSACK - add SACK block for dropped range - */ - if (tp->t_flags & TF_SACK_PERMIT) { - tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); - /* - * ACK now, as the next in-sequence segment - * will clear the DSACK block again - */ - tp->t_flags |= TF_ACKNOW; - } *drop_hdrlen += todrop; /* drop from the top afterwards */ th->th_seq += todrop; tlen -= todrop; @@ -2124,8 +2122,6 @@ /* We can't start any timer in persists */ return (rack_get_persists_timer_val(tp, rack)); } - if (tp->t_state < TCPS_ESTABLISHED) - goto activate_rxt; rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap); if (rsm == NULL) { /* Nothing on the send map */ @@ -2184,6 +2180,12 @@ */ goto activate_rxt; } + if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) { + /* + * Peer collapsed rwnd, don't do TLP. + */ + goto activate_rxt; + } rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext); if (rsm == NULL) { /* We found no rsm to TLP with. */ @@ -2288,7 +2290,9 @@ /* A previous call is already set up */ return; } - if (tp->t_state == TCPS_CLOSED) { + + if ((tp->t_state == TCPS_CLOSED) || + (tp->t_state == TCPS_LISTEN)) { return; } stopped = rack->rc_tmr_stopped; @@ -2307,8 +2311,8 @@ * We are still left on the hpts when the to goes * it will be for output. */ - if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to)) - slot = cts - rack->r_ctl.rc_last_output_to; + if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts)) + slot = rack->r_ctl.rc_last_output_to - cts; else slot = 1; } @@ -2330,7 +2334,7 @@ } hpts_timeout = rack_timer_start(tp, rack, cts); if (tp->t_flags & TF_DELACK) { - delayed_ack = TICKS_2_MSEC(tcp_delacktime); + delayed_ack = tcp_delacktime; rack->r_ctl.rc_hpts_flags |= PACE_TMR_DELACK; } if (delayed_ack && ((hpts_timeout == 0) || @@ -2487,6 +2491,43 @@ return (0); } +static struct rack_sendmap * +rack_merge_rsm(struct tcp_rack *rack, + struct rack_sendmap *l_rsm, + struct rack_sendmap *r_rsm) +{ + /* + * We are merging two ack'd RSM's, + * the l_rsm is on the left (lower seq + * values) and the r_rsm is on the right + * (higher seq value). The simplest way + * to merge these is to move the right + * one into the left. I don't think there + * is any reason we need to try to find + * the oldest (or last oldest retransmitted). + */ + l_rsm->r_end = r_rsm->r_end; + if (r_rsm->r_rtr_bytes) + l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes; + if (r_rsm->r_in_tmap) { + /* This really should not happen */ + TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext); + } + /* Now the flags */ + if (r_rsm->r_flags & RACK_HAS_FIN) + l_rsm->r_flags |= RACK_HAS_FIN; + if (r_rsm->r_flags & RACK_TLP) + l_rsm->r_flags |= RACK_TLP; + TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next); + if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) { + /* Transfer the split limit to the map we free */ + r_rsm->r_limit_type = l_rsm->r_limit_type; + l_rsm->r_limit_type = 0; + } + rack_free(rack, r_rsm); + return(l_rsm); +} + /* * TLP Timer, here we simply setup what segment we want to * have the TLP expire on, the normal rack_output() will then @@ -2590,7 +2631,7 @@ int32_t idx; struct rack_sendmap *nrsm; - nrsm = rack_alloc(rack); + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * No memory to split, we will just exit and punt @@ -2937,7 +2978,7 @@ TCPSTAT_INC(tcps_rexmttimeo); if ((tp->t_state == TCPS_SYN_SENT) || (tp->t_state == TCPS_SYN_RECEIVED)) - rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]); + rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]); else rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift]; TCPT_RANGESET(tp->t_rxtcur, rexmt, @@ -3281,7 +3322,7 @@ * Here we retransmitted less than the whole thing which means we * have to split this into what was transmitted and what was not. */ - nrsm = rack_alloc(rack); + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { /* * We can't get memory, so lets not proceed. @@ -3415,9 +3456,6 @@ * Hmm out of memory and the tcb got destroyed while * we tried to wait. */ -#ifdef INVARIANTS - panic("Out of memory when we should not be rack:%p", rack); -#endif return; } if (th_flags & TH_FIN) { @@ -3428,15 +3466,8 @@ rsm->r_tim_lastsent[0] = ts; rsm->r_rtr_cnt = 1; rsm->r_rtr_bytes = 0; - if (th_flags & TH_SYN) { - /* The data space is one beyond snd_una */ - rsm->r_start = seq_out + 1; - rsm->r_end = rsm->r_start + (len - 1); - } else { - /* Normal case */ - rsm->r_start = seq_out; - rsm->r_end = rsm->r_start + len; - } + rsm->r_start = seq_out; + rsm->r_end = rsm->r_start + len; rsm->r_sndcnt = 0; TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next); TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); @@ -3486,11 +3517,8 @@ * Ok we must split off the front and then let the * update do the rest */ - nrsm = rack_alloc(rack); + nrsm = rack_alloc_full_limit(rack); if (nrsm == NULL) { -#ifdef INVARIANTS - panic("Ran out of memory that was preallocated? rack:%p", rack); -#endif rack_update_rsm(tp, rack, rsm, ts); return; } @@ -3908,6 +3936,14 @@ if (nrsm->r_flags & RACK_ACKED) { /* Skip ack'd segments */ continue; + } + if (nrsm->r_flags & RACK_SACK_PASSED) { + /* + * We found one that is already marked + * passed, we have been here before and + * so all others below this are marked. + */ + break; } idx = nrsm->r_rtr_cnt - 1; if (ts == nrsm->r_tim_lastsent[idx]) { @@ -4114,6 +4150,26 @@ rsm->r_in_tmap = 0; } out: + if (rsm && (rsm->r_flags & RACK_ACKED)) { + /* + * Now can we merge this newly acked + * block with either the previous or + * next block? + */ + nrsm = TAILQ_NEXT(rsm, r_next); + if (nrsm && + (nrsm->r_flags & RACK_ACKED)) { + /* yep this and next can be merged */ + rsm = rack_merge_rsm(rack, rsm, nrsm); + } + /* Now what about the previous? */ + nrsm = TAILQ_PREV(rsm, rack_head, r_next); + if (nrsm && + (nrsm->r_flags & RACK_ACKED)) { + /* yep the previous and this can be merged */ + rsm = rack_merge_rsm(rack, nrsm, rsm); + } + } if (used_ref == 0) { counter_u64_add(rack_sack_proc_all, 1); } else { @@ -4353,16 +4409,13 @@ } sack_blocks[num_sack_blks] = sack; num_sack_blks++; -#ifdef NETFLIX_STATS } else if (SEQ_LEQ(sack.start, th_ack) && SEQ_LEQ(sack.end, th_ack)) { /* * Its a D-SACK block. */ - tcp_record_dsack(sack.start, sack.end); -#endif +/* tcp_record_dsack(sack.start, sack.end); */ } - } if (num_sack_blks == 0) goto out; @@ -4371,7 +4424,9 @@ * just one pass. */ if (rack_use_sack_filter) { - num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack); + num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, + num_sack_blks, th->th_ack); + ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks); } if (num_sack_blks < 2) { goto do_sack_work; @@ -4620,8 +4675,9 @@ return (0); } if (rack->r_ctl.rc_early_recovery) { - if (IN_FASTRECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover) && + (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); @@ -4648,8 +4704,9 @@ sowwakeup_locked(so); m_freem(mfree); if (rack->r_ctl.rc_early_recovery == 0) { - if (IN_FASTRECOVERY(tp->t_flags)) { - if (SEQ_LT(th->th_ack, tp->snd_recover)) { + if (IN_RECOVERY(tp->t_flags)) { + if (SEQ_LT(th->th_ack, tp->snd_recover) && + (SEQ_LT(th->th_ack, tp->snd_max))) { tcp_rack_partialack(tp, th); } else { rack_post_recovery(tp, th); @@ -4707,7 +4764,11 @@ * send garbage on first SYN. */ int32_t nsegs; +#ifdef TCP_RFC7413 int32_t tfo_syn; +#else +#define tfo_syn (FALSE) +#endif struct tcp_rack *rack; rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -4816,8 +4877,10 @@ * PRU_RCVD). If a FIN has already been received on this connection * then we just ignore the text. */ +#ifdef TCP_RFC7413 tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) && - IS_FASTOPEN(tp->t_flags)); + (tp->t_flags & TF_FASTOPEN)); +#endif if ((tlen || (thflags & TH_FIN) || tfo_syn) && TCPS_HAVERCVDFIN(tp->t_state) == 0) { tcp_seq save_start = th->th_seq; @@ -5024,8 +5087,9 @@ /* Clean receiver SACK report if present */ - if (tp->rcv_numsacks) - tcp_clean_sackreport(tp); +/* if (tp->rcv_numsacks) + tcp_clean_sackreport(tp); +*/ TCPSTAT_INC(tcps_preddat); tp->rcv_nxt += tlen; /* @@ -5284,8 +5348,6 @@ tp->irs = th->th_seq; tcp_rcvseqinit(tp); if (thflags & TH_ACK) { - int tfo_partial = 0; - TCPSTAT_INC(tcps_connects); soisconnected(so); #ifdef MAC @@ -5299,19 +5361,10 @@ tp->rcv_adv += min(tp->rcv_wnd, TCP_MAXWIN << tp->rcv_scale); /* - * If not all the data that was sent in the TFO SYN - * has been acked, resend the remainder right away. - */ - if (IS_FASTOPEN(tp->t_flags) && - (tp->snd_una != tp->snd_max)) { - tp->snd_nxt = th->th_ack; - tfo_partial = 1; - } - /* * If there's data, delay ACK; if there's also a FIN ACKNOW * will be turned on later. */ - if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) { + if (DELAY_ACK(tp, tlen) && tlen != 0) { rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr, ((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__); tp->t_flags |= TF_DELACK; @@ -5320,26 +5373,10 @@ tp->t_flags |= TF_ACKNOW; } - if (((thflags & (TH_CWR | TH_ECE)) == TH_ECE) && - V_tcp_do_ecn) { + if ((thflags & TH_ECE) && V_tcp_do_ecn) { tp->t_flags |= TF_ECN_PERMIT; TCPSTAT_INC(tcps_ecn_shs); } - if (SEQ_GT(th->th_ack, tp->snd_una)) { - /* - * We advance snd_una for the - * fast open case. If th_ack is - * acknowledging data beyond - * snd_una we can't just call - * ack-processing since the - * data stream in our send-map - * will start at snd_una + 1 (one - * beyond the SYN). If its just - * equal we don't need to do that - * and there is no send_map. - */ - tp->snd_una++; - } /* * Received in SYN_SENT[*] state. Transitions: * SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1 @@ -5423,7 +5460,7 @@ } } return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, - tiwin, thflags, nxt_pkt)); + tiwin, thflags, nxt_pkt)); } /* @@ -5447,13 +5484,13 @@ rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); return (1); } - if (IS_FASTOPEN(tp->t_flags)) { +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { /* - * When a TFO connection is in SYN_RECEIVED, the - * only valid packets are the initial SYN, a - * retransmit/copy of the initial SYN (possibly with - * a subset of the original data), a valid ACK, a - * FIN, or a RST. + * When a TFO connection is in SYN_RECEIVED, the only valid + * packets are the initial SYN, a retransmit/copy of the + * initial SYN (possibly with a subset of the original + * data), a valid ACK, a FIN, or a RST. */ if ((thflags & (TH_SYN | TH_ACK)) == (TH_SYN | TH_ACK)) { rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); @@ -5474,9 +5511,18 @@ return (0); } } +#endif if (thflags & TH_RST) return (rack_process_rst(m, th, so, tp)); /* + * RFC5961 Section 4.2 Send challenge ACK for any SYN in + * synchronized state. + */ + if (thflags & TH_SYN) { + rack_challenge_ack(m, th, tp, &ret_val); + return (ret_val); + } + /* * RFC 1323 PAWS: If we have a timestamp reply on this segment and * it's less than ts_recent, drop it. */ @@ -5520,16 +5566,18 @@ tp->ts_recent_age = tcp_ts_getticks(); tp->ts_recent = to->to_tsval; } - tp->snd_wnd = tiwin; /* * If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag * is on (half-synchronized state), then queue data for later * processing; else drop segment and return. */ if ((thflags & TH_ACK) == 0) { - if (IS_FASTOPEN(tp->t_flags)) { +#ifdef TCP_RFC7413 + if (tp->t_flags & TF_FASTOPEN) { + tp->snd_wnd = tiwin; cc_conn_init(tp); } +#endif return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen, tiwin, thflags, nxt_pkt)); } @@ -5539,22 +5587,13 @@ if ((tp->t_flags & (TF_RCVD_SCALE | TF_REQ_SCALE)) == (TF_RCVD_SCALE | TF_REQ_SCALE)) { tp->rcv_scale = tp->request_r_scale; + tp->snd_wnd = tiwin; } /* * Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* -> * FIN-WAIT-1 */ tp->t_starttime = ticks; - if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) { - tcp_fastopen_decrement_counter(tp->t_tfo_pending); - tp->t_tfo_pending = NULL; - - /* - * Account for the ACK of our SYN prior to - * regular ACK processing below. - */ - tp->snd_una++; - } if (tp->t_flags & TF_NEEDFIN) { tcp_state_change(tp, TCPS_FIN_WAIT_1); tp->t_flags &= ~TF_NEEDFIN; @@ -5562,13 +5601,25 @@ tcp_state_change(tp, TCPS_ESTABLISHED); TCP_PROBE5(accept__established, NULL, tp, mtod(m, const char *), tp, th); +#ifdef TCP_RFC7413 + if (tp->t_tfo_pending) { + tcp_fastopen_decrement_counter(tp->t_tfo_pending); + tp->t_tfo_pending = NULL; + + /* + * Account for the ACK of our SYN prior to regular + * ACK processing below. + */ + tp->snd_una++; + } /* * TFO connections call cc_conn_init() during SYN * processing. Calling it again here for such connections * is not harmless as it would undo the snd_cwnd reduction * that occurs when a TFO SYN|ACK is retransmitted. */ - if (!IS_FASTOPEN(tp->t_flags)) + if (!(tp->t_flags & TF_FASTOPEN)) +#endif cc_conn_init(tp); } /* @@ -5576,7 +5627,7 @@ * not, do so now to pass queued data to user. */ if (tlen == 0 && (thflags & TH_FIN) == 0) - (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0, + (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0, (struct mbuf *)0); tp->snd_wl1 = th->th_seq - 1; if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) { @@ -5836,7 +5887,7 @@ rack_check_data_after_close(struct mbuf *m, struct tcpcb *tp, int32_t *tlen, struct tcphdr *th, struct socket *so) { - struct tcp_rack *rack; + struct tcp_rack *rack; INP_INFO_RLOCK_ASSERT(&V_tcbinfo); rack = (struct tcp_rack *)tp->t_fb_ptr; @@ -6353,7 +6404,6 @@ rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr; rack->r_ctl.rc_min_to = rack_min_to; rack->r_ctl.rc_prr_inc_var = rack_inc_var; - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); if (tp->snd_una != tp->snd_max) { /* Create a send map for the current outstanding data */ struct rack_sendmap *rsm; @@ -6375,6 +6425,8 @@ TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext); rsm->r_in_tmap = 1; } + rack_stop_all_timers(tp); + rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); return (0); } @@ -6431,6 +6483,8 @@ uma_zfree(rack_pcb_zone, tp->t_fb_ptr); tp->t_fb_ptr = NULL; } + /* Make sure snd_nxt is correctly set */ + tp->snd_nxt = tp->snd_max; } static void @@ -6473,9 +6527,6 @@ case TCPS_CLOSED: case TCPS_TIME_WAIT: default: -#ifdef INVARIANTS - panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state); -#endif break; }; } @@ -6585,10 +6636,6 @@ * allow the tcbinfo to be in either locked or unlocked, as the * caller may have unnecessarily acquired a lock due to a race. */ - if ((thflags & (TH_SYN | TH_FIN | TH_RST)) != 0 || - tp->t_state != TCPS_ESTABLISHED) { - INP_INFO_RLOCK_ASSERT(&V_tcbinfo); - } INP_WLOCK_ASSERT(tp->t_inpcb); KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN", __func__)); @@ -6600,37 +6647,17 @@ memset(&log.u_bbr, 0, sizeof(log.u_bbr)); log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts; log.u_bbr.ininput = rack->rc_inp->inp_in_input; + log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced; TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0, tlen, &log, true); } - if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) { - way_out = 4; - goto done_with_input; - } /* - * If a segment with the ACK-bit set arrives in the SYN-SENT state - * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9. - */ - if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) && - (SEQ_LEQ(th->th_ack, tp->iss) || SEQ_GT(th->th_ack, tp->snd_max))) { - rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); - return; - } - /* * Segment received on connection. Reset idle time and keep-alive * timer. XXX: This should be done after segment validation to * ignore broken/spoofed segs. */ if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) { -#ifdef NETFLIX_CWV - if ((tp->cwv_enabled) && - ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) { - tcp_newcwv_nvp_closedown(tp); - } else -#endif - if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { + if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) { counter_u64_add(rack_input_idle_reduces, 1); rack_cc_after_idle(tp, (rack->r_idle_reduce_largest ? 1 :0)); @@ -6639,14 +6666,6 @@ rack->r_ctl.rc_rcvtime = cts; tp->t_rcvtime = ticks; -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) - tcp_newcwv_nvp_closedown(tp); - } -#endif /* * Unscale the window into a 32-bit value. For the SYN_SENT state * the scale is zero. @@ -6737,22 +6756,6 @@ if ((tp->t_flags & TF_SACK_PERMIT) && (to.to_flags & TOF_SACKPERM) == 0) tp->t_flags &= ~TF_SACK_PERMIT; - if (IS_FASTOPEN(tp->t_flags)) { - if (to.to_flags & TOF_FASTOPEN) { - uint16_t mss; - - if (to.to_flags & TOF_MSS) - mss = to.to_mss; - else - if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0) - mss = TCP6_MSS; - else - mss = TCP_MSS; - tcp_fastopen_update_cache(tp, mss, - to.to_tfo_len, to.to_tfo_cookie); - } else - tcp_fastopen_disable_path(tp); - } } /* * At this point we are at the initial call. Here we decide @@ -6769,7 +6772,6 @@ /* Set the flag */ rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0; tcp_set_hpts(tp->t_inpcb); - rack_stop_all_timers(tp); sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack); } /* @@ -6801,24 +6803,6 @@ */ INP_WLOCK_ASSERT(tp->t_inpcb); tcp_rack_xmit_timer_commit(rack, tp); - if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) && - (rack->rc_in_persist == 0)){ - /* - * The peer shrunk its window on us to the point - * where we have sent too much. The only thing - * we can do here is stop any timers and - * enter persist. We most likely lost the last - * bytes we sent but oh well, we will have to - * retransmit them after the peer is caught up. - */ - if (rack->rc_inp->inp_in_hpts) - tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT); - rack_timer_cancel(tp, rack, cts, __LINE__); - rack_enter_persist(tp, rack, cts); - rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0); - way_out = 3; - goto done_with_input; - } if (nxt_pkt == 0) { if (rack->r_wanted_output != 0) { did_out = 1; @@ -6848,7 +6832,6 @@ rack_timer_audit(tp, rack, &so->so_snd); way_out = 2; } - done_with_input: rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out); if (did_out) rack->r_wanted_output = 0; @@ -6871,7 +6854,7 @@ #ifdef RSS struct tcp_function_block *tfb; struct tcp_rack *rack; - struct epoch_tracker et; + struct inpcb *inp; rack = (struct tcp_rack *)tp->t_fb_ptr; if (rack->r_state == 0) { @@ -6879,11 +6862,9 @@ * Initial input (ACK to SYN-ACK etc)lets go ahead and get * it processed */ - INP_INFO_RLOCK_ET(&V_tcbinfo, et); tcp_get_usecs(&tv); rack_hpts_do_segment(m, th, so, tp, drop_hdrlen, tlen, iptos, 0, &tv); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); return; } tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos); @@ -6959,13 +6940,17 @@ #ifdef TCPDEBUG struct ipovly *ipov = NULL; #endif +#ifdef NETFLIX_TCP_O_UDP struct udphdr *udp = NULL; +#endif struct tcp_rack *rack; struct tcphdr *th; uint8_t pass = 0; - uint8_t wanted_cookie = 0; u_char opt[TCP_MAXOLEN]; - unsigned ipoptlen, optlen, hdrlen, ulen=0; + unsigned ipoptlen, optlen, hdrlen; +#ifdef NETFLIX_TCP_O_UDP + unsigned ulen; +#endif uint32_t rack_seq; #if defined(IPSEC) || defined(IPSEC_SUPPORT) @@ -7004,6 +6989,18 @@ if (tp->t_flags & TF_TOE) return (tcp_offload_output(tp)); #endif + +#ifdef TCP_RFC7413 + /* + * For TFO connections in SYN_RECEIVED, only allow the initial + * SYN|ACK and those sent by the retransmit timer. + */ + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && + SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN|ACK sent */ + (tp->snd_nxt != tp->snd_una)) /* not a retransmit */ + return (0); +#endif #ifdef INET6 if (rack->r_state) { /* Use the cache line loaded if possible */ @@ -7046,31 +7043,12 @@ rack->r_wanted_output = 0; rack->r_timer_override = 0; /* - * For TFO connections in SYN_SENT or SYN_RECEIVED, - * only allow the initial SYN or SYN|ACK and those sent - * by the retransmit timer. - */ - if (IS_FASTOPEN(tp->t_flags) && - ((tp->t_state == TCPS_SYN_RECEIVED) || - (tp->t_state == TCPS_SYN_SENT)) && - SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN|ACK sent */ - (tp->t_rxtshift == 0)) /* not a retransmit */ - return (0); - /* * Determine length of data that should be transmitted, and flags * that will be used. If there is some data or critical controls * (SYN, RST) to send, then transmit; otherwise, investigate * further. */ idle = (tp->t_flags & TF_LASTIDLE) || (tp->snd_max == tp->snd_una); -#ifdef NETFLIX_CWV - if (tp->cwv_enabled) { - if ((tp->cwv_cwnd_valid == 0) && - TCPS_HAVEESTABLISHED(tp->t_state) && - (tp->snd_cwnd > tp->snd_cwv.init_cwnd)) - tcp_newcwv_nvp_closedown(tp); - } else -#endif if (tp->t_idle_reduce) { if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur)) rack_cc_after_idle(tp, @@ -7141,10 +7119,12 @@ tlen = rsm->r_end - rsm->r_start; if (tlen > tp->t_maxseg) tlen = tp->t_maxseg; - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); +#ifdef INVARIANTS + if (SEQ_GT(tp->snd_una, rsm->r_start)) { + panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u", + tp, rack, tp->snd_una, rsm, rsm->r_start); + } +#endif sb_offset = rsm->r_start - tp->snd_una; cwin = min(tp->snd_wnd, tlen); len = cwin; @@ -7155,14 +7135,12 @@ len = rsm->r_end - rsm->r_start; sack_rxmit = 1; sendalot = 0; - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (len >= tp->t_maxseg) { len = tp->t_maxseg; } + KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", + __func__, sb_offset)); } else if ((rack->rc_in_persist == 0) && ((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) { long tlen; @@ -7187,10 +7165,6 @@ } #endif tlen = rsm->r_end - rsm->r_start; - KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start), - ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p", - __func__, __LINE__, - rsm->r_start, tp->snd_una, tp, rack, rsm)); sb_offset = rsm->r_start - tp->snd_una; if (tlen > rack->r_ctl.rc_prr_sndcnt) { len = rack->r_ctl.rc_prr_sndcnt; @@ -7212,6 +7186,8 @@ goto just_return_nolock; } } + KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d", + __func__, sb_offset)); if (len > 0) { sub_from_prr = 1; sack_rxmit = 1; @@ -7236,6 +7212,20 @@ /* For debugging */ rack->r_ctl.rc_rsm_at_retran = rsm; #endif + /* + * Enforce a connection sendmap count limit if set + * as long as we are not retransmiting. + */ + if ((rsm == NULL) && + (rack_map_entries_limit > 0) && + (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) { + counter_u64_add(rack_to_alloc_limited, 1); + if (!rack->alloc_limit_reported) { + rack->alloc_limit_reported = 1; + counter_u64_add(rack_alloc_limited_conns, 1); + } + goto just_return_nolock; + } /* * Get standard flags, and add SYN or FIN if requested by 'hidden' * state flags. @@ -7306,7 +7296,7 @@ uint32_t avail; avail = sbavail(sb); - if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail) + if (SEQ_GT(tp->snd_nxt, tp->snd_una)) sb_offset = tp->snd_nxt - tp->snd_una; else sb_offset = 0; @@ -7347,9 +7337,18 @@ * data possible so far in the scoreboard. */ outstanding = tp->snd_max - tp->snd_una; - if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) - len = 0; - else if (avail > sb_offset) + if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) { + if (tp->snd_wnd > outstanding) { + len = tp->snd_wnd - outstanding; + /* Check to see if we have the data */ + if (((sb_offset + len) > avail) && + (avail > sb_offset)) + len = avail - sb_offset; + else + len = 0; + } else + len = 0; + } else if (avail > sb_offset) len = avail - sb_offset; else len = 0; @@ -7398,18 +7397,22 @@ * SYN-SENT state and if segment contains data and if we don't know * that foreign host supports TAO, suppress sending segment. */ - if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) && - ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) { - if (tp->t_state != TCPS_SYN_RECEIVED) + if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) { + if ((tp->t_state != TCPS_SYN_RECEIVED) && + (tp->t_state != TCPS_SYN_SENT)) flags &= ~TH_SYN; +#ifdef TCP_RFC7413 /* * When sending additional segments following a TFO SYN|ACK, * do not include the SYN bit. */ - if (IS_FASTOPEN(tp->t_flags) && + if ((tp->t_flags & TF_FASTOPEN) && (tp->t_state == TCPS_SYN_RECEIVED)) flags &= ~TH_SYN; +#endif sb_offset--, len++; + if (sbavail(sb) == 0) + len = 0; } /* * Be careful not to send data and/or FIN on SYN segments. This @@ -7420,29 +7423,16 @@ len = 0; flags &= ~TH_FIN; } +#ifdef TCP_RFC7413 /* - * On TFO sockets, ensure no data is sent in the following cases: - * - * - When retransmitting SYN|ACK on a passively-created socket - * - * - When retransmitting SYN on an actively created socket - * - * - When sending a zero-length cookie (cookie request) on an - * actively created socket - * - * - When the socket is in the CLOSED state (RST is being sent) + * When retransmitting SYN|ACK on a passively-created TFO socket, + * don't include data, as the presence of data may have caused the + * original SYN|ACK to have been dropped by a middlebox. */ - if (IS_FASTOPEN(tp->t_flags) && - (((flags & TH_SYN) && (tp->t_rxtshift > 0)) || - ((tp->t_state == TCPS_SYN_SENT) && - (tp->t_tfo_client_cookie_len == 0)) || - (flags & TH_RST))) { - sack_rxmit = 0; + if ((tp->t_flags & TF_FASTOPEN) && + ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0))) len = 0; - } - /* Without fast-open there should never be data sent on a SYN */ - if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags))) - len = 0; +#endif if (len <= 0) { /* * If FIN has been sent but not acked, but we haven't been @@ -7519,7 +7509,9 @@ ipoptlen += ipsec_optlen; #endif if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg && +#ifdef NETFLIX_TCP_O_UDP (tp->t_port == 0) && +#endif ((tp->t_flags & TF_SIGNATURE) == 0) && tp->rcv_numsacks == 0 && sack_rxmit == 0 && ipoptlen == 0) @@ -7688,10 +7680,13 @@ * If our state indicates that FIN should be sent and we have not * yet done so, then we need to send. */ - if ((flags & TH_FIN) && - (tp->snd_nxt == tp->snd_una)) { - pass = 11; - goto send; + if (flags & TH_FIN) { + if ((tp->t_flags & TF_SENTFIN) || + (((tp->t_flags & TF_SENTFIN) == 0) && + (tp->snd_nxt == tp->snd_una))) { + pass = 11; + goto send; + } } /* * No reason to send a segment, just return. @@ -7750,44 +7745,27 @@ if (flags & TH_SYN) { tp->snd_nxt = tp->iss; to.to_mss = tcp_mssopt(&inp->inp_inc); -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) to.to_mss -= V_tcp_udp_tunneling_overhead; #endif to.to_flags |= TOF_MSS; - +#ifdef TCP_RFC7413 /* - * On SYN or SYN|ACK transmits on TFO connections, - * only include the TFO option if it is not a - * retransmit, as the presence of the TFO option may - * have caused the original SYN or SYN|ACK to have - * been dropped by a middlebox. + * Only include the TFO option on the first + * transmission of the SYN|ACK on a + * passively-created TFO socket, as the presence of + * the TFO option may have caused the original + * SYN|ACK to have been dropped by a middlebox. */ - if (IS_FASTOPEN(tp->t_flags) && + if ((tp->t_flags & TF_FASTOPEN) && + (tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift == 0)) { - if (tp->t_state == TCPS_SYN_RECEIVED) { - to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN; - to.to_tfo_cookie = - (u_int8_t *)&tp->t_tfo_cookie.server; - to.to_flags |= TOF_FASTOPEN; - wanted_cookie = 1; - } else if (tp->t_state == TCPS_SYN_SENT) { - to.to_tfo_len = - tp->t_tfo_client_cookie_len; - to.to_tfo_cookie = - tp->t_tfo_cookie.client; - to.to_flags |= TOF_FASTOPEN; - wanted_cookie = 1; - /* - * If we wind up having more data to - * send with the SYN than can fit in - * one segment, don't send any more - * until the SYN|ACK comes back from - * the other end. - */ - sendalot = 0; - } + to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN; + to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie; + to.to_flags |= TOF_FASTOPEN; } +#endif } /* Window scaling. */ if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) { @@ -7822,15 +7800,8 @@ /* Processing the options. */ hdrlen += optlen = tcp_addoptions(&to, opt); - /* - * If we wanted a TFO option to be added, but it was unable - * to fit, ensure no data is sent. - */ - if (IS_FASTOPEN(tp->t_flags) && wanted_cookie && - !(to.to_flags & TOF_FASTOPEN)) - len = 0; } -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { if (V_tcp_udp_tunneling_port == 0) { /* The port was removed?? */ @@ -7996,8 +7967,8 @@ msb = NULL; else msb = sb; - m->m_next = tcp_m_copym(mb, moff, &len, - if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb); + m->m_next = tcp_m_copym(/*tp, */ mb, moff, &len, + if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /*, 0, NULL*/); if (len <= (tp->t_maxseg - optlen)) { /* * Must have ran out of mbufs for the copy @@ -8031,6 +8002,8 @@ * TLP should not count in retran count, but * in its own bin */ +/* tp->t_sndtlppack++;*/ +/* tp->t_sndtlpbyte += len;*/ counter_u64_add(rack_tlp_retran, 1); counter_u64_add(rack_tlp_retran_bytes, len); } else { @@ -8156,7 +8129,7 @@ #ifdef INET6 if (isipv6) { ip6 = mtod(m, struct ip6_hdr *); -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -8164,10 +8137,10 @@ ulen = hdrlen + len - sizeof(struct ip6_hdr); udp->uh_ulen = htons(ulen); th = (struct tcphdr *)(udp + 1); - } else + } else #endif th = (struct tcphdr *)(ip6 + 1); - tcpip_fillheaders(inp, ip6, th); + tcpip_fillheaders(inp, /*tp->t_port, */ ip6, th); } else #endif /* INET6 */ { @@ -8175,7 +8148,7 @@ #ifdef TCPDEBUG ipov = (struct ipovly *)ip; #endif -#ifdef NETFLIX_TCPOUDP +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip)); udp->uh_sport = htons(V_tcp_udp_tunneling_port); @@ -8186,7 +8159,7 @@ } else #endif th = (struct tcphdr *)(ip + 1); - tcpip_fillheaders(inp, ip, th); + tcpip_fillheaders(inp,/*tp->t_port, */ ip, th); } /* * Fill in fields, remembering maximum advertised window for use in @@ -8277,20 +8250,15 @@ /* * Calculate receive window. Don't shrink window, but avoid silly * window syndrome. - * If a RST segment is sent, advertise a window of zero. */ - if (flags & TH_RST) { + if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && + recwin < (long)tp->t_maxseg) recwin = 0; - } else { - if (recwin < (long)(so->so_rcv.sb_hiwat / 4) && - recwin < (long)tp->t_maxseg) - recwin = 0; - if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && - recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) - recwin = (long)(tp->rcv_adv - tp->rcv_nxt); - if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) - recwin = (long)TCP_MAXWIN << tp->rcv_scale; - } + if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) && + recwin < (long)(tp->rcv_adv - tp->rcv_nxt)) + recwin = (long)(tp->rcv_adv - tp->rcv_nxt); + if (recwin > (long)TCP_MAXWIN << tp->rcv_scale) + recwin = (long)TCP_MAXWIN << tp->rcv_scale; /* * According to RFC1323 the window field in a SYN (i.e., a or @@ -8357,18 +8325,23 @@ * ip6_plen is not need to be filled now, and will be filled * in ip6_output. */ +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP_IPV6; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0); th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); } else { +#endif m->m_pkthdr.csum_flags = CSUM_TCP_IPV6; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in6_cksum_pseudo(ip6, sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP, 0); +#ifdef NETFLIX_TCP_O_UDP } +#endif } #endif #if defined(INET6) && defined(INET) @@ -8376,19 +8349,24 @@ #endif #ifdef INET { +#ifdef NETFLIX_TCP_O_UDP if (tp->t_port) { m->m_pkthdr.csum_flags = CSUM_UDP; m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum); udp->uh_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP)); th->th_sum = htons(0); + UDPSTAT_INC(udps_opackets); } else { +#endif m->m_pkthdr.csum_flags = CSUM_TCP; m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum); th->th_sum = in_pseudo(ip->ip_src.s_addr, ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) + IPPROTO_TCP + len + optlen)); +#ifdef NETFLIX_TCP_O_UDP } +#endif /* IP version must be set here for ipv4/ipv6 checking later */ KASSERT(ip->ip_v == IPVERSION, ("%s: IP version incorrect: %d", __func__, ip->ip_v)); @@ -8559,6 +8537,10 @@ * retransmit. In persist state, just set snd_max. */ if (error == 0) { +/* if (TCPS_HAVEESTABLISHED(tp->t_state) && + (tp->t_flags & TF_SACK_PERMIT) && + tp->rcv_numsacks > 0) + tcp_clean_dsack_blocks(tp);*/ if (len == 0) counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1); else if (len == 1) { @@ -8574,15 +8556,19 @@ } } if (sub_from_prr && (error == 0)) { - rack->r_ctl.rc_prr_sndcnt -= len; + if (rack->r_ctl.rc_prr_sndcnt >= len) + rack->r_ctl.rc_prr_sndcnt -= len; + else + rack->r_ctl.rc_prr_sndcnt = 0; } sub_from_prr = 0; rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts, pass, rsm); if ((tp->t_flags & TF_FORCEDATA) == 0 || (rack->rc_in_persist == 0)) { +#ifdef NETFLIX_STATS tcp_seq startseq = tp->snd_nxt; - +#endif /* * Advance snd_nxt over sequence space of this segment. */ @@ -8613,17 +8599,6 @@ tp->t_acktime = ticks; } tp->snd_max = tp->snd_nxt; - /* - * Time this transmission if not a retransmission and - * not currently timing anything. - * This is only relevant in case of switching back to - * the base stack. - */ - if (tp->t_rtttime == 0) { - tp->t_rtttime = ticks; - tp->t_rtseq = startseq; - TCPSTAT_INC(tcps_segstimed); - } #ifdef NETFLIX_STATS if (!(tp->t_flags & TF_GPUTINPROG) && len) { tp->t_flags |= TF_GPUTINPROG; @@ -8996,9 +8971,7 @@ return (tcp_default_ctloutput(so, sopt, inp, tp)); break; } -#ifdef NETFLIX_STATS - tcp_log_socket_option(tp, sopt->sopt_name, optval, error); -#endif +/* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/ INP_WUNLOCK(inp); return (error); } @@ -9131,7 +9104,6 @@ .tfb_tcp_block_name = __XSTRING(STACKNAME), .tfb_tcp_output = rack_output, .tfb_tcp_do_segment = rack_do_segment, - .tfb_tcp_hpts_do_segment = rack_hpts_do_segment, .tfb_tcp_ctloutput = rack_ctloutput, .tfb_tcp_fb_init = rack_init, .tfb_tcp_fb_fini = rack_fini, @@ -9241,4 +9213,3 @@ MODULE_VERSION(MODNAME, 1); DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY); -MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1); Index: head/sys/netinet/tcp_stacks/rack_bbr_common.h =================================================================== --- head/sys/netinet/tcp_stacks/rack_bbr_common.h +++ head/sys/netinet/tcp_stacks/rack_bbr_common.h @@ -38,17 +38,8 @@ #define TCP_MSS_ACCT_SIZE 70 #define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF) +#define DUP_ACK_THRESHOLD 3 -/* Magic flags to tell whats cooking on the pacing wheel */ -#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */ -#define PACE_TMR_RACK 0x02 /* RACK timer running */ -#define PACE_TMR_TLP 0x04 /* TLP timer running */ -#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ -#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ -#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ -#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */ -#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) - /* Magic flags for tracing progress events */ #define PROGRESS_DROP 1 #define PROGRESS_UPDATE 2 @@ -61,8 +52,66 @@ #define USE_RTT_LOW 1 #define USE_RTT_AVG 2 +#define PACE_MAX_IP_BYTES 65536 +#define USECS_IN_SECOND 1000000 +#define MSEC_IN_SECOND 1000 +#define MS_IN_USEC 1000 +#define USEC_TO_MSEC(x) (x / MS_IN_USEC) +#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */ + #ifdef _KERNEL /* We have only 7 bits in rack so assert its true */ CTASSERT((PACE_TMR_MASK & 0x80) == 0); +#ifdef KERN_TLS +uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd); +#endif +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, + struct mbuf *m, int has_pkt); +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt); +uint32_t ctf_outstanding(struct tcpcb *tp); +uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked); +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, + struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, + int32_t * drop_hdrlen, int32_t * ret_val); +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t rstreason, int32_t tlen); +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp); + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp); + +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t * ret_val); + +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp); + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen); + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp); + +void +ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks); + +uint32_t +ctf_decay_count(uint32_t count, uint32_t decay_percentage); + #endif #endif Index: head/sys/netinet/tcp_stacks/rack_bbr_common.c =================================================================== --- head/sys/netinet/tcp_stacks/rack_bbr_common.c +++ head/sys/netinet/tcp_stacks/rack_bbr_common.c @@ -0,0 +1,859 @@ +/*- + * Copyright (c) 2016-2018 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Author: Randall Stewart + * This work is based on the ACM Queue paper + * BBR - Congestion Based Congestion Control + * and also numerous discussions with Neal, Yuchung and Van. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +/*#include "opt_kern_tls.h"*/ +#include +#include +#include +#ifdef TCP_HHOOK +#include +#endif +#include +#include +#include +#include +#include +#ifdef KERN_TLS +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif +#ifdef INET6 +#include +#endif +#include + +#include +#include +#include + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include +#include +#endif /* IPSEC */ + +#include +#include +#include + +#ifdef MAC +#include +#endif +#include "rack_bbr_common.h" + +/* + * Common TCP Functions - These are shared by borth + * rack and BBR. + */ + + +#ifdef KERN_TLS +uint32_t +ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) +{ + struct sbtls_info *tls; + uint32_t len; + +again: + tls = so->so_snd.sb_tls_info; + len = tls->sb_params.sb_maxlen; /* max tls payload */ + len += tls->sb_params.sb_tls_hlen; /* tls header len */ + len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ + if ((len * 4) > rwnd) { + /* + * Stroke this will suck counter and what + * else should we do Drew? From the + * TCP perspective I am not sure + * what should be done... + */ + if (tls->sb_params.sb_maxlen > 4096) { + tls->sb_params.sb_maxlen -= 4096; + if (tls->sb_params.sb_maxlen < 4096) + tls->sb_params.sb_maxlen = 4096; + goto again; + } + } + return (len); +} +#endif + +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) +{ + /* + * We are passed a raw change of mbuf packets + * that arrived in LRO. They are linked via + * the m_nextpkt link in the pkt-headers. + * + * We process each one by: + * a) saving off the next + * b) stripping off the ether-header + * c) formulating the arguments for + * the tfb_tcp_hpts_do_segment + * d) calling each mbuf to tfb_tcp_hpts_do_segment + * after adjusting the time to match the arrival time. + * Note that the LRO code assures no IP options are present. + * + * The symantics for calling tfb_tcp_hpts_do_segment are the + * following: + * 1) It returns 0 if all went well and you (the caller) need + * to release the lock. + * 2) If nxt_pkt is set, then the function will surpress calls + * to tfb_tcp_output() since you are promising to call again + * with another packet. + * 3) If it returns 1, then you must free all the packets being + * shipped in, the tcb has been destroyed (or about to be destroyed). + */ + struct mbuf *m_save; + struct ether_header *eh; + struct epoch_tracker et; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip = NULL; /* Keep compiler happy. */ +#endif + struct ifnet *ifp; + struct timeval tv; + int32_t retval, nxt_pkt, tlen, off; + uint16_t etype; + uint16_t drop_hdrlen; + uint8_t iptos, no_vn=0, bpf_req=0; + + /* + * This is a bit deceptive, we get the + * "info epoch" which is really the network + * epoch. This covers us on both any INP + * type change but also if the ifp goes + * away it covers us as well. + */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + if (m && m->m_pkthdr.rcvif) + ifp = m->m_pkthdr.rcvif; + else + ifp = NULL; + if (ifp) { + bpf_req = bpf_peers_present(ifp->if_bpf); + } else { + /* + * We probably should not work around + * but kassert, since lro alwasy sets rcvif. + */ + no_vn = 1; + goto skip_vnet; + } + CURVNET_SET(ifp->if_vnet); +skip_vnet: + while (m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + /* Now lets get the ether header */ + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + /* Let the BPF see the packet */ + if (bpf_req && ifp) + ETHER_BPF_MTAP(ifp, m); + m_adj(m, sizeof(*eh)); + /* Trim off the ethernet header */ + switch (etype) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { + m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); + if (m == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + tlen = ntohs(ip6->ip6_plen); + drop_hdrlen = sizeof(*ip6); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_pseudo(ip6, tlen, + IPPROTO_TCP, m->m_pkthdr.csum_data); + th->th_sum ^= 0xffff; + } else + th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + m_freem(m); + goto skipped_pkt; + } + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) + == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip + 1); + drop_hdrlen = sizeof(*ip); + iptos = ip->ip_tos; + tlen = ntohs(ip->ip_len) - sizeof(struct ip); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + int len; + struct ipovly *ipov = (struct ipovly *)ip; + /* + * Checksum extended TCP header and data. + */ + len = drop_hdrlen + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = htons(tlen); + th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(len); + /* Reset TOS bits */ + ip->ip_tos = iptos; + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + } + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + break; + } +#endif + } + /* + * Convert TCP protocol specific fields to host format. + */ + tcp_fields_to_host(th); + + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + TCPSTAT_INC(tcps_rcvbadoff); + m_freem(m); + goto skipped_pkt; + } + tlen -= off; + drop_hdrlen += off; + /* + * Now lets setup the timeval to be when we should + * have been called (if we can). + */ + m->m_pkthdr.lro_nsegs = 1; + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + /* Now what about next packet? */ + if (m_save || has_pkt) + nxt_pkt = 1; + else + nxt_pkt = 0; + retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, + iptos, nxt_pkt, &tv); + if (retval) { + /* We lost the lock and tcb probably */ + m = m_save; + while (m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return (retval); + } +skipped_pkt: + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return (retval); +} + +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) +{ + struct mbuf *m; + + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + m = tp->t_in_pkt; + tp->t_in_pkt = NULL; + tp->t_tail_pkt = NULL; + if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { + /* We lost the tcpcb (maybe a RST came in)? */ + return (1); + } + } + return (0); +} + +uint32_t +ctf_outstanding(struct tcpcb *tp) +{ + return (tp->snd_max - tp->snd_una); +} + +uint32_t +ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) +{ + if (rc_sacked <= ctf_outstanding(tp)) + return (ctf_outstanding(tp) - rc_sacked); + else { + /* TSNH */ +#ifdef INVARIANTS + panic("tp:%p rc_sacked:%d > out:%d", + tp, rc_sacked, ctf_outstanding(tp)); +#endif + return (0); + } +} + +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); +} + +/* + * ctf_drop_checks returns 1 for you should not proceed. It places + * in ret_val what should be returned 1/0 by the caller. The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +{ + int32_t todrop; + int32_t thflags; + int32_t tlen; + + thflags = *thf; + tlen = *tlenp; + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + /* + * DSACK - add SACK block for dropped range + */ + if (tp->t_flags & TF_SACK_PERMIT) { + tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); + /* + * ACK now, as the next in-sequence segment + * will clear the DSACK block again + */ + tp->t_flags |= TF_ACKNOW; + } + *drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + /* + * If segment ends after window, drop trailing data (and PUSH and + * FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment and + * ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + return (1); + } + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH | TH_FIN); + } + *thf = thflags; + *tlenp = tlen; + return (0); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) +{ + /* + * Generate an ACK dropping incoming segment if it occupies sequence + * space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all paths to this + * code happen after packets containing RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the segment + * we received passes the SYN-RECEIVED ACK test. If it fails send a + * RST. This breaks the loop in the "LAND" DoS attack, and also + * prevents an ACK storm between two listening ports that have been + * sent forged SYN segments, each with the source address of the + * other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max))) { + *ret_val = 1; + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return; + } else + *ret_val = 0; + tp->t_flags |= TF_ACKNOW; + if (m) + m_freem(m); +} + +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp) +{ + + /* + * Drop space held by incoming segment and return. + */ + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + if (m) + m_freem(m); +} + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) +{ + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in + * window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should test against + * last_ack_sent instead of rcv_nxt. Note 2: we handle special case + * of closed window, not covered by the RFC. + */ + int dropped = 0; + + if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + (tp->last_ack_sent == th->th_seq) || + (tp->rcv_nxt == th->th_seq) || + ((tp->last_ack_sent - 1) == th->th_seq)) { + TCPSTAT_INC(tcps_drops); + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + dropped = 1; + ctf_do_drop(m, tp); + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + } + } else { + m_freem(m); + } + return (dropped); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) +{ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + *ret_val = 1; + ctf_do_drop(m, tp); + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + *ret_val = 0; + ctf_do_drop(m, NULL); + } +} + +/* + * bbr_ts_check returns 1 for you should not proceed, the state + * machine should return. It places in ret_val what should + * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, + int32_t tlen, int32_t thflags, int32_t * ret_val) +{ + + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates ts_recent, + * the age will be reset later and ts_recent will get a + * valid value. If it does not, setting ts_recent to zero + * will at least satisfy the requirement that zero be placed + * in the timestamp echo reply when ts_recent isn't valid. + * The age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be dropped + * when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + *ret_val = 0; + if (tlen) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + } else { + ctf_do_drop(m, NULL); + } + return (1); + } + return (0); +} + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp) +{ + int32_t win; + + /* + * Calculate amount of space in receive window, and then do TCP + * input processing. Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); +} + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + + if (tp->t_inpcb) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + } + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); +} + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp) +{ + int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We only consider fixed options that we would send every + * time I.e. SACK is not considered. + * + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + +void +ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex8 = num_sack_blks; + if (num_sack_blks > 0) { + log.u_bbr.flex1 = sack_blocks[0].start; + log.u_bbr.flex2 = sack_blocks[0].end; + } + if (num_sack_blks > 1) { + log.u_bbr.flex3 = sack_blocks[1].start; + log.u_bbr.flex4 = sack_blocks[1].end; + } + if (num_sack_blks > 2) { + log.u_bbr.flex5 = sack_blocks[2].start; + log.u_bbr.flex6 = sack_blocks[2].end; + } + if (num_sack_blks > 3) { + log.u_bbr.applimited = sack_blocks[3].start; + log.u_bbr.pkts_out = sack_blocks[3].end; + } + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_SACK_FILTER_RES, 0, + 0, &log, false, &tv); + } +} + +uint32_t +ctf_decay_count(uint32_t count, uint32_t decay) +{ + /* + * Given a count, decay it by a set percentage. The + * percentage is in thousands i.e. 100% = 1000, + * 19.3% = 193. + */ + uint64_t perc_count, decay_per; + uint32_t decayed_count; + if (decay > 1000) { + /* We don't raise it */ + return (count); + } + perc_count = count; + decay_per = decay; + perc_count *= decay_per; + perc_count /= 1000; + /* + * So now perc_count holds the + * count decay value. + */ + decayed_count = count - (uint32_t)perc_count; + return (decayed_count); +} Index: head/sys/netinet/tcp_var.h =================================================================== --- head/sys/netinet/tcp_var.h +++ head/sys/netinet/tcp_var.h @@ -102,7 +102,8 @@ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ - bits_spare : 4; + t_fin_is_rst: 1, /* Are fin's treated as resets */ + bits_spare : 3; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -271,6 +272,11 @@ void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); + int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); + int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, Index: head/sys/sys/mbuf.h =================================================================== --- head/sys/sys/mbuf.h +++ head/sys/sys/mbuf.h @@ -407,6 +407,7 @@ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ +#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */