Index: modules/tcp/rack/Makefile =================================================================== --- modules/tcp/rack/Makefile +++ modules/tcp/rack/Makefile @@ -6,7 +6,7 @@ STACKNAME= rack KMOD= tcp_${STACKNAME} -SRCS= rack.c sack_filter.c +SRCS= rack.c sack_filter.c rack_bbr_common.c SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h SRCS+= opt_tcpdebug.h Index: netinet/in_pcb.h =================================================================== --- netinet/in_pcb.h +++ netinet/in_pcb.h @@ -759,7 +759,9 @@ #define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */ #define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */ #define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */ - +#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */ +#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */ +#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */ /* * Flags passed to in_pcblookup*() functions. */ Index: netinet/tcp.h =================================================================== --- netinet/tcp.h +++ netinet/tcp.h @@ -201,9 +201,8 @@ #define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */ #define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */ #define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */ -#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */ #define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */ -#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */ +#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */ #define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */ #define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */ #define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */ @@ -211,14 +210,18 @@ #define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */ #define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */ #define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */ -#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */ -#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */ -#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */ +#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */ +#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */ +#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */ +#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */ +#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */ +#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */ #define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */ #define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */ #define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */ #define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */ -#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */ +#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */ +#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */ #define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */ #define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */ #define TCP_BBR_PACE_PER_SEC 1086 @@ -227,11 +230,12 @@ #define TCP_BBR_PACE_SEG_MIN 1089 #define TCP_BBR_PACE_CROSS 1090 #define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ -#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */ #define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */ #define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */ +#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */ #define TCP_RACK_TLP_USE 1095 #define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */ +#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */ #define TCP_BBR_EXTRA_GAIN 1097 #define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */ #define TCP_BBR_RETRAN_WTSO 1099 @@ -238,6 +242,15 @@ #define TCP_DATA_AFTER_CLOSE 1100 #define TCP_BBR_PROBE_RTT_GAIN 1101 #define TCP_BBR_PROBE_RTT_LEN 1102 +#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */ +#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */ +#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */ +#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */ +#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */ +#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */ +#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */ +#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */ +#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */ /* Start of reserved space for third-party user-settable options. */ Index: netinet/tcp_hpts.h =================================================================== --- netinet/tcp_hpts.h +++ netinet/tcp_hpts.h @@ -45,107 +45,70 @@ /* Number of useconds in a hpts tick */ #define HPTS_TICKS_PER_USEC 10 -#define HPTS_MS_TO_SLOTS(x) (x * 100) +#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1) #define HPTS_USEC_TO_SLOTS(x) ((x+9) /10) #define HPTS_USEC_IN_SEC 1000000 #define HPTS_MSEC_IN_SEC 1000 #define HPTS_USEC_IN_MSEC 1000 -#define DEFAULT_HPTS_LOG 3072 -/* - * Log flags consist of - * 7f 7f 1 1 bits - * p_cpu | p_num | INPUT_ACTIVE | HPTS_ACTIVE - * - * So for example cpu 10, number 10 would with - * input active would show up as: - * p_flags = 0001010 0001010 1 0 - * - * p_flags = 0x142a - */ -#define HPTS_HPTS_ACTIVE 0x01 -#define HPTS_INPUT_ACTIVE 0x02 - -#define HPTSLOG_IMMEDIATE 1 -#define HPTSLOG_INSERT_NORMAL 2 -#define HPTSLOG_INSERT_SLEEPER 3 -#define HPTSLOG_SLEEP_AFTER 4 -#define HPTSLOG_SLEEP_BEFORE 5 -#define HPTSLOG_INSERTED 6 -#define HPTSLOG_WAKEUP_HPTS 7 -#define HPTSLOG_SETTORUN 8 -#define HPTSLOG_HPTSI 9 -#define HPTSLOG_TOLONG 10 -#define HPTSLOG_AWAKENS 11 -#define HPTSLOG_TIMESOUT 12 -#define HPTSLOG_SLEEPSET 13 -#define HPTSLOG_WAKEUP_INPUT 14 -#define HPTSLOG_RESCHEDULE 15 -#define HPTSLOG_AWAKE 16 -#define HPTSLOG_INP_DONE 17 - -struct hpts_log { - struct inpcb *inp; - int32_t event; - uint32_t cts; - int32_t line; - uint32_t ticknow; - uint32_t t_paceslot; - uint32_t t_hptsreq; - uint32_t p_curtick; - uint32_t p_prevtick; - uint32_t slot_req; - uint32_t p_on_queue_cnt; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t p_hpts_sleep_time; - uint16_t p_flags; - uint8_t p_onhpts; - uint8_t p_oninput; - uint8_t is_notempty; -}; - struct hpts_diag { - uint32_t p_hpts_active; - uint32_t p_nxt_slot; - uint32_t p_cur_slot; - uint32_t slot_req; - uint32_t inp_hptsslot; - uint32_t slot_now; - uint32_t have_slept; - uint32_t hpts_sleep_time; - uint32_t yet_to_sleep; - uint32_t need_new_to; - int32_t co_ret; - uint8_t p_on_min_sleep; + uint32_t p_hpts_active; /* bbr->flex7 x */ + uint32_t p_nxt_slot; /* bbr->flex1 x */ + uint32_t p_cur_slot; /* bbr->flex2 x */ + uint32_t p_prev_slot; /* bbr->delivered */ + uint32_t p_runningtick; /* bbr->inflight */ + uint32_t slot_req; /* bbr->flex3 x */ + uint32_t inp_hptsslot; /* bbr->flex4 x */ + uint32_t slot_remaining; /* bbr->flex5 x */ + uint32_t have_slept; /* bbr->epoch x */ + uint32_t hpts_sleep_time; /* bbr->applimited x */ + uint32_t yet_to_sleep; /* bbr->lt_epoch x */ + uint32_t need_new_to; /* bbr->flex6 x */ + uint32_t wheel_tick; /* bbr->bw_inuse x */ + uint32_t maxticks; /* bbr->delRate x */ + uint32_t wheel_cts; /* bbr->rttProp x */ + int32_t co_ret; /* bbr->pkts_out x */ + uint32_t p_curtick; /* upper bbr->cur_del_rate */ + uint32_t p_lasttick; /* lower bbr->cur_del_rate */ + uint8_t p_on_min_sleep; /* bbr->flex8 x */ }; +/* Magic flags to tell whats cooking on the pacing wheel */ +#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */ +#define PACE_TMR_RACK 0x02 /* RACK timer running */ +#define PACE_TMR_TLP 0x04 /* TLP timer running */ +#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ +#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ +#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ +#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */ +#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) + #ifdef _KERNEL /* Each hpts has its own p_mtx which is used for locking */ struct tcp_hpts_entry { /* Cache line 0x00 */ struct mtx p_mtx; /* Mutex for hpts */ - uint32_t p_hpts_active; /* Flag that says hpts is awake */ - uint32_t p_curtick; /* Current tick in 10 us the hpts is at */ - uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */ + uint16_t p_hpts_active; /* Flag that says hpts is awake */ + uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */ + uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */ + uint32_t p_curtick; /* Tick in 10 us the hpts is going to */ + uint32_t p_runningtick; /* Current tick we are at if we are running */ + uint32_t p_prev_slot; /* Previous slot we were on */ uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */ uint32_t p_nxt_slot; /* The next slot outside the current range of * slots that the hpts is running on. */ int32_t p_on_queue_cnt; /* Count on queue in this hpts */ - uint32_t enobuf_cnt; - uint16_t p_log_at; + uint32_t p_lasttick; /* Last tick before the current one */ uint8_t p_direct_wake :1, /* boolean */ - p_log_wrapped :1, /* boolean */ - p_on_min_sleep:1; /* boolean */ - uint8_t p_fill; + p_on_min_sleep:1, /* boolean */ + p_avail:6; + uint8_t p_fill[3]; /* Fill to 32 bits */ /* Cache line 0x40 */ void *p_inp; struct hptsh p_input; /* For the tcp-input runner */ /* Hptsi wheel */ struct hptsh *p_hptss; - struct hpts_log *p_log; - uint32_t p_logsize; int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */ uint32_t hit_no_enobuf; uint32_t p_dyn_adjust; @@ -236,13 +199,9 @@ int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line); #define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__); -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos); int -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line); -#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__) +__tcp_queue_to_input(struct inpcb *inp, int32_t line); +#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__) uint16_t tcp_hpts_delayedby(struct inpcb *inp); Index: netinet/tcp_hpts.c =================================================================== --- netinet/tcp_hpts.c +++ netinet/tcp_hpts.c @@ -37,7 +37,7 @@ * pacing packets out onto the wire. It can be used in two ways * by a given TCP stack (and those two methods can be used simultaneously). * - * First, and probably the main thing its used by Rack and BBR for, it can + * First, and probably the main thing its used by Rack and BBR, it can * be used to call tcp_output() of a transport stack at some time in the future. * The normal way this is done is that tcp_output() of the stack schedules * itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The @@ -59,42 +59,57 @@ * to prevent output processing until the time alotted has gone by. * Of course this is a bare bones example and the stack will probably * have more consideration then just the above. - * - * Now the tcp_hpts system will call tcp_output in one of two forms, - * it will first check to see if the stack as defined a - * tfb_tcp_output_wtime() function, if so that is the routine it - * will call, if that function is not defined then it will call the - * tfb_tcp_output() function. The only difference between these - * two calls is that the former passes the time in to the function - * so the function does not have to access the time (which tcp_hpts - * already has). What these functions do is of course totally up - * to the individual tcp stack. - * + * * Now the second function (actually two functions I guess :D) * the tcp_hpts system provides is the ability to either abort - * a connection (later) or process input on a connection. - * Why would you want to do this? To keep processor locality. + * a connection (later) or process input on a connection. + * Why would you want to do this? To keep processor locality + * and or not have to worry about untangling any recursive + * locks. The input function now is hooked to the new LRO + * system as well. * - * So in order to use the input redirection function the - * stack changes its tcp_do_segment() routine to instead - * of process the data call the function: + * In order to use the input redirection function the + * tcp stack must define an input function for + * tfb_do_queued_segments(). This function understands + * how to dequeue a array of packets that were input and + * knows how to call the correct processing routine. * - * tcp_queue_pkt_to_input() + * Locking in this is important as well so most likely the + * stack will need to define the tfb_do_segment_nounlock() + * splitting tfb_do_segment() into two parts. The main processing + * part that does not unlock the INP and returns a value of 1 or 0. + * It returns 0 if all is well and the lock was not released. It + * returns 1 if we had to destroy the TCB (a reset received etc). + * The remains of tfb_do_segment() then become just a simple call + * to the tfb_do_segment_nounlock() function and check the return + * code and possibly unlock. + * + * The stack must also set the flag on the INP that it supports this + * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes + * this flag as well and will queue packets when it is set. + * There are other flags as well INP_MBUF_QUEUE_READY and + * INP_DONT_SACK_QUEUE. The first flag tells the LRO code + * that we are in the pacer for output so there is no + * need to wake up the hpts system to get immediate + * input. The second tells the LRO code that its okay + * if a SACK arrives you can still defer input and let + * the current hpts timer run (this is usually set when + * a rack timer is up so we know SACK's are happening + * on the connection already and don't want to wakeup yet). * - * You will note that the arguments to this function look - * a lot like tcp_do_segments's arguments. This function - * will assure that the tcp_hpts system will - * call the functions tfb_tcp_hpts_do_segment() from the - * correct CPU. Note that multiple calls can get pushed - * into the tcp_hpts system this will be indicated by - * the next to last argument to tfb_tcp_hpts_do_segment() - * (nxt_pkt). If nxt_pkt is a 1 then another packet is - * coming. If nxt_pkt is a 0 then this is the last call - * that the tcp_hpts system has available for the tcp stack. - * - * The other point of the input system is to be able to safely - * drop a tcp connection without worrying about the recursive - * locking that may be occuring on the INP_WLOCK. So if + * There is a common functions within the rack_bbr_common code + * version i.e. ctf_do_queued_segments(). This function + * knows how to take the input queue of packets from + * tp->t_in_pkts and process them digging out + * all the arguments, calling any bpf tap and + * calling into tfb_do_segment_nounlock(). The common + * function (ctf_do_queued_segments()) requires that + * you have defined the tfb_do_segment_nounlock() as + * described above. + * + * The second feature of the input side of hpts is the + * dropping of a connection. This is due to the way that + * locking may have occured on the INP_WLOCK. So if * a stack wants to drop a connection it calls: * * tcp_set_inp_to_drop(tp, ETIMEDOUT) @@ -168,8 +183,6 @@ MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts"); #ifdef RSS -#include -#include static int tcp_bind_threads = 1; #else static int tcp_bind_threads = 2; @@ -176,16 +189,12 @@ #endif TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads); -static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG; - -TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size); - static struct tcp_hptsi tcp_pace; static void tcp_wakehpts(struct tcp_hpts_entry *p); static void tcp_wakeinput(struct tcp_hpts_entry *p); static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv); -static void tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick); +static void tcp_hptsi(struct tcp_hpts_entry *hpts); static void tcp_hpts_thread(void *ctx); static void tcp_init_hptsi(void *st); @@ -204,8 +213,6 @@ } \ } while (0) -static int32_t logging_on = 0; -static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2); static int32_t tcp_hpts_precision = 120; struct hpts_domain_info { @@ -219,10 +226,6 @@ &tcp_hpts_precision, 120, "Value for PRE() precision of callout"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW, - &logging_on, 0, - "Turn on logging if compiled in"); - counter_u64_t hpts_loops; SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD, @@ -233,21 +236,16 @@ SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD, &back_tosleep, "Number of times hpts found no tcbs"); -static int32_t in_newts_every_tcb = 0; +counter_u64_t combined_wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW, - &in_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for input"); -static int32_t in_ts_percision = 0; +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD, + &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW, - &in_ts_percision, 0, - "Do we use percise timestamp for clients on input"); -static int32_t out_newts_every_tcb = 0; +counter_u64_t wheel_wrap; -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW, - &out_newts_every_tcb, 0, - "Do we have a new cts every tcb we process for output"); +SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD, + &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap"); + static int32_t out_ts_percision = 0; SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW, @@ -254,9 +252,33 @@ &out_ts_percision, 0, "Do we use a percise timestamp for every output cts"); -SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW, +#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2) + +static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED; + +static int +sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS) +{ + int error; + uint32_t new; + + new = hpts_sleep_max; + error = sysctl_handle_int(oidp, &new, 0, req); + if (error == 0 && req->newptr) { + if ((new < (NUM_OF_HPTSI_SLOTS/4)) || + (new > HPTS_MAX_SLEEP_ALLOWED)) + error = EINVAL; + else + hpts_sleep_max = new; + } + return (error); +} + +SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep, + CTLTYPE_UINT | CTLFLAG_RW, &hpts_sleep_max, 0, - "The maximum time the hpts will sleep <1 - 254>"); + &sysctl_net_inet_tcp_hpts_max_sleep, "IU", + "Maximum time hpts will sleep"); SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW, &tcp_min_hptsi_time, 0, @@ -267,56 +289,6 @@ "Do we have the callout call directly to the hpts?"); static void -__tcp_hpts_log_it(struct tcp_hpts_entry *hpts, struct inpcb *inp, int event, uint32_t slot, - uint32_t ticknow, int32_t line) -{ - struct hpts_log *pl; - - HPTS_MTX_ASSERT(hpts); - if (hpts->p_log == NULL) - return; - pl = &hpts->p_log[hpts->p_log_at]; - hpts->p_log_at++; - if (hpts->p_log_at >= hpts->p_logsize) { - hpts->p_log_at = 0; - hpts->p_log_wrapped = 1; - } - pl->inp = inp; - if (inp) { - pl->t_paceslot = inp->inp_hptsslot; - pl->t_hptsreq = inp->inp_hpts_request; - pl->p_onhpts = inp->inp_in_hpts; - pl->p_oninput = inp->inp_in_input; - } else { - pl->t_paceslot = 0; - pl->t_hptsreq = 0; - pl->p_onhpts = 0; - pl->p_oninput = 0; - } - pl->is_notempty = 1; - pl->event = event; - pl->line = line; - pl->cts = tcp_get_usecs(NULL); - pl->p_curtick = hpts->p_curtick; - pl->p_prevtick = hpts->p_prevtick; - pl->p_on_queue_cnt = hpts->p_on_queue_cnt; - pl->ticknow = ticknow; - pl->slot_req = slot; - pl->p_nxt_slot = hpts->p_nxt_slot; - pl->p_cur_slot = hpts->p_cur_slot; - pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time; - pl->p_flags = (hpts->p_cpu & 0x7f); - pl->p_flags <<= 7; - pl->p_flags |= (hpts->p_num & 0x7f); - pl->p_flags <<= 2; - if (hpts->p_hpts_active) { - pl->p_flags |= HPTS_HPTS_ACTIVE; - } -} - -#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__) - -static void hpts_timeout_swi(void *arg) { struct tcp_hpts_entry *hpts; @@ -347,12 +319,6 @@ /* We are not on the hpts? */ panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp); } - if (TAILQ_EMPTY(head) && - (hpts->p_on_queue_cnt != 0)) { - /* We should not be empty with a queue count */ - panic("%s hpts:%p hpts bucket empty but cnt:%d", - __FUNCTION__, hpts, hpts->p_on_queue_cnt); - } #endif TAILQ_REMOVE(head, inp, inp_hpts); hpts->p_on_queue_cnt--; @@ -456,58 +422,13 @@ in_pcbref(inp); } -static int -sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS) -{ - struct tcp_hpts_entry *hpts; - size_t sz; - int32_t logging_was, i; - int32_t error = 0; - - /* - * HACK: Turn off logging so no locks are required this really needs - * a memory barrier :) - */ - logging_was = logging_on; - logging_on = 0; - if (!req->oldptr) { - /* How much? */ - sz = 0; - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - sz += (sizeof(struct hpts_log) * hpts->p_logsize); - } - error = SYSCTL_OUT(req, 0, sz); - } else { - for (i = 0; i < tcp_pace.rp_num_hptss; i++) { - hpts = tcp_pace.rp_ent[i]; - if (hpts->p_log == NULL) - continue; - if (hpts->p_log_wrapped) - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - else - sz = (sizeof(struct hpts_log) * hpts->p_log_at); - error = SYSCTL_OUT(req, hpts->p_log, sz); - } - } - logging_on = logging_was; - return error; -} - -SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE, - 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log"); - - static void tcp_wakehpts(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -515,10 +436,9 @@ tcp_wakeinput(struct tcp_hpts_entry *hpts) { HPTS_MTX_ASSERT(hpts); - swi_sched(hpts->ie_cookie, 0); - if (hpts->p_hpts_active == 2) { - /* Rare sleeping on a ENOBUF */ - wakeup_one(hpts); + if (hpts->p_hpts_wake_scheduled == 0) { + hpts->p_hpts_wake_scheduled = 1; + swi_sched(hpts->ie_cookie, 0); } } @@ -648,8 +568,8 @@ * Valid values in the flags are * HPTS_REMOVE_OUTPUT - remove from the output of the hpts. * HPTS_REMOVE_INPUT - remove from the input of the hpts. - * Note that you can or both values together and get two - * actions. + * Note that you can use one or both values together + * and get two actions. */ void __tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line) @@ -670,53 +590,198 @@ } static inline int -hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus) +hpts_tick(uint32_t wheel_tick, uint32_t plus) { - return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS); + /* + * Given a slot on the wheel, what slot + * is that plus ticks out? + */ + KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick)); + return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS); } +static inline int +tick_to_wheel(uint32_t cts_in_wticks) +{ + /* + * Given a timestamp in wheel ticks (10usec inc's) + * map it to our limited space wheel. + */ + return (cts_in_wticks % NUM_OF_HPTSI_SLOTS); +} + +static inline int +hpts_ticks_diff(int prev_tick, int tick_now) +{ + /* + * Given two ticks that are someplace + * on our wheel. How far are they apart? + */ + if (tick_now > prev_tick) + return(tick_now - prev_tick); + else if (tick_now == prev_tick) + /* + * Special case, same means we can go all of our + * wheel less one slot. + */ + return (NUM_OF_HPTSI_SLOTS-1); + else + return((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now); +} + +/* + * Given a tick on the wheel that is the current time + * mapped to the wheel (wheel_tick), what is the maximum + * distance forward that can be obtained without + * wrapping past either prev_tick or running_tick + * depending on the htps state? Also if passed + * a uint32_t *, fill it with the tick location. + * + * Note if you do not give this function the current + * time (that you think it is) mapped to the wheel + * then the results will not be what you expect and + * could lead to invalid inserts. + */ +static inline int32_t +max_ticks_available(struct tcp_hpts_entry *hpts, uint32_t wheel_tick, uint32_t *target_tick) +{ + uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel; + + if ((hpts->p_hpts_active == 1) && + (hpts->p_wheel_complete == 0)) { + end_tick = hpts->p_runningtick; + /* Back up one tick */ + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + } else { + /* + * For the case where we are + * not active, or we have + * completed the pass over + * the wheel, we can use the + * prev tick and subtract one from it. This puts us + * as far out as possible on the wheel. + */ + end_tick = hpts->p_prev_slot; + if (end_tick == 0) + end_tick = NUM_OF_HPTSI_SLOTS - 1; + else + end_tick--; + if (target_tick) + *target_tick = end_tick; + /* + * Now we have close to the full wheel left minus the + * time it has been since the pacer went to sleep. Note + * that wheel_tick, passed in, should be the current time + * from the perspective of the caller, mapped to the wheel. + */ + if (hpts->p_prev_slot != wheel_tick) + dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + else + dis_to_travel = 1; + /* + * dis_to_travel in this case is the space from when the + * pacer stopped (p_prev_slot) and where our wheel_tick + * is now. To know how many slots we can put it in we + * subtract from the wheel size. We would not want + * to place something after p_prev_slot or it will + * get ran too soon. + */ + return(NUM_OF_HPTSI_SLOTS - dis_to_travel); + } + /* + * So how many slots are open between p_runningtick -> p_cur_slot + * that is what is currently un-available for insertion. Special + * case when we are at the last slot, this gets 1, so that + * the answer to how many slots are available is all but 1. + */ + if (hpts->p_runningtick == hpts->p_cur_slot) + dis_to_travel = 1; + else + dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + /* + * How long has the pacer been running? + */ + if (hpts->p_cur_slot != wheel_tick) { + /* The pacer is a bit late */ + pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick); + } else { + /* The pacer is right on time, now == pacers start time */ + pacer_to_now = 0; + } + /* + * To get the number left we can insert into we simply + * subract the distance the pacer has to run from how + * many slots there are. + */ + avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel; + /* + * Now how many of those we will eat due to the pacer's + * time (p_cur_slot) of start being behind the + * real time (wheel_tick)? + */ + if (avail_on_wheel <= pacer_to_now) { + /* + * Wheel wrap, we can't fit on the wheel, that + * is unusual the system must be way overloaded! + * Insert into the assured tick, and return special + * "0". + */ + counter_u64_add(combined_wheel_wrap, 1); + *target_tick = hpts->p_nxt_slot; + return (0); + } else { + /* + * We know how many slots are open + * on the wheel (the reverse of what + * is left to run. Take away the time + * the pacer started to now (wheel_tick) + * and that tells you how many slots are + * open that can be inserted into that won't + * be touched by the pacer until later. + */ + return (avail_on_wheel - pacer_to_now); + } +} + static int tcp_queue_to_hpts_immediate_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line, int32_t noref) { - int32_t need_wake = 0; - uint32_t ticknow = 0; - + uint32_t need_wake = 0; + HPTS_MTX_ASSERT(hpts); if (inp->inp_in_hpts == 0) { /* Ok we need to set it on the hpts in the current slot */ - if (hpts->p_hpts_active == 0) { - /* A sleeping hpts we want in next slot to run */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0, - hpts_tick(hpts, 1)); - } - inp->inp_hptsslot = hpts_tick(hpts, 1); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow); - } - need_wake = 1; + inp->inp_hpts_request = 0; + if ((hpts->p_hpts_active == 0) || + (hpts->p_wheel_complete)) { + /* + * A sleeping hpts we want in next slot to run + * note that in this state p_prev_slot == p_cur_slot + */ + inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1); + if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0)) + need_wake = 1; } else if ((void *)inp == hpts->p_inp) { /* + * The hpts system is running and the caller + * was awoken by the hpts system. * We can't allow you to go into the same slot we - * are in. We must put you out. + * are in (we don't want a loop :-D). */ inp->inp_hptsslot = hpts->p_nxt_slot; } else - inp->inp_hptsslot = hpts->p_cur_slot; + inp->inp_hptsslot = hpts->p_runningtick; hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - inp->inp_hpts_request = 0; - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0); - } if (need_wake) { /* * Activate the hpts if it is sleeping and its * timeout is not 1. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); } @@ -737,141 +802,129 @@ return (ret); } +#ifdef INVARIANTS static void -tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, uint32_t cts, int32_t line, - struct hpts_diag *diag, int32_t noref) +check_if_slot_would_be_wrong(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t inp_hptsslot, int line) { - int32_t need_new_to = 0; - int32_t need_wakeup = 0; - uint32_t largest_slot; - uint32_t ticknow = 0; - uint32_t slot_calc; + /* + * Sanity checks for the pacer with invariants + * on insert. + */ + if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS) + panic("hpts:%p inp:%p slot:%d > max", + hpts, inp, inp_hptsslot); + if ((hpts->p_hpts_active) && + (hpts->p_wheel_complete == 0)) { + /* + * If the pacer is processing a arc + * of the wheel, we need to make + * sure we are not inserting within + * that arc. + */ + int distance, yet_to_run; + distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot); + if (hpts->p_runningtick != hpts->p_cur_slot) + yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot); + else + yet_to_run = 0; /* processing last slot */ + if (yet_to_run > distance) { + panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d", + hpts, inp, inp_hptsslot, + distance, yet_to_run, + hpts->p_runningtick, hpts->p_cur_slot); + } + } +} +#endif + +static void +tcp_hpts_insert_locked(struct tcp_hpts_entry *hpts, struct inpcb *inp, uint32_t slot, int32_t line, + struct hpts_diag *diag, struct timeval *tv) +{ + uint32_t need_new_to = 0; + uint32_t wheel_cts, last_tick; + int32_t wheel_tick, maxticks; + int8_t need_wakeup = 0; + HPTS_MTX_ASSERT(hpts); if (diag) { memset(diag, 0, sizeof(struct hpts_diag)); diag->p_hpts_active = hpts->p_hpts_active; + diag->p_prev_slot = hpts->p_prev_slot; + diag->p_runningtick = hpts->p_runningtick; diag->p_nxt_slot = hpts->p_nxt_slot; diag->p_cur_slot = hpts->p_cur_slot; + diag->p_curtick = hpts->p_curtick; + diag->p_lasttick = hpts->p_lasttick; diag->slot_req = slot; + diag->p_on_min_sleep = hpts->p_on_min_sleep; + diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((inp->inp_in_hpts == 0) || noref) { - inp->inp_hpts_request = slot; + if (inp->inp_in_hpts == 0) { if (slot == 0) { /* Immediate */ - tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref); + tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0); return; } - if (hpts->p_hpts_active) { - /* - * Its slot - 1 since nxt_slot is the next tick that - * will go off since the hpts is awake - */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0); - } - /* - * We want to make sure that we don't place a inp in - * the range of p_cur_slot <-> p_nxt_slot. If we - * take from p_nxt_slot to the end, plus p_cur_slot - * and then take away 2, we will know how many is - * the max slots we can use. - */ - if (hpts->p_nxt_slot > hpts->p_cur_slot) { - /* - * Non-wrap case nxt_slot <-> cur_slot we - * don't want to land in. So the diff gives - * us what is taken away from the number of - * slots. + /* Get the current time relative to the wheel */ + wheel_cts = tcp_tv_to_hptstick(tv); + /* Map it onto the wheel */ + wheel_tick = tick_to_wheel(wheel_cts); + /* Now what's the max we can place it at? */ + maxticks = max_ticks_available(hpts, wheel_tick, &last_tick); + if (diag) { + diag->wheel_tick = wheel_tick; + diag->maxticks = maxticks; + diag->wheel_cts = wheel_cts; + } + if (maxticks == 0) { + /* The pacer is in a wheel wrap behind, yikes! */ + if (slot > 1) { + /* + * Reduce by 1 to prevent a forever loop in + * case something else is wrong. Note this + * probably does not hurt because the pacer + * if its true is so far behind we will be + * > 1second late calling anyway. */ - largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot); - } else if (hpts->p_nxt_slot == hpts->p_cur_slot) { - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - } else { - /* - * Wrap case so the diff gives us the number - * of slots that we can land in. - */ - largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot; + slot--; } - /* - * We take away two so we never have a problem (20 - * usec's) out of 1024000 usecs - */ - largest_slot -= 2; - if (inp->inp_hpts_request > largest_slot) { - /* - * Restrict max jump of slots and remember - * leftover - */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* This one will run when we hit it */ - inp->inp_hpts_request = 0; - } - if (hpts->p_nxt_slot == hpts->p_cur_slot) - slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS; - else - slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS; - if (slot_calc == hpts->p_cur_slot) { + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request = slot; + } else if (maxticks >= slot) { + /* It all fits on the wheel */ + inp->inp_hpts_request = 0; + inp->inp_hptsslot = hpts_tick(wheel_tick, slot); + } else { + /* It does not fit */ + inp->inp_hpts_request = slot - maxticks; + inp->inp_hptsslot = last_tick; + } + if (diag) { + diag->slot_remaining = inp->inp_hpts_request; + diag->inp_hptsslot = inp->inp_hptsslot; + } #ifdef INVARIANTS - /* TSNH */ - panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n", - hpts, slot_calc, slot, largest_slot); + check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line); #endif - if (slot_calc) - slot_calc--; - else - slot_calc = NUM_OF_HPTSI_SLOTS - 1; - } - inp->inp_hptsslot = slot_calc; - if (diag) { - diag->inp_hptsslot = inp->inp_hptsslot; - } - } else { + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0); + if ((hpts->p_hpts_active == 0) && + (inp->inp_hpts_request == 0) && + (hpts->p_on_min_sleep == 0)) { /* - * The hpts is sleeping, we need to figure out where + * The hpts is sleeping and not on a minimum + * sleep time, we need to figure out where * it will wake up at and if we need to reschedule * its time-out. */ uint32_t have_slept, yet_to_sleep; - uint32_t slot_now; - struct timeval tv; - ticknow = tcp_gethptstick(&tv); - slot_now = ticknow % NUM_OF_HPTSI_SLOTS; - /* - * The user wants to be inserted at (slot_now + - * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up. - */ - largest_slot = NUM_OF_HPTSI_SLOTS - 2; - if (inp->inp_hpts_request > largest_slot) { - /* Adjust the residual in inp_hpts_request */ - slot = largest_slot; - inp->inp_hpts_request -= largest_slot; - } else { - /* No residual it all fits */ - inp->inp_hpts_request = 0; - } - inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS; - if (diag) { - diag->slot_now = slot_now; - diag->inp_hptsslot = inp->inp_hptsslot; - diag->p_on_min_sleep = hpts->p_on_min_sleep; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow); - } /* Now do we need to restart the hpts's timer? */ - if (TSTMP_GT(ticknow, hpts->p_curtick)) - have_slept = ticknow - hpts->p_curtick; - else - have_slept = 0; - if (have_slept < hpts->p_hpts_sleep_time) { - /* This should be what happens */ + have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick); + if(have_slept < hpts->p_hpts_sleep_time) yet_to_sleep = hpts->p_hpts_sleep_time - have_slept; - } else { + else { /* We are over-due */ yet_to_sleep = 0; need_wakeup = 1; @@ -879,20 +932,16 @@ if (diag) { diag->have_slept = have_slept; diag->yet_to_sleep = yet_to_sleep; - diag->hpts_sleep_time = hpts->p_hpts_sleep_time; } - if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) { + if (yet_to_sleep && + (yet_to_sleep > slot)) { /* - * We need to reschedule the hptss time-out. + * We need to reschedule the hpts's time-out. */ hpts->p_hpts_sleep_time = slot; need_new_to = slot * HPTS_TICKS_PER_USEC; } } - hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow); - } /* * Now how far is the hpts sleeping to? if active is 1, its * up and ticking we do nothing, otherwise we may need to @@ -899,9 +948,6 @@ * reschedule its callout if need_new_to is set from above. */ if (need_wakeup) { - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0); - } hpts->p_direct_wake = 1; tcp_wakehpts(hpts); if (diag) { @@ -944,9 +990,10 @@ } uint32_t -tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag){ +tcp_hpts_insert_diag(struct inpcb *inp, uint32_t slot, int32_t line, struct hpts_diag *diag) +{ struct tcp_hpts_entry *hpts; - uint32_t slot_on, cts; + uint32_t slot_on; struct timeval tv; /* @@ -956,12 +1003,8 @@ */ INP_WLOCK_ASSERT(inp); hpts = tcp_hpts_lock(inp); - if (in_ts_percision) - microuptime(&tv); - else - getmicrouptime(&tv); - cts = tcp_tv_to_usectick(&tv); - tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0); + microuptime(&tv); + tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv); slot_on = hpts->p_nxt_slot; mtx_unlock(&hpts->p_mtx); return (slot_on); @@ -971,7 +1014,6 @@ __tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){ return (tcp_hpts_insert_diag(inp, slot, line, NULL)); } - int __tcp_queue_to_input_locked(struct inpcb *inp, struct tcp_hpts_entry *hpts, int32_t line) { @@ -986,9 +1028,6 @@ /* * Activate the hpts if it is sleeping. */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0); - } retval = 2; hpts->p_direct_wake = 1; tcp_wakeinput(hpts); @@ -1001,36 +1040,14 @@ return (retval); } -void -tcp_queue_pkt_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos) +int32_t +__tcp_queue_to_input(struct inpcb *inp, int line) { - /* Setup packet for input first */ - INP_WLOCK_ASSERT(tp->t_inpcb); - m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t)); - m->m_pkthdr.pace_tlen = (uint16_t) tlen; - m->m_pkthdr.pace_drphdrlen = drop_hdrlen; - m->m_pkthdr.pace_tos = iptos; - m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0); - if (tp->t_in_pkt == NULL) { - tp->t_in_pkt = m; - tp->t_tail_pkt = m; - } else { - tp->t_tail_pkt->m_nextpkt = m; - tp->t_tail_pkt = m; - } -} - - -int32_t -__tcp_queue_to_input(struct tcpcb *tp, struct mbuf *m, struct tcphdr *th, - int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){ struct tcp_hpts_entry *hpts; int32_t ret; - tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos); - hpts = tcp_input_lock(tp->t_inpcb); - ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line); + hpts = tcp_input_lock(inp); + ret = __tcp_queue_to_input_locked(inp, hpts, line); mtx_unlock(&hpts->p_mtx); return (ret); } @@ -1132,6 +1149,25 @@ #endif } +static void +tcp_drop_in_pkts(struct tcpcb *tp) +{ + struct mbuf *m, *n; + + m = tp->t_in_pkt; + if (m) + n = m->m_nextpkt; + else + n = NULL; + tp->t_in_pkt = NULL; + while (m) { + m_freem(m); + m = n; + if (m) + n = m->m_nextpkt; + } +} + /* * Do NOT try to optimize the processing of inp's * by first pulling off all the inp's into a temporary @@ -1142,7 +1178,7 @@ * but then while you were processing one of the inp's * some other one that you switch will get a new * packet on the different CPU. It will insert it - * on the new hptss input list. Creating a temporary + * on the new hpts's input list. Creating a temporary * link in the inp will not fix it either, since * the other hpts will be doing the same thing and * you will both end up using the temporary link. @@ -1155,16 +1191,16 @@ static void tcp_input_data(struct tcp_hpts_entry *hpts, struct timeval *tv) { - struct mbuf *m, *n; struct tcpcb *tp; struct inpcb *inp; uint16_t drop_reason; int16_t set_cpu; uint32_t did_prefetch = 0; - int32_t ti_locked = TI_UNLOCKED; + int dropped; struct epoch_tracker et; HPTS_MTX_ASSERT(hpts); + INP_INFO_RLOCK_ET(&V_tcbinfo, et); while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) { HPTS_MTX_ASSERT(hpts); hpts_sane_input_remove(hpts, inp, 0); @@ -1178,24 +1214,14 @@ inp->inp_in_input = 0; mtx_unlock(&hpts->p_mtx); CURVNET_SET(inp->inp_vnet); - if (drop_reason) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else { - ti_locked = TI_UNLOCKED; - } INP_WLOCK(inp); if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || (inp->inp_flags2 & INP_FREED)) { out: hpts->p_inp = NULL; - if (ti_locked == TI_RLOCKED) { - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - } if (in_pcbrele_wlocked(inp) == 0) { INP_WUNLOCK(inp); } - ti_locked = TI_UNLOCKED; CURVNET_RESTORE(); mtx_lock(&hpts->p_mtx); continue; @@ -1206,20 +1232,8 @@ } if (drop_reason) { /* This tcb is being destroyed for drop_reason */ - m = tp->t_in_pkt; - if (m) - n = m->m_nextpkt; - else - n = NULL; - tp->t_in_pkt = NULL; - while (m) { - m_freem(m); - m = n; - if (m) - n = m->m_nextpkt; - } + tcp_drop_in_pkts(tp); tp = tcp_drop(tp, drop_reason); - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); if (tp == NULL) { INP_WLOCK(inp); } @@ -1246,212 +1260,162 @@ */ tcp_set_hpts(inp); } - m = tp->t_in_pkt; - n = NULL; - if (m != NULL && - (m->m_pkthdr.pace_lock == TI_RLOCKED || - tp->t_state != TCPS_ESTABLISHED)) { - ti_locked = TI_RLOCKED; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - m = tp->t_in_pkt; - } - if (in_newts_every_tcb) { - if (in_ts_percision) - microuptime(tv); - else - getmicrouptime(tv); - } if (tp->t_fb_ptr != NULL) { kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } - /* Any input work to do, if so do it first */ - if ((m != NULL) && (m == tp->t_in_pkt)) { - struct tcphdr *th; - int32_t tlen, drop_hdrlen, nxt_pkt; - uint8_t iptos; - - n = m->m_nextpkt; - tp->t_in_pkt = tp->t_tail_pkt = NULL; - while (m) { - th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff); - tlen = m->m_pkthdr.pace_tlen; - drop_hdrlen = m->m_pkthdr.pace_drphdrlen; - iptos = m->m_pkthdr.pace_tos; - m->m_nextpkt = NULL; - if (n) - nxt_pkt = 1; - else - nxt_pkt = 0; - inp->inp_input_calls = 1; - if (tp->t_fb->tfb_tcp_hpts_do_segment) { - /* Use the hpts specific do_segment */ - (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket, - tp, drop_hdrlen, - tlen, iptos, nxt_pkt, tv); - } else { - /* Use the default do_segment */ - (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket, - tp, drop_hdrlen, - tlen, iptos); - } - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - /* - * Do segment returns unlocked we need the - * lock again but we also need some kasserts - * here. - */ - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); - INP_UNLOCK_ASSERT(inp); - m = n; - if (m) - n = m->m_nextpkt; - if (m != NULL && - m->m_pkthdr.pace_lock == TI_RLOCKED) { - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - ti_locked = TI_RLOCKED; - } else - ti_locked = TI_UNLOCKED; + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + if (inp->inp_in_input) + tcp_hpts_remove(inp, HPTS_REMOVE_INPUT); + dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); + if (dropped) { + /* Re-acquire the wlock so we can release the reference */ INP_WLOCK(inp); - /* - * Since we have an opening here we must - * re-check if the tcb went away while we - * were getting the lock(s). - */ - if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || - (inp->inp_flags2 & INP_FREED)) { - while (m) { - m_freem(m); - m = n; - if (m) - n = m->m_nextpkt; - } - goto out; - } - /* - * Now that we hold the INP lock, check if - * we need to upgrade our lock. - */ - if (ti_locked == TI_UNLOCKED && - (tp->t_state != TCPS_ESTABLISHED)) { - ti_locked = TI_RLOCKED; - INP_INFO_RLOCK_ET(&V_tcbinfo, et); - } - } /** end while(m) */ - } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */ + } + } else if (tp->t_in_pkt) { + /* + * We reach here only if we had a + * stack that supported INP_SUPPORTS_MBUFQ + * and then somehow switched to a stack that + * does not. The packets are basically stranded + * and would hang with the connection until + * cleanup without this code. Its not the + * best way but I know of no other way to + * handle it since the stack needs functions + * it does not have to handle queued packets. + */ + tcp_drop_in_pkts(tp); + } if (in_pcbrele_wlocked(inp) == 0) INP_WUNLOCK(inp); - if (ti_locked == TI_RLOCKED) - INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); - INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo); INP_UNLOCK_ASSERT(inp); - ti_locked = TI_UNLOCKED; mtx_lock(&hpts->p_mtx); hpts->p_inp = NULL; CURVNET_RESTORE(); } + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + INP_INFO_UNLOCK_ASSERT(&V_tcbinfo); } -static int -tcp_hpts_est_run(struct tcp_hpts_entry *hpts) -{ - int32_t ticks_to_run; - - if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) { - ticks_to_run = hpts->p_curtick - hpts->p_prevtick; - if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) { - ticks_to_run = NUM_OF_HPTSI_SLOTS - 2; - } - } else { - if (hpts->p_prevtick == hpts->p_curtick) { - /* This happens when we get woken up right away */ - return (-1); - } - ticks_to_run = 1; - } - /* Set in where we will be when we catch up */ - hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS; - if (hpts->p_nxt_slot == hpts->p_cur_slot) { - panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d", - hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run); - } - return (ticks_to_run); -} - static void -tcp_hptsi(struct tcp_hpts_entry *hpts, struct timeval *ctick) +tcp_hptsi(struct tcp_hpts_entry *hpts) { + struct epoch_tracker et; struct tcpcb *tp; struct inpcb *inp = NULL, *ninp; struct timeval tv; - int32_t ticks_to_run, i, error, tick_now, interum_tick; + int32_t ticks_to_run, i, error; int32_t paced_cnt = 0; int32_t did_prefetch = 0; int32_t prefetch_ninp = 0; int32_t prefetch_tp = 0; - uint32_t cts; int16_t set_cpu; HPTS_MTX_ASSERT(hpts); - hpts->p_curtick = tcp_tv_to_hptstick(ctick); - cts = tcp_tv_to_usectick(ctick); - memcpy(&tv, ctick, sizeof(struct timeval)); - hpts->p_cur_slot = hpts_tick(hpts, 1); - - /* Figure out if we had missed ticks */ + hpts->p_lasttick = hpts->p_curtick; + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + if ((hpts->p_on_queue_cnt == 0) || + (hpts->p_lasttick == hpts->p_curtick)) { + /* + * No time has yet passed, + * or nothing to do. + */ + hpts->p_prev_slot = hpts->p_cur_slot; + hpts->p_lasttick = hpts->p_curtick; + goto no_run; + } again: + hpts->p_wheel_complete = 0; HPTS_MTX_ASSERT(hpts); - ticks_to_run = tcp_hpts_est_run(hpts); - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); + ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot); + if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) && + (hpts->p_on_queue_cnt != 0)) { + /* + * Wheel wrap is occuring, basically we + * are behind and the distance between + * run's has spread so much it has exceeded + * the time on the wheel (1.024 seconds). This + * is ugly and should NOT be happening. We + * need to run the entire wheel. We last processed + * p_prev_slot, so that needs to be the last slot + * we run. The next slot after that should be our + * reserved first slot for new, and then starts + * the running postion. Now the problem is the + * reserved "not to yet" place does not exist + * and there may be inp's in there that need + * running. We can merge those into the + * first slot at the head. + */ + hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1); + hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2); + /* + * Adjust p_cur_slot to be where we are starting from + * hopefully we will catch up (fat chance if something + * is broken this bad :( ) + */ + hpts->p_cur_slot = hpts->p_prev_slot; + /* + * The next slot has guys to run too, and that would + * be where we would normally start, lets move them into + * the next slot (p_prev_slot + 2) so that we will + * run them, the extra 10usecs of late (by being + * put behind) does not really matter in this situation. + */ +#ifdef INVARIANTS + /* + * To prevent a panic we need to update the inpslot to the + * new location. This is safe since it takes both the + * INP lock and the pacer mutex to change the inp_hptsslot. + */ + TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) { + inp->inp_hptsslot = hpts->p_runningtick; + } +#endif + TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick], + &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts); + ticks_to_run = NUM_OF_HPTSI_SLOTS - 1; + counter_u64_add(wheel_wrap, 1); + } else { + /* + * Nxt slot is always one after p_runningtick though + * its not used usually unless we are doing wheel wrap. + */ + hpts->p_nxt_slot = hpts->p_prev_slot; + hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1); } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts, hpts->p_on_inqueue_cnt); } #endif HPTS_MTX_ASSERT(hpts); - /* Reset the ticks to run and time if we need too */ - interum_tick = tcp_gethptstick(&tv); - if (interum_tick != hpts->p_curtick) { - /* Save off the new time we execute to */ - *ctick = tv; - hpts->p_curtick = interum_tick; - cts = tcp_tv_to_usectick(&tv); - hpts->p_cur_slot = hpts_tick(hpts, 1); - ticks_to_run = tcp_hpts_est_run(hpts); - } - if (ticks_to_run == -1) { - goto no_run; - } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0); - } if (hpts->p_on_queue_cnt == 0) { goto no_one; } HPTS_MTX_ASSERT(hpts); +#ifndef VIMAGE + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif for (i = 0; i < ticks_to_run; i++) { /* * Calculate our delay, if there are no extra ticks there - * was not any + * was not any (i.e. if ticks_to_run == 1, no delay). */ hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC; HPTS_MTX_ASSERT(hpts); - while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { + while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { /* For debugging */ - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i); - } hpts->p_inp = inp; paced_cnt++; - if (hpts->p_cur_slot != inp->inp_hptsslot) { +#ifdef INVARIANTS + if (hpts->p_runningtick != inp->inp_hptsslot) { panic("Hpts:%p inp:%p slot mis-aligned %u vs %u", - hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot); + hpts, inp, hpts->p_runningtick, inp->inp_hptsslot); } +#endif /* Now pull it */ if (inp->inp_hpts_cpu_set == 0) { set_cpu = 1; @@ -1458,8 +1422,8 @@ } else { set_cpu = 0; } - hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0); - if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) { + hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0); + if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) { /* We prefetch the next inp if possible */ kern_prefetch(ninp, &prefetch_ninp); prefetch_ninp = 1; @@ -1467,25 +1431,36 @@ if (inp->inp_hpts_request) { /* * This guy is deferred out further in time - * then our wheel had on it. Push him back - * on the wheel. + * then our wheel had available on it. + * Push him back on the wheel or run it + * depending. */ - int32_t remaining_slots; - + uint32_t maxticks, last_tick, remaining_slots; + remaining_slots = ticks_to_run - (i + 1); if (inp->inp_hpts_request > remaining_slots) { /* - * Keep INVARIANTS happy by clearing - * the flag + * How far out can we go? */ - tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1); + maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick); + if (maxticks >= inp->inp_hpts_request) { + /* we can place it finally to be processed */ + inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request); + inp->inp_hpts_request = 0; + } else { + /* Work off some more time */ + inp->inp_hptsslot = last_tick; + inp->inp_hpts_request-= maxticks; + } + hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1); hpts->p_inp = NULL; continue; } inp->inp_hpts_request = 0; + /* Fall through we will so do it now */ } /* - * We clear the hpts flag here after dealing with + * We clear the hpts flag here after dealing with * remaining slots. This way anyone looking with the * TCB lock will see its on the hpts until just * before we unlock. @@ -1495,23 +1470,20 @@ INP_WLOCK(inp); if (in_pcbrele_wlocked(inp)) { mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1); hpts->p_inp = NULL; continue; } - if (inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) { -out_now: + if ((inp->inp_flags & (INP_TIMEWAIT | INP_DROPPED)) || + (inp->inp_flags2 & INP_FREED)) { + out_now: #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif INP_WUNLOCK(inp); mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3); hpts->p_inp = NULL; continue; } @@ -1539,16 +1511,10 @@ */ tcp_set_hpts(inp); } - if (out_newts_every_tcb) { - struct timeval sv; - - if (out_ts_percision) - microuptime(&sv); - else - getmicrouptime(&sv); - cts = tcp_tv_to_usectick(&sv); - } CURVNET_SET(inp->inp_vnet); +#ifdef VIMAGE + INP_INFO_RLOCK_ET(&V_tcbinfo, et); +#endif /* * There is a hole here, we get the refcnt on the * inp so it will still be preserved but to make @@ -1560,7 +1526,7 @@ #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx before tcp-output:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif if (tp->t_fb_ptr != NULL) { @@ -1567,12 +1533,16 @@ kern_prefetch(tp->t_fb_ptr, &did_prefetch); did_prefetch = 1; } + if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) { + error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0); + if (error) { + /* The input killed the connection */ + goto skip_pacing; + } + } inp->inp_hpts_calls = 1; - if (tp->t_fb->tfb_tcp_output_wtime != NULL) { - error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv); - } else { - error = tp->t_fb->tfb_tcp_output(tp); - } + error = tp->t_fb->tfb_tcp_output(tp); + inp->inp_hpts_calls = 0; if (ninp && ninp->inp_ppcb) { /* * If we have a nxt inp, see if we can @@ -1609,74 +1579,93 @@ prefetch_tp = 1; } INP_WUNLOCK(inp); + skip_pacing: +#ifdef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); +#endif INP_UNLOCK_ASSERT(inp); CURVNET_RESTORE(); #ifdef INVARIANTS if (mtx_owned(&hpts->p_mtx)) { panic("Hpts:%p owns mtx prior-to lock line:%d", - hpts, __LINE__); + hpts, __LINE__); } #endif mtx_lock(&hpts->p_mtx); - if (logging_on) - tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4); hpts->p_inp = NULL; } HPTS_MTX_ASSERT(hpts); hpts->p_inp = NULL; - hpts->p_cur_slot++; - if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) { - hpts->p_cur_slot = 0; + hpts->p_runningtick++; + if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) { + hpts->p_runningtick = 0; } } +#ifndef VIMAGE + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); +#endif no_one: HPTS_MTX_ASSERT(hpts); - hpts->p_prevtick = hpts->p_curtick; hpts->p_delayed_by = 0; /* * Check to see if we took an excess amount of time and need to run * more ticks (if we did not hit eno-bufs). */ - /* Re-run any input that may be there */ - (void)tcp_gethptstick(&tv); - if (!TAILQ_EMPTY(&hpts->p_input)) { - tcp_input_data(hpts, &tv); - } #ifdef INVARIANTS if (TAILQ_EMPTY(&hpts->p_input) && (hpts->p_on_inqueue_cnt != 0)) { panic("tp:%p in_hpts input empty but cnt:%d", - hpts, hpts->p_on_inqueue_cnt); + hpts, hpts->p_on_inqueue_cnt); } #endif - tick_now = tcp_gethptstick(&tv); - if (SEQ_GT(tick_now, hpts->p_prevtick)) { - struct timeval res; - - /* Did we really spend a full tick or more in here? */ - timersub(&tv, ctick, &res); - if (res.tv_sec || (res.tv_usec >= HPTS_TICKS_PER_USEC)) { + hpts->p_prev_slot = hpts->p_cur_slot; + hpts->p_lasttick = hpts->p_curtick; + hpts->p_curtick = tcp_gethptstick(&tv); + if (hpts->p_lasttick != hpts->p_curtick) { + counter_u64_add(hpts_loops, 1); + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); + goto again; + } +no_run: + /* + * Set flag to tell that we are done for + * any slot input that happens during + * input. + */ + hpts->p_wheel_complete = 1; + /* + * Run any input that may be there not covered + * in running data. + */ + if (!TAILQ_EMPTY(&hpts->p_input)) { + tcp_input_data(hpts, &tv); + /* + * Now did we spend too long running + * input and need to run more ticks? + */ + KASSERT(hpts->p_prev_slot == hpts->p_cur_slot, + ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts, + hpts->p_prev_slot, hpts->p_cur_slot)); + KASSERT(hpts->p_lasttick == hpts->p_curtick, + ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts, + hpts->p_lasttick, hpts->p_curtick)); + hpts->p_curtick = tcp_gethptstick(&tv); + if (hpts->p_lasttick != hpts->p_curtick) { counter_u64_add(hpts_loops, 1); - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now); - } - *ctick = res; - hpts->p_curtick = tick_now; + hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); goto again; } } -no_run: { uint32_t t = 0, i, fnd = 0; if (hpts->p_on_queue_cnt) { - - /* * Find next slot that is occupied and use that to * be the sleep time. */ - for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { + + for (i = 0, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) { if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) { fnd = 1; break; @@ -1684,27 +1673,20 @@ t = (t + 1) % NUM_OF_HPTSI_SLOTS; } if (fnd) { - hpts->p_hpts_sleep_time = i; + hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max); } else { - counter_u64_add(back_tosleep, 1); #ifdef INVARIANTS - panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt); + panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt); #endif + counter_u64_add(back_tosleep, 1); hpts->p_on_queue_cnt = 0; goto non_found; } - t++; } else { - /* No one on the wheel sleep for all but 2 slots */ -non_found: - if (hpts_sleep_max == 0) - hpts_sleep_max = 1; - hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max); - t = 0; + /* No one on the wheel sleep for all but 400 slots or sleep max */ + non_found: + hpts->p_hpts_sleep_time = hpts_sleep_max; } - if (logging_on) { - tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC)); - } } } @@ -1746,25 +1728,19 @@ mtx_lock(&hpts->p_mtx); if (hpts->p_direct_wake) { /* Signaled by input */ - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1); callout_stop(&hpts->co); } else { /* Timed out */ if (callout_pending(&hpts->co) || !callout_active(&hpts->co)) { - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2); mtx_unlock(&hpts->p_mtx); return; } callout_deactivate(&hpts->co); - if (logging_on) - tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3); } + hpts->p_hpts_wake_scheduled = 0; hpts->p_hpts_active = 1; - (void)tcp_gethptstick(&tv); - tcp_hptsi(hpts, &tv); + tcp_hptsi(hpts); HPTS_MTX_ASSERT(hpts); tv.tv_sec = 0; tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC; @@ -1811,7 +1787,8 @@ tcp_pace.rp_num_hptss = ncpus; hpts_loops = counter_u64_alloc(M_WAITOK); back_tosleep = counter_u64_alloc(M_WAITOK); - + combined_wheel_wrap = counter_u64_alloc(M_WAITOK); + wheel_wrap = counter_u64_alloc(M_WAITOK); sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *)); tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS; @@ -1850,7 +1827,7 @@ OID_AUTO, "out_qcnt", CTLFLAG_RD, &hpts->p_on_queue_cnt, 0, "Count TCB's awaiting output processing"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_ADD_U16(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "active", CTLFLAG_RD, &hpts->p_hpts_active, 0, @@ -1859,29 +1836,23 @@ SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curslot", CTLFLAG_RD, &hpts->p_cur_slot, 0, - "What the current slot is if active"); + "What the current running pacers goal"); SYSCTL_ADD_UINT(&hpts->hpts_ctx, SYSCTL_CHILDREN(hpts->hpts_root), + OID_AUTO, "runtick", CTLFLAG_RD, + &hpts->p_runningtick, 0, + "What the running pacers current slot is"); + SYSCTL_ADD_UINT(&hpts->hpts_ctx, + SYSCTL_CHILDREN(hpts->hpts_root), OID_AUTO, "curtick", CTLFLAG_RD, &hpts->p_curtick, 0, - "What the current tick on if active"); - SYSCTL_ADD_UINT(&hpts->hpts_ctx, - SYSCTL_CHILDREN(hpts->hpts_root), - OID_AUTO, "logsize", CTLFLAG_RD, - &hpts->p_logsize, 0, - "Hpts logging buffer size"); - hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2; + "What the running pacers last tick mapped to the wheel was"); + hpts->p_hpts_sleep_time = hpts_sleep_max; hpts->p_num = i; - hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv); - hpts->p_prevtick -= 1; - hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS; + hpts->p_curtick = tcp_gethptstick(&tv); + hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick); hpts->p_cpu = 0xffff; - hpts->p_nxt_slot = 1; - hpts->p_logsize = tcp_hpts_logging_size; - if (hpts->p_logsize) { - sz = (sizeof(struct hpts_log) * hpts->p_logsize); - hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK | M_ZERO); - } + hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1); callout_init(&hpts->co, 1); } Index: netinet/tcp_log_buf.h =================================================================== --- netinet/tcp_log_buf.h +++ netinet/tcp_log_buf.h @@ -175,7 +175,7 @@ TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */ TCP_LOG_PRR, /* Doing PRR 6 */ TCP_LOG_REORDER,/* Detected reorder 7 */ - TCP_LOG_PACER, /* Pacer sending a packet 8 */ + TCP_LOG_HPTS, /* Hpts sending a packet 8 */ BBR_LOG_BBRUPD, /* We updated BBR info 9 */ BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */ BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */ @@ -194,31 +194,38 @@ BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */ TCP_LOG_FLOWEND, /* End of a flow 25 */ BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */ - BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */ - BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */ + BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */ + BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */ BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */ BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */ TCP_LOG_USERSEND, /* User level sends data 31 */ - UNUSED_32, /* Unused 32 */ - UNUSED_33, /* Unused 33 */ + BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */ + BBR_LOG_STATE_TARGET, /* Log of target at state 33 */ BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */ BBR_LOG_TO_PROCESS, /* A to was processed 35 */ BBR_LOG_BBRTSO, /* TSO update 36 */ - BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */ + BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */ BBR_LOG_LOWGAIN, /* Low gain accounting 38 */ BBR_LOG_PROGRESS, /* Progress timer event 39 */ TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */ BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */ BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */ - BBR_LOG_PACING_CALC, /* calc the pacing time 43 */ + BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */ BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */ BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */ BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/ TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */ BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */ - BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */ + BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */ TCP_LOG_REASS, /* Reassembly buffer logging 50 */ - TCP_LOG_END /* End (keep at end) 51 */ + TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */ + BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */ + BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */ + TCP_LOG_CONNEND, /* End of connection 54 */ + TCP_LOG_LRO, /* LRO entry 55 */ + TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */ + TCP_SAD_DETECTION, /* Sack Attack Detection 57 */ + TCP_LOG_END /* End (keep at end) 58 */ }; enum tcp_log_states { @@ -275,8 +282,8 @@ #ifdef _KERNEL -#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000 -#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000 +#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000 +#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000 /* * TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always Index: netinet/tcp_stacks/rack_bbr_common.h =================================================================== --- netinet/tcp_stacks/rack_bbr_common.h +++ netinet/tcp_stacks/rack_bbr_common.h @@ -38,17 +38,8 @@ #define TCP_MSS_ACCT_SIZE 70 #define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF) +#define DUP_ACK_THRESHOLD 3 -/* Magic flags to tell whats cooking on the pacing wheel */ -#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */ -#define PACE_TMR_RACK 0x02 /* RACK timer running */ -#define PACE_TMR_TLP 0x04 /* TLP timer running */ -#define PACE_TMR_RXT 0x08 /* Retransmit timer running */ -#define PACE_TMR_PERSIT 0x10 /* Persists timer running */ -#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */ -#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */ -#define PACE_TMR_MASK (PACE_TMR_KEEP|PACE_TMR_PERSIT|PACE_TMR_RXT|PACE_TMR_TLP|PACE_TMR_RACK|PACE_TMR_DELACK) - /* Magic flags for tracing progress events */ #define PROGRESS_DROP 1 #define PROGRESS_UPDATE 2 @@ -61,8 +52,66 @@ #define USE_RTT_LOW 1 #define USE_RTT_AVG 2 +#define PACE_MAX_IP_BYTES 65536 +#define USECS_IN_SECOND 1000000 +#define MSEC_IN_SECOND 1000 +#define MS_IN_USEC 1000 +#define USEC_TO_MSEC(x) (x / MS_IN_USEC) +#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */ + #ifdef _KERNEL /* We have only 7 bits in rack so assert its true */ CTASSERT((PACE_TMR_MASK & 0x80) == 0); +#ifdef KERN_TLS +uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd); #endif +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, + struct mbuf *m, int has_pkt); +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt); +uint32_t ctf_outstanding(struct tcpcb *tp); +uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked); +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, + struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, + int32_t * drop_hdrlen, int32_t * ret_val); +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val); +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, + struct tcphdr *th, int32_t rstreason, int32_t tlen); +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp); + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, + struct socket *so, struct tcpcb *tp); + +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t * ret_val); + +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, + struct tcpcb *tp, int32_t tlen, int32_t thflags, int32_t * ret_val); + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp); + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen); + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp); + +void +ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks); + +uint32_t +ctf_decay_count(uint32_t count, uint32_t decay_percentage); + #endif +#endif Index: netinet/tcp_stacks/rack_bbr_common.c =================================================================== --- netinet/tcp_stacks/rack_bbr_common.c +++ netinet/tcp_stacks/rack_bbr_common.c @@ -0,0 +1,859 @@ +/*- + * Copyright (c) 2016-2018 + * Netflix Inc. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + */ +/* + * Author: Randall Stewart + * This work is based on the ACM Queue paper + * BBR - Congestion Based Congestion Control + * and also numerous discussions with Neal, Yuchung and Van. + */ + +#include +__FBSDID("$FreeBSD$"); + +#include "opt_inet.h" +#include "opt_inet6.h" +#include "opt_ipsec.h" +#include "opt_tcpdebug.h" +#include "opt_ratelimit.h" +/*#include "opt_kern_tls.h"*/ +#include +#include +#include +#ifdef TCP_HHOOK +#include +#endif +#include +#include +#include +#include +#include +#ifdef KERN_TLS +#include +#endif +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#define TCPSTATES /* for logging */ + +#include +#include +#include +#include +#include /* required for icmp_var.h */ +#include /* for ICMP_BANDLIM */ +#include +#include +#include +#include +#define TCPOUTFLAGS +#include +#include +#include +#include +#include +#include +#include +#include +#include +#ifdef TCPDEBUG +#include +#endif /* TCPDEBUG */ +#ifdef TCP_OFFLOAD +#include +#endif +#ifdef INET6 +#include +#endif +#include + +#include +#include +#include + +#if defined(IPSEC) || defined(IPSEC_SUPPORT) +#include +#include +#endif /* IPSEC */ + +#include +#include +#include + +#ifdef MAC +#include +#endif +#include "rack_bbr_common.h" + +/* + * Common TCP Functions - These are shared by borth + * rack and BBR. + */ + + +#ifdef KERN_TLS +uint32_t +ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd) +{ + struct sbtls_info *tls; + uint32_t len; + +again: + tls = so->so_snd.sb_tls_info; + len = tls->sb_params.sb_maxlen; /* max tls payload */ + len += tls->sb_params.sb_tls_hlen; /* tls header len */ + len += tls->sb_params.sb_tls_tlen; /* tls trailer len */ + if ((len * 4) > rwnd) { + /* + * Stroke this will suck counter and what + * else should we do Drew? From the + * TCP perspective I am not sure + * what should be done... + */ + if (tls->sb_params.sb_maxlen > 4096) { + tls->sb_params.sb_maxlen -= 4096; + if (tls->sb_params.sb_maxlen < 4096) + tls->sb_params.sb_maxlen = 4096; + goto again; + } + } + return (len); +} +#endif + +int +ctf_process_inbound_raw(struct tcpcb *tp, struct socket *so, struct mbuf *m, int has_pkt) +{ + /* + * We are passed a raw change of mbuf packets + * that arrived in LRO. They are linked via + * the m_nextpkt link in the pkt-headers. + * + * We process each one by: + * a) saving off the next + * b) stripping off the ether-header + * c) formulating the arguments for + * the tfb_tcp_hpts_do_segment + * d) calling each mbuf to tfb_tcp_hpts_do_segment + * after adjusting the time to match the arrival time. + * Note that the LRO code assures no IP options are present. + * + * The symantics for calling tfb_tcp_hpts_do_segment are the + * following: + * 1) It returns 0 if all went well and you (the caller) need + * to release the lock. + * 2) If nxt_pkt is set, then the function will surpress calls + * to tfb_tcp_output() since you are promising to call again + * with another packet. + * 3) If it returns 1, then you must free all the packets being + * shipped in, the tcb has been destroyed (or about to be destroyed). + */ + struct mbuf *m_save; + struct ether_header *eh; + struct epoch_tracker et; + struct tcphdr *th; +#ifdef INET6 + struct ip6_hdr *ip6 = NULL; /* Keep compiler happy. */ +#endif +#ifdef INET + struct ip *ip = NULL; /* Keep compiler happy. */ +#endif + struct ifnet *ifp; + struct timeval tv; + int32_t retval, nxt_pkt, tlen, off; + uint16_t etype; + uint16_t drop_hdrlen; + uint8_t iptos, no_vn=0, bpf_req=0; + + /* + * This is a bit deceptive, we get the + * "info epoch" which is really the network + * epoch. This covers us on both any INP + * type change but also if the ifp goes + * away it covers us as well. + */ + INP_INFO_RLOCK_ET(&V_tcbinfo, et); + if (m && m->m_pkthdr.rcvif) + ifp = m->m_pkthdr.rcvif; + else + ifp = NULL; + if (ifp) { + bpf_req = bpf_peers_present(ifp->if_bpf); + } else { + /* + * We probably should not work around + * but kassert, since lro alwasy sets rcvif. + */ + no_vn = 1; + goto skip_vnet; + } + CURVNET_SET(ifp->if_vnet); +skip_vnet: + while (m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + /* Now lets get the ether header */ + eh = mtod(m, struct ether_header *); + etype = ntohs(eh->ether_type); + /* Let the BPF see the packet */ + if (bpf_req && ifp) + ETHER_BPF_MTAP(ifp, m); + m_adj(m, sizeof(*eh)); + /* Trim off the ethernet header */ + switch (etype) { +#ifdef INET6 + case ETHERTYPE_IPV6: + { + if (m->m_len < (sizeof(*ip6) + sizeof(*th))) { + m = m_pullup(m, sizeof(*ip6) + sizeof(*th)); + if (m == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip6 = (struct ip6_hdr *)(eh + 1); + th = (struct tcphdr *)(ip6 + 1); + tlen = ntohs(ip6->ip6_plen); + drop_hdrlen = sizeof(*ip6); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in6_cksum_pseudo(ip6, tlen, + IPPROTO_TCP, m->m_pkthdr.csum_data); + th->th_sum ^= 0xffff; + } else + th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen); + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + /* + * Be proactive about unspecified IPv6 address in source. + * As we use all-zero to indicate unbounded/unconnected pcb, + * unspecified IPv6 address can be used to confuse us. + * + * Note that packets with unspecified IPv6 destination is + * already dropped in ip6_input. + */ + if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) { + /* XXX stat */ + m_freem(m); + goto skipped_pkt; + } + iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff; + break; + } +#endif +#ifdef INET + case ETHERTYPE_IP: + { + if (m->m_len < sizeof (struct tcpiphdr)) { + if ((m = m_pullup(m, sizeof (struct tcpiphdr))) + == NULL) { + TCPSTAT_INC(tcps_rcvshort); + m_freem(m); + goto skipped_pkt; + } + } + ip = (struct ip *)(eh + 1); + th = (struct tcphdr *)(ip + 1); + drop_hdrlen = sizeof(*ip); + iptos = ip->ip_tos; + tlen = ntohs(ip->ip_len) - sizeof(struct ip); + if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) { + if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR) + th->th_sum = m->m_pkthdr.csum_data; + else + th->th_sum = in_pseudo(ip->ip_src.s_addr, + ip->ip_dst.s_addr, + htonl(m->m_pkthdr.csum_data + tlen + + IPPROTO_TCP)); + th->th_sum ^= 0xffff; + } else { + int len; + struct ipovly *ipov = (struct ipovly *)ip; + /* + * Checksum extended TCP header and data. + */ + len = drop_hdrlen + tlen; + bzero(ipov->ih_x1, sizeof(ipov->ih_x1)); + ipov->ih_len = htons(tlen); + th->th_sum = in_cksum(m, len); + /* Reset length for SDT probes. */ + ip->ip_len = htons(len); + /* Reset TOS bits */ + ip->ip_tos = iptos; + /* Re-initialization for later version check */ + ip->ip_v = IPVERSION; + ip->ip_hl = sizeof(*ip) >> 2; + } + if (th->th_sum) { + TCPSTAT_INC(tcps_rcvbadsum); + m_freem(m); + goto skipped_pkt; + } + break; + } +#endif + } + /* + * Convert TCP protocol specific fields to host format. + */ + tcp_fields_to_host(th); + + off = th->th_off << 2; + if (off < sizeof (struct tcphdr) || off > tlen) { + TCPSTAT_INC(tcps_rcvbadoff); + m_freem(m); + goto skipped_pkt; + } + tlen -= off; + drop_hdrlen += off; + /* + * Now lets setup the timeval to be when we should + * have been called (if we can). + */ + m->m_pkthdr.lro_nsegs = 1; + if (m->m_flags & M_TSTMP_LRO) { + tv.tv_sec = m->m_pkthdr.rcv_tstmp /1000000000; + tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000)/1000; + } else { + /* Should not be should we kassert instead? */ + tcp_get_usecs(&tv); + } + /* Now what about next packet? */ + if (m_save || has_pkt) + nxt_pkt = 1; + else + nxt_pkt = 0; + retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen, + iptos, nxt_pkt, &tv); + if (retval) { + /* We lost the lock and tcb probably */ + m = m_save; + while(m) { + m_save = m->m_nextpkt; + m->m_nextpkt = NULL; + m_freem(m); + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return(retval); + } +skipped_pkt: + m = m_save; + } + if (no_vn == 0) + CURVNET_RESTORE(); + INP_INFO_RUNLOCK_ET(&V_tcbinfo, et); + return(retval); +} + +int +ctf_do_queued_segments(struct socket *so, struct tcpcb *tp, int have_pkt) +{ + struct mbuf *m; + + /* First lets see if we have old packets */ + if (tp->t_in_pkt) { + m = tp->t_in_pkt; + tp->t_in_pkt = NULL; + tp->t_tail_pkt = NULL; + if (ctf_process_inbound_raw(tp, so, m, have_pkt)) { + /* We lost the tcpcb (maybe a RST came in)? */ + return(1); + } + } + return (0); +} + +uint32_t +ctf_outstanding(struct tcpcb *tp) +{ + return(tp->snd_max - tp->snd_una); +} + +uint32_t +ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked) +{ + if (rc_sacked <= ctf_outstanding(tp)) + return(ctf_outstanding(tp) - rc_sacked); + else { + /* TSNH */ +#ifdef INVARIANTS + panic("tp:%p rc_sacked:%d > out:%d", + tp, rc_sacked, ctf_outstanding(tp)); +#endif + return (0); + } +} + +void +ctf_do_dropwithreset(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + if (tp != NULL) { + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); + } else + tcp_dropwithreset(m, th, NULL, tlen, rstreason); +} + +/* + * ctf_drop_checks returns 1 for you should not proceed. It places + * in ret_val what should be returned 1/0 by the caller. The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_drop_checks(struct tcpopt *to, struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val) +{ + int32_t todrop; + int32_t thflags; + int32_t tlen; + + thflags = *thf; + tlen = *tlenp; + todrop = tp->rcv_nxt - th->th_seq; + if (todrop > 0) { + if (thflags & TH_SYN) { + thflags &= ~TH_SYN; + th->th_seq++; + if (th->th_urp > 1) + th->th_urp--; + else + thflags &= ~TH_URG; + todrop--; + } + /* + * Following if statement from Stevens, vol. 2, p. 960. + */ + if (todrop > tlen + || (todrop == tlen && (thflags & TH_FIN) == 0)) { + /* + * Any valid FIN must be to the left of the window. + * At this point the FIN must be a duplicate or out + * of sequence; drop it. + */ + thflags &= ~TH_FIN; + /* + * Send an ACK to resynchronize and drop any data. + * But keep on processing for RST or ACK. + */ + tp->t_flags |= TF_ACKNOW; + todrop = tlen; + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, todrop); + } else { + TCPSTAT_INC(tcps_rcvpartduppack); + TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop); + } + /* + * DSACK - add SACK block for dropped range + */ + if (tp->t_flags & TF_SACK_PERMIT) { + tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen); + /* + * ACK now, as the next in-sequence segment + * will clear the DSACK block again + */ + tp->t_flags |= TF_ACKNOW; + } + *drop_hdrlen += todrop; /* drop from the top afterwards */ + th->th_seq += todrop; + tlen -= todrop; + if (th->th_urp > todrop) + th->th_urp -= todrop; + else { + thflags &= ~TH_URG; + th->th_urp = 0; + } + } + /* + * If segment ends after window, drop trailing data (and PUSH and + * FIN); if nothing left, just ACK. + */ + todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd); + if (todrop > 0) { + TCPSTAT_INC(tcps_rcvpackafterwin); + if (todrop >= tlen) { + TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen); + /* + * If window is closed can only take segments at + * window edge, and have to drop data and PUSH from + * incoming segments. Continue processing, but + * remember to ack. Otherwise, drop segment and + * ack. + */ + if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) { + tp->t_flags |= TF_ACKNOW; + TCPSTAT_INC(tcps_rcvwinprobe); + } else { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + return (1); + } + } else + TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop); + m_adj(m, -todrop); + tlen -= todrop; + thflags &= ~(TH_PUSH | TH_FIN); + } + *thf = thflags; + *tlenp = tlen; + return (0); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_do_dropafterack(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, int32_t thflags, int32_t tlen, int32_t * ret_val) +{ + /* + * Generate an ACK dropping incoming segment if it occupies sequence + * space, where the ACK reflects our state. + * + * We can now skip the test for the RST flag since all paths to this + * code happen after packets containing RST have been dropped. + * + * In the SYN-RECEIVED state, don't send an ACK unless the segment + * we received passes the SYN-RECEIVED ACK test. If it fails send a + * RST. This breaks the loop in the "LAND" DoS attack, and also + * prevents an ACK storm between two listening ports that have been + * sent forged SYN segments, each with the source address of the + * other. + */ + if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) && + (SEQ_GT(tp->snd_una, th->th_ack) || + SEQ_GT(th->th_ack, tp->snd_max))) { + *ret_val = 1; + ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen); + return; + } else + *ret_val = 0; + tp->t_flags |= TF_ACKNOW; + if (m) + m_freem(m); +} + +void +ctf_do_drop(struct mbuf *m, struct tcpcb *tp) +{ + + /* + * Drop space held by incoming segment and return. + */ + if (tp != NULL) + INP_WUNLOCK(tp->t_inpcb); + if (m) + m_freem(m); +} + +int +ctf_process_rst(struct mbuf *m, struct tcphdr *th, struct socket *so, struct tcpcb *tp) +{ + /* + * RFC5961 Section 3.2 + * + * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in + * window, we send challenge ACK. + * + * Note: to take into account delayed ACKs, we should test against + * last_ack_sent instead of rcv_nxt. Note 2: we handle special case + * of closed window, not covered by the RFC. + */ + int dropped = 0; + + if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) || + (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) { + + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + KASSERT(tp->t_state != TCPS_SYN_SENT, + ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p", + __func__, th, tp)); + + if (V_tcp_insecure_rst || + (tp->last_ack_sent == th->th_seq) || + (tp->rcv_nxt == th->th_seq) || + ((tp->last_ack_sent - 1) == th->th_seq)) { + TCPSTAT_INC(tcps_drops); + /* Drop the connection. */ + switch (tp->t_state) { + case TCPS_SYN_RECEIVED: + so->so_error = ECONNREFUSED; + goto close; + case TCPS_ESTABLISHED: + case TCPS_FIN_WAIT_1: + case TCPS_FIN_WAIT_2: + case TCPS_CLOSE_WAIT: + case TCPS_CLOSING: + case TCPS_LAST_ACK: + so->so_error = ECONNRESET; + close: + tcp_state_change(tp, TCPS_CLOSED); + /* FALLTHROUGH */ + default: + tp = tcp_close(tp); + } + dropped = 1; + ctf_do_drop(m, tp); + } else { + TCPSTAT_INC(tcps_badrst); + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, + tp->rcv_nxt, tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + } + } else { + m_freem(m); + } + return (dropped); +} + +/* + * The value in ret_val informs the caller + * if we dropped the tcb (and lock) or not. + * 1 = we dropped it, 0 = the TCB is still locked + * and valid. + */ +void +ctf_challenge_ack(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, int32_t * ret_val) +{ + INP_INFO_RLOCK_ASSERT(&V_tcbinfo); + + TCPSTAT_INC(tcps_badsyn); + if (V_tcp_insecure_syn && + SEQ_GEQ(th->th_seq, tp->last_ack_sent) && + SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) { + tp = tcp_drop(tp, ECONNRESET); + *ret_val = 1; + ctf_do_drop(m, tp); + } else { + /* Send challenge ACK. */ + tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt, + tp->snd_nxt, TH_ACK); + tp->last_ack_sent = tp->rcv_nxt; + m = NULL; + *ret_val = 0; + ctf_do_drop(m, NULL); + } +} + +/* + * bbr_ts_check returns 1 for you should not proceed, the state + * machine should return. It places in ret_val what should + * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates + * that the TCB is unlocked and probably dropped. The 0 indicates the + * TCB is still valid and locked. + */ +int +ctf_ts_check(struct mbuf *m, struct tcphdr *th, struct tcpcb *tp, + int32_t tlen, int32_t thflags, int32_t * ret_val) +{ + + if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) { + /* + * Invalidate ts_recent. If this segment updates ts_recent, + * the age will be reset later and ts_recent will get a + * valid value. If it does not, setting ts_recent to zero + * will at least satisfy the requirement that zero be placed + * in the timestamp echo reply when ts_recent isn't valid. + * The age isn't reset until we get a valid ts_recent + * because we don't want out-of-order segments to be dropped + * when ts_recent is old. + */ + tp->ts_recent = 0; + } else { + TCPSTAT_INC(tcps_rcvduppack); + TCPSTAT_ADD(tcps_rcvdupbyte, tlen); + TCPSTAT_INC(tcps_pawsdrop); + *ret_val = 0; + if (tlen) { + ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val); + } else { + ctf_do_drop(m, NULL); + } + return (1); + } + return (0); +} + +void +ctf_calc_rwin(struct socket *so, struct tcpcb *tp) +{ + int32_t win; + + /* + * Calculate amount of space in receive window, and then do TCP + * input processing. Receive window is amount of space in rcv queue, + * but not less than advertised window. + */ + win = sbspace(&so->so_rcv); + if (win < 0) + win = 0; + tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt)); +} + +void +ctf_do_dropwithreset_conn(struct mbuf *m, struct tcpcb *tp, struct tcphdr *th, + int32_t rstreason, int32_t tlen) +{ + + if (tp->t_inpcb) { + tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT); + } + tcp_dropwithreset(m, th, tp, tlen, rstreason); + INP_WUNLOCK(tp->t_inpcb); +} + +uint32_t +ctf_fixed_maxseg(struct tcpcb *tp) +{ + int optlen; + + if (tp->t_flags & TF_NOOPT) + return (tp->t_maxseg); + + /* + * Here we have a simplified code from tcp_addoptions(), + * without a proper loop, and having most of paddings hardcoded. + * We only consider fixed options that we would send every + * time I.e. SACK is not considered. + * + */ +#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4) + if (TCPS_HAVEESTABLISHED(tp->t_state)) { + if (tp->t_flags & TF_RCVD_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = 0; +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + } else { + if (tp->t_flags & TF_REQ_TSTMP) + optlen = TCPOLEN_TSTAMP_APPA; + else + optlen = PAD(TCPOLEN_MAXSEG); + if (tp->t_flags & TF_REQ_SCALE) + optlen += PAD(TCPOLEN_WINDOW); +#if defined(IPSEC_SUPPORT) || defined(TCP_SIGNATURE) + if (tp->t_flags & TF_SIGNATURE) + optlen += PAD(TCPOLEN_SIGNATURE); +#endif + if (tp->t_flags & TF_SACK_PERMIT) + optlen += PAD(TCPOLEN_SACK_PERMITTED); + } +#undef PAD + optlen = min(optlen, TCP_MAXOLEN); + return (tp->t_maxseg - optlen); +} + +void +ctf_log_sack_filter(struct tcpcb *tp, int num_sack_blks, struct sackblk *sack_blocks) +{ + if (tp->t_logstate != TCP_LOG_STATE_OFF) { + union tcp_log_stackspecific log; + struct timeval tv; + + memset(&log, 0, sizeof(log)); + log.u_bbr.timeStamp = tcp_get_usecs(&tv); + log.u_bbr.flex8 = num_sack_blks; + if (num_sack_blks > 0) { + log.u_bbr.flex1 = sack_blocks[0].start; + log.u_bbr.flex2 = sack_blocks[0].end; + } + if (num_sack_blks > 1) { + log.u_bbr.flex3 = sack_blocks[1].start; + log.u_bbr.flex4 = sack_blocks[1].end; + } + if (num_sack_blks > 2) { + log.u_bbr.flex5 = sack_blocks[2].start; + log.u_bbr.flex6 = sack_blocks[2].end; + } + if (num_sack_blks > 3) { + log.u_bbr.applimited = sack_blocks[3].start; + log.u_bbr.pkts_out = sack_blocks[3].end; + } + TCP_LOG_EVENTP(tp, NULL, + &tp->t_inpcb->inp_socket->so_rcv, + &tp->t_inpcb->inp_socket->so_snd, + TCP_SACK_FILTER_RES, 0, + 0, &log, false, &tv); + } +} + +uint32_t +ctf_decay_count(uint32_t count, uint32_t decay) +{ + /* + * Given a count, decay it by a set percentage. The + * percentage is in thousands i.e. 100% = 1000, + * 19.3% = 193. + */ + uint64_t perc_count, decay_per; + uint32_t decayed_count; + if (decay > 1000) { + /* We don't raise it */ + return (count); + } + perc_count = count; + decay_per = decay; + perc_count *= decay_per; + perc_count /= 1000; + /* + * So now perc_count holds the + * count decay value. + */ + decayed_count = count - (uint32_t)perc_count; + return(decayed_count); +} Index: netinet/tcp_var.h =================================================================== --- netinet/tcp_var.h +++ netinet/tcp_var.h @@ -102,7 +102,8 @@ t_state:4, /* state of this connection */ t_idle_reduce : 1, t_delayed_ack: 7, /* Delayed ack variable */ - bits_spare : 4; + t_fin_is_rst: 1, /* Are fin's treated as resets */ + bits_spare : 3; u_int t_flags; tcp_seq snd_una; /* sent but unacknowledged */ tcp_seq snd_max; /* highest sequence number sent; @@ -271,6 +272,11 @@ void (*tfb_tcp_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t); + int (*tfb_do_queued_segments)(struct socket *, struct tcpcb *, int); + int (*tfb_do_segment_nounlock)(struct mbuf *, struct tcphdr *, + struct socket *, struct tcpcb *, + int, int, uint8_t, + int, struct timeval *); void (*tfb_tcp_hpts_do_segment)(struct mbuf *, struct tcphdr *, struct socket *, struct tcpcb *, int, int, uint8_t, Index: sys/mbuf.h =================================================================== --- sys/mbuf.h +++ sys/mbuf.h @@ -407,6 +407,7 @@ #define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically hw-stamped on port (useful for IEEE 1588 and 802.1AS) */ +#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */ #define M_PROTO1 0x00001000 /* protocol-specific */ #define M_PROTO2 0x00002000 /* protocol-specific */