D20834.diff
No OneTemporary
Actions

Size

151 KB

Referenced Files

None

Subscribers

None

D20834.diff
View Options

	Index: head/sys/modules/tcp/rack/Makefile
	===================================================================
	--- head/sys/modules/tcp/rack/Makefile
	+++ head/sys/modules/tcp/rack/Makefile
	@@ -6,7 +6,7 @@

	STACKNAME= rack
	KMOD= tcp_${STACKNAME}
	-SRCS= rack.c sack_filter.c
	+SRCS= rack.c sack_filter.c rack_bbr_common.c

	SRCS+= opt_inet.h opt_inet6.h opt_ipsec.h
	SRCS+= opt_tcpdebug.h
	Index: head/sys/netinet/in_pcb.h
	===================================================================
	--- head/sys/netinet/in_pcb.h
	+++ head/sys/netinet/in_pcb.h
	@@ -759,7 +759,9 @@
	#define INP_ORIGDSTADDR 0x00000800 /* receive IP dst address/port */
	#define INP_CANNOT_DO_ECN 0x00001000 /* The stack does not do ECN */
	#define INP_REUSEPORT_LB 0x00002000 /* SO_REUSEPORT_LB option is set */
	-
	+#define INP_SUPPORTS_MBUFQ 0x00004000 /* Supports the mbuf queue method of LRO */
	+#define INP_MBUF_QUEUE_READY 0x00008000 /* The transport is pacing, inputs can be queued */
	+#define INP_DONT_SACK_QUEUE 0x00010000 /* If a sack arrives do not wake me */
	/*
	* Flags passed to in_pcblookup*() functions.
	*/
	Index: head/sys/netinet/tcp.h
	===================================================================
	--- head/sys/netinet/tcp.h
	+++ head/sys/netinet/tcp.h
	@@ -201,9 +201,8 @@
	#define TCP_RACK_TLP_THRESH 1063 /* RACK TLP theshold i.e. srtt+(srtt/N) */
	#define TCP_RACK_PKT_DELAY 1064 /* RACK added ms i.e. rack-rtt + reord + N */
	#define TCP_RACK_TLP_INC_VAR 1065 /* Does TLP include rtt variance in t-o */
	-#define TCP_RACK_SESS_CWV 1066 /* Enable RFC7611 cwnd validation on sess */
	#define TCP_BBR_IWINTSO 1067 /* Initial TSO window for BBRs first sends */
	-#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer */
	+#define TCP_BBR_RECFORCE 1068 /* Enter recovery force out a segment disregard pacer no longer valid */
	#define TCP_BBR_STARTUP_PG 1069 /* Startup pacing gain */
	#define TCP_BBR_DRAIN_PG 1070 /* Drain pacing gain */
	#define TCP_BBR_RWND_IS_APP 1071 /* Rwnd limited is considered app limited */
	@@ -211,14 +210,18 @@
	#define TCP_BBR_ONE_RETRAN 1073 /* Is only one segment allowed out during retran */
	#define TCP_BBR_STARTUP_LOSS_EXIT 1074 /* Do we exit a loss during startup if not 20% incr */
	#define TCP_BBR_USE_LOWGAIN 1075 /* lower the gain in PROBE_BW enable */
	-#define TCP_BBR_LOWGAIN_THRESH 1076 /* How many cycles do we stay in lowgain */
	-#define TCP_BBR_LOWGAIN_HALF 1077 /* Do we halfstep lowgain down */
	-#define TCP_BBR_LOWGAIN_FD 1078 /* Do we force a drain when lowgain in place */
	+#define TCP_BBR_LOWGAIN_THRESH 1076 /* Unused after 2.3 morphs to TSLIMITS >= 2.3 */
	+#define TCP_BBR_TSLIMITS 1076 /* Do we use experimental Timestamp limiting for our algo */
	+#define TCP_BBR_LOWGAIN_HALF 1077 /* Unused after 2.3 */
	+#define TCP_BBR_PACE_OH 1077 /* Reused in 4.2 for pacing overhead setting */
	+#define TCP_BBR_LOWGAIN_FD 1078 /* Unused after 2.3 */
	+#define TCP_BBR_HOLD_TARGET 1078 /* For 4.3 on */
	#define TCP_BBR_USEDEL_RATE 1079 /* Enable use of delivery rate for loss recovery */
	#define TCP_BBR_MIN_RTO 1080 /* Min RTO in milliseconds */
	#define TCP_BBR_MAX_RTO 1081 /* Max RTO in milliseconds */
	#define TCP_BBR_REC_OVER_HPTS 1082 /* Recovery override htps settings 0/1/3 */
	-#define TCP_BBR_UNLIMITED 1083 /* Does BBR, in non-recovery not use cwnd */
	+#define TCP_BBR_UNLIMITED 1083 /* Not used before 2.3 and morphs to algorithm >= 2.3 */
	+#define TCP_BBR_ALGORITHM 1083 /* What measurement algo does BBR use netflix=0, google=1 */
	#define TCP_BBR_DRAIN_INC_EXTRA 1084 /* Does the 3/4 drain target include the extra gain */
	#define TCP_BBR_STARTUP_EXIT_EPOCH 1085 /* what epoch gets us out of startup */
	#define TCP_BBR_PACE_PER_SEC 1086
	@@ -227,17 +230,27 @@
	#define TCP_BBR_PACE_SEG_MIN 1089
	#define TCP_BBR_PACE_CROSS 1090
	#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
	-#define TCP_RACK_IDLE_REDUCE_HIGH 1092 /* Reduce the highest cwnd seen to IW on idle */
	#define TCP_RACK_MIN_PACE 1093 /* Do we enforce rack min pace time */
	#define TCP_RACK_MIN_PACE_SEG 1094 /* If so what is the seg threshould */
	+#define TCP_RACK_GP_INCREASE 1094 /* After 4.1 its the GP increase */
	#define TCP_RACK_TLP_USE 1095
	#define TCP_BBR_ACK_COMP_ALG 1096 /* Not used */
	+#define TCP_BBR_TMR_PACE_OH 1096 /* Recycled in 4.2 */
	#define TCP_BBR_EXTRA_GAIN 1097
	#define TCP_BBR_RACK_RTT_USE 1098 /* what RTT should we use 0, 1, or 2? */
	#define TCP_BBR_RETRAN_WTSO 1099
	#define TCP_DATA_AFTER_CLOSE 1100
	#define TCP_BBR_PROBE_RTT_GAIN 1101
	#define TCP_BBR_PROBE_RTT_LEN 1102
	+#define TCP_BBR_SEND_IWND_IN_TSO 1103 /* Do we burst out whole iwin size chunks at start? */
	+#define TCP_BBR_USE_RACK_CHEAT 1104 /* Do we use the rack cheat for pacing rxt's */
	+#define TCP_BBR_HDWR_PACE 1105 /* Enable/disable hardware pacing */
	+#define TCP_BBR_UTTER_MAX_TSO 1106 /* Do we enforce an utter max TSO size */
	+#define TCP_BBR_EXTRA_STATE 1107 /* Special exit-persist catch up */
	+#define TCP_BBR_FLOOR_MIN_TSO 1108 /* The min tso size */
	+#define TCP_BBR_MIN_TOPACEOUT 1109 /* Do we suspend pacing until */
	+#define TCP_BBR_TSTMP_RAISES 1110 /* Can a timestamp measurement raise the b/w */
	+#define TCP_BBR_POLICER_DETECT 1111 /* Turn on/off google mode policer detection */


	/* Start of reserved space for third-party user-settable options. */
	Index: head/sys/netinet/tcp_hpts.h
	===================================================================
	--- head/sys/netinet/tcp_hpts.h
	+++ head/sys/netinet/tcp_hpts.h
	@@ -45,112 +45,80 @@

	/* Number of useconds in a hpts tick */
	#define HPTS_TICKS_PER_USEC 10
	-#define HPTS_MS_TO_SLOTS(x) (x * 100)
	+#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
	#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
	#define HPTS_USEC_IN_SEC 1000000
	#define HPTS_MSEC_IN_SEC 1000
	#define HPTS_USEC_IN_MSEC 1000

	-#define DEFAULT_HPTS_LOG 3072

	-/*
	- * Log flags consist of
	- * 7f 7f 1 1 bits
	- * p_cpu \| p_num \| INPUT_ACTIVE \| HPTS_ACTIVE
	- *
	- * So for example cpu 10, number 10 would with
	- * input active would show up as:
	- * p_flags = 0001010 0001010 1 0
	- * <or>
	- * p_flags = 0x142a
	- */
	-#define HPTS_HPTS_ACTIVE 0x01
	-#define HPTS_INPUT_ACTIVE 0x02
	-
	-#define HPTSLOG_IMMEDIATE 1
	-#define HPTSLOG_INSERT_NORMAL 2
	-#define HPTSLOG_INSERT_SLEEPER 3
	-#define HPTSLOG_SLEEP_AFTER 4
	-#define HPTSLOG_SLEEP_BEFORE 5
	-#define HPTSLOG_INSERTED 6
	-#define HPTSLOG_WAKEUP_HPTS 7
	-#define HPTSLOG_SETTORUN 8
	-#define HPTSLOG_HPTSI 9
	-#define HPTSLOG_TOLONG 10
	-#define HPTSLOG_AWAKENS 11
	-#define HPTSLOG_TIMESOUT 12
	-#define HPTSLOG_SLEEPSET 13
	-#define HPTSLOG_WAKEUP_INPUT 14
	-#define HPTSLOG_RESCHEDULE 15
	-#define HPTSLOG_AWAKE 16
	-#define HPTSLOG_INP_DONE 17
	-
	-struct hpts_log {
	- struct inpcb *inp;
	- int32_t event;
	- uint32_t cts;
	- int32_t line;
	- uint32_t ticknow;
	- uint32_t t_paceslot;
	- uint32_t t_hptsreq;
	- uint32_t p_curtick;
	- uint32_t p_prevtick;
	- uint32_t slot_req;
	- uint32_t p_on_queue_cnt;
	- uint32_t p_nxt_slot;
	- uint32_t p_cur_slot;
	- uint32_t p_hpts_sleep_time;
	- uint16_t p_flags;
	- uint8_t p_onhpts;
	- uint8_t p_oninput;
	- uint8_t is_notempty;
	-};
	-
	struct hpts_diag {
	- uint32_t p_hpts_active;
	- uint32_t p_nxt_slot;
	- uint32_t p_cur_slot;
	- uint32_t slot_req;
	- uint32_t inp_hptsslot;
	- uint32_t slot_now;
	- uint32_t have_slept;
	- uint32_t hpts_sleep_time;
	- uint32_t yet_to_sleep;
	- uint32_t need_new_to;
	- int32_t co_ret;
	- uint8_t p_on_min_sleep;
	+ uint32_t p_hpts_active; /* bbr->flex7 x */
	+ uint32_t p_nxt_slot; /* bbr->flex1 x */
	+ uint32_t p_cur_slot; /* bbr->flex2 x */
	+ uint32_t p_prev_slot; /* bbr->delivered */
	+ uint32_t p_runningtick; /* bbr->inflight */
	+ uint32_t slot_req; /* bbr->flex3 x */
	+ uint32_t inp_hptsslot; /* bbr->flex4 x */
	+ uint32_t slot_remaining; /* bbr->flex5 x */
	+ uint32_t have_slept; /* bbr->epoch x */
	+ uint32_t hpts_sleep_time; /* bbr->applimited x */
	+ uint32_t yet_to_sleep; /* bbr->lt_epoch x */
	+ uint32_t need_new_to; /* bbr->flex6 x */
	+ uint32_t wheel_tick; /* bbr->bw_inuse x */
	+ uint32_t maxticks; /* bbr->delRate x */
	+ uint32_t wheel_cts; /* bbr->rttProp x */
	+ int32_t co_ret; /* bbr->pkts_out x */
	+ uint32_t p_curtick; /* upper bbr->cur_del_rate */
	+ uint32_t p_lasttick; /* lower bbr->cur_del_rate */
	+ uint8_t p_on_min_sleep; /* bbr->flex8 x */
	};

	+/* Magic flags to tell whats cooking on the pacing wheel */
	+#define PACE_TMR_DELACK 0x01 /* Delayed ack timer running */
	+#define PACE_TMR_RACK 0x02 /* RACK timer running */
	+#define PACE_TMR_TLP 0x04 /* TLP timer running */
	+#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
	+#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
	+#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
	+#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
	+#define PACE_TMR_MASK (PACE_TMR_KEEP\|PACE_TMR_PERSIT\|PACE_TMR_RXT\|PACE_TMR_TLP\|PACE_TMR_RACK\|PACE_TMR_DELACK)
	+
	#ifdef _KERNEL
	/* Each hpts has its own p_mtx which is used for locking */
	struct tcp_hpts_entry {
	/* Cache line 0x00 */
	struct mtx p_mtx; /* Mutex for hpts */
	- uint32_t p_hpts_active; /* Flag that says hpts is awake */
	- uint32_t p_curtick; /* Current tick in 10 us the hpts is at */
	- uint32_t p_prevtick; /* Previous tick in 10 us the hpts ran */
	+ uint16_t p_hpts_active; /* Flag that says hpts is awake */
	+ uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
	+ uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
	+ uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
	+ uint32_t p_runningtick; /* Current tick we are at if we are running */
	+ uint32_t p_prev_slot; /* Previous slot we were on */
	uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
	uint32_t p_nxt_slot; /* The next slot outside the current range of
	* slots that the hpts is running on. */
	int32_t p_on_queue_cnt; /* Count on queue in this hpts */
	- uint32_t enobuf_cnt;
	- uint16_t p_log_at;
	+ uint32_t p_lasttick; /* Last tick before the current one */
	uint8_t p_direct_wake :1, /* boolean */
	- p_log_wrapped :1, /* boolean */
	- p_on_min_sleep:1; /* boolean */
	- uint8_t p_fill;
	+ p_on_min_sleep:1, /* boolean */
	+ p_avail:6;
	+ uint8_t p_fill[3]; /* Fill to 32 bits */
	/* Cache line 0x40 */
	void *p_inp;
	struct hptsh p_input; /* For the tcp-input runner */
	/* Hptsi wheel */
	struct hptsh *p_hptss;
	- struct hpts_log *p_log;
	- uint32_t p_logsize;
	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
	uint32_t hit_no_enobuf;
	uint32_t p_dyn_adjust;
	uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
	* of 255ms */
	+ uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
	+ uint32_t saved_lasttick; /* for logging */
	+ uint32_t saved_curtick; /* for logging */
	+ uint32_t saved_curslot; /* for logging */
	+ uint32_t saved_prev_slot; /* for logging */
	uint32_t p_delayed_by; /* How much were we delayed by */
	/* Cache line 0x80 */
	struct sysctl_ctx_list hpts_ctx;
	@@ -236,13 +204,9 @@
	int
	__tcp_queue_to_input_locked(struct inpcb inp, struct tcp_hpts_entry hpts, int32_t line);
	#define tcp_queue_to_input_locked(a, b) __tcp_queue_to_input_locked(a, b, __LINE__);
	-void
	-tcp_queue_pkt_to_input(struct tcpcb tp, struct mbuf m, struct tcphdr *th,
	- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos);
	int
	-__tcp_queue_to_input(struct tcpcb tp, struct mbuf m, struct tcphdr *th,
	- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line);
	-#define tcp_queue_to_input(a, b, c, d, e, f, g) __tcp_queue_to_input(a, b, c, d, e, f, g, __LINE__)
	+__tcp_queue_to_input(struct inpcb *inp, int32_t line);
	+#define tcp_queue_to_input(a) __tcp_queue_to_input(a, __LINE__)

	uint16_t tcp_hpts_delayedby(struct inpcb *inp);

	Index: head/sys/netinet/tcp_hpts.c
	===================================================================
	--- head/sys/netinet/tcp_hpts.c
	+++ head/sys/netinet/tcp_hpts.c
	@@ -37,7 +37,7 @@
	* pacing packets out onto the wire. It can be used in two ways
	* by a given TCP stack (and those two methods can be used simultaneously).
	*
	- * First, and probably the main thing its used by Rack and BBR for, it can
	+ * First, and probably the main thing its used by Rack and BBR, it can
	* be used to call tcp_output() of a transport stack at some time in the future.
	* The normal way this is done is that tcp_output() of the stack schedules
	* itself to be called again by calling tcp_hpts_insert(tcpcb, slot). The
	@@ -59,42 +59,57 @@
	* to prevent output processing until the time alotted has gone by.
	* Of course this is a bare bones example and the stack will probably
	* have more consideration then just the above.
	- *
	- * Now the tcp_hpts system will call tcp_output in one of two forms,
	- * it will first check to see if the stack as defined a
	- * tfb_tcp_output_wtime() function, if so that is the routine it
	- * will call, if that function is not defined then it will call the
	- * tfb_tcp_output() function. The only difference between these
	- * two calls is that the former passes the time in to the function
	- * so the function does not have to access the time (which tcp_hpts
	- * already has). What these functions do is of course totally up
	- * to the individual tcp stack.
	- *
	+ *
	* Now the second function (actually two functions I guess :D)
	* the tcp_hpts system provides is the ability to either abort
	- * a connection (later) or process input on a connection.
	- * Why would you want to do this? To keep processor locality.
	+ * a connection (later) or process input on a connection.
	+ * Why would you want to do this? To keep processor locality
	+ * and or not have to worry about untangling any recursive
	+ * locks. The input function now is hooked to the new LRO
	+ * system as well.
	*
	- * So in order to use the input redirection function the
	- * stack changes its tcp_do_segment() routine to instead
	- * of process the data call the function:
	+ * In order to use the input redirection function the
	+ * tcp stack must define an input function for
	+ * tfb_do_queued_segments(). This function understands
	+ * how to dequeue a array of packets that were input and
	+ * knows how to call the correct processing routine.
	*
	- * tcp_queue_pkt_to_input()
	- *
	- * You will note that the arguments to this function look
	- * a lot like tcp_do_segments's arguments. This function
	- * will assure that the tcp_hpts system will
	- * call the functions tfb_tcp_hpts_do_segment() from the
	- * correct CPU. Note that multiple calls can get pushed
	- * into the tcp_hpts system this will be indicated by
	- * the next to last argument to tfb_tcp_hpts_do_segment()
	- * (nxt_pkt). If nxt_pkt is a 1 then another packet is
	- * coming. If nxt_pkt is a 0 then this is the last call
	- * that the tcp_hpts system has available for the tcp stack.
	+ * Locking in this is important as well so most likely the
	+ * stack will need to define the tfb_do_segment_nounlock()
	+ * splitting tfb_do_segment() into two parts. The main processing
	+ * part that does not unlock the INP and returns a value of 1 or 0.
	+ * It returns 0 if all is well and the lock was not released. It
	+ * returns 1 if we had to destroy the TCB (a reset received etc).
	+ * The remains of tfb_do_segment() then become just a simple call
	+ * to the tfb_do_segment_nounlock() function and check the return
	+ * code and possibly unlock.
	*
	- * The other point of the input system is to be able to safely
	- * drop a tcp connection without worrying about the recursive
	- * locking that may be occuring on the INP_WLOCK. So if
	+ * The stack must also set the flag on the INP that it supports this
	+ * feature i.e. INP_SUPPORTS_MBUFQ. The LRO code recoginizes
	+ * this flag as well and will queue packets when it is set.
	+ * There are other flags as well INP_MBUF_QUEUE_READY and
	+ * INP_DONT_SACK_QUEUE. The first flag tells the LRO code
	+ * that we are in the pacer for output so there is no
	+ * need to wake up the hpts system to get immediate
	+ * input. The second tells the LRO code that its okay
	+ * if a SACK arrives you can still defer input and let
	+ * the current hpts timer run (this is usually set when
	+ * a rack timer is up so we know SACK's are happening
	+ * on the connection already and don't want to wakeup yet).
	+ *
	+ * There is a common functions within the rack_bbr_common code
	+ * version i.e. ctf_do_queued_segments(). This function
	+ * knows how to take the input queue of packets from
	+ * tp->t_in_pkts and process them digging out
	+ * all the arguments, calling any bpf tap and
	+ * calling into tfb_do_segment_nounlock(). The common
	+ * function (ctf_do_queued_segments()) requires that
	+ * you have defined the tfb_do_segment_nounlock() as
	+ * described above.
	+ *
	+ * The second feature of the input side of hpts is the
	+ * dropping of a connection. This is due to the way that
	+ * locking may have occured on the INP_WLOCK. So if
	* a stack wants to drop a connection it calls:
	*
	* tcp_set_inp_to_drop(tp, ETIMEDOUT)
	@@ -156,6 +171,7 @@
	#include <netinet/tcpip.h>
	#include <netinet/cc/cc.h>
	#include <netinet/tcp_hpts.h>
	+#include <netinet/tcp_log_buf.h>

	#ifdef tcpdebug
	#include <netinet/tcp_debug.h>
	@@ -168,24 +184,19 @@

	MALLOC_DEFINE(M_TCPHPTS, "tcp_hpts", "TCP hpts");
	#ifdef RSS
	-#include <net/netisr.h>
	-#include <net/rss_config.h>
	static int tcp_bind_threads = 1;
	#else
	static int tcp_bind_threads = 2;
	#endif
	TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);

	-static uint32_t tcp_hpts_logging_size = DEFAULT_HPTS_LOG;
	-
	-TUNABLE_INT("net.inet.tcp.hpts_logging_sz", &tcp_hpts_logging_size);
	-
	static struct tcp_hptsi tcp_pace;
	+static int hpts_does_tp_logging = 0;

	static void tcp_wakehpts(struct tcp_hpts_entry *p);
	static void tcp_wakeinput(struct tcp_hpts_entry *p);
	static void tcp_input_data(struct tcp_hpts_entry hpts, struct timeval tv);
	-static void tcp_hptsi(struct tcp_hpts_entry hpts, struct timeval ctick);
	+static void tcp_hptsi(struct tcp_hpts_entry *hpts);
	static void tcp_hpts_thread(void *ctx);
	static void tcp_init_hptsi(void *st);

	@@ -204,8 +215,6 @@
	} \
	} while (0)

	-static int32_t logging_on = 0;
	-static int32_t hpts_sleep_max = (NUM_OF_HPTSI_SLOTS - 2);
	static int32_t tcp_hpts_precision = 120;

	struct hpts_domain_info {
	@@ -219,44 +228,75 @@
	&tcp_hpts_precision, 120,
	"Value for PRE() precision of callout");

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
	- &logging_on, 0,
	- "Turn on logging if compiled in");
	+counter_u64_t hpts_hopelessly_behind;

	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
	+ &hpts_hopelessly_behind,
	+ "Number of times hpts could not catch up and was behind hopelessly");
	+
	counter_u64_t hpts_loops;

	SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
	&hpts_loops, "Number of times hpts had to loop to catch up");

	+
	counter_u64_t back_tosleep;

	SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
	&back_tosleep, "Number of times hpts found no tcbs");

	-static int32_t in_newts_every_tcb = 0;
	+counter_u64_t combined_wheel_wrap;

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tsperpcb, CTLFLAG_RW,
	- &in_newts_every_tcb, 0,
	- "Do we have a new cts every tcb we process for input");
	-static int32_t in_ts_percision = 0;
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
	+ &combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, in_tspercision, CTLFLAG_RW,
	- &in_ts_percision, 0,
	- "Do we use percise timestamp for clients on input");
	-static int32_t out_newts_every_tcb = 0;
	+counter_u64_t wheel_wrap;

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tsperpcb, CTLFLAG_RW,
	- &out_newts_every_tcb, 0,
	- "Do we have a new cts every tcb we process for output");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
	+ &wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");
	+
	static int32_t out_ts_percision = 0;

	SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
	&out_ts_percision, 0,
	"Do we use a percise timestamp for every output cts");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
	+ &hpts_does_tp_logging, 0,
	+ "Do we add to any tp that has logging on pacer logs");

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, maxsleep, CTLFLAG_RW,
	+static int32_t max_pacer_loops = 10;
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
	+ &max_pacer_loops, 10,
	+ "What is the maximum number of times the pacer will loop trying to catch up");
	+
	+#define HPTS_MAX_SLEEP_ALLOWED (NUM_OF_HPTSI_SLOTS/2)
	+
	+static uint32_t hpts_sleep_max = HPTS_MAX_SLEEP_ALLOWED;
	+
	+
	+static int
	+sysctl_net_inet_tcp_hpts_max_sleep(SYSCTL_HANDLER_ARGS)
	+{
	+ int error;
	+ uint32_t new;
	+
	+ new = hpts_sleep_max;
	+ error = sysctl_handle_int(oidp, &new, 0, req);
	+ if (error == 0 && req->newptr) {
	+ if ((new < (NUM_OF_HPTSI_SLOTS / 4)) \|\|
	+ (new > HPTS_MAX_SLEEP_ALLOWED))
	+ error = EINVAL;
	+ else
	+ hpts_sleep_max = new;
	+ }
	+ return (error);
	+}
	+
	+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
	+ CTLTYPE_UINT \| CTLFLAG_RW,
	&hpts_sleep_max, 0,
	- "The maximum time the hpts will sleep <1 - 254>");
	+ &sysctl_net_inet_tcp_hpts_max_sleep, "IU",
	+ "Maximum time hpts will sleep");

	SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
	&tcp_min_hptsi_time, 0,
	@@ -267,55 +307,35 @@
	"Do we have the callout call directly to the hpts?");

	static void
	-__tcp_hpts_log_it(struct tcp_hpts_entry hpts, struct inpcb inp, int event, uint32_t slot,
	- uint32_t ticknow, int32_t line)
	+tcp_hpts_log(struct tcp_hpts_entry hpts, struct tcpcb tp, struct timeval *tv,
	+ int ticks_to_run, int idx)
	{
	- struct hpts_log *pl;
	-
	- HPTS_MTX_ASSERT(hpts);
	- if (hpts->p_log == NULL)
	- return;
	- pl = &hpts->p_log[hpts->p_log_at];
	- hpts->p_log_at++;
	- if (hpts->p_log_at >= hpts->p_logsize) {
	- hpts->p_log_at = 0;
	- hpts->p_log_wrapped = 1;
	- }
	- pl->inp = inp;
	- if (inp) {
	- pl->t_paceslot = inp->inp_hptsslot;
	- pl->t_hptsreq = inp->inp_hpts_request;
	- pl->p_onhpts = inp->inp_in_hpts;
	- pl->p_oninput = inp->inp_in_input;
	- } else {
	- pl->t_paceslot = 0;
	- pl->t_hptsreq = 0;
	- pl->p_onhpts = 0;
	- pl->p_oninput = 0;
	- }
	- pl->is_notempty = 1;
	- pl->event = event;
	- pl->line = line;
	- pl->cts = tcp_get_usecs(NULL);
	- pl->p_curtick = hpts->p_curtick;
	- pl->p_prevtick = hpts->p_prevtick;
	- pl->p_on_queue_cnt = hpts->p_on_queue_cnt;
	- pl->ticknow = ticknow;
	- pl->slot_req = slot;
	- pl->p_nxt_slot = hpts->p_nxt_slot;
	- pl->p_cur_slot = hpts->p_cur_slot;
	- pl->p_hpts_sleep_time = hpts->p_hpts_sleep_time;
	- pl->p_flags = (hpts->p_cpu & 0x7f);
	- pl->p_flags <<= 7;
	- pl->p_flags \|= (hpts->p_num & 0x7f);
	- pl->p_flags <<= 2;
	- if (hpts->p_hpts_active) {
	- pl->p_flags \|= HPTS_HPTS_ACTIVE;
	- }
	+ union tcp_log_stackspecific log;
	+
	+ memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	+ log.u_bbr.flex1 = hpts->p_nxt_slot;
	+ log.u_bbr.flex2 = hpts->p_cur_slot;
	+ log.u_bbr.flex3 = hpts->p_prev_slot;
	+ log.u_bbr.flex4 = idx;
	+ log.u_bbr.flex5 = hpts->p_curtick;
	+ log.u_bbr.flex6 = hpts->p_on_queue_cnt;
	+ log.u_bbr.use_lt_bw = 1;
	+ log.u_bbr.inflight = ticks_to_run;
	+ log.u_bbr.applimited = hpts->overidden_sleep;
	+ log.u_bbr.delivered = hpts->saved_curtick;
	+ log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
	+ log.u_bbr.epoch = hpts->saved_curslot;
	+ log.u_bbr.lt_epoch = hpts->saved_prev_slot;
	+ log.u_bbr.pkts_out = hpts->p_delayed_by;
	+ log.u_bbr.lost = hpts->p_hpts_sleep_time;
	+ log.u_bbr.cur_del_rate = hpts->p_runningtick;
	+ TCP_LOG_EVENTP(tp, NULL,
	+ &tp->t_inpcb->inp_socket->so_rcv,
	+ &tp->t_inpcb->inp_socket->so_snd,
	+ BBR_LOG_HPTSDIAG, 0,
	+ 0, &log, false, tv);
	}

	-#define tcp_hpts_log_it(a, b, c, d, e) __tcp_hpts_log_it(a, b, c, d, e, __LINE__)
	-
	static void
	hpts_timeout_swi(void *arg)
	{
	@@ -347,12 +367,6 @@
	/* We are not on the hpts? */
	panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
	}
	- if (TAILQ_EMPTY(head) &&
	- (hpts->p_on_queue_cnt != 0)) {
	- /* We should not be empty with a queue count */
	- panic("%s hpts:%p hpts bucket empty but cnt:%d",
	- __FUNCTION__, hpts, hpts->p_on_queue_cnt);
	- }
	#endif
	TAILQ_REMOVE(head, inp, inp_hpts);
	hpts->p_on_queue_cnt--;
	@@ -456,58 +470,13 @@
	in_pcbref(inp);
	}

	-static int
	-sysctl_tcp_hpts_log(SYSCTL_HANDLER_ARGS)
	-{
	- struct tcp_hpts_entry *hpts;
	- size_t sz;
	- int32_t logging_was, i;
	- int32_t error = 0;
	-
	- /*
	- * HACK: Turn off logging so no locks are required this really needs
	- * a memory barrier :)
	- */
	- logging_was = logging_on;
	- logging_on = 0;
	- if (!req->oldptr) {
	- /* How much? */
	- sz = 0;
	- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
	- hpts = tcp_pace.rp_ent[i];
	- if (hpts->p_log == NULL)
	- continue;
	- sz += (sizeof(struct hpts_log) * hpts->p_logsize);
	- }
	- error = SYSCTL_OUT(req, 0, sz);
	- } else {
	- for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
	- hpts = tcp_pace.rp_ent[i];
	- if (hpts->p_log == NULL)
	- continue;
	- if (hpts->p_log_wrapped)
	- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
	- else
	- sz = (sizeof(struct hpts_log) * hpts->p_log_at);
	- error = SYSCTL_OUT(req, hpts->p_log, sz);
	- }
	- }
	- logging_on = logging_was;
	- return error;
	-}
	-
	-SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, log, CTLTYPE_STRING \| CTLFLAG_RD \| CTLFLAG_MPSAFE,
	- 0, 0, sysctl_tcp_hpts_log, "A", "tcp hptsi log");
	-
	-
	static void
	tcp_wakehpts(struct tcp_hpts_entry *hpts)
	{
	HPTS_MTX_ASSERT(hpts);
	- swi_sched(hpts->ie_cookie, 0);
	- if (hpts->p_hpts_active == 2) {
	- /* Rare sleeping on a ENOBUF */
	- wakeup_one(hpts);
	+ if (hpts->p_hpts_wake_scheduled == 0) {
	+ hpts->p_hpts_wake_scheduled = 1;
	+ swi_sched(hpts->ie_cookie, 0);
	}
	}

	@@ -515,10 +484,9 @@
	tcp_wakeinput(struct tcp_hpts_entry *hpts)
	{
	HPTS_MTX_ASSERT(hpts);
	- swi_sched(hpts->ie_cookie, 0);
	- if (hpts->p_hpts_active == 2) {
	- /* Rare sleeping on a ENOBUF */
	- wakeup_one(hpts);
	+ if (hpts->p_hpts_wake_scheduled == 0) {
	+ hpts->p_hpts_wake_scheduled = 1;
	+ swi_sched(hpts->ie_cookie, 0);
	}
	}

	@@ -648,8 +616,8 @@
	* Valid values in the flags are
	* HPTS_REMOVE_OUTPUT - remove from the output of the hpts.
	* HPTS_REMOVE_INPUT - remove from the input of the hpts.
	- * Note that you can or both values together and get two
	- * actions.
	+ * Note that you can use one or both values together
	+ * and get two actions.
	*/
	void
	__tcp_hpts_remove(struct inpcb *inp, int32_t flags, int32_t line)
	@@ -670,53 +638,198 @@
	}

	static inline int
	-hpts_tick(struct tcp_hpts_entry *hpts, int32_t plus)
	+hpts_tick(uint32_t wheel_tick, uint32_t plus)
	{
	- return ((hpts->p_prevtick + plus) % NUM_OF_HPTSI_SLOTS);
	+ /*
	+ * Given a slot on the wheel, what slot
	+ * is that plus ticks out?
	+ */
	+ KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
	+ return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
	}

	+static inline int
	+tick_to_wheel(uint32_t cts_in_wticks)
	+{
	+ /*
	+ * Given a timestamp in wheel ticks (10usec inc's)
	+ * map it to our limited space wheel.
	+ */
	+ return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
	+}
	+
	+static inline int
	+hpts_ticks_diff(int prev_tick, int tick_now)
	+{
	+ /*
	+ * Given two ticks that are someplace
	+ * on our wheel. How far are they apart?
	+ */
	+ if (tick_now > prev_tick)
	+ return (tick_now - prev_tick);
	+ else if (tick_now == prev_tick)
	+ /*
	+ * Special case, same means we can go all of our
	+ * wheel less one slot.
	+ */
	+ return (NUM_OF_HPTSI_SLOTS - 1);
	+ else
	+ return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
	+}
	+
	+/*
	+ * Given a tick on the wheel that is the current time
	+ * mapped to the wheel (wheel_tick), what is the maximum
	+ * distance forward that can be obtained without
	+ * wrapping past either prev_tick or running_tick
	+ * depending on the htps state? Also if passed
	+ * a uint32_t *, fill it with the tick location.
	+ *
	+ * Note if you do not give this function the current
	+ * time (that you think it is) mapped to the wheel
	+ * then the results will not be what you expect and
	+ * could lead to invalid inserts.
	+ */
	+static inline int32_t
	+max_ticks_available(struct tcp_hpts_entry hpts, uint32_t wheel_tick, uint32_t target_tick)
	+{
	+ uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
	+
	+ if ((hpts->p_hpts_active == 1) &&
	+ (hpts->p_wheel_complete == 0)) {
	+ end_tick = hpts->p_runningtick;
	+ /* Back up one tick */
	+ if (end_tick == 0)
	+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
	+ else
	+ end_tick--;
	+ if (target_tick)
	+ *target_tick = end_tick;
	+ } else {
	+ /*
	+ * For the case where we are
	+ * not active, or we have
	+ * completed the pass over
	+ * the wheel, we can use the
	+ * prev tick and subtract one from it. This puts us
	+ * as far out as possible on the wheel.
	+ */
	+ end_tick = hpts->p_prev_slot;
	+ if (end_tick == 0)
	+ end_tick = NUM_OF_HPTSI_SLOTS - 1;
	+ else
	+ end_tick--;
	+ if (target_tick)
	+ *target_tick = end_tick;
	+ /*
	+ * Now we have close to the full wheel left minus the
	+ * time it has been since the pacer went to sleep. Note
	+ * that wheel_tick, passed in, should be the current time
	+ * from the perspective of the caller, mapped to the wheel.
	+ */
	+ if (hpts->p_prev_slot != wheel_tick)
	+ dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
	+ else
	+ dis_to_travel = 1;
	+ /*
	+ * dis_to_travel in this case is the space from when the
	+ * pacer stopped (p_prev_slot) and where our wheel_tick
	+ * is now. To know how many slots we can put it in we
	+ * subtract from the wheel size. We would not want
	+ * to place something after p_prev_slot or it will
	+ * get ran too soon.
	+ */
	+ return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
	+ }
	+ /*
	+ * So how many slots are open between p_runningtick -> p_cur_slot
	+ * that is what is currently un-available for insertion. Special
	+ * case when we are at the last slot, this gets 1, so that
	+ * the answer to how many slots are available is all but 1.
	+ */
	+ if (hpts->p_runningtick == hpts->p_cur_slot)
	+ dis_to_travel = 1;
	+ else
	+ dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
	+ /*
	+ * How long has the pacer been running?
	+ */
	+ if (hpts->p_cur_slot != wheel_tick) {
	+ /* The pacer is a bit late */
	+ pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
	+ } else {
	+ /* The pacer is right on time, now == pacers start time */
	+ pacer_to_now = 0;
	+ }
	+ /*
	+ * To get the number left we can insert into we simply
	+ * subract the distance the pacer has to run from how
	+ * many slots there are.
	+ */
	+ avail_on_wheel = NUM_OF_HPTSI_SLOTS - dis_to_travel;
	+ /*
	+ * Now how many of those we will eat due to the pacer's
	+ * time (p_cur_slot) of start being behind the
	+ * real time (wheel_tick)?
	+ */
	+ if (avail_on_wheel <= pacer_to_now) {
	+ /*
	+ * Wheel wrap, we can't fit on the wheel, that
	+ * is unusual the system must be way overloaded!
	+ * Insert into the assured tick, and return special
	+ * "0".
	+ */
	+ counter_u64_add(combined_wheel_wrap, 1);
	+ *target_tick = hpts->p_nxt_slot;
	+ return (0);
	+ } else {
	+ /*
	+ * We know how many slots are open
	+ * on the wheel (the reverse of what
	+ * is left to run. Take away the time
	+ * the pacer started to now (wheel_tick)
	+ * and that tells you how many slots are
	+ * open that can be inserted into that won't
	+ * be touched by the pacer until later.
	+ */
	+ return (avail_on_wheel - pacer_to_now);
	+ }
	+}
	+
	static int
	tcp_queue_to_hpts_immediate_locked(struct inpcb inp, struct tcp_hpts_entry hpts, int32_t line, int32_t noref)
	{
	- int32_t need_wake = 0;
	- uint32_t ticknow = 0;
	-
	+ uint32_t need_wake = 0;
	+
	HPTS_MTX_ASSERT(hpts);
	if (inp->inp_in_hpts == 0) {
	/* Ok we need to set it on the hpts in the current slot */
	- if (hpts->p_hpts_active == 0) {
	- /* A sleeping hpts we want in next slot to run */
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, 0,
	- hpts_tick(hpts, 1));
	- }
	- inp->inp_hptsslot = hpts_tick(hpts, 1);
	- inp->inp_hpts_request = 0;
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEP_BEFORE, 1, ticknow);
	- }
	- need_wake = 1;
	+ inp->inp_hpts_request = 0;
	+ if ((hpts->p_hpts_active == 0) \|\|
	+ (hpts->p_wheel_complete)) {
	+ /*
	+ * A sleeping hpts we want in next slot to run
	+ * note that in this state p_prev_slot == p_cur_slot
	+ */
	+ inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
	+ if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
	+ need_wake = 1;
	} else if ((void *)inp == hpts->p_inp) {
	/*
	+ * The hpts system is running and the caller
	+ * was awoken by the hpts system.
	* We can't allow you to go into the same slot we
	- * are in. We must put you out.
	+ * are in (we don't want a loop :-D).
	*/
	inp->inp_hptsslot = hpts->p_nxt_slot;
	} else
	- inp->inp_hptsslot = hpts->p_cur_slot;
	+ inp->inp_hptsslot = hpts->p_runningtick;
	hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
	- inp->inp_hpts_request = 0;
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_IMMEDIATE, 0, 0);
	- }
	if (need_wake) {
	/*
	* Activate the hpts if it is sleeping and its
	* timeout is not 1.
	*/
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_HPTS, 0, ticknow);
	- }
	hpts->p_direct_wake = 1;
	tcp_wakehpts(hpts);
	}
	@@ -737,141 +850,129 @@
	return (ret);
	}

	+#ifdef INVARIANTS
	static void
	-tcp_hpts_insert_locked(struct tcp_hpts_entry hpts, struct inpcb inp, uint32_t slot, uint32_t cts, int32_t line,
	- struct hpts_diag *diag, int32_t noref)
	+check_if_slot_would_be_wrong(struct tcp_hpts_entry hpts, struct inpcb inp, uint32_t inp_hptsslot, int line)
	{
	- int32_t need_new_to = 0;
	- int32_t need_wakeup = 0;
	- uint32_t largest_slot;
	- uint32_t ticknow = 0;
	- uint32_t slot_calc;
	+ /*
	+ * Sanity checks for the pacer with invariants
	+ * on insert.
	+ */
	+ if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
	+ panic("hpts:%p inp:%p slot:%d > max",
	+ hpts, inp, inp_hptsslot);
	+ if ((hpts->p_hpts_active) &&
	+ (hpts->p_wheel_complete == 0)) {
	+ /*
	+ * If the pacer is processing a arc
	+ * of the wheel, we need to make
	+ * sure we are not inserting within
	+ * that arc.
	+ */
	+ int distance, yet_to_run;

	+ distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
	+ if (hpts->p_runningtick != hpts->p_cur_slot)
	+ yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
	+ else
	+ yet_to_run = 0; /* processing last slot */
	+ if (yet_to_run > distance) {
	+ panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
	+ hpts, inp, inp_hptsslot,
	+ distance, yet_to_run,
	+ hpts->p_runningtick, hpts->p_cur_slot);
	+ }
	+ }
	+}
	+#endif
	+
	+static void
	+tcp_hpts_insert_locked(struct tcp_hpts_entry hpts, struct inpcb inp, uint32_t slot, int32_t line,
	+ struct hpts_diag diag, struct timeval tv)
	+{
	+ uint32_t need_new_to = 0;
	+ uint32_t wheel_cts, last_tick;
	+ int32_t wheel_tick, maxticks;
	+ int8_t need_wakeup = 0;
	+
	HPTS_MTX_ASSERT(hpts);
	if (diag) {
	memset(diag, 0, sizeof(struct hpts_diag));
	diag->p_hpts_active = hpts->p_hpts_active;
	+ diag->p_prev_slot = hpts->p_prev_slot;
	+ diag->p_runningtick = hpts->p_runningtick;
	diag->p_nxt_slot = hpts->p_nxt_slot;
	diag->p_cur_slot = hpts->p_cur_slot;
	+ diag->p_curtick = hpts->p_curtick;
	+ diag->p_lasttick = hpts->p_lasttick;
	diag->slot_req = slot;
	+ diag->p_on_min_sleep = hpts->p_on_min_sleep;
	+ diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
	}
	- if ((inp->inp_in_hpts == 0) \|\| noref) {
	- inp->inp_hpts_request = slot;
	+ if (inp->inp_in_hpts == 0) {
	if (slot == 0) {
	/* Immediate */
	- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, noref);
	+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
	return;
	}
	- if (hpts->p_hpts_active) {
	- /*
	- * Its slot - 1 since nxt_slot is the next tick that
	- * will go off since the hpts is awake
	- */
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_NORMAL, slot, 0);
	- }
	- /*
	- * We want to make sure that we don't place a inp in
	- * the range of p_cur_slot <-> p_nxt_slot. If we
	- * take from p_nxt_slot to the end, plus p_cur_slot
	- * and then take away 2, we will know how many is
	- * the max slots we can use.
	- */
	- if (hpts->p_nxt_slot > hpts->p_cur_slot) {
	- /*
	- * Non-wrap case nxt_slot <-> cur_slot we
	- * don't want to land in. So the diff gives
	- * us what is taken away from the number of
	- * slots.
	+ /* Get the current time relative to the wheel */
	+ wheel_cts = tcp_tv_to_hptstick(tv);
	+ /* Map it onto the wheel */
	+ wheel_tick = tick_to_wheel(wheel_cts);
	+ /* Now what's the max we can place it at? */
	+ maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
	+ if (diag) {
	+ diag->wheel_tick = wheel_tick;
	+ diag->maxticks = maxticks;
	+ diag->wheel_cts = wheel_cts;
	+ }
	+ if (maxticks == 0) {
	+ /* The pacer is in a wheel wrap behind, yikes! */
	+ if (slot > 1) {
	+ /*
	+ * Reduce by 1 to prevent a forever loop in
	+ * case something else is wrong. Note this
	+ * probably does not hurt because the pacer
	+ * if its true is so far behind we will be
	+ * > 1second late calling anyway.
	*/
	- largest_slot = NUM_OF_HPTSI_SLOTS - (hpts->p_nxt_slot - hpts->p_cur_slot);
	- } else if (hpts->p_nxt_slot == hpts->p_cur_slot) {
	- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
	- } else {
	- /*
	- * Wrap case so the diff gives us the number
	- * of slots that we can land in.
	- */
	- largest_slot = hpts->p_cur_slot - hpts->p_nxt_slot;
	+ slot--;
	}
	- /*
	- * We take away two so we never have a problem (20
	- * usec's) out of 1024000 usecs
	- */
	- largest_slot -= 2;
	- if (inp->inp_hpts_request > largest_slot) {
	- /*
	- * Restrict max jump of slots and remember
	- * leftover
	- */
	- slot = largest_slot;
	- inp->inp_hpts_request -= largest_slot;
	- } else {
	- /* This one will run when we hit it */
	- inp->inp_hpts_request = 0;
	- }
	- if (hpts->p_nxt_slot == hpts->p_cur_slot)
	- slot_calc = (hpts->p_nxt_slot + slot) % NUM_OF_HPTSI_SLOTS;
	- else
	- slot_calc = (hpts->p_nxt_slot + slot - 1) % NUM_OF_HPTSI_SLOTS;
	- if (slot_calc == hpts->p_cur_slot) {
	+ inp->inp_hptsslot = last_tick;
	+ inp->inp_hpts_request = slot;
	+ } else if (maxticks >= slot) {
	+ /* It all fits on the wheel */
	+ inp->inp_hpts_request = 0;
	+ inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
	+ } else {
	+ /* It does not fit */
	+ inp->inp_hpts_request = slot - maxticks;
	+ inp->inp_hptsslot = last_tick;
	+ }
	+ if (diag) {
	+ diag->slot_remaining = inp->inp_hpts_request;
	+ diag->inp_hptsslot = inp->inp_hptsslot;
	+ }
	#ifdef INVARIANTS
	- /* TSNH */
	- panic("Hpts:%p impossible slot calculation slot_calc:%u slot:%u largest:%u\n",
	- hpts, slot_calc, slot, largest_slot);
	+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
	#endif
	- if (slot_calc)
	- slot_calc--;
	- else
	- slot_calc = NUM_OF_HPTSI_SLOTS - 1;
	- }
	- inp->inp_hptsslot = slot_calc;
	- if (diag) {
	- diag->inp_hptsslot = inp->inp_hptsslot;
	- }
	- } else {
	+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
	+ if ((hpts->p_hpts_active == 0) &&
	+ (inp->inp_hpts_request == 0) &&
	+ (hpts->p_on_min_sleep == 0)) {
	/*
	- * The hpts is sleeping, we need to figure out where
	+ * The hpts is sleeping and not on a minimum
	+ * sleep time, we need to figure out where
	* it will wake up at and if we need to reschedule
	* its time-out.
	*/
	uint32_t have_slept, yet_to_sleep;
	- uint32_t slot_now;
	- struct timeval tv;

	- ticknow = tcp_gethptstick(&tv);
	- slot_now = ticknow % NUM_OF_HPTSI_SLOTS;
	- /*
	- * The user wants to be inserted at (slot_now +
	- * slot) % NUM_OF_HPTSI_SLOTS, so lets set that up.
	- */
	- largest_slot = NUM_OF_HPTSI_SLOTS - 2;
	- if (inp->inp_hpts_request > largest_slot) {
	- /* Adjust the residual in inp_hpts_request */
	- slot = largest_slot;
	- inp->inp_hpts_request -= largest_slot;
	- } else {
	- /* No residual it all fits */
	- inp->inp_hpts_request = 0;
	- }
	- inp->inp_hptsslot = (slot_now + slot) % NUM_OF_HPTSI_SLOTS;
	- if (diag) {
	- diag->slot_now = slot_now;
	- diag->inp_hptsslot = inp->inp_hptsslot;
	- diag->p_on_min_sleep = hpts->p_on_min_sleep;
	- }
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERT_SLEEPER, slot, ticknow);
	- }
	/* Now do we need to restart the hpts's timer? */
	- if (TSTMP_GT(ticknow, hpts->p_curtick))
	- have_slept = ticknow - hpts->p_curtick;
	- else
	- have_slept = 0;
	- if (have_slept < hpts->p_hpts_sleep_time) {
	- /* This should be what happens */
	+ have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
	+ if (have_slept < hpts->p_hpts_sleep_time)
	yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
	- } else {
	+ else {
	/* We are over-due */
	yet_to_sleep = 0;
	need_wakeup = 1;
	@@ -879,29 +980,22 @@
	if (diag) {
	diag->have_slept = have_slept;
	diag->yet_to_sleep = yet_to_sleep;
	- diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
	}
	- if ((hpts->p_on_min_sleep == 0) && (yet_to_sleep > slot)) {
	+ if (yet_to_sleep &&
	+ (yet_to_sleep > slot)) {
	/*
	- * We need to reschedule the hptss time-out.
	+ * We need to reschedule the hpts's time-out.
	*/
	hpts->p_hpts_sleep_time = slot;
	need_new_to = slot * HPTS_TICKS_PER_USEC;
	}
	}
	- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_INSERTED, slot, ticknow);
	- }
	/*
	* Now how far is the hpts sleeping to? if active is 1, its
	* up and ticking we do nothing, otherwise we may need to
	* reschedule its callout if need_new_to is set from above.
	*/
	if (need_wakeup) {
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_RESCHEDULE, 1, 0);
	- }
	hpts->p_direct_wake = 1;
	tcp_wakehpts(hpts);
	if (diag) {
	@@ -944,9 +1038,10 @@
	}

	uint32_t
	-tcp_hpts_insert_diag(struct inpcb inp, uint32_t slot, int32_t line, struct hpts_diag diag){
	+tcp_hpts_insert_diag(struct inpcb inp, uint32_t slot, int32_t line, struct hpts_diag diag)
	+{
	struct tcp_hpts_entry *hpts;
	- uint32_t slot_on, cts;
	+ uint32_t slot_on;
	struct timeval tv;

	/*
	@@ -956,12 +1051,8 @@
	*/
	INP_WLOCK_ASSERT(inp);
	hpts = tcp_hpts_lock(inp);
	- if (in_ts_percision)
	- microuptime(&tv);
	- else
	- getmicrouptime(&tv);
	- cts = tcp_tv_to_usectick(&tv);
	- tcp_hpts_insert_locked(hpts, inp, slot, cts, line, diag, 0);
	+ microuptime(&tv);
	+ tcp_hpts_insert_locked(hpts, inp, slot, line, diag, &tv);
	slot_on = hpts->p_nxt_slot;
	mtx_unlock(&hpts->p_mtx);
	return (slot_on);
	@@ -971,7 +1062,6 @@
	__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
	}
	-
	int
	__tcp_queue_to_input_locked(struct inpcb inp, struct tcp_hpts_entry hpts, int32_t line)
	{
	@@ -986,9 +1076,6 @@
	/*
	* Activate the hpts if it is sleeping.
	*/
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_WAKEUP_INPUT, 0, 0);
	- }
	retval = 2;
	hpts->p_direct_wake = 1;
	tcp_wakeinput(hpts);
	@@ -1001,36 +1088,14 @@
	return (retval);
	}

	-void
	-tcp_queue_pkt_to_input(struct tcpcb tp, struct mbuf m, struct tcphdr *th,
	- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos)
	-{
	- /* Setup packet for input first */
	- INP_WLOCK_ASSERT(tp->t_inpcb);
	- m->m_pkthdr.pace_thoff = (uint16_t) ((caddr_t)th - mtod(m, caddr_t));
	- m->m_pkthdr.pace_tlen = (uint16_t) tlen;
	- m->m_pkthdr.pace_drphdrlen = drop_hdrlen;
	- m->m_pkthdr.pace_tos = iptos;
	- m->m_pkthdr.pace_lock = (curthread->td_epochnest != 0);
	- if (tp->t_in_pkt == NULL) {
	- tp->t_in_pkt = m;
	- tp->t_tail_pkt = m;
	- } else {
	- tp->t_tail_pkt->m_nextpkt = m;
	- tp->t_tail_pkt = m;
	- }
	-}
	-
	-
	int32_t
	-__tcp_queue_to_input(struct tcpcb tp, struct mbuf m, struct tcphdr *th,
	- int32_t tlen, int32_t drop_hdrlen, uint8_t iptos, int32_t line){
	+__tcp_queue_to_input(struct inpcb *inp, int line)
	+{
	struct tcp_hpts_entry *hpts;
	int32_t ret;

	- tcp_queue_pkt_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
	- hpts = tcp_input_lock(tp->t_inpcb);
	- ret = __tcp_queue_to_input_locked(tp->t_inpcb, hpts, line);
	+ hpts = tcp_input_lock(inp);
	+ ret = __tcp_queue_to_input_locked(inp, hpts, line);
	mtx_unlock(&hpts->p_mtx);
	return (ret);
	}
	@@ -1132,6 +1197,25 @@
	#endif
	}

	+static void
	+tcp_drop_in_pkts(struct tcpcb *tp)
	+{
	+ struct mbuf m, n;
	+
	+ m = tp->t_in_pkt;
	+ if (m)
	+ n = m->m_nextpkt;
	+ else
	+ n = NULL;
	+ tp->t_in_pkt = NULL;
	+ while (m) {
	+ m_freem(m);
	+ m = n;
	+ if (m)
	+ n = m->m_nextpkt;
	+ }
	+}
	+
	/*
	* Do NOT try to optimize the processing of inp's
	* by first pulling off all the inp's into a temporary
	@@ -1142,7 +1226,7 @@
	* but then while you were processing one of the inp's
	* some other one that you switch will get a new
	* packet on the different CPU. It will insert it
	- * on the new hptss input list. Creating a temporary
	+ * on the new hpts's input list. Creating a temporary
	* link in the inp will not fix it either, since
	* the other hpts will be doing the same thing and
	* you will both end up using the temporary link.
	@@ -1155,16 +1239,18 @@
	static void
	tcp_input_data(struct tcp_hpts_entry hpts, struct timeval tv)
	{
	- struct mbuf m, n;
	struct tcpcb *tp;
	struct inpcb *inp;
	uint16_t drop_reason;
	int16_t set_cpu;
	uint32_t did_prefetch = 0;
	- int32_t ti_locked = TI_UNLOCKED;
	+ int dropped;
	struct epoch_tracker et;

	HPTS_MTX_ASSERT(hpts);
	+#ifndef VIMAGE
	+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	+#endif
	while ((inp = TAILQ_FIRST(&hpts->p_input)) != NULL) {
	HPTS_MTX_ASSERT(hpts);
	hpts_sane_input_remove(hpts, inp, 0);
	@@ -1177,26 +1263,22 @@
	drop_reason = inp->inp_hpts_drop_reas;
	inp->inp_in_input = 0;
	mtx_unlock(&hpts->p_mtx);
	- CURVNET_SET(inp->inp_vnet);
	- if (drop_reason) {
	- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	- ti_locked = TI_RLOCKED;
	- } else {
	- ti_locked = TI_UNLOCKED;
	- }
	INP_WLOCK(inp);
	+#ifdef VIMAGE
	+ CURVNET_SET(inp->inp_vnet);
	+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	+#endif
	if ((inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) \|\|
	(inp->inp_flags2 & INP_FREED)) {
	out:
	hpts->p_inp = NULL;
	- if (ti_locked == TI_RLOCKED) {
	- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	- }
	if (in_pcbrele_wlocked(inp) == 0) {
	INP_WUNLOCK(inp);
	}
	- ti_locked = TI_UNLOCKED;
	+#ifdef VIMAGE
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	CURVNET_RESTORE();
	+#endif
	mtx_lock(&hpts->p_mtx);
	continue;
	}
	@@ -1206,26 +1288,17 @@
	}
	if (drop_reason) {
	/* This tcb is being destroyed for drop_reason */
	- m = tp->t_in_pkt;
	- if (m)
	- n = m->m_nextpkt;
	- else
	- n = NULL;
	- tp->t_in_pkt = NULL;
	- while (m) {
	- m_freem(m);
	- m = n;
	- if (m)
	- n = m->m_nextpkt;
	- }
	+ tcp_drop_in_pkts(tp);
	tp = tcp_drop(tp, drop_reason);
	- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	if (tp == NULL) {
	INP_WLOCK(inp);
	}
	if (in_pcbrele_wlocked(inp) == 0)
	INP_WUNLOCK(inp);
	+#ifdef VIMAGE
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	CURVNET_RESTORE();
	+#endif
	mtx_lock(&hpts->p_mtx);
	continue;
	}
	@@ -1246,220 +1319,184 @@
	*/
	tcp_set_hpts(inp);
	}
	- m = tp->t_in_pkt;
	- n = NULL;
	- if (m != NULL &&
	- (m->m_pkthdr.pace_lock == TI_RLOCKED \|\|
	- tp->t_state != TCPS_ESTABLISHED)) {
	- ti_locked = TI_RLOCKED;
	- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	- m = tp->t_in_pkt;
	- }
	- if (in_newts_every_tcb) {
	- if (in_ts_percision)
	- microuptime(tv);
	- else
	- getmicrouptime(tv);
	- }
	if (tp->t_fb_ptr != NULL) {
	kern_prefetch(tp->t_fb_ptr, &did_prefetch);
	did_prefetch = 1;
	}
	- /* Any input work to do, if so do it first */
	- if ((m != NULL) && (m == tp->t_in_pkt)) {
	- struct tcphdr *th;
	- int32_t tlen, drop_hdrlen, nxt_pkt;
	- uint8_t iptos;
	-
	- n = m->m_nextpkt;
	- tp->t_in_pkt = tp->t_tail_pkt = NULL;
	- while (m) {
	- th = (struct tcphdr *)(mtod(m, caddr_t)+m->m_pkthdr.pace_thoff);
	- tlen = m->m_pkthdr.pace_tlen;
	- drop_hdrlen = m->m_pkthdr.pace_drphdrlen;
	- iptos = m->m_pkthdr.pace_tos;
	- m->m_nextpkt = NULL;
	- if (n)
	- nxt_pkt = 1;
	- else
	- nxt_pkt = 0;
	- inp->inp_input_calls = 1;
	- if (tp->t_fb->tfb_tcp_hpts_do_segment) {
	- /* Use the hpts specific do_segment */
	- (*tp->t_fb->tfb_tcp_hpts_do_segment) (m, th, inp->inp_socket,
	- tp, drop_hdrlen,
	- tlen, iptos, nxt_pkt, tv);
	- } else {
	- /* Use the default do_segment */
	- (*tp->t_fb->tfb_tcp_do_segment) (m, th, inp->inp_socket,
	- tp, drop_hdrlen,
	- tlen, iptos);
	- }
	- if (ti_locked == TI_RLOCKED)
	- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	- /*
	- * Do segment returns unlocked we need the
	- * lock again but we also need some kasserts
	- * here.
	- */
	- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
	- INP_UNLOCK_ASSERT(inp);
	- m = n;
	- if (m)
	- n = m->m_nextpkt;
	- if (m != NULL &&
	- m->m_pkthdr.pace_lock == TI_RLOCKED) {
	- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	- ti_locked = TI_RLOCKED;
	- } else
	- ti_locked = TI_UNLOCKED;
	+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
	+ if (inp->inp_in_input)
	+ tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
	+ dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
	+ if (dropped) {
	+ /* Re-acquire the wlock so we can release the reference */
	INP_WLOCK(inp);
	- /*
	- * Since we have an opening here we must
	- * re-check if the tcb went away while we
	- * were getting the lock(s).
	- */
	- if ((inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) \|\|
	- (inp->inp_flags2 & INP_FREED)) {
	- while (m) {
	- m_freem(m);
	- m = n;
	- if (m)
	- n = m->m_nextpkt;
	- }
	- goto out;
	- }
	- /*
	- * Now that we hold the INP lock, check if
	- * we need to upgrade our lock.
	- */
	- if (ti_locked == TI_UNLOCKED &&
	- (tp->t_state != TCPS_ESTABLISHED)) {
	- ti_locked = TI_RLOCKED;
	- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	- }
	- } /** end while(m) */
	- } /** end if ((m != NULL) && (m == tp->t_in_pkt)) */
	+ }
	+ } else if (tp->t_in_pkt) {
	+ /*
	+ * We reach here only if we had a
	+ * stack that supported INP_SUPPORTS_MBUFQ
	+ * and then somehow switched to a stack that
	+ * does not. The packets are basically stranded
	+ * and would hang with the connection until
	+ * cleanup without this code. Its not the
	+ * best way but I know of no other way to
	+ * handle it since the stack needs functions
	+ * it does not have to handle queued packets.
	+ */
	+ tcp_drop_in_pkts(tp);
	+ }
	if (in_pcbrele_wlocked(inp) == 0)
	INP_WUNLOCK(inp);
	- if (ti_locked == TI_RLOCKED)
	- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	- INP_INFO_WUNLOCK_ASSERT(&V_tcbinfo);
	INP_UNLOCK_ASSERT(inp);
	- ti_locked = TI_UNLOCKED;
	+#ifdef VIMAGE
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	+ CURVNET_RESTORE();
	+#endif
	mtx_lock(&hpts->p_mtx);
	hpts->p_inp = NULL;
	- CURVNET_RESTORE();
	}
	+#ifndef VIMAGE
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	+ INP_INFO_UNLOCK_ASSERT(&V_tcbinfo);
	+#endif
	}

	-static int
	-tcp_hpts_est_run(struct tcp_hpts_entry *hpts)
	-{
	- int32_t ticks_to_run;
	-
	- if (hpts->p_prevtick && (SEQ_GT(hpts->p_curtick, hpts->p_prevtick))) {
	- ticks_to_run = hpts->p_curtick - hpts->p_prevtick;
	- if (ticks_to_run >= (NUM_OF_HPTSI_SLOTS - 1)) {
	- ticks_to_run = NUM_OF_HPTSI_SLOTS - 2;
	- }
	- } else {
	- if (hpts->p_prevtick == hpts->p_curtick) {
	- /* This happens when we get woken up right away */
	- return (-1);
	- }
	- ticks_to_run = 1;
	- }
	- /* Set in where we will be when we catch up */
	- hpts->p_nxt_slot = (hpts->p_cur_slot + ticks_to_run) % NUM_OF_HPTSI_SLOTS;
	- if (hpts->p_nxt_slot == hpts->p_cur_slot) {
	- panic("Impossible math -- hpts:%p p_nxt_slot:%d p_cur_slot:%d ticks_to_run:%d",
	- hpts, hpts->p_nxt_slot, hpts->p_cur_slot, ticks_to_run);
	- }
	- return (ticks_to_run);
	-}
	-
	static void
	-tcp_hptsi(struct tcp_hpts_entry hpts, struct timeval ctick)
	+tcp_hptsi(struct tcp_hpts_entry *hpts)
	{
	+ struct epoch_tracker et;
	struct tcpcb *tp;
	struct inpcb inp = NULL, ninp;
	struct timeval tv;
	- int32_t ticks_to_run, i, error, tick_now, interum_tick;
	+ int32_t ticks_to_run, i, error;
	int32_t paced_cnt = 0;
	+ int32_t loop_cnt = 0;
	int32_t did_prefetch = 0;
	int32_t prefetch_ninp = 0;
	int32_t prefetch_tp = 0;
	- uint32_t cts;
	+ int32_t wrap_loop_cnt = 0;
	int16_t set_cpu;

	HPTS_MTX_ASSERT(hpts);
	- hpts->p_curtick = tcp_tv_to_hptstick(ctick);
	- cts = tcp_tv_to_usectick(ctick);
	- memcpy(&tv, ctick, sizeof(struct timeval));
	- hpts->p_cur_slot = hpts_tick(hpts, 1);
	+ /* record previous info for any logging */
	+ hpts->saved_lasttick = hpts->p_lasttick;
	+ hpts->saved_curtick = hpts->p_curtick;
	+ hpts->saved_curslot = hpts->p_cur_slot;
	+ hpts->saved_prev_slot = hpts->p_prev_slot;

	- /* Figure out if we had missed ticks */
	+ hpts->p_lasttick = hpts->p_curtick;
	+ hpts->p_curtick = tcp_gethptstick(&tv);
	+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	+ if ((hpts->p_on_queue_cnt == 0) \|\|
	+ (hpts->p_lasttick == hpts->p_curtick)) {
	+ /*
	+ * No time has yet passed,
	+ * or nothing to do.
	+ */
	+ hpts->p_prev_slot = hpts->p_cur_slot;
	+ hpts->p_lasttick = hpts->p_curtick;
	+ goto no_run;
	+ }
	again:
	+ hpts->p_wheel_complete = 0;
	HPTS_MTX_ASSERT(hpts);
	- ticks_to_run = tcp_hpts_est_run(hpts);
	- if (!TAILQ_EMPTY(&hpts->p_input)) {
	- tcp_input_data(hpts, &tv);
	+ ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
	+ if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
	+ (hpts->p_on_queue_cnt != 0)) {
	+ /*
	+ * Wheel wrap is occuring, basically we
	+ * are behind and the distance between
	+ * run's has spread so much it has exceeded
	+ * the time on the wheel (1.024 seconds). This
	+ * is ugly and should NOT be happening. We
	+ * need to run the entire wheel. We last processed
	+ * p_prev_slot, so that needs to be the last slot
	+ * we run. The next slot after that should be our
	+ * reserved first slot for new, and then starts
	+ * the running postion. Now the problem is the
	+ * reserved "not to yet" place does not exist
	+ * and there may be inp's in there that need
	+ * running. We can merge those into the
	+ * first slot at the head.
	+ */
	+ wrap_loop_cnt++;
	+ hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
	+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
	+ /*
	+ * Adjust p_cur_slot to be where we are starting from
	+ * hopefully we will catch up (fat chance if something
	+ * is broken this bad :( )
	+ */
	+ hpts->p_cur_slot = hpts->p_prev_slot;
	+ /*
	+ * The next slot has guys to run too, and that would
	+ * be where we would normally start, lets move them into
	+ * the next slot (p_prev_slot + 2) so that we will
	+ * run them, the extra 10usecs of late (by being
	+ * put behind) does not really matter in this situation.
	+ */
	+#ifdef INVARIANTS
	+ /*
	+ * To prevent a panic we need to update the inpslot to the
	+ * new location. This is safe since it takes both the
	+ * INP lock and the pacer mutex to change the inp_hptsslot.
	+ */
	+ TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
	+ inp->inp_hptsslot = hpts->p_runningtick;
	+ }
	+#endif
	+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
	+ &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
	+ ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
	+ counter_u64_add(wheel_wrap, 1);
	+ } else {
	+ /*
	+ * Nxt slot is always one after p_runningtick though
	+ * its not used usually unless we are doing wheel wrap.
	+ */
	+ hpts->p_nxt_slot = hpts->p_prev_slot;
	+ hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
	}
	#ifdef INVARIANTS
	if (TAILQ_EMPTY(&hpts->p_input) &&
	(hpts->p_on_inqueue_cnt != 0)) {
	panic("tp:%p in_hpts input empty but cnt:%d",
	- hpts, hpts->p_on_inqueue_cnt);
	+ hpts, hpts->p_on_inqueue_cnt);
	}
	#endif
	HPTS_MTX_ASSERT(hpts);
	- /* Reset the ticks to run and time if we need too */
	- interum_tick = tcp_gethptstick(&tv);
	- if (interum_tick != hpts->p_curtick) {
	- /* Save off the new time we execute to */
	- *ctick = tv;
	- hpts->p_curtick = interum_tick;
	- cts = tcp_tv_to_usectick(&tv);
	- hpts->p_cur_slot = hpts_tick(hpts, 1);
	- ticks_to_run = tcp_hpts_est_run(hpts);
	- }
	- if (ticks_to_run == -1) {
	- goto no_run;
	- }
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_SETTORUN, ticks_to_run, 0);
	- }
	if (hpts->p_on_queue_cnt == 0) {
	goto no_one;
	}
	HPTS_MTX_ASSERT(hpts);
	+#ifndef VIMAGE
	+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	+#endif
	for (i = 0; i < ticks_to_run; i++) {
	/*
	* Calculate our delay, if there are no extra ticks there
	- * was not any
	+ * was not any (i.e. if ticks_to_run == 1, no delay).
	*/
	hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
	HPTS_MTX_ASSERT(hpts);
	- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
	+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
	/* For debugging */
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_HPTSI, ticks_to_run, i);
	- }
	hpts->p_inp = inp;
	paced_cnt++;
	- if (hpts->p_cur_slot != inp->inp_hptsslot) {
	+#ifdef INVARIANTS
	+ if (hpts->p_runningtick != inp->inp_hptsslot) {
	panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
	- hpts, inp, hpts->p_cur_slot, inp->inp_hptsslot);
	+ hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
	}
	+#endif
	/* Now pull it */
	if (inp->inp_hpts_cpu_set == 0) {
	set_cpu = 1;
	} else {
	set_cpu = 0;
	}
	- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_cur_slot], 0);
	- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_cur_slot])) != NULL) {
	+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
	+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
	/* We prefetch the next inp if possible */
	kern_prefetch(ninp, &prefetch_ninp);
	prefetch_ninp = 1;
	@@ -1467,25 +1504,36 @@
	if (inp->inp_hpts_request) {
	/*
	* This guy is deferred out further in time
	- * then our wheel had on it. Push him back
	- * on the wheel.
	+ * then our wheel had available on it.
	+ * Push him back on the wheel or run it
	+ * depending.
	*/
	- int32_t remaining_slots;
	-
	+ uint32_t maxticks, last_tick, remaining_slots;
	+
	remaining_slots = ticks_to_run - (i + 1);
	if (inp->inp_hpts_request > remaining_slots) {
	/*
	- * Keep INVARIANTS happy by clearing
	- * the flag
	+ * How far out can we go?
	*/
	- tcp_hpts_insert_locked(hpts, inp, inp->inp_hpts_request, cts, __LINE__, NULL, 1);
	+ maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
	+ if (maxticks >= inp->inp_hpts_request) {
	+ /* we can place it finally to be processed */
	+ inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
	+ inp->inp_hpts_request = 0;
	+ } else {
	+ /* Work off some more time */
	+ inp->inp_hptsslot = last_tick;
	+ inp->inp_hpts_request-= maxticks;
	+ }
	+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
	hpts->p_inp = NULL;
	continue;
	}
	inp->inp_hpts_request = 0;
	+ /* Fall through we will so do it now */
	}
	/*
	- * We clear the hpts flag here after dealing with
	+ * We clear the hpts flag here after dealing with
	* remaining slots. This way anyone looking with the
	* TCB lock will see its on the hpts until just
	* before we unlock.
	@@ -1495,23 +1543,20 @@
	INP_WLOCK(inp);
	if (in_pcbrele_wlocked(inp)) {
	mtx_lock(&hpts->p_mtx);
	- if (logging_on)
	- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 1);
	hpts->p_inp = NULL;
	continue;
	}
	- if (inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) {
	-out_now:
	+ if ((inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) \|\|
	+ (inp->inp_flags2 & INP_FREED)) {
	+ out_now:
	#ifdef INVARIANTS
	if (mtx_owned(&hpts->p_mtx)) {
	panic("Hpts:%p owns mtx prior-to lock line:%d",
	- hpts, __LINE__);
	+ hpts, __LINE__);
	}
	#endif
	INP_WUNLOCK(inp);
	mtx_lock(&hpts->p_mtx);
	- if (logging_on)
	- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 3);
	hpts->p_inp = NULL;
	continue;
	}
	@@ -1539,16 +1584,14 @@
	*/
	tcp_set_hpts(inp);
	}
	- if (out_newts_every_tcb) {
	- struct timeval sv;
	-
	- if (out_ts_percision)
	- microuptime(&sv);
	- else
	- getmicrouptime(&sv);
	- cts = tcp_tv_to_usectick(&sv);
	- }
	+#ifdef VIMAGE
	CURVNET_SET(inp->inp_vnet);
	+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	+#endif
	+ /* Lets do any logging that we might want to */
	+ if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
	+ tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
	+ }
	/*
	* There is a hole here, we get the refcnt on the
	* inp so it will still be preserved but to make
	@@ -1560,19 +1603,23 @@
	#ifdef INVARIANTS
	if (mtx_owned(&hpts->p_mtx)) {
	panic("Hpts:%p owns mtx before tcp-output:%d",
	- hpts, __LINE__);
	+ hpts, __LINE__);
	}
	#endif
	if (tp->t_fb_ptr != NULL) {
	kern_prefetch(tp->t_fb_ptr, &did_prefetch);
	did_prefetch = 1;
	}
	- inp->inp_hpts_calls = 1;
	- if (tp->t_fb->tfb_tcp_output_wtime != NULL) {
	- error = (*tp->t_fb->tfb_tcp_output_wtime) (tp, &tv);
	- } else {
	- error = tp->t_fb->tfb_tcp_output(tp);
	+ if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
	+ error = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
	+ if (error) {
	+ /* The input killed the connection */
	+ goto skip_pacing;
	+ }
	}
	+ inp->inp_hpts_calls = 1;
	+ error = tp->t_fb->tfb_tcp_output(tp);
	+ inp->inp_hpts_calls = 0;
	if (ninp && ninp->inp_ppcb) {
	/*
	* If we have a nxt inp, see if we can
	@@ -1609,74 +1656,112 @@
	prefetch_tp = 1;
	}
	INP_WUNLOCK(inp);
	- INP_UNLOCK_ASSERT(inp);
	+ skip_pacing:
	+#ifdef VIMAGE
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	CURVNET_RESTORE();
	+#endif
	+ INP_UNLOCK_ASSERT(inp);
	#ifdef INVARIANTS
	if (mtx_owned(&hpts->p_mtx)) {
	panic("Hpts:%p owns mtx prior-to lock line:%d",
	- hpts, __LINE__);
	+ hpts, __LINE__);
	}
	#endif
	mtx_lock(&hpts->p_mtx);
	- if (logging_on)
	- tcp_hpts_log_it(hpts, hpts->p_inp, HPTSLOG_INP_DONE, 0, 4);
	hpts->p_inp = NULL;
	}
	HPTS_MTX_ASSERT(hpts);
	hpts->p_inp = NULL;
	- hpts->p_cur_slot++;
	- if (hpts->p_cur_slot >= NUM_OF_HPTSI_SLOTS) {
	- hpts->p_cur_slot = 0;
	+ hpts->p_runningtick++;
	+ if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
	+ hpts->p_runningtick = 0;
	}
	}
	+#ifndef VIMAGE
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	+#endif
	no_one:
	HPTS_MTX_ASSERT(hpts);
	- hpts->p_prevtick = hpts->p_curtick;
	hpts->p_delayed_by = 0;
	/*
	* Check to see if we took an excess amount of time and need to run
	* more ticks (if we did not hit eno-bufs).
	*/
	- /* Re-run any input that may be there */
	- (void)tcp_gethptstick(&tv);
	- if (!TAILQ_EMPTY(&hpts->p_input)) {
	- tcp_input_data(hpts, &tv);
	- }
	#ifdef INVARIANTS
	if (TAILQ_EMPTY(&hpts->p_input) &&
	(hpts->p_on_inqueue_cnt != 0)) {
	panic("tp:%p in_hpts input empty but cnt:%d",
	- hpts, hpts->p_on_inqueue_cnt);
	+ hpts, hpts->p_on_inqueue_cnt);
	}
	#endif
	- tick_now = tcp_gethptstick(&tv);
	- if (SEQ_GT(tick_now, hpts->p_prevtick)) {
	- struct timeval res;
	-
	- /* Did we really spend a full tick or more in here? */
	- timersub(&tv, ctick, &res);
	- if (res.tv_sec \|\| (res.tv_usec >= HPTS_TICKS_PER_USEC)) {
	+ hpts->p_prev_slot = hpts->p_cur_slot;
	+ hpts->p_lasttick = hpts->p_curtick;
	+ if (loop_cnt > max_pacer_loops) {
	+ /*
	+ * Something is serious slow we have
	+ * looped through processing the wheel
	+ * and by the time we cleared the
	+ * needs to run max_pacer_loops time
	+ * we still needed to run. That means
	+ * the system is hopelessly behind and
	+ * can never catch up :(
	+ *
	+ * We will just lie to this thread
	+ * and let it thing p_curtick is
	+ * correct. When it next awakens
	+ * it will find itself further behind.
	+ */
	+ counter_u64_add(hpts_hopelessly_behind, 1);
	+ goto no_run;
	+ }
	+ hpts->p_curtick = tcp_gethptstick(&tv);
	+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	+ if ((wrap_loop_cnt < 2) &&
	+ (hpts->p_lasttick != hpts->p_curtick)) {
	+ counter_u64_add(hpts_loops, 1);
	+ loop_cnt++;
	+ goto again;
	+ }
	+no_run:
	+ /*
	+ * Set flag to tell that we are done for
	+ * any slot input that happens during
	+ * input.
	+ */
	+ hpts->p_wheel_complete = 1;
	+ /*
	+ * Run any input that may be there not covered
	+ * in running data.
	+ */
	+ if (!TAILQ_EMPTY(&hpts->p_input)) {
	+ tcp_input_data(hpts, &tv);
	+ /*
	+ * Now did we spend too long running
	+ * input and need to run more ticks?
	+ */
	+ KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
	+ ("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
	+ hpts->p_prev_slot, hpts->p_cur_slot));
	+ KASSERT(hpts->p_lasttick == hpts->p_curtick,
	+ ("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
	+ hpts->p_lasttick, hpts->p_curtick));
	+ hpts->p_curtick = tcp_gethptstick(&tv);
	+ if (hpts->p_lasttick != hpts->p_curtick) {
	counter_u64_add(hpts_loops, 1);
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_TOLONG, (uint32_t) res.tv_usec, tick_now);
	- }
	- *ctick = res;
	- hpts->p_curtick = tick_now;
	+ hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	goto again;
	}
	}
	-no_run:
	{
	uint32_t t = 0, i, fnd = 0;

	- if (hpts->p_on_queue_cnt) {
	-
	-
	+ if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
	/*
	* Find next slot that is occupied and use that to
	* be the sleep time.
	*/
	- for (i = 1, t = hpts->p_nxt_slot; i < NUM_OF_HPTSI_SLOTS; i++) {
	+ for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
	if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
	fnd = 1;
	break;
	@@ -1684,27 +1769,23 @@
	t = (t + 1) % NUM_OF_HPTSI_SLOTS;
	}
	if (fnd) {
	- hpts->p_hpts_sleep_time = i;
	+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
	} else {
	- counter_u64_add(back_tosleep, 1);
	#ifdef INVARIANTS
	- panic("Hpts:%p cnt:%d but non found", hpts, hpts->p_on_queue_cnt);
	+ panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
	#endif
	+ counter_u64_add(back_tosleep, 1);
	hpts->p_on_queue_cnt = 0;
	goto non_found;
	}
	- t++;
	+ } else if (wrap_loop_cnt >= 2) {
	+ /* Special case handling */
	+ hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
	} else {
	- /* No one on the wheel sleep for all but 2 slots */
	-non_found:
	- if (hpts_sleep_max == 0)
	- hpts_sleep_max = 1;
	- hpts->p_hpts_sleep_time = min((NUM_OF_HPTSI_SLOTS - 2), hpts_sleep_max);
	- t = 0;
	+ /* No one on the wheel sleep for all but 400 slots or sleep max */
	+ non_found:
	+ hpts->p_hpts_sleep_time = hpts_sleep_max;
	}
	- if (logging_on) {
	- tcp_hpts_log_it(hpts, inp, HPTSLOG_SLEEPSET, t, (hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC));
	- }
	}
	}

	@@ -1746,33 +1827,29 @@
	mtx_lock(&hpts->p_mtx);
	if (hpts->p_direct_wake) {
	/* Signaled by input */
	- if (logging_on)
	- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 1, 1);
	callout_stop(&hpts->co);
	} else {
	/* Timed out */
	if (callout_pending(&hpts->co) \|\|
	!callout_active(&hpts->co)) {
	- if (logging_on)
	- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 2, 2);
	mtx_unlock(&hpts->p_mtx);
	return;
	}
	callout_deactivate(&hpts->co);
	- if (logging_on)
	- tcp_hpts_log_it(hpts, NULL, HPTSLOG_AWAKE, 3, 3);
	}
	+ hpts->p_hpts_wake_scheduled = 0;
	hpts->p_hpts_active = 1;
	- (void)tcp_gethptstick(&tv);
	- tcp_hptsi(hpts, &tv);
	+ tcp_hptsi(hpts);
	HPTS_MTX_ASSERT(hpts);
	tv.tv_sec = 0;
	tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
	if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
	+ hpts->overidden_sleep = tv.tv_usec;
	tv.tv_usec = tcp_min_hptsi_time;
	hpts->p_on_min_sleep = 1;
	} else {
	/* Clear the min sleep flag */
	+ hpts->overidden_sleep = 0;
	hpts->p_on_min_sleep = 0;
	}
	hpts->p_hpts_active = 0;
	@@ -1809,9 +1886,11 @@

	tcp_pace.rp_proc = NULL;
	tcp_pace.rp_num_hptss = ncpus;
	+ hpts_hopelessly_behind = counter_u64_alloc(M_WAITOK);
	hpts_loops = counter_u64_alloc(M_WAITOK);
	back_tosleep = counter_u64_alloc(M_WAITOK);
	-
	+ combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
	+ wheel_wrap = counter_u64_alloc(M_WAITOK);
	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK \| M_ZERO);
	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
	@@ -1850,7 +1929,7 @@
	OID_AUTO, "out_qcnt", CTLFLAG_RD,
	&hpts->p_on_queue_cnt, 0,
	"Count TCB's awaiting output processing");
	- SYSCTL_ADD_UINT(&hpts->hpts_ctx,
	+ SYSCTL_ADD_U16(&hpts->hpts_ctx,
	SYSCTL_CHILDREN(hpts->hpts_root),
	OID_AUTO, "active", CTLFLAG_RD,
	&hpts->p_hpts_active, 0,
	@@ -1859,29 +1938,23 @@
	SYSCTL_CHILDREN(hpts->hpts_root),
	OID_AUTO, "curslot", CTLFLAG_RD,
	&hpts->p_cur_slot, 0,
	- "What the current slot is if active");
	+ "What the current running pacers goal");
	SYSCTL_ADD_UINT(&hpts->hpts_ctx,
	SYSCTL_CHILDREN(hpts->hpts_root),
	- OID_AUTO, "curtick", CTLFLAG_RD,
	- &hpts->p_curtick, 0,
	- "What the current tick on if active");
	+ OID_AUTO, "runtick", CTLFLAG_RD,
	+ &hpts->p_runningtick, 0,
	+ "What the running pacers current slot is");
	SYSCTL_ADD_UINT(&hpts->hpts_ctx,
	SYSCTL_CHILDREN(hpts->hpts_root),
	- OID_AUTO, "logsize", CTLFLAG_RD,
	- &hpts->p_logsize, 0,
	- "Hpts logging buffer size");
	- hpts->p_hpts_sleep_time = NUM_OF_HPTSI_SLOTS - 2;
	+ OID_AUTO, "curtick", CTLFLAG_RD,
	+ &hpts->p_curtick, 0,
	+ "What the running pacers last tick mapped to the wheel was");
	+ hpts->p_hpts_sleep_time = hpts_sleep_max;
	hpts->p_num = i;
	- hpts->p_prevtick = hpts->p_curtick = tcp_gethptstick(&tv);
	- hpts->p_prevtick -= 1;
	- hpts->p_prevtick %= NUM_OF_HPTSI_SLOTS;
	+ hpts->p_curtick = tcp_gethptstick(&tv);
	+ hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	hpts->p_cpu = 0xffff;
	- hpts->p_nxt_slot = 1;
	- hpts->p_logsize = tcp_hpts_logging_size;
	- if (hpts->p_logsize) {
	- sz = (sizeof(struct hpts_log) * hpts->p_logsize);
	- hpts->p_log = malloc(sz, M_TCPHPTS, M_WAITOK \| M_ZERO);
	- }
	+ hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
	callout_init(&hpts->co, 1);
	}

	Index: head/sys/netinet/tcp_log_buf.h
	===================================================================
	--- head/sys/netinet/tcp_log_buf.h
	+++ head/sys/netinet/tcp_log_buf.h
	@@ -175,7 +175,7 @@
	TCP_LOG_BAD_RETRAN, /* Detected bad retransmission 5 */
	TCP_LOG_PRR, /* Doing PRR 6 */
	TCP_LOG_REORDER,/* Detected reorder 7 */
	- TCP_LOG_PACER, /* Pacer sending a packet 8 */
	+ TCP_LOG_HPTS, /* Hpts sending a packet 8 */
	BBR_LOG_BBRUPD, /* We updated BBR info 9 */
	BBR_LOG_BBRSND, /* We did a slot calculation and sending is done 10 */
	BBR_LOG_ACKCLEAR, /* A ack clears all outstanding 11 */
	@@ -194,31 +194,38 @@
	BBR_LOG_PERSIST, /* BBR changed to/from a persists 24 */
	TCP_LOG_FLOWEND, /* End of a flow 25 */
	BBR_LOG_RTO, /* BBR's timeout includes BBR info 26 */
	- BBR_LOG_DOSEG_DONE, /* pacer do_segment completes 27 */
	- BBR_LOG_EXIT_GAIN, /* pacer do_segment completes 28 */
	+ BBR_LOG_DOSEG_DONE, /* hpts do_segment completes 27 */
	+ BBR_LOG_EXIT_GAIN, /* hpts do_segment completes 28 */
	BBR_LOG_THRESH_CALC, /* Doing threshold calculation 29 */
	BBR_LOG_EXTRACWNDGAIN, /* Removed 30 */
	TCP_LOG_USERSEND, /* User level sends data 31 */
	- UNUSED_32, /* Unused 32 */
	- UNUSED_33, /* Unused 33 */
	+ BBR_RSM_CLEARED, /* RSM cleared of ACK flags 32 */
	+ BBR_LOG_STATE_TARGET, /* Log of target at state 33 */
	BBR_LOG_TIME_EPOCH, /* A timed based Epoch occured 34 */
	BBR_LOG_TO_PROCESS, /* A to was processed 35 */
	BBR_LOG_BBRTSO, /* TSO update 36 */
	- BBR_LOG_PACERDIAG, /* Pacer diag insert 37 */
	+ BBR_LOG_HPTSDIAG, /* Hpts diag insert 37 */
	BBR_LOG_LOWGAIN, /* Low gain accounting 38 */
	BBR_LOG_PROGRESS, /* Progress timer event 39 */
	TCP_LOG_SOCKET_OPT, /* A socket option is set 40 */
	BBR_LOG_TIMERPREP, /* A BBR var to debug out TLP issues 41 */
	BBR_LOG_ENOBUF_JMP, /* We had a enobuf jump 42 */
	- BBR_LOG_PACING_CALC, /* calc the pacing time 43 */
	+ BBR_LOG_HPTSI_CALC, /* calc the hptsi time 43 */
	BBR_LOG_RTT_SHRINKS, /* We had a log reduction of rttProp 44 */
	BBR_LOG_BW_RED_EV, /* B/W reduction events 45 */
	BBR_LOG_REDUCE, /* old bbr log reduce for 4.1 and earlier 46*/
	TCP_LOG_RTT, /* A rtt (in useconds) is being sampled and applied to the srtt algo 47 */
	BBR_LOG_SETTINGS_CHG, /* Settings changed for loss response 48 */
	- BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining 49 */
	+ BBR_LOG_SRTT_GAIN_EVENT, /* SRTT gaining -- now not used 49 */
	TCP_LOG_REASS, /* Reassembly buffer logging 50 */
	- TCP_LOG_END /* End (keep at end) 51 */
	+ TCP_HDWR_TLS, /* TCP Hardware TLS logs 51 */
	+ BBR_LOG_HDWR_PACE, /* TCP Hardware pacing log 52 */
	+ BBR_LOG_TSTMP_VAL, /* Temp debug timestamp validation 53 */
	+ TCP_LOG_CONNEND, /* End of connection 54 */
	+ TCP_LOG_LRO, /* LRO entry 55 */
	+ TCP_SACK_FILTER_RES, /* Results of SACK Filter 56 */
	+ TCP_SAD_DETECTION, /* Sack Attack Detection 57 */
	+ TCP_LOG_END /* End (keep at end) 58 */
	};

	enum tcp_log_states {
	@@ -275,8 +282,8 @@

	#ifdef _KERNEL

	-#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 10000
	-#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 1000000
	+#define TCP_LOG_BUF_DEFAULT_SESSION_LIMIT 5000
	+#define TCP_LOG_BUF_DEFAULT_GLOBAL_LIMIT 5000000

	/*
	* TCP_LOG_EVENT_VERBOSE: The same as TCP_LOG_EVENT, except it always
	Index: head/sys/netinet/tcp_stacks/rack.c
	===================================================================
	--- head/sys/netinet/tcp_stacks/rack.c
	+++ head/sys/netinet/tcp_stacks/rack.c
	@@ -1,5 +1,6 @@
	/*-
	- * Copyright (c) 2016-2019 Netflix, Inc.
	+ * Copyright (c) 2016
	+ * Netflix Inc. All rights reserved.
	*
	* Redistribution and use in source and binary forms, with or without
	* modification, are permitted provided that the following conditions
	@@ -44,12 +45,16 @@
	#include <sys/mutex.h>
	#include <sys/mbuf.h>
	#include <sys/proc.h> /* for proc0 declaration */
	+#ifdef NETFLIX_STATS
	+#include <sys/qmath.h>
	+#endif
	#include <sys/socket.h>
	#include <sys/socketvar.h>
	#include <sys/sysctl.h>
	#include <sys/systm.h>
	+#include <sys/tree.h>
	#ifdef NETFLIX_STATS
	-#include <sys/stats.h>
	+#include <sys/stats.h> /* Must come after qmath.h and tree.h */
	#endif
	#include <sys/refcount.h>
	#include <sys/queue.h>
	@@ -74,8 +79,8 @@
	#include <netinet/ip6.h>
	#include <netinet6/in6_pcb.h>
	#include <netinet6/ip6_var.h>
	-#include <netinet/tcp.h>
	#define TCPOUTFLAGS
	+#include <netinet/tcp.h>
	#include <netinet/tcp_fsm.h>
	#include <netinet/tcp_log_buf.h>
	#include <netinet/tcp_seq.h>
	@@ -84,9 +89,6 @@
	#include <netinet/tcp_hpts.h>
	#include <netinet/tcpip.h>
	#include <netinet/cc/cc.h>
	-#ifdef NETFLIX_CWV
	-#include <netinet/tcp_newcwv.h>
	-#endif
	#include <netinet/tcp_fastopen.h>
	#ifdef TCPDEBUG
	#include <netinet/tcp_debug.h>
	@@ -126,6 +128,10 @@
	struct sysctl_ctx_list rack_sysctl_ctx;
	struct sysctl_oid *rack_sysctl_root;

	+#ifndef TCPHPTS
	+fatal error missing option TCPHSTS in the build;
	+#endif
	+
	#define CUM_ACKED 1
	#define SACKED 2

	@@ -178,6 +184,9 @@
	static int32_t rack_tlp_in_recovery = 1; /* Can we do TLP in recovery? */
	static int32_t rack_verbose_logging = 0;
	static int32_t rack_ignore_data_after_close = 1;
	+static int32_t rack_map_entries_limit = 1024;
	+static int32_t rack_map_split_limit = 256;
	+
	/*
	* Currently regular tcp has a rto_min of 30ms
	* the backoff goes 12 times so that ends up
	@@ -202,7 +211,6 @@
	static int32_t rack_sack_block_limit = 128;
	static int32_t rack_use_sack_filter = 1;
	static int32_t rack_tlp_threshold_use = TLP_USE_TWO_ONE;
	-static uint32_t rack_map_split_limit = 0; /* unlimited by default */

	/* Rack specific counters */
	counter_u64_t rack_badfr;
	@@ -228,6 +236,7 @@
	counter_u64_t rack_to_alloc;
	counter_u64_t rack_to_alloc_hard;
	counter_u64_t rack_to_alloc_emerg;
	+counter_u64_t rack_to_alloc_limited;
	counter_u64_t rack_alloc_limited_conns;
	counter_u64_t rack_split_limited;

	@@ -248,12 +257,21 @@
	counter_u64_t rack_out_size[TCP_MSS_ACCT_SIZE];
	counter_u64_t rack_opts_arry[RACK_OPTS_SIZE];

	+/*
	+ * This was originally defined in tcp_timer.c, but is now reproduced here given
	+ * the unification of the SYN and non-SYN retransmit timer exponents combined
	+ * with wanting to retain previous behaviour for previously deployed stack
	+ * versions.
	+ */
	+int tcp_syn_backoff[TCP_MAXRXTSHIFT + 1] =
	+ { 1, 1, 1, 1, 1, 2, 4, 8, 16, 32, 64, 64, 64 };
	+
	static void
	rack_log_progress_event(struct tcp_rack rack, struct tcpcb tp, uint32_t tick, int event, int line);

	static int
	rack_process_ack(struct mbuf m, struct tcphdr th,
	- struct socket so, struct tcpcb tp, struct tcpopt *to,
	+ struct socket so, struct tcpcb tp, struct tcpopt *to,
	uint32_t tiwin, int32_t tlen, int32_t * ofia, int32_t thflags, int32_t * ret_val);
	static int
	rack_process_data(struct mbuf m, struct tcphdr th,
	@@ -351,14 +369,13 @@
	rack_do_closing(struct mbuf m, struct tcphdr th,
	struct socket so, struct tcpcb tp, struct tcpopt *to, int32_t drop_hdrlen,
	int32_t tlen, uint32_t tiwin, int32_t thflags, int32_t nxt_pkt);
	+static void rack_do_drop(struct mbuf m, struct tcpcb tp);
	static void
	-rack_do_drop(struct mbuf m, struct tcpcb tp);
	-static void
	rack_do_dropafterack(struct mbuf m, struct tcpcb tp,
	struct tcphdr th, int32_t thflags, int32_t tlen, int32_t ret_val);
	static void
	rack_do_dropwithreset(struct mbuf m, struct tcpcb tp,
	- struct tcphdr *th, int32_t rstreason, int32_t tlen);
	+ struct tcphdr *th, int32_t rstreason, int32_t tlen);
	static int
	rack_do_established(struct mbuf m, struct tcphdr th,
	struct socket so, struct tcpcb tp, struct tcpopt *to, int32_t drop_hdrlen,
	@@ -449,6 +466,7 @@
	counter_u64_zero(rack_sack_proc_short);
	counter_u64_zero(rack_sack_proc_restart);
	counter_u64_zero(rack_to_alloc);
	+ counter_u64_zero(rack_to_alloc_limited);
	counter_u64_zero(rack_alloc_limited_conns);
	counter_u64_zero(rack_split_limited);
	counter_u64_zero(rack_find_high);
	@@ -470,6 +488,18 @@
	{
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_sysctl_root),
	+ OID_AUTO, "map_limit", CTLFLAG_RW,
	+ &rack_map_entries_limit , 1024,
	+ "Is there a limit on how big the sendmap can grow? ");
	+
	+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_sysctl_root),
	+ OID_AUTO, "map_splitlimit", CTLFLAG_RW,
	+ &rack_map_split_limit , 256,
	+ "Is there a limit on how much splitting a peer can do?");
	+
	+ SYSCTL_ADD_S32(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_sysctl_root),
	OID_AUTO, "rate_sample_method", CTLFLAG_RW,
	&rack_rate_sample_method , USE_RTT_LOW,
	"What method should we use for rate sampling 0=high, 1=low ");
	@@ -628,11 +658,6 @@
	OID_AUTO, "pktdelay", CTLFLAG_RW,
	&rack_pkt_delay, 1,
	"Extra RACK time (in ms) besides reordering thresh");
	- SYSCTL_ADD_U32(&rack_sysctl_ctx,
	- SYSCTL_CHILDREN(rack_sysctl_root),
	- OID_AUTO, "split_limit", CTLFLAG_RW,
	- &rack_map_split_limit, 0,
	- "Is there a limit on the number of map split entries (0=unlimited)");
	SYSCTL_ADD_S32(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_sysctl_root),
	OID_AUTO, "inc_var", CTLFLAG_RW,
	@@ -769,6 +794,12 @@
	OID_AUTO, "allocemerg", CTLFLAG_RD,
	&rack_to_alloc_emerg,
	"Total allocations done from emergency cache");
	+ rack_to_alloc_limited = counter_u64_alloc(M_WAITOK);
	+ SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
	+ SYSCTL_CHILDREN(rack_sysctl_root),
	+ OID_AUTO, "alloc_limited", CTLFLAG_RD,
	+ &rack_to_alloc_limited,
	+ "Total allocations dropped due to limit");
	rack_alloc_limited_conns = counter_u64_alloc(M_WAITOK);
	SYSCTL_ADD_COUNTER_U64(&rack_sysctl_ctx,
	SYSCTL_CHILDREN(rack_sysctl_root),
	@@ -859,6 +890,7 @@
	static inline int32_t
	rack_progress_timeout_check(struct tcpcb *tp)
	{
	+#ifdef NETFLIX_PROGRESS
	if (tp->t_maxunacktime && tp->t_acktime && TSTMP_GT(ticks, tp->t_acktime)) {
	if ((ticks - tp->t_acktime) >= tp->t_maxunacktime) {
	/*
	@@ -869,13 +901,12 @@
	struct tcp_rack *rack;
	rack = (struct tcp_rack *)tp->t_fb_ptr;
	counter_u64_add(rack_progress_drops, 1);
	-#ifdef NETFLIX_STATS
	TCPSTAT_INC(tcps_progdrops);
	-#endif
	rack_log_progress_event(rack, tp, ticks, PROGRESS_DROP, __LINE__);
	return (1);
	}
	}
	+#endif
	return (0);
	}

	@@ -962,6 +993,7 @@
	union tcp_log_stackspecific log;
	struct timeval tv;

	+ memset(&log, 0, sizeof(log));
	/* Convert our ms to a microsecond */
	log.u_bbr.flex1 = rtt * 1000;
	log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	@@ -1021,6 +1053,8 @@
	{
	if (rack->rc_tp->t_logstate != TCP_LOG_STATE_OFF) {
	union tcp_log_stackspecific log;
	+
	+ memset(&log, 0, sizeof(log));
	log.u_bbr.flex1 = did_out;
	log.u_bbr.flex2 = nxt_pkt;
	log.u_bbr.flex3 = way_out;
	@@ -1127,6 +1161,8 @@
	counter_u64_free(rack_sack_proc_short);
	counter_u64_free(rack_sack_proc_restart);
	counter_u64_free(rack_to_alloc);
	+ counter_u64_free(rack_to_alloc_limited);
	+ counter_u64_free(rack_split_limited);
	counter_u64_free(rack_find_high);
	counter_u64_free(rack_runt_sacks);
	counter_u64_free(rack_enter_tlp_calc);
	@@ -1146,9 +1182,8 @@

	rsm = uma_zalloc(rack_zone, M_NOWAIT);
	if (rsm) {
	-alloc_done:
	- counter_u64_add(rack_to_alloc, 1);
	rack->r_ctl.rc_num_maps_alloced++;
	+ counter_u64_add(rack_to_alloc, 1);
	return (rsm);
	}
	if (rack->rc_free_cnt) {
	@@ -1156,11 +1191,26 @@
	rsm = TAILQ_FIRST(&rack->r_ctl.rc_free);
	TAILQ_REMOVE(&rack->r_ctl.rc_free, rsm, r_next);
	rack->rc_free_cnt--;
	- goto alloc_done;
	+ return (rsm);
	}
	return (NULL);
	}

	+static struct rack_sendmap *
	+rack_alloc_full_limit(struct tcp_rack *rack)
	+{
	+ if ((rack_map_entries_limit > 0) &&
	+ (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
	+ counter_u64_add(rack_to_alloc_limited, 1);
	+ if (!rack->alloc_limit_reported) {
	+ rack->alloc_limit_reported = 1;
	+ counter_u64_add(rack_alloc_limited_conns, 1);
	+ }
	+ return (NULL);
	+ }
	+ return (rack_alloc(rack));
	+}
	+
	/* wrapper to allocate a sendmap entry, subject to a specific limit */
	static struct rack_sendmap *
	rack_alloc_limit(struct tcp_rack *rack, uint8_t limit_type)
	@@ -1196,7 +1246,6 @@
	/* currently there is only one limit type */
	rack->r_ctl.rc_num_split_allocs--;
	}
	- rack->r_ctl.rc_num_maps_alloced--;
	if (rack->r_ctl.rc_tlpsend == rsm)
	rack->r_ctl.rc_tlpsend = NULL;
	if (rack->r_ctl.rc_next == rsm)
	@@ -1206,9 +1255,11 @@
	if (rack->rc_free_cnt < rack_free_cache) {
	memset(rsm, 0, sizeof(struct rack_sendmap));
	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_free, rsm, r_next);
	+ rsm->r_limit_type = 0;
	rack->rc_free_cnt++;
	return;
	}
	+ rack->r_ctl.rc_num_maps_alloced--;
	uma_zfree(rack_zone, rsm);
	}

	@@ -1222,11 +1273,9 @@
	#ifdef NETFLIX_STATS
	int32_t gput;
	#endif
	-#ifdef NETFLIX_CWV
	- u_long old_cwnd = tp->snd_cwnd;
	-#endif

	INP_WLOCK_ASSERT(tp->t_inpcb);
	+
	tp->ccv->nsegs = nsegs;
	tp->ccv->bytes_this_ack = BYTES_THIS_ACK(tp, th);
	if ((recovery) && (rack->r_ctl.rc_early_recovery_segs)) {
	@@ -1264,7 +1313,6 @@
	tp->t_stats_gput_prev);
	tp->t_flags &= ~TF_GPUTINPROG;
	tp->t_stats_gput_prev = gput;
	-#ifdef NETFLIX_CWV
	if (tp->t_maxpeakrate) {
	/*
	* We update t_peakrate_thr. This gives us roughly
	@@ -1272,7 +1320,6 @@
	*/
	tcp_update_peakrate_thr(tp);
	}
	-#endif
	}
	#endif
	if (tp->snd_cwnd > tp->snd_ssthresh) {
	@@ -1298,39 +1345,10 @@
	if (rack->r_ctl.rc_rack_largest_cwnd < tp->snd_cwnd) {
	rack->r_ctl.rc_rack_largest_cwnd = tp->snd_cwnd;
	}
	-#ifdef NETFLIX_CWV
	- if (tp->cwv_enabled) {
	- /*
	- * Per RFC 7661: The behaviour in the non-validated phase is
	- * specified as: o A sender determines whether to increase
	- * the cwnd based upon whether it is cwnd-limited (see
	- * Section 4.5.3): * A sender that is cwnd-limited MAY use
	- * the standard TCP method to increase cwnd (i.e., the
	- * standard method permits a TCP sender that fully utilises
	- * the cwnd to increase the cwnd each time it receives an
	- * ACK). * A sender that is not cwnd-limited MUST NOT
	- * increase the cwnd when ACK packets are received in this
	- * phase (i.e., needs to avoid growing the cwnd when it has
	- * not recently sent using the current size of cwnd).
	- */
	- if ((tp->snd_cwnd > old_cwnd) &&
	- (tp->cwv_cwnd_valid == 0) &&
	- (!(tp->ccv->flags & CCF_CWND_LIMITED))) {
	- tp->snd_cwnd = old_cwnd;
	- }
	- /* Try to update pipeAck and NCWV state */
	- if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	- !IN_RECOVERY(tp->t_flags)) {
	- uint32_t data = sbavail(&(tp->t_inpcb->inp_socket->so_snd));
	-
	- tcp_newcwv_update_pipeack(tp, data);
	- }
	- }
	/* we enforce max peak rate if it is set. */
	if (tp->t_peakrate_thr && tp->snd_cwnd > tp->t_peakrate_thr) {
	tp->snd_cwnd = tp->t_peakrate_thr;
	}
	-#endif
	}

	static void
	@@ -1379,16 +1397,8 @@
	tp->snd_cwnd += rack->r_ctl.rc_prr_sndcnt;
	rack->r_ctl.rc_prr_sndcnt = 0;
	}
	+ tp->snd_recover = tp->snd_una;
	EXIT_RECOVERY(tp->t_flags);
	-
	-
	-#ifdef NETFLIX_CWV
	- if (tp->cwv_enabled) {
	- if ((tp->cwv_cwnd_valid == 0) &&
	- (tp->snd_cwv.in_recovery))
	- tcp_newcwv_end_recovery(tp);
	- }
	-#endif
	}

	static void
	@@ -1450,16 +1460,6 @@
	tp->ccv->curack = th->th_ack;
	CC_ALGO(tp)->cong_signal(tp->ccv, type);
	}
	-#ifdef NETFLIX_CWV
	- if (tp->cwv_enabled) {
	- if (tp->snd_cwv.in_recovery == 0 && IN_RECOVERY(tp->t_flags)) {
	- tcp_newcwv_enter_recovery(tp);
	- }
	- if (type == CC_RTO) {
	- tcp_newcwv_reset(tp);
	- }
	- }
	-#endif
	}


	@@ -1479,11 +1479,21 @@
	if (CC_ALGO(tp)->after_idle != NULL)
	CC_ALGO(tp)->after_idle(tp->ccv);

	- if (tp->snd_cwnd == 1)
	- i_cwnd = tp->t_maxseg; /* SYN(-ACK) lost */
	- else
	- i_cwnd = tcp_compute_initwnd(tcp_maxseg(tp));
	-
	+ if (V_tcp_initcwnd_segments)
	+ i_cwnd = min((V_tcp_initcwnd_segments * tp->t_maxseg),
	+ max(2 * tp->t_maxseg, 14600));
	+ else if (V_tcp_do_rfc3390)
	+ i_cwnd = min(4 * tp->t_maxseg,
	+ max(2 * tp->t_maxseg, 4380));
	+ else {
	+ /* Per RFC5681 Section 3.1 */
	+ if (tp->t_maxseg > 2190)
	+ i_cwnd = 2 * tp->t_maxseg;
	+ else if (tp->t_maxseg > 1095)
	+ i_cwnd = 3 * tp->t_maxseg;
	+ else
	+ i_cwnd = 4 * tp->t_maxseg;
	+ }
	if (reduce_largest) {
	/*
	* Do we reduce the largest cwnd to make
	@@ -1549,8 +1559,7 @@
	}

	static void
	-rack_do_dropwithreset(struct mbuf m, struct tcpcb tp, struct tcphdr *th,
	- int32_t rstreason, int32_t tlen)
	+rack_do_dropwithreset(struct mbuf m, struct tcpcb tp, struct tcphdr *th, int32_t rstreason, int32_t tlen)
	{
	if (tp != NULL) {
	tcp_dropwithreset(m, th, tp, tlen, rstreason);
	@@ -1736,7 +1745,7 @@
	* TCB is still valid and locked.
	*/
	static int
	-rack_drop_checks(struct tcpopt to, struct mbuf m, struct tcphdr th, struct tcpcb tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
	+rack_drop_checks(struct tcpopt to, struct mbuf m, struct tcphdr th, struct tcpcb tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
	{
	int32_t todrop;
	int32_t thflags;
	@@ -1778,17 +1787,6 @@
	TCPSTAT_INC(tcps_rcvpartduppack);
	TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
	}
	- /*
	- * DSACK - add SACK block for dropped range
	- */
	- if (tp->t_flags & TF_SACK_PERMIT) {
	- tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
	- /*
	- * ACK now, as the next in-sequence segment
	- * will clear the DSACK block again
	- */
	- tp->t_flags \|= TF_ACKNOW;
	- }
	drop_hdrlen += todrop; / drop from the top afterwards */
	th->th_seq += todrop;
	tlen -= todrop;
	@@ -2124,8 +2122,6 @@
	/* We can't start any timer in persists */
	return (rack_get_persists_timer_val(tp, rack));
	}
	- if (tp->t_state < TCPS_ESTABLISHED)
	- goto activate_rxt;
	rsm = TAILQ_FIRST(&rack->r_ctl.rc_tmap);
	if (rsm == NULL) {
	/* Nothing on the send map */
	@@ -2184,6 +2180,12 @@
	*/
	goto activate_rxt;
	}
	+ if ((tp->snd_max - tp->snd_una) > tp->snd_wnd) {
	+ /*
	+ * Peer collapsed rwnd, don't do TLP.
	+ */
	+ goto activate_rxt;
	+ }
	rsm = TAILQ_LAST_FAST(&rack->r_ctl.rc_tmap, rack_sendmap, r_tnext);
	if (rsm == NULL) {
	/* We found no rsm to TLP with. */
	@@ -2288,7 +2290,9 @@
	/* A previous call is already set up */
	return;
	}
	- if (tp->t_state == TCPS_CLOSED) {
	+
	+ if ((tp->t_state == TCPS_CLOSED) \|\|
	+ (tp->t_state == TCPS_LISTEN)) {
	return;
	}
	stopped = rack->rc_tmr_stopped;
	@@ -2307,8 +2311,8 @@
	* We are still left on the hpts when the to goes
	* it will be for output.
	*/
	- if (TSTMP_GT(cts, rack->r_ctl.rc_last_output_to))
	- slot = cts - rack->r_ctl.rc_last_output_to;
	+ if (TSTMP_GT(rack->r_ctl.rc_last_output_to, cts))
	+ slot = rack->r_ctl.rc_last_output_to - cts;
	else
	slot = 1;
	}
	@@ -2330,7 +2334,7 @@
	}
	hpts_timeout = rack_timer_start(tp, rack, cts);
	if (tp->t_flags & TF_DELACK) {
	- delayed_ack = TICKS_2_MSEC(tcp_delacktime);
	+ delayed_ack = tcp_delacktime;
	rack->r_ctl.rc_hpts_flags \|= PACE_TMR_DELACK;
	}
	if (delayed_ack && ((hpts_timeout == 0) \|\|
	@@ -2487,6 +2491,43 @@
	return (0);
	}

	+static struct rack_sendmap *
	+rack_merge_rsm(struct tcp_rack *rack,
	+ struct rack_sendmap *l_rsm,
	+ struct rack_sendmap *r_rsm)
	+{
	+ /*
	+ * We are merging two ack'd RSM's,
	+ * the l_rsm is on the left (lower seq
	+ * values) and the r_rsm is on the right
	+ * (higher seq value). The simplest way
	+ * to merge these is to move the right
	+ * one into the left. I don't think there
	+ * is any reason we need to try to find
	+ * the oldest (or last oldest retransmitted).
	+ */
	+ l_rsm->r_end = r_rsm->r_end;
	+ if (r_rsm->r_rtr_bytes)
	+ l_rsm->r_rtr_bytes += r_rsm->r_rtr_bytes;
	+ if (r_rsm->r_in_tmap) {
	+ /* This really should not happen */
	+ TAILQ_REMOVE(&rack->r_ctl.rc_tmap, r_rsm, r_tnext);
	+ }
	+ /* Now the flags */
	+ if (r_rsm->r_flags & RACK_HAS_FIN)
	+ l_rsm->r_flags \|= RACK_HAS_FIN;
	+ if (r_rsm->r_flags & RACK_TLP)
	+ l_rsm->r_flags \|= RACK_TLP;
	+ TAILQ_REMOVE(&rack->r_ctl.rc_map, r_rsm, r_next);
	+ if ((r_rsm->r_limit_type == 0) && (l_rsm->r_limit_type != 0)) {
	+ /* Transfer the split limit to the map we free */
	+ r_rsm->r_limit_type = l_rsm->r_limit_type;
	+ l_rsm->r_limit_type = 0;
	+ }
	+ rack_free(rack, r_rsm);
	+ return(l_rsm);
	+}
	+
	/*
	* TLP Timer, here we simply setup what segment we want to
	* have the TLP expire on, the normal rack_output() will then
	@@ -2590,7 +2631,7 @@
	int32_t idx;
	struct rack_sendmap *nrsm;

	- nrsm = rack_alloc(rack);
	+ nrsm = rack_alloc_full_limit(rack);
	if (nrsm == NULL) {
	/*
	* No memory to split, we will just exit and punt
	@@ -2937,7 +2978,7 @@
	TCPSTAT_INC(tcps_rexmttimeo);
	if ((tp->t_state == TCPS_SYN_SENT) \|\|
	(tp->t_state == TCPS_SYN_RECEIVED))
	- rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_backoff[tp->t_rxtshift]);
	+ rexmt = MSEC_2_TICKS(RACK_INITIAL_RTO * tcp_syn_backoff[tp->t_rxtshift]);
	else
	rexmt = TCP_REXMTVAL(tp) * tcp_backoff[tp->t_rxtshift];
	TCPT_RANGESET(tp->t_rxtcur, rexmt,
	@@ -3281,7 +3322,7 @@
	* Here we retransmitted less than the whole thing which means we
	* have to split this into what was transmitted and what was not.
	*/
	- nrsm = rack_alloc(rack);
	+ nrsm = rack_alloc_full_limit(rack);
	if (nrsm == NULL) {
	/*
	* We can't get memory, so lets not proceed.
	@@ -3415,9 +3456,6 @@
	* Hmm out of memory and the tcb got destroyed while
	* we tried to wait.
	*/
	-#ifdef INVARIANTS
	- panic("Out of memory when we should not be rack:%p", rack);
	-#endif
	return;
	}
	if (th_flags & TH_FIN) {
	@@ -3428,15 +3466,8 @@
	rsm->r_tim_lastsent[0] = ts;
	rsm->r_rtr_cnt = 1;
	rsm->r_rtr_bytes = 0;
	- if (th_flags & TH_SYN) {
	- /* The data space is one beyond snd_una */
	- rsm->r_start = seq_out + 1;
	- rsm->r_end = rsm->r_start + (len - 1);
	- } else {
	- /* Normal case */
	- rsm->r_start = seq_out;
	- rsm->r_end = rsm->r_start + len;
	- }
	+ rsm->r_start = seq_out;
	+ rsm->r_end = rsm->r_start + len;
	rsm->r_sndcnt = 0;
	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_map, rsm, r_next);
	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
	@@ -3486,11 +3517,8 @@
	* Ok we must split off the front and then let the
	* update do the rest
	*/
	- nrsm = rack_alloc(rack);
	+ nrsm = rack_alloc_full_limit(rack);
	if (nrsm == NULL) {
	-#ifdef INVARIANTS
	- panic("Ran out of memory that was preallocated? rack:%p", rack);
	-#endif
	rack_update_rsm(tp, rack, rsm, ts);
	return;
	}
	@@ -3908,6 +3936,14 @@
	if (nrsm->r_flags & RACK_ACKED) {
	/* Skip ack'd segments */
	continue;
	+ }
	+ if (nrsm->r_flags & RACK_SACK_PASSED) {
	+ /*
	+ * We found one that is already marked
	+ * passed, we have been here before and
	+ * so all others below this are marked.
	+ */
	+ break;
	}
	idx = nrsm->r_rtr_cnt - 1;
	if (ts == nrsm->r_tim_lastsent[idx]) {
	@@ -4114,6 +4150,26 @@
	rsm->r_in_tmap = 0;
	}
	out:
	+ if (rsm && (rsm->r_flags & RACK_ACKED)) {
	+ /*
	+ * Now can we merge this newly acked
	+ * block with either the previous or
	+ * next block?
	+ */
	+ nrsm = TAILQ_NEXT(rsm, r_next);
	+ if (nrsm &&
	+ (nrsm->r_flags & RACK_ACKED)) {
	+ /* yep this and next can be merged */
	+ rsm = rack_merge_rsm(rack, rsm, nrsm);
	+ }
	+ /* Now what about the previous? */
	+ nrsm = TAILQ_PREV(rsm, rack_head, r_next);
	+ if (nrsm &&
	+ (nrsm->r_flags & RACK_ACKED)) {
	+ /* yep the previous and this can be merged */
	+ rsm = rack_merge_rsm(rack, nrsm, rsm);
	+ }
	+ }
	if (used_ref == 0) {
	counter_u64_add(rack_sack_proc_all, 1);
	} else {
	@@ -4353,16 +4409,13 @@
	}
	sack_blocks[num_sack_blks] = sack;
	num_sack_blks++;
	-#ifdef NETFLIX_STATS
	} else if (SEQ_LEQ(sack.start, th_ack) &&
	SEQ_LEQ(sack.end, th_ack)) {
	/*
	* Its a D-SACK block.
	*/
	- tcp_record_dsack(sack.start, sack.end);
	-#endif
	+/* tcp_record_dsack(sack.start, sack.end); */
	}
	-
	}
	if (num_sack_blks == 0)
	goto out;
	@@ -4371,7 +4424,9 @@
	* just one pass.
	*/
	if (rack_use_sack_filter) {
	- num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks, num_sack_blks, th->th_ack);
	+ num_sack_blks = sack_filter_blks(&rack->r_ctl.rack_sf, sack_blocks,
	+ num_sack_blks, th->th_ack);
	+ ctf_log_sack_filter(rack->rc_tp, num_sack_blks, sack_blocks);
	}
	if (num_sack_blks < 2) {
	goto do_sack_work;
	@@ -4620,8 +4675,9 @@
	return (0);
	}
	if (rack->r_ctl.rc_early_recovery) {
	- if (IN_FASTRECOVERY(tp->t_flags)) {
	- if (SEQ_LT(th->th_ack, tp->snd_recover)) {
	+ if (IN_RECOVERY(tp->t_flags)) {
	+ if (SEQ_LT(th->th_ack, tp->snd_recover) &&
	+ (SEQ_LT(th->th_ack, tp->snd_max))) {
	tcp_rack_partialack(tp, th);
	} else {
	rack_post_recovery(tp, th);
	@@ -4648,8 +4704,9 @@
	sowwakeup_locked(so);
	m_freem(mfree);
	if (rack->r_ctl.rc_early_recovery == 0) {
	- if (IN_FASTRECOVERY(tp->t_flags)) {
	- if (SEQ_LT(th->th_ack, tp->snd_recover)) {
	+ if (IN_RECOVERY(tp->t_flags)) {
	+ if (SEQ_LT(th->th_ack, tp->snd_recover) &&
	+ (SEQ_LT(th->th_ack, tp->snd_max))) {
	tcp_rack_partialack(tp, th);
	} else {
	rack_post_recovery(tp, th);
	@@ -4707,7 +4764,11 @@
	* send garbage on first SYN.
	*/
	int32_t nsegs;
	+#ifdef TCP_RFC7413
	int32_t tfo_syn;
	+#else
	+#define tfo_syn (FALSE)
	+#endif
	struct tcp_rack *rack;

	rack = (struct tcp_rack *)tp->t_fb_ptr;
	@@ -4816,8 +4877,10 @@
	* PRU_RCVD). If a FIN has already been received on this connection
	* then we just ignore the text.
	*/
	+#ifdef TCP_RFC7413
	tfo_syn = ((tp->t_state == TCPS_SYN_RECEIVED) &&
	- IS_FASTOPEN(tp->t_flags));
	+ (tp->t_flags & TF_FASTOPEN));
	+#endif
	if ((tlen \|\| (thflags & TH_FIN) \|\| tfo_syn) &&
	TCPS_HAVERCVDFIN(tp->t_state) == 0) {
	tcp_seq save_start = th->th_seq;
	@@ -5024,8 +5087,9 @@


	/* Clean receiver SACK report if present */
	- if (tp->rcv_numsacks)
	- tcp_clean_sackreport(tp);
	+/* if (tp->rcv_numsacks)
	+ tcp_clean_sackreport(tp);
	+*/
	TCPSTAT_INC(tcps_preddat);
	tp->rcv_nxt += tlen;
	/*
	@@ -5284,8 +5348,6 @@
	tp->irs = th->th_seq;
	tcp_rcvseqinit(tp);
	if (thflags & TH_ACK) {
	- int tfo_partial = 0;
	-
	TCPSTAT_INC(tcps_connects);
	soisconnected(so);
	#ifdef MAC
	@@ -5299,19 +5361,10 @@
	tp->rcv_adv += min(tp->rcv_wnd,
	TCP_MAXWIN << tp->rcv_scale);
	/*
	- * If not all the data that was sent in the TFO SYN
	- * has been acked, resend the remainder right away.
	- */
	- if (IS_FASTOPEN(tp->t_flags) &&
	- (tp->snd_una != tp->snd_max)) {
	- tp->snd_nxt = th->th_ack;
	- tfo_partial = 1;
	- }
	- /*
	* If there's data, delay ACK; if there's also a FIN ACKNOW
	* will be turned on later.
	*/
	- if (DELAY_ACK(tp, tlen) && tlen != 0 && (tfo_partial == 0)) {
	+ if (DELAY_ACK(tp, tlen) && tlen != 0) {
	rack_timer_cancel(tp, (struct tcp_rack *)tp->t_fb_ptr,
	((struct tcp_rack *)tp->t_fb_ptr)->r_ctl.rc_rcvtime, __LINE__);
	tp->t_flags \|= TF_DELACK;
	@@ -5320,26 +5373,10 @@
	tp->t_flags \|= TF_ACKNOW;
	}

	- if (((thflags & (TH_CWR \| TH_ECE)) == TH_ECE) &&
	- V_tcp_do_ecn) {
	+ if ((thflags & TH_ECE) && V_tcp_do_ecn) {
	tp->t_flags \|= TF_ECN_PERMIT;
	TCPSTAT_INC(tcps_ecn_shs);
	}
	- if (SEQ_GT(th->th_ack, tp->snd_una)) {
	- /*
	- * We advance snd_una for the
	- * fast open case. If th_ack is
	- * acknowledging data beyond
	- * snd_una we can't just call
	- * ack-processing since the
	- * data stream in our send-map
	- * will start at snd_una + 1 (one
	- * beyond the SYN). If its just
	- * equal we don't need to do that
	- * and there is no send_map.
	- */
	- tp->snd_una++;
	- }
	/*
	* Received <SYN,ACK> in SYN_SENT[*] state. Transitions:
	* SYN_SENT --> ESTABLISHED SYN_SENT* --> FIN_WAIT_1
	@@ -5423,7 +5460,7 @@
	}
	}
	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
	- tiwin, thflags, nxt_pkt));
	+ tiwin, thflags, nxt_pkt));
	}

	/*
	@@ -5447,13 +5484,13 @@
	rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
	return (1);
	}
	- if (IS_FASTOPEN(tp->t_flags)) {
	+#ifdef TCP_RFC7413
	+ if (tp->t_flags & TF_FASTOPEN) {
	/*
	- * When a TFO connection is in SYN_RECEIVED, the
	- * only valid packets are the initial SYN, a
	- * retransmit/copy of the initial SYN (possibly with
	- * a subset of the original data), a valid ACK, a
	- * FIN, or a RST.
	+ * When a TFO connection is in SYN_RECEIVED, the only valid
	+ * packets are the initial SYN, a retransmit/copy of the
	+ * initial SYN (possibly with a subset of the original
	+ * data), a valid ACK, a FIN, or a RST.
	*/
	if ((thflags & (TH_SYN \| TH_ACK)) == (TH_SYN \| TH_ACK)) {
	rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
	@@ -5474,9 +5511,18 @@
	return (0);
	}
	}
	+#endif
	if (thflags & TH_RST)
	return (rack_process_rst(m, th, so, tp));
	/*
	+ * RFC5961 Section 4.2 Send challenge ACK for any SYN in
	+ * synchronized state.
	+ */
	+ if (thflags & TH_SYN) {
	+ rack_challenge_ack(m, th, tp, &ret_val);
	+ return (ret_val);
	+ }
	+ /*
	* RFC 1323 PAWS: If we have a timestamp reply on this segment and
	* it's less than ts_recent, drop it.
	*/
	@@ -5520,16 +5566,18 @@
	tp->ts_recent_age = tcp_ts_getticks();
	tp->ts_recent = to->to_tsval;
	}
	- tp->snd_wnd = tiwin;
	/*
	* If the ACK bit is off: if in SYN-RECEIVED state or SENDSYN flag
	* is on (half-synchronized state), then queue data for later
	* processing; else drop segment and return.
	*/
	if ((thflags & TH_ACK) == 0) {
	- if (IS_FASTOPEN(tp->t_flags)) {
	+#ifdef TCP_RFC7413
	+ if (tp->t_flags & TF_FASTOPEN) {
	+ tp->snd_wnd = tiwin;
	cc_conn_init(tp);
	}
	+#endif
	return (rack_process_data(m, th, so, tp, drop_hdrlen, tlen,
	tiwin, thflags, nxt_pkt));
	}
	@@ -5539,22 +5587,13 @@
	if ((tp->t_flags & (TF_RCVD_SCALE \| TF_REQ_SCALE)) ==
	(TF_RCVD_SCALE \| TF_REQ_SCALE)) {
	tp->rcv_scale = tp->request_r_scale;
	+ tp->snd_wnd = tiwin;
	}
	/*
	* Make transitions: SYN-RECEIVED -> ESTABLISHED SYN-RECEIVED* ->
	* FIN-WAIT-1
	*/
	tp->t_starttime = ticks;
	- if (IS_FASTOPEN(tp->t_flags) && tp->t_tfo_pending) {
	- tcp_fastopen_decrement_counter(tp->t_tfo_pending);
	- tp->t_tfo_pending = NULL;
	-
	- /*
	- * Account for the ACK of our SYN prior to
	- * regular ACK processing below.
	- */
	- tp->snd_una++;
	- }
	if (tp->t_flags & TF_NEEDFIN) {
	tcp_state_change(tp, TCPS_FIN_WAIT_1);
	tp->t_flags &= ~TF_NEEDFIN;
	@@ -5562,13 +5601,25 @@
	tcp_state_change(tp, TCPS_ESTABLISHED);
	TCP_PROBE5(accept__established, NULL, tp,
	mtod(m, const char *), tp, th);
	+#ifdef TCP_RFC7413
	+ if (tp->t_tfo_pending) {
	+ tcp_fastopen_decrement_counter(tp->t_tfo_pending);
	+ tp->t_tfo_pending = NULL;
	+
	+ /*
	+ * Account for the ACK of our SYN prior to regular
	+ * ACK processing below.
	+ */
	+ tp->snd_una++;
	+ }
	/*
	* TFO connections call cc_conn_init() during SYN
	* processing. Calling it again here for such connections
	* is not harmless as it would undo the snd_cwnd reduction
	* that occurs when a TFO SYN\|ACK is retransmitted.
	*/
	- if (!IS_FASTOPEN(tp->t_flags))
	+ if (!(tp->t_flags & TF_FASTOPEN))
	+#endif
	cc_conn_init(tp);
	}
	/*
	@@ -5576,7 +5627,7 @@
	* not, do so now to pass queued data to user.
	*/
	if (tlen == 0 && (thflags & TH_FIN) == 0)
	- (void) tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
	+ (void)tcp_reass(tp, (struct tcphdr *)0, NULL, 0,
	(struct mbuf *)0);
	tp->snd_wl1 = th->th_seq - 1;
	if (rack_process_ack(m, th, so, tp, to, tiwin, tlen, &ourfinisacked, thflags, &ret_val)) {
	@@ -5836,7 +5887,7 @@
	rack_check_data_after_close(struct mbuf *m,
	struct tcpcb tp, int32_t tlen, struct tcphdr th, struct socket so)
	{
	- struct tcp_rack *rack;
	+ struct tcp_rack *rack;

	INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
	rack = (struct tcp_rack *)tp->t_fb_ptr;
	@@ -6353,7 +6404,6 @@
	rack->r_ctl.rc_prr_sendalot = rack_send_a_lot_in_prr;
	rack->r_ctl.rc_min_to = rack_min_to;
	rack->r_ctl.rc_prr_inc_var = rack_inc_var;
	- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
	if (tp->snd_una != tp->snd_max) {
	/* Create a send map for the current outstanding data */
	struct rack_sendmap *rsm;
	@@ -6375,6 +6425,8 @@
	TAILQ_INSERT_TAIL(&rack->r_ctl.rc_tmap, rsm, r_tnext);
	rsm->r_in_tmap = 1;
	}
	+ rack_stop_all_timers(tp);
	+ rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
	return (0);
	}

	@@ -6431,6 +6483,8 @@
	uma_zfree(rack_pcb_zone, tp->t_fb_ptr);
	tp->t_fb_ptr = NULL;
	}
	+ /* Make sure snd_nxt is correctly set */
	+ tp->snd_nxt = tp->snd_max;
	}

	static void
	@@ -6473,9 +6527,6 @@
	case TCPS_CLOSED:
	case TCPS_TIME_WAIT:
	default:
	-#ifdef INVARIANTS
	- panic("tcp tp:%p state:%d sees impossible state?", tp, tp->t_state);
	-#endif
	break;
	};
	}
	@@ -6585,10 +6636,6 @@
	* allow the tcbinfo to be in either locked or unlocked, as the
	* caller may have unnecessarily acquired a lock due to a race.
	*/
	- if ((thflags & (TH_SYN \| TH_FIN \| TH_RST)) != 0 \|\|
	- tp->t_state != TCPS_ESTABLISHED) {
	- INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
	- }
	INP_WLOCK_ASSERT(tp->t_inpcb);
	KASSERT(tp->t_state > TCPS_LISTEN, ("%s: TCPS_LISTEN",
	__func__));
	@@ -6600,37 +6647,17 @@
	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	log.u_bbr.inhpts = rack->rc_inp->inp_in_hpts;
	log.u_bbr.ininput = rack->rc_inp->inp_in_input;
	+ log.u_bbr.flex2 = rack->r_ctl.rc_num_maps_alloced;
	TCP_LOG_EVENT(tp, th, &so->so_rcv, &so->so_snd, TCP_LOG_IN, 0,
	tlen, &log, true);
	}
	- if ((thflags & TH_SYN) && (thflags & TH_FIN) && V_drop_synfin) {
	- way_out = 4;
	- goto done_with_input;
	- }
	/*
	- * If a segment with the ACK-bit set arrives in the SYN-SENT state
	- * check SEQ.ACK first as described on page 66 of RFC 793, section 3.9.
	- */
	- if ((tp->t_state == TCPS_SYN_SENT) && (thflags & TH_ACK) &&
	- (SEQ_LEQ(th->th_ack, tp->iss) \|\| SEQ_GT(th->th_ack, tp->snd_max))) {
	- rack_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
	- return;
	- }
	- /*
	* Segment received on connection. Reset idle time and keep-alive
	* timer. XXX: This should be done after segment validation to
	* ignore broken/spoofed segs.
	*/
	if (tp->t_idle_reduce && (tp->snd_max == tp->snd_una)) {
	-#ifdef NETFLIX_CWV
	- if ((tp->cwv_enabled) &&
	- ((tp->cwv_cwnd_valid == 0) &&
	- TCPS_HAVEESTABLISHED(tp->t_state) &&
	- (tp->snd_cwnd > tp->snd_cwv.init_cwnd))) {
	- tcp_newcwv_nvp_closedown(tp);
	- } else
	-#endif
	- if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
	+ if ((ticks - tp->t_rcvtime) >= tp->t_rxtcur) {
	counter_u64_add(rack_input_idle_reduces, 1);
	rack_cc_after_idle(tp,
	(rack->r_idle_reduce_largest ? 1 :0));
	@@ -6639,14 +6666,6 @@
	rack->r_ctl.rc_rcvtime = cts;
	tp->t_rcvtime = ticks;

	-#ifdef NETFLIX_CWV
	- if (tp->cwv_enabled) {
	- if ((tp->cwv_cwnd_valid == 0) &&
	- TCPS_HAVEESTABLISHED(tp->t_state) &&
	- (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
	- tcp_newcwv_nvp_closedown(tp);
	- }
	-#endif
	/*
	* Unscale the window into a 32-bit value. For the SYN_SENT state
	* the scale is zero.
	@@ -6737,22 +6756,6 @@
	if ((tp->t_flags & TF_SACK_PERMIT) &&
	(to.to_flags & TOF_SACKPERM) == 0)
	tp->t_flags &= ~TF_SACK_PERMIT;
	- if (IS_FASTOPEN(tp->t_flags)) {
	- if (to.to_flags & TOF_FASTOPEN) {
	- uint16_t mss;
	-
	- if (to.to_flags & TOF_MSS)
	- mss = to.to_mss;
	- else
	- if ((tp->t_inpcb->inp_vflag & INP_IPV6) != 0)
	- mss = TCP6_MSS;
	- else
	- mss = TCP_MSS;
	- tcp_fastopen_update_cache(tp, mss,
	- to.to_tfo_len, to.to_tfo_cookie);
	- } else
	- tcp_fastopen_disable_path(tp);
	- }
	}
	/*
	* At this point we are at the initial call. Here we decide
	@@ -6769,7 +6772,6 @@
	/* Set the flag */
	rack->r_is_v6 = (tp->t_inpcb->inp_vflag & INP_IPV6) != 0;
	tcp_set_hpts(tp->t_inpcb);
	- rack_stop_all_timers(tp);
	sack_filter_clear(&rack->r_ctl.rack_sf, th->th_ack);
	}
	/*
	@@ -6801,24 +6803,6 @@
	*/
	INP_WLOCK_ASSERT(tp->t_inpcb);
	tcp_rack_xmit_timer_commit(rack, tp);
	- if (((tp->snd_max - tp->snd_una) > tp->snd_wnd) &&
	- (rack->rc_in_persist == 0)){
	- /*
	- * The peer shrunk its window on us to the point
	- * where we have sent too much. The only thing
	- * we can do here is stop any timers and
	- * enter persist. We most likely lost the last
	- * bytes we sent but oh well, we will have to
	- * retransmit them after the peer is caught up.
	- */
	- if (rack->rc_inp->inp_in_hpts)
	- tcp_hpts_remove(rack->rc_inp, HPTS_REMOVE_OUTPUT);
	- rack_timer_cancel(tp, rack, cts, __LINE__);
	- rack_enter_persist(tp, rack, cts);
	- rack_start_hpts_timer(rack, tp, tcp_ts_getticks(), __LINE__, 0, 0, 0);
	- way_out = 3;
	- goto done_with_input;
	- }
	if (nxt_pkt == 0) {
	if (rack->r_wanted_output != 0) {
	did_out = 1;
	@@ -6848,7 +6832,6 @@
	rack_timer_audit(tp, rack, &so->so_snd);
	way_out = 2;
	}
	- done_with_input:
	rack_log_doseg_done(rack, cts, nxt_pkt, did_out, way_out);
	if (did_out)
	rack->r_wanted_output = 0;
	@@ -6871,7 +6854,7 @@
	#ifdef RSS
	struct tcp_function_block *tfb;
	struct tcp_rack *rack;
	- struct epoch_tracker et;
	+ struct inpcb *inp;

	rack = (struct tcp_rack *)tp->t_fb_ptr;
	if (rack->r_state == 0) {
	@@ -6879,11 +6862,9 @@
	* Initial input (ACK to SYN-ACK etc)lets go ahead and get
	* it processed
	*/
	- INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	tcp_get_usecs(&tv);
	rack_hpts_do_segment(m, th, so, tp, drop_hdrlen,
	tlen, iptos, 0, &tv);
	- INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	return;
	}
	tcp_queue_to_input(tp, m, th, tlen, drop_hdrlen, iptos);
	@@ -6959,13 +6940,17 @@
	#ifdef TCPDEBUG
	struct ipovly *ipov = NULL;
	#endif
	+#ifdef NETFLIX_TCP_O_UDP
	struct udphdr *udp = NULL;
	+#endif
	struct tcp_rack *rack;
	struct tcphdr *th;
	uint8_t pass = 0;
	- uint8_t wanted_cookie = 0;
	u_char opt[TCP_MAXOLEN];
	- unsigned ipoptlen, optlen, hdrlen, ulen=0;
	+ unsigned ipoptlen, optlen, hdrlen;
	+#ifdef NETFLIX_TCP_O_UDP
	+ unsigned ulen;
	+#endif
	uint32_t rack_seq;

	#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	@@ -7004,6 +6989,18 @@
	if (tp->t_flags & TF_TOE)
	return (tcp_offload_output(tp));
	#endif
	+
	+#ifdef TCP_RFC7413
	+ /*
	+ * For TFO connections in SYN_RECEIVED, only allow the initial
	+ * SYN\|ACK and those sent by the retransmit timer.
	+ */
	+ if ((tp->t_flags & TF_FASTOPEN) &&
	+ (tp->t_state == TCPS_SYN_RECEIVED) &&
	+ SEQ_GT(tp->snd_max, tp->snd_una) && /* inital SYN\|ACK sent */
	+ (tp->snd_nxt != tp->snd_una)) /* not a retransmit */
	+ return (0);
	+#endif
	#ifdef INET6
	if (rack->r_state) {
	/* Use the cache line loaded if possible */
	@@ -7046,31 +7043,12 @@
	rack->r_wanted_output = 0;
	rack->r_timer_override = 0;
	/*
	- * For TFO connections in SYN_SENT or SYN_RECEIVED,
	- * only allow the initial SYN or SYN\|ACK and those sent
	- * by the retransmit timer.
	- */
	- if (IS_FASTOPEN(tp->t_flags) &&
	- ((tp->t_state == TCPS_SYN_RECEIVED) \|\|
	- (tp->t_state == TCPS_SYN_SENT)) &&
	- SEQ_GT(tp->snd_max, tp->snd_una) && /* initial SYN or SYN\|ACK sent */
	- (tp->t_rxtshift == 0)) /* not a retransmit */
	- return (0);
	- /*
	* Determine length of data that should be transmitted, and flags
	* that will be used. If there is some data or critical controls
	* (SYN, RST) to send, then transmit; otherwise, investigate
	* further.
	*/
	idle = (tp->t_flags & TF_LASTIDLE) \|\| (tp->snd_max == tp->snd_una);
	-#ifdef NETFLIX_CWV
	- if (tp->cwv_enabled) {
	- if ((tp->cwv_cwnd_valid == 0) &&
	- TCPS_HAVEESTABLISHED(tp->t_state) &&
	- (tp->snd_cwnd > tp->snd_cwv.init_cwnd))
	- tcp_newcwv_nvp_closedown(tp);
	- } else
	-#endif
	if (tp->t_idle_reduce) {
	if (idle && ((ticks - tp->t_rcvtime) >= tp->t_rxtcur))
	rack_cc_after_idle(tp,
	@@ -7141,10 +7119,12 @@
	tlen = rsm->r_end - rsm->r_start;
	if (tlen > tp->t_maxseg)
	tlen = tp->t_maxseg;
	- KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
	- ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
	- __func__, __LINE__,
	- rsm->r_start, tp->snd_una, tp, rack, rsm));
	+#ifdef INVARIANTS
	+ if (SEQ_GT(tp->snd_una, rsm->r_start)) {
	+ panic("tp:%p rack:%p snd_una:%u rsm:%p r_start:%u",
	+ tp, rack, tp->snd_una, rsm, rsm->r_start);
	+ }
	+#endif
	sb_offset = rsm->r_start - tp->snd_una;
	cwin = min(tp->snd_wnd, tlen);
	len = cwin;
	@@ -7155,14 +7135,12 @@
	len = rsm->r_end - rsm->r_start;
	sack_rxmit = 1;
	sendalot = 0;
	- KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
	- ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
	- __func__, __LINE__,
	- rsm->r_start, tp->snd_una, tp, rack, rsm));
	sb_offset = rsm->r_start - tp->snd_una;
	if (len >= tp->t_maxseg) {
	len = tp->t_maxseg;
	}
	+ KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
	+ __func__, sb_offset));
	} else if ((rack->rc_in_persist == 0) &&
	((rsm = tcp_rack_output(tp, rack, cts)) != NULL)) {
	long tlen;
	@@ -7187,10 +7165,6 @@
	}
	#endif
	tlen = rsm->r_end - rsm->r_start;
	- KASSERT(SEQ_LEQ(tp->snd_una, rsm->r_start),
	- ("%s:%d: r.start:%u < SND.UNA:%u; tp:%p, rack:%p, rsm:%p",
	- __func__, __LINE__,
	- rsm->r_start, tp->snd_una, tp, rack, rsm));
	sb_offset = rsm->r_start - tp->snd_una;
	if (tlen > rack->r_ctl.rc_prr_sndcnt) {
	len = rack->r_ctl.rc_prr_sndcnt;
	@@ -7212,6 +7186,8 @@
	goto just_return_nolock;
	}
	}
	+ KASSERT(sb_offset >= 0, ("%s: sack block to the left of una : %d",
	+ __func__, sb_offset));
	if (len > 0) {
	sub_from_prr = 1;
	sack_rxmit = 1;
	@@ -7236,6 +7212,20 @@
	/* For debugging */
	rack->r_ctl.rc_rsm_at_retran = rsm;
	#endif
	+ /*
	+ * Enforce a connection sendmap count limit if set
	+ * as long as we are not retransmiting.
	+ */
	+ if ((rsm == NULL) &&
	+ (rack_map_entries_limit > 0) &&
	+ (rack->r_ctl.rc_num_maps_alloced >= rack_map_entries_limit)) {
	+ counter_u64_add(rack_to_alloc_limited, 1);
	+ if (!rack->alloc_limit_reported) {
	+ rack->alloc_limit_reported = 1;
	+ counter_u64_add(rack_alloc_limited_conns, 1);
	+ }
	+ goto just_return_nolock;
	+ }
	/*
	* Get standard flags, and add SYN or FIN if requested by 'hidden'
	* state flags.
	@@ -7306,7 +7296,7 @@
	uint32_t avail;

	avail = sbavail(sb);
	- if (SEQ_GT(tp->snd_nxt, tp->snd_una) && avail)
	+ if (SEQ_GT(tp->snd_nxt, tp->snd_una))
	sb_offset = tp->snd_nxt - tp->snd_una;
	else
	sb_offset = 0;
	@@ -7347,9 +7337,18 @@
	* data possible so far in the scoreboard.
	*/
	outstanding = tp->snd_max - tp->snd_una;
	- if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd)
	- len = 0;
	- else if (avail > sb_offset)
	+ if ((rack->r_ctl.rc_prr_sndcnt + outstanding) > tp->snd_wnd) {
	+ if (tp->snd_wnd > outstanding) {
	+ len = tp->snd_wnd - outstanding;
	+ /* Check to see if we have the data */
	+ if (((sb_offset + len) > avail) &&
	+ (avail > sb_offset))
	+ len = avail - sb_offset;
	+ else
	+ len = 0;
	+ } else
	+ len = 0;
	+ } else if (avail > sb_offset)
	len = avail - sb_offset;
	else
	len = 0;
	@@ -7398,18 +7397,22 @@
	* SYN-SENT state and if segment contains data and if we don't know
	* that foreign host supports TAO, suppress sending segment.
	*/
	- if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una) &&
	- ((sack_rxmit == 0) && (tp->t_rxtshift == 0))) {
	- if (tp->t_state != TCPS_SYN_RECEIVED)
	+ if ((flags & TH_SYN) && SEQ_GT(tp->snd_nxt, tp->snd_una)) {
	+ if ((tp->t_state != TCPS_SYN_RECEIVED) &&
	+ (tp->t_state != TCPS_SYN_SENT))
	flags &= ~TH_SYN;
	+#ifdef TCP_RFC7413
	/*
	* When sending additional segments following a TFO SYN\|ACK,
	* do not include the SYN bit.
	*/
	- if (IS_FASTOPEN(tp->t_flags) &&
	+ if ((tp->t_flags & TF_FASTOPEN) &&
	(tp->t_state == TCPS_SYN_RECEIVED))
	flags &= ~TH_SYN;
	+#endif
	sb_offset--, len++;
	+ if (sbavail(sb) == 0)
	+ len = 0;
	}
	/*
	* Be careful not to send data and/or FIN on SYN segments. This
	@@ -7420,29 +7423,16 @@
	len = 0;
	flags &= ~TH_FIN;
	}
	+#ifdef TCP_RFC7413
	/*
	- * On TFO sockets, ensure no data is sent in the following cases:
	- *
	- * - When retransmitting SYN\|ACK on a passively-created socket
	- *
	- * - When retransmitting SYN on an actively created socket
	- *
	- * - When sending a zero-length cookie (cookie request) on an
	- * actively created socket
	- *
	- * - When the socket is in the CLOSED state (RST is being sent)
	+ * When retransmitting SYN\|ACK on a passively-created TFO socket,
	+ * don't include data, as the presence of data may have caused the
	+ * original SYN\|ACK to have been dropped by a middlebox.
	*/
	- if (IS_FASTOPEN(tp->t_flags) &&
	- (((flags & TH_SYN) && (tp->t_rxtshift > 0)) \|\|
	- ((tp->t_state == TCPS_SYN_SENT) &&
	- (tp->t_tfo_client_cookie_len == 0)) \|\|
	- (flags & TH_RST))) {
	- sack_rxmit = 0;
	+ if ((tp->t_flags & TF_FASTOPEN) &&
	+ ((tp->t_state == TCPS_SYN_RECEIVED) && (tp->t_rxtshift > 0)))
	len = 0;
	- }
	- /* Without fast-open there should never be data sent on a SYN */
	- if ((flags & TH_SYN) && (!IS_FASTOPEN(tp->t_flags)))
	- len = 0;
	+#endif
	if (len <= 0) {
	/*
	* If FIN has been sent but not acked, but we haven't been
	@@ -7519,7 +7509,9 @@
	ipoptlen += ipsec_optlen;
	#endif
	if ((tp->t_flags & TF_TSO) && V_tcp_do_tso && len > tp->t_maxseg &&
	+#ifdef NETFLIX_TCP_O_UDP
	(tp->t_port == 0) &&
	+#endif
	((tp->t_flags & TF_SIGNATURE) == 0) &&
	tp->rcv_numsacks == 0 && sack_rxmit == 0 &&
	ipoptlen == 0)
	@@ -7688,10 +7680,13 @@
	* If our state indicates that FIN should be sent and we have not
	* yet done so, then we need to send.
	*/
	- if ((flags & TH_FIN) &&
	- (tp->snd_nxt == tp->snd_una)) {
	- pass = 11;
	- goto send;
	+ if (flags & TH_FIN) {
	+ if ((tp->t_flags & TF_SENTFIN) \|\|
	+ (((tp->t_flags & TF_SENTFIN) == 0) &&
	+ (tp->snd_nxt == tp->snd_una))) {
	+ pass = 11;
	+ goto send;
	+ }
	}
	/*
	* No reason to send a segment, just return.
	@@ -7750,44 +7745,27 @@
	if (flags & TH_SYN) {
	tp->snd_nxt = tp->iss;
	to.to_mss = tcp_mssopt(&inp->inp_inc);
	-#ifdef NETFLIX_TCPOUDP
	+#ifdef NETFLIX_TCP_O_UDP
	if (tp->t_port)
	to.to_mss -= V_tcp_udp_tunneling_overhead;
	#endif
	to.to_flags \|= TOF_MSS;
	-
	+#ifdef TCP_RFC7413
	/*
	- * On SYN or SYN\|ACK transmits on TFO connections,
	- * only include the TFO option if it is not a
	- * retransmit, as the presence of the TFO option may
	- * have caused the original SYN or SYN\|ACK to have
	- * been dropped by a middlebox.
	+ * Only include the TFO option on the first
	+ * transmission of the SYN\|ACK on a
	+ * passively-created TFO socket, as the presence of
	+ * the TFO option may have caused the original
	+ * SYN\|ACK to have been dropped by a middlebox.
	*/
	- if (IS_FASTOPEN(tp->t_flags) &&
	+ if ((tp->t_flags & TF_FASTOPEN) &&
	+ (tp->t_state == TCPS_SYN_RECEIVED) &&
	(tp->t_rxtshift == 0)) {
	- if (tp->t_state == TCPS_SYN_RECEIVED) {
	- to.to_tfo_len = TCP_FASTOPEN_COOKIE_LEN;
	- to.to_tfo_cookie =
	- (u_int8_t *)&tp->t_tfo_cookie.server;
	- to.to_flags \|= TOF_FASTOPEN;
	- wanted_cookie = 1;
	- } else if (tp->t_state == TCPS_SYN_SENT) {
	- to.to_tfo_len =
	- tp->t_tfo_client_cookie_len;
	- to.to_tfo_cookie =
	- tp->t_tfo_cookie.client;
	- to.to_flags \|= TOF_FASTOPEN;
	- wanted_cookie = 1;
	- /*
	- * If we wind up having more data to
	- * send with the SYN than can fit in
	- * one segment, don't send any more
	- * until the SYN\|ACK comes back from
	- * the other end.
	- */
	- sendalot = 0;
	- }
	+ to.to_tfo_len = TCP_FASTOPEN_MAX_COOKIE_LEN;
	+ to.to_tfo_cookie = (u_char *)&tp->t_tfo_cookie;
	+ to.to_flags \|= TOF_FASTOPEN;
	}
	+#endif
	}
	/* Window scaling. */
	if ((flags & TH_SYN) && (tp->t_flags & TF_REQ_SCALE)) {
	@@ -7822,15 +7800,8 @@

	/* Processing the options. */
	hdrlen += optlen = tcp_addoptions(&to, opt);
	- /*
	- * If we wanted a TFO option to be added, but it was unable
	- * to fit, ensure no data is sent.
	- */
	- if (IS_FASTOPEN(tp->t_flags) && wanted_cookie &&
	- !(to.to_flags & TOF_FASTOPEN))
	- len = 0;
	}
	-#ifdef NETFLIX_TCPOUDP
	+#ifdef NETFLIX_TCP_O_UDP
	if (tp->t_port) {
	if (V_tcp_udp_tunneling_port == 0) {
	/* The port was removed?? */
	@@ -7996,8 +7967,8 @@
	msb = NULL;
	else
	msb = sb;
	- m->m_next = tcp_m_copym(mb, moff, &len,
	- if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb);
	+ m->m_next = tcp_m_copym(/tp, / mb, moff, &len,
	+ if_hw_tsomaxsegcount, if_hw_tsomaxsegsize, msb /, 0, NULL/);
	if (len <= (tp->t_maxseg - optlen)) {
	/*
	* Must have ran out of mbufs for the copy
	@@ -8031,6 +8002,8 @@
	* TLP should not count in retran count, but
	* in its own bin
	*/
	+/* tp->t_sndtlppack++;*/
	+/* tp->t_sndtlpbyte += len;*/
	counter_u64_add(rack_tlp_retran, 1);
	counter_u64_add(rack_tlp_retran_bytes, len);
	} else {
	@@ -8156,7 +8129,7 @@
	#ifdef INET6
	if (isipv6) {
	ip6 = mtod(m, struct ip6_hdr *);
	-#ifdef NETFLIX_TCPOUDP
	+#ifdef NETFLIX_TCP_O_UDP
	if (tp->t_port) {
	udp = (struct udphdr *)((caddr_t)ip6 + ipoptlen + sizeof(struct ip6_hdr));
	udp->uh_sport = htons(V_tcp_udp_tunneling_port);
	@@ -8164,10 +8137,10 @@
	ulen = hdrlen + len - sizeof(struct ip6_hdr);
	udp->uh_ulen = htons(ulen);
	th = (struct tcphdr *)(udp + 1);
	- } else
	+ } else
	#endif
	th = (struct tcphdr *)(ip6 + 1);
	- tcpip_fillheaders(inp, ip6, th);
	+ tcpip_fillheaders(inp, /tp->t_port, / ip6, th);
	} else
	#endif /* INET6 */
	{
	@@ -8175,7 +8148,7 @@
	#ifdef TCPDEBUG
	ipov = (struct ipovly *)ip;
	#endif
	-#ifdef NETFLIX_TCPOUDP
	+#ifdef NETFLIX_TCP_O_UDP
	if (tp->t_port) {
	udp = (struct udphdr *)((caddr_t)ip + ipoptlen + sizeof(struct ip));
	udp->uh_sport = htons(V_tcp_udp_tunneling_port);
	@@ -8186,7 +8159,7 @@
	} else
	#endif
	th = (struct tcphdr *)(ip + 1);
	- tcpip_fillheaders(inp, ip, th);
	+ tcpip_fillheaders(inp,/tp->t_port, / ip, th);
	}
	/*
	* Fill in fields, remembering maximum advertised window for use in
	@@ -8277,20 +8250,15 @@
	/*
	* Calculate receive window. Don't shrink window, but avoid silly
	* window syndrome.
	- * If a RST segment is sent, advertise a window of zero.
	*/
	- if (flags & TH_RST) {
	+ if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
	+ recwin < (long)tp->t_maxseg)
	recwin = 0;
	- } else {
	- if (recwin < (long)(so->so_rcv.sb_hiwat / 4) &&
	- recwin < (long)tp->t_maxseg)
	- recwin = 0;
	- if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
	- recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
	- recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
	- if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
	- recwin = (long)TCP_MAXWIN << tp->rcv_scale;
	- }
	+ if (SEQ_GT(tp->rcv_adv, tp->rcv_nxt) &&
	+ recwin < (long)(tp->rcv_adv - tp->rcv_nxt))
	+ recwin = (long)(tp->rcv_adv - tp->rcv_nxt);
	+ if (recwin > (long)TCP_MAXWIN << tp->rcv_scale)
	+ recwin = (long)TCP_MAXWIN << tp->rcv_scale;

	/*
	* According to RFC1323 the window field in a SYN (i.e., a <SYN> or
	@@ -8357,18 +8325,23 @@
	* ip6_plen is not need to be filled now, and will be filled
	* in ip6_output.
	*/
	+#ifdef NETFLIX_TCP_O_UDP
	if (tp->t_port) {
	m->m_pkthdr.csum_flags = CSUM_UDP_IPV6;
	m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
	udp->uh_sum = in6_cksum_pseudo(ip6, ulen, IPPROTO_UDP, 0);
	th->th_sum = htons(0);
	+ UDPSTAT_INC(udps_opackets);
	} else {
	+#endif
	m->m_pkthdr.csum_flags = CSUM_TCP_IPV6;
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	th->th_sum = in6_cksum_pseudo(ip6,
	sizeof(struct tcphdr) + optlen + len, IPPROTO_TCP,
	0);
	+#ifdef NETFLIX_TCP_O_UDP
	}
	+#endif
	}
	#endif
	#if defined(INET6) && defined(INET)
	@@ -8376,19 +8349,24 @@
	#endif
	#ifdef INET
	{
	+#ifdef NETFLIX_TCP_O_UDP
	if (tp->t_port) {
	m->m_pkthdr.csum_flags = CSUM_UDP;
	m->m_pkthdr.csum_data = offsetof(struct udphdr, uh_sum);
	udp->uh_sum = in_pseudo(ip->ip_src.s_addr,
	ip->ip_dst.s_addr, htons(ulen + IPPROTO_UDP));
	th->th_sum = htons(0);
	+ UDPSTAT_INC(udps_opackets);
	} else {
	+#endif
	m->m_pkthdr.csum_flags = CSUM_TCP;
	m->m_pkthdr.csum_data = offsetof(struct tcphdr, th_sum);
	th->th_sum = in_pseudo(ip->ip_src.s_addr,
	ip->ip_dst.s_addr, htons(sizeof(struct tcphdr) +
	IPPROTO_TCP + len + optlen));
	+#ifdef NETFLIX_TCP_O_UDP
	}
	+#endif
	/* IP version must be set here for ipv4/ipv6 checking later */
	KASSERT(ip->ip_v == IPVERSION,
	("%s: IP version incorrect: %d", __func__, ip->ip_v));
	@@ -8559,6 +8537,10 @@
	* retransmit. In persist state, just set snd_max.
	*/
	if (error == 0) {
	+/* if (TCPS_HAVEESTABLISHED(tp->t_state) &&
	+ (tp->t_flags & TF_SACK_PERMIT) &&
	+ tp->rcv_numsacks > 0)
	+ tcp_clean_dsack_blocks(tp);*/
	if (len == 0)
	counter_u64_add(rack_out_size[TCP_MSS_ACCT_SNDACK], 1);
	else if (len == 1) {
	@@ -8574,15 +8556,19 @@
	}
	}
	if (sub_from_prr && (error == 0)) {
	- rack->r_ctl.rc_prr_sndcnt -= len;
	+ if (rack->r_ctl.rc_prr_sndcnt >= len)
	+ rack->r_ctl.rc_prr_sndcnt -= len;
	+ else
	+ rack->r_ctl.rc_prr_sndcnt = 0;
	}
	sub_from_prr = 0;
	rack_log_output(tp, &to, len, rack_seq, (uint8_t) flags, error, cts,
	pass, rsm);
	if ((tp->t_flags & TF_FORCEDATA) == 0 \|\|
	(rack->rc_in_persist == 0)) {
	+#ifdef NETFLIX_STATS
	tcp_seq startseq = tp->snd_nxt;
	-
	+#endif
	/*
	* Advance snd_nxt over sequence space of this segment.
	*/
	@@ -8613,17 +8599,6 @@
	tp->t_acktime = ticks;
	}
	tp->snd_max = tp->snd_nxt;
	- /*
	- * Time this transmission if not a retransmission and
	- * not currently timing anything.
	- * This is only relevant in case of switching back to
	- * the base stack.
	- */
	- if (tp->t_rtttime == 0) {
	- tp->t_rtttime = ticks;
	- tp->t_rtseq = startseq;
	- TCPSTAT_INC(tcps_segstimed);
	- }
	#ifdef NETFLIX_STATS
	if (!(tp->t_flags & TF_GPUTINPROG) && len) {
	tp->t_flags \|= TF_GPUTINPROG;
	@@ -8996,9 +8971,7 @@
	return (tcp_default_ctloutput(so, sopt, inp, tp));
	break;
	}
	-#ifdef NETFLIX_STATS
	- tcp_log_socket_option(tp, sopt->sopt_name, optval, error);
	-#endif
	+/* tcp_log_socket_option(tp, sopt->sopt_name, optval, error);*/
	INP_WUNLOCK(inp);
	return (error);
	}
	@@ -9131,7 +9104,6 @@
	.tfb_tcp_block_name = __XSTRING(STACKNAME),
	.tfb_tcp_output = rack_output,
	.tfb_tcp_do_segment = rack_do_segment,
	- .tfb_tcp_hpts_do_segment = rack_hpts_do_segment,
	.tfb_tcp_ctloutput = rack_ctloutput,
	.tfb_tcp_fb_init = rack_init,
	.tfb_tcp_fb_fini = rack_fini,
	@@ -9241,4 +9213,3 @@

	MODULE_VERSION(MODNAME, 1);
	DECLARE_MODULE(MODNAME, tcp_rack, SI_SUB_PROTO_DOMAIN, SI_ORDER_ANY);
	-MODULE_DEPEND(MODNAME, tcphpts, 1, 1, 1);
	Index: head/sys/netinet/tcp_stacks/rack_bbr_common.h
	===================================================================
	--- head/sys/netinet/tcp_stacks/rack_bbr_common.h
	+++ head/sys/netinet/tcp_stacks/rack_bbr_common.h
	@@ -38,17 +38,8 @@
	#define TCP_MSS_ACCT_SIZE 70
	#define TCP_MSS_SMALL_MAX_SIZE_DIV (TCP_MSS_ACCT_SIZE - TCP_MSS_SMALL_SIZE_OFF)

	+#define DUP_ACK_THRESHOLD 3

	-/* Magic flags to tell whats cooking on the pacing wheel */
	-#define PACE_PKT_OUTPUT 0x01 /* Output Packets being paced */
	-#define PACE_TMR_RACK 0x02 /* RACK timer running */
	-#define PACE_TMR_TLP 0x04 /* TLP timer running */
	-#define PACE_TMR_RXT 0x08 /* Retransmit timer running */
	-#define PACE_TMR_PERSIT 0x10 /* Persists timer running */
	-#define PACE_TMR_KEEP 0x20 /* Keep alive timer running */
	-#define PACE_TMR_DELACK 0x40 /* Delayed ack timer running */
	-#define PACE_TMR_MASK (PACE_TMR_KEEP\|PACE_TMR_PERSIT\|PACE_TMR_RXT\|PACE_TMR_TLP\|PACE_TMR_RACK\|PACE_TMR_DELACK)
	-
	/* Magic flags for tracing progress events */
	#define PROGRESS_DROP 1
	#define PROGRESS_UPDATE 2
	@@ -61,8 +52,66 @@
	#define USE_RTT_LOW 1
	#define USE_RTT_AVG 2

	+#define PACE_MAX_IP_BYTES 65536
	+#define USECS_IN_SECOND 1000000
	+#define MSEC_IN_SECOND 1000
	+#define MS_IN_USEC 1000
	+#define USEC_TO_MSEC(x) (x / MS_IN_USEC)
	+#define TCP_TS_OVERHEAD 12 /* Overhead of having Timestamps on */
	+
	#ifdef _KERNEL
	/* We have only 7 bits in rack so assert its true */
	CTASSERT((PACE_TMR_MASK & 0x80) == 0);
	+#ifdef KERN_TLS
	+uint32_t ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd);
	+#endif
	+int
	+ctf_process_inbound_raw(struct tcpcb tp, struct socket so,
	+ struct mbuf *m, int has_pkt);
	+int
	+ctf_do_queued_segments(struct socket so, struct tcpcb tp, int have_pkt);
	+uint32_t ctf_outstanding(struct tcpcb *tp);
	+uint32_t ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked);
	+int
	+ctf_drop_checks(struct tcpopt to, struct mbuf m,
	+ struct tcphdr th, struct tcpcb tp, int32_t * tlenp, int32_t * thf,
	+ int32_t * drop_hdrlen, int32_t * ret_val);
	+void
	+ctf_do_dropafterack(struct mbuf m, struct tcpcb tp,
	+ struct tcphdr th, int32_t thflags, int32_t tlen, int32_t ret_val);
	+void
	+ctf_do_dropwithreset(struct mbuf m, struct tcpcb tp,
	+ struct tcphdr *th, int32_t rstreason, int32_t tlen);
	+void
	+ctf_do_drop(struct mbuf m, struct tcpcb tp);
	+
	+int
	+ctf_process_rst(struct mbuf m, struct tcphdr th,
	+ struct socket so, struct tcpcb tp);
	+
	+void
	+ctf_challenge_ack(struct mbuf m, struct tcphdr th,
	+ struct tcpcb tp, int32_t ret_val);
	+
	+int
	+ctf_ts_check(struct mbuf m, struct tcphdr th,
	+ struct tcpcb tp, int32_t tlen, int32_t thflags, int32_t ret_val);
	+
	+void
	+ctf_calc_rwin(struct socket so, struct tcpcb tp);
	+
	+void
	+ctf_do_dropwithreset_conn(struct mbuf m, struct tcpcb tp, struct tcphdr *th,
	+ int32_t rstreason, int32_t tlen);
	+
	+uint32_t
	+ctf_fixed_maxseg(struct tcpcb *tp);
	+
	+void
	+ctf_log_sack_filter(struct tcpcb tp, int num_sack_blks, struct sackblk sack_blocks);
	+
	+uint32_t
	+ctf_decay_count(uint32_t count, uint32_t decay_percentage);
	+
	#endif
	#endif
	Index: head/sys/netinet/tcp_stacks/rack_bbr_common.c
	===================================================================
	--- head/sys/netinet/tcp_stacks/rack_bbr_common.c
	+++ head/sys/netinet/tcp_stacks/rack_bbr_common.c
	@@ -0,0 +1,859 @@
	+/*-
	+ * Copyright (c) 2016-2018
	+ * Netflix Inc.
	+ * All rights reserved.
	+ *
	+ * Redistribution and use in source and binary forms, with or without
	+ * modification, are permitted provided that the following conditions
	+ * are met:
	+ * 1. Redistributions of source code must retain the above copyright
	+ * notice, this list of conditions and the following disclaimer.
	+ * 2. Redistributions in binary form must reproduce the above copyright
	+ * notice, this list of conditions and the following disclaimer in the
	+ * documentation and/or other materials provided with the distribution.
	+ *
	+ * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
	+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	+ * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	+ * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
	+ * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
	+ * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
	+ * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
	+ * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
	+ * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
	+ * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
	+ * SUCH DAMAGE.
	+ *
	+ */
	+/*
	+ * Author: Randall Stewart <rrs@netflix.com>
	+ * This work is based on the ACM Queue paper
	+ * BBR - Congestion Based Congestion Control
	+ * and also numerous discussions with Neal, Yuchung and Van.
	+ */
	+
	+#include <sys/cdefs.h>
	+__FBSDID("$FreeBSD$");
	+
	+#include "opt_inet.h"
	+#include "opt_inet6.h"
	+#include "opt_ipsec.h"
	+#include "opt_tcpdebug.h"
	+#include "opt_ratelimit.h"
	+/#include "opt_kern_tls.h"/
	+#include <sys/param.h>
	+#include <sys/module.h>
	+#include <sys/kernel.h>
	+#ifdef TCP_HHOOK
	+#include <sys/hhook.h>
	+#endif
	+#include <sys/malloc.h>
	+#include <sys/mbuf.h>
	+#include <sys/proc.h>
	+#include <sys/socket.h>
	+#include <sys/socketvar.h>
	+#ifdef KERN_TLS
	+#include <sys/sockbuf_tls.h>
	+#endif
	+#include <sys/sysctl.h>
	+#include <sys/systm.h>
	+#include <sys/tree.h>
	+#include <sys/refcount.h>
	+#include <sys/queue.h>
	+#include <sys/smp.h>
	+#include <sys/kthread.h>
	+#include <sys/lock.h>
	+#include <sys/mutex.h>
	+#include <sys/time.h>
	+#include <vm/uma.h>
	+#include <sys/kern_prefetch.h>
	+
	+#include <net/route.h>
	+#include <net/vnet.h>
	+#include <net/ethernet.h>
	+#include <net/bpf.h>
	+
	+#define TCPSTATES /* for logging */
	+
	+#include <netinet/in.h>
	+#include <netinet/in_kdtrace.h>
	+#include <netinet/in_pcb.h>
	+#include <netinet/ip.h>
	+#include <netinet/ip_icmp.h> /* required for icmp_var.h */
	+#include <netinet/icmp_var.h> /* for ICMP_BANDLIM */
	+#include <netinet/ip_var.h>
	+#include <netinet/ip6.h>
	+#include <netinet6/in6_pcb.h>
	+#include <netinet6/ip6_var.h>
	+#define TCPOUTFLAGS
	+#include <netinet/tcp.h>
	+#include <netinet/tcp_fsm.h>
	+#include <netinet/tcp_seq.h>
	+#include <netinet/tcp_timer.h>
	+#include <netinet/tcp_var.h>
	+#include <netinet/tcpip.h>
	+#include <netinet/tcp_hpts.h>
	+#include <netinet/cc/cc.h>
	+#include <netinet/tcp_log_buf.h>
	+#ifdef TCPDEBUG
	+#include <netinet/tcp_debug.h>
	+#endif /* TCPDEBUG */
	+#ifdef TCP_OFFLOAD
	+#include <netinet/tcp_offload.h>
	+#endif
	+#ifdef INET6
	+#include <netinet6/tcp6_var.h>
	+#endif
	+#include <netinet/tcp_fastopen.h>
	+
	+#include <netipsec/ipsec_support.h>
	+#include <net/if.h>
	+#include <net/if_var.h>
	+
	+#if defined(IPSEC) \|\| defined(IPSEC_SUPPORT)
	+#include <netipsec/ipsec.h>
	+#include <netipsec/ipsec6.h>
	+#endif /* IPSEC */
	+
	+#include <netinet/udp.h>
	+#include <netinet/udp_var.h>
	+#include <machine/in_cksum.h>
	+
	+#ifdef MAC
	+#include <security/mac/mac_framework.h>
	+#endif
	+#include "rack_bbr_common.h"
	+
	+/*
	+ * Common TCP Functions - These are shared by borth
	+ * rack and BBR.
	+ */
	+
	+
	+#ifdef KERN_TLS
	+uint32_t
	+ctf_get_opt_tls_size(struct socket *so, uint32_t rwnd)
	+{
	+ struct sbtls_info *tls;
	+ uint32_t len;
	+
	+again:
	+ tls = so->so_snd.sb_tls_info;
	+ len = tls->sb_params.sb_maxlen; /* max tls payload */
	+ len += tls->sb_params.sb_tls_hlen; /* tls header len */
	+ len += tls->sb_params.sb_tls_tlen; /* tls trailer len */
	+ if ((len * 4) > rwnd) {
	+ /*
	+ * Stroke this will suck counter and what
	+ * else should we do Drew? From the
	+ * TCP perspective I am not sure
	+ * what should be done...
	+ */
	+ if (tls->sb_params.sb_maxlen > 4096) {
	+ tls->sb_params.sb_maxlen -= 4096;
	+ if (tls->sb_params.sb_maxlen < 4096)
	+ tls->sb_params.sb_maxlen = 4096;
	+ goto again;
	+ }
	+ }
	+ return (len);
	+}
	+#endif
	+
	+int
	+ctf_process_inbound_raw(struct tcpcb tp, struct socket so, struct mbuf *m, int has_pkt)
	+{
	+ /*
	+ * We are passed a raw change of mbuf packets
	+ * that arrived in LRO. They are linked via
	+ * the m_nextpkt link in the pkt-headers.
	+ *
	+ * We process each one by:
	+ * a) saving off the next
	+ * b) stripping off the ether-header
	+ * c) formulating the arguments for
	+ * the tfb_tcp_hpts_do_segment
	+ * d) calling each mbuf to tfb_tcp_hpts_do_segment
	+ * after adjusting the time to match the arrival time.
	+ * Note that the LRO code assures no IP options are present.
	+ *
	+ * The symantics for calling tfb_tcp_hpts_do_segment are the
	+ * following:
	+ * 1) It returns 0 if all went well and you (the caller) need
	+ * to release the lock.
	+ * 2) If nxt_pkt is set, then the function will surpress calls
	+ * to tfb_tcp_output() since you are promising to call again
	+ * with another packet.
	+ * 3) If it returns 1, then you must free all the packets being
	+ * shipped in, the tcb has been destroyed (or about to be destroyed).
	+ */
	+ struct mbuf *m_save;
	+ struct ether_header *eh;
	+ struct epoch_tracker et;
	+ struct tcphdr *th;
	+#ifdef INET6
	+ struct ip6_hdr ip6 = NULL; / Keep compiler happy. */
	+#endif
	+#ifdef INET
	+ struct ip ip = NULL; / Keep compiler happy. */
	+#endif
	+ struct ifnet *ifp;
	+ struct timeval tv;
	+ int32_t retval, nxt_pkt, tlen, off;
	+ uint16_t etype;
	+ uint16_t drop_hdrlen;
	+ uint8_t iptos, no_vn=0, bpf_req=0;
	+
	+ /*
	+ * This is a bit deceptive, we get the
	+ * "info epoch" which is really the network
	+ * epoch. This covers us on both any INP
	+ * type change but also if the ifp goes
	+ * away it covers us as well.
	+ */
	+ INP_INFO_RLOCK_ET(&V_tcbinfo, et);
	+ if (m && m->m_pkthdr.rcvif)
	+ ifp = m->m_pkthdr.rcvif;
	+ else
	+ ifp = NULL;
	+ if (ifp) {
	+ bpf_req = bpf_peers_present(ifp->if_bpf);
	+ } else {
	+ /*
	+ * We probably should not work around
	+ * but kassert, since lro alwasy sets rcvif.
	+ */
	+ no_vn = 1;
	+ goto skip_vnet;
	+ }
	+ CURVNET_SET(ifp->if_vnet);
	+skip_vnet:
	+ while (m) {
	+ m_save = m->m_nextpkt;
	+ m->m_nextpkt = NULL;
	+ /* Now lets get the ether header */
	+ eh = mtod(m, struct ether_header *);
	+ etype = ntohs(eh->ether_type);
	+ /* Let the BPF see the packet */
	+ if (bpf_req && ifp)
	+ ETHER_BPF_MTAP(ifp, m);
	+ m_adj(m, sizeof(*eh));
	+ /* Trim off the ethernet header */
	+ switch (etype) {
	+#ifdef INET6
	+ case ETHERTYPE_IPV6:
	+ {
	+ if (m->m_len < (sizeof(ip6) + sizeof(th))) {
	+ m = m_pullup(m, sizeof(ip6) + sizeof(th));
	+ if (m == NULL) {
	+ TCPSTAT_INC(tcps_rcvshort);
	+ m_freem(m);
	+ goto skipped_pkt;
	+ }
	+ }
	+ ip6 = (struct ip6_hdr *)(eh + 1);
	+ th = (struct tcphdr *)(ip6 + 1);
	+ tlen = ntohs(ip6->ip6_plen);
	+ drop_hdrlen = sizeof(*ip6);
	+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID_IPV6) {
	+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
	+ th->th_sum = m->m_pkthdr.csum_data;
	+ else
	+ th->th_sum = in6_cksum_pseudo(ip6, tlen,
	+ IPPROTO_TCP, m->m_pkthdr.csum_data);
	+ th->th_sum ^= 0xffff;
	+ } else
	+ th->th_sum = in6_cksum(m, IPPROTO_TCP, drop_hdrlen, tlen);
	+ if (th->th_sum) {
	+ TCPSTAT_INC(tcps_rcvbadsum);
	+ m_freem(m);
	+ goto skipped_pkt;
	+ }
	+ /*
	+ * Be proactive about unspecified IPv6 address in source.
	+ * As we use all-zero to indicate unbounded/unconnected pcb,
	+ * unspecified IPv6 address can be used to confuse us.
	+ *
	+ * Note that packets with unspecified IPv6 destination is
	+ * already dropped in ip6_input.
	+ */
	+ if (IN6_IS_ADDR_UNSPECIFIED(&ip6->ip6_src)) {
	+ /* XXX stat */
	+ m_freem(m);
	+ goto skipped_pkt;
	+ }
	+ iptos = (ntohl(ip6->ip6_flow) >> 20) & 0xff;
	+ break;
	+ }
	+#endif
	+#ifdef INET
	+ case ETHERTYPE_IP:
	+ {
	+ if (m->m_len < sizeof (struct tcpiphdr)) {
	+ if ((m = m_pullup(m, sizeof (struct tcpiphdr)))
	+ == NULL) {
	+ TCPSTAT_INC(tcps_rcvshort);
	+ m_freem(m);
	+ goto skipped_pkt;
	+ }
	+ }
	+ ip = (struct ip *)(eh + 1);
	+ th = (struct tcphdr *)(ip + 1);
	+ drop_hdrlen = sizeof(*ip);
	+ iptos = ip->ip_tos;
	+ tlen = ntohs(ip->ip_len) - sizeof(struct ip);
	+ if (m->m_pkthdr.csum_flags & CSUM_DATA_VALID) {
	+ if (m->m_pkthdr.csum_flags & CSUM_PSEUDO_HDR)
	+ th->th_sum = m->m_pkthdr.csum_data;
	+ else
	+ th->th_sum = in_pseudo(ip->ip_src.s_addr,
	+ ip->ip_dst.s_addr,
	+ htonl(m->m_pkthdr.csum_data + tlen +
	+ IPPROTO_TCP));
	+ th->th_sum ^= 0xffff;
	+ } else {
	+ int len;
	+ struct ipovly ipov = (struct ipovly )ip;
	+ /*
	+ * Checksum extended TCP header and data.
	+ */
	+ len = drop_hdrlen + tlen;
	+ bzero(ipov->ih_x1, sizeof(ipov->ih_x1));
	+ ipov->ih_len = htons(tlen);
	+ th->th_sum = in_cksum(m, len);
	+ /* Reset length for SDT probes. */
	+ ip->ip_len = htons(len);
	+ /* Reset TOS bits */
	+ ip->ip_tos = iptos;
	+ /* Re-initialization for later version check */
	+ ip->ip_v = IPVERSION;
	+ ip->ip_hl = sizeof(*ip) >> 2;
	+ }
	+ if (th->th_sum) {
	+ TCPSTAT_INC(tcps_rcvbadsum);
	+ m_freem(m);
	+ goto skipped_pkt;
	+ }
	+ break;
	+ }
	+#endif
	+ }
	+ /*
	+ * Convert TCP protocol specific fields to host format.
	+ */
	+ tcp_fields_to_host(th);
	+
	+ off = th->th_off << 2;
	+ if (off < sizeof (struct tcphdr) \|\| off > tlen) {
	+ TCPSTAT_INC(tcps_rcvbadoff);
	+ m_freem(m);
	+ goto skipped_pkt;
	+ }
	+ tlen -= off;
	+ drop_hdrlen += off;
	+ /*
	+ * Now lets setup the timeval to be when we should
	+ * have been called (if we can).
	+ */
	+ m->m_pkthdr.lro_nsegs = 1;
	+ if (m->m_flags & M_TSTMP_LRO) {
	+ tv.tv_sec = m->m_pkthdr.rcv_tstmp / 1000000000;
	+ tv.tv_usec = (m->m_pkthdr.rcv_tstmp % 1000000000) / 1000;
	+ } else {
	+ /* Should not be should we kassert instead? */
	+ tcp_get_usecs(&tv);
	+ }
	+ /* Now what about next packet? */
	+ if (m_save \|\| has_pkt)
	+ nxt_pkt = 1;
	+ else
	+ nxt_pkt = 0;
	+ retval = (*tp->t_fb->tfb_do_segment_nounlock)(m, th, so, tp, drop_hdrlen, tlen,
	+ iptos, nxt_pkt, &tv);
	+ if (retval) {
	+ /* We lost the lock and tcb probably */
	+ m = m_save;
	+ while (m) {
	+ m_save = m->m_nextpkt;
	+ m->m_nextpkt = NULL;
	+ m_freem(m);
	+ m = m_save;
	+ }
	+ if (no_vn == 0)
	+ CURVNET_RESTORE();
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	+ return (retval);
	+ }
	+skipped_pkt:
	+ m = m_save;
	+ }
	+ if (no_vn == 0)
	+ CURVNET_RESTORE();
	+ INP_INFO_RUNLOCK_ET(&V_tcbinfo, et);
	+ return (retval);
	+}
	+
	+int
	+ctf_do_queued_segments(struct socket so, struct tcpcb tp, int have_pkt)
	+{
	+ struct mbuf *m;
	+
	+ /* First lets see if we have old packets */
	+ if (tp->t_in_pkt) {
	+ m = tp->t_in_pkt;
	+ tp->t_in_pkt = NULL;
	+ tp->t_tail_pkt = NULL;
	+ if (ctf_process_inbound_raw(tp, so, m, have_pkt)) {
	+ /* We lost the tcpcb (maybe a RST came in)? */
	+ return (1);
	+ }
	+ }
	+ return (0);
	+}
	+
	+uint32_t
	+ctf_outstanding(struct tcpcb *tp)
	+{
	+ return (tp->snd_max - tp->snd_una);
	+}
	+
	+uint32_t
	+ctf_flight_size(struct tcpcb *tp, uint32_t rc_sacked)
	+{
	+ if (rc_sacked <= ctf_outstanding(tp))
	+ return (ctf_outstanding(tp) - rc_sacked);
	+ else {
	+ /* TSNH */
	+#ifdef INVARIANTS
	+ panic("tp:%p rc_sacked:%d > out:%d",
	+ tp, rc_sacked, ctf_outstanding(tp));
	+#endif
	+ return (0);
	+ }
	+}
	+
	+void
	+ctf_do_dropwithreset(struct mbuf m, struct tcpcb tp, struct tcphdr *th,
	+ int32_t rstreason, int32_t tlen)
	+{
	+ if (tp != NULL) {
	+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
	+ INP_WUNLOCK(tp->t_inpcb);
	+ } else
	+ tcp_dropwithreset(m, th, NULL, tlen, rstreason);
	+}
	+
	+/*
	+ * ctf_drop_checks returns 1 for you should not proceed. It places
	+ * in ret_val what should be returned 1/0 by the caller. The 1 indicates
	+ * that the TCB is unlocked and probably dropped. The 0 indicates the
	+ * TCB is still valid and locked.
	+ */
	+int
	+ctf_drop_checks(struct tcpopt to, struct mbuf m, struct tcphdr th, struct tcpcb tp, int32_t * tlenp, int32_t * thf, int32_t * drop_hdrlen, int32_t * ret_val)
	+{
	+ int32_t todrop;
	+ int32_t thflags;
	+ int32_t tlen;
	+
	+ thflags = *thf;
	+ tlen = *tlenp;
	+ todrop = tp->rcv_nxt - th->th_seq;
	+ if (todrop > 0) {
	+ if (thflags & TH_SYN) {
	+ thflags &= ~TH_SYN;
	+ th->th_seq++;
	+ if (th->th_urp > 1)
	+ th->th_urp--;
	+ else
	+ thflags &= ~TH_URG;
	+ todrop--;
	+ }
	+ /*
	+ * Following if statement from Stevens, vol. 2, p. 960.
	+ */
	+ if (todrop > tlen
	+ \|\| (todrop == tlen && (thflags & TH_FIN) == 0)) {
	+ /*
	+ * Any valid FIN must be to the left of the window.
	+ * At this point the FIN must be a duplicate or out
	+ * of sequence; drop it.
	+ */
	+ thflags &= ~TH_FIN;
	+ /*
	+ * Send an ACK to resynchronize and drop any data.
	+ * But keep on processing for RST or ACK.
	+ */
	+ tp->t_flags \|= TF_ACKNOW;
	+ todrop = tlen;
	+ TCPSTAT_INC(tcps_rcvduppack);
	+ TCPSTAT_ADD(tcps_rcvdupbyte, todrop);
	+ } else {
	+ TCPSTAT_INC(tcps_rcvpartduppack);
	+ TCPSTAT_ADD(tcps_rcvpartdupbyte, todrop);
	+ }
	+ /*
	+ * DSACK - add SACK block for dropped range
	+ */
	+ if (tp->t_flags & TF_SACK_PERMIT) {
	+ tcp_update_sack_list(tp, th->th_seq, th->th_seq + tlen);
	+ /*
	+ * ACK now, as the next in-sequence segment
	+ * will clear the DSACK block again
	+ */
	+ tp->t_flags \|= TF_ACKNOW;
	+ }
	+ drop_hdrlen += todrop; / drop from the top afterwards */
	+ th->th_seq += todrop;
	+ tlen -= todrop;
	+ if (th->th_urp > todrop)
	+ th->th_urp -= todrop;
	+ else {
	+ thflags &= ~TH_URG;
	+ th->th_urp = 0;
	+ }
	+ }
	+ /*
	+ * If segment ends after window, drop trailing data (and PUSH and
	+ * FIN); if nothing left, just ACK.
	+ */
	+ todrop = (th->th_seq + tlen) - (tp->rcv_nxt + tp->rcv_wnd);
	+ if (todrop > 0) {
	+ TCPSTAT_INC(tcps_rcvpackafterwin);
	+ if (todrop >= tlen) {
	+ TCPSTAT_ADD(tcps_rcvbyteafterwin, tlen);
	+ /*
	+ * If window is closed can only take segments at
	+ * window edge, and have to drop data and PUSH from
	+ * incoming segments. Continue processing, but
	+ * remember to ack. Otherwise, drop segment and
	+ * ack.
	+ */
	+ if (tp->rcv_wnd == 0 && th->th_seq == tp->rcv_nxt) {
	+ tp->t_flags \|= TF_ACKNOW;
	+ TCPSTAT_INC(tcps_rcvwinprobe);
	+ } else {
	+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
	+ return (1);
	+ }
	+ } else
	+ TCPSTAT_ADD(tcps_rcvbyteafterwin, todrop);
	+ m_adj(m, -todrop);
	+ tlen -= todrop;
	+ thflags &= ~(TH_PUSH \| TH_FIN);
	+ }
	+ *thf = thflags;
	+ *tlenp = tlen;
	+ return (0);
	+}
	+
	+/*
	+ * The value in ret_val informs the caller
	+ * if we dropped the tcb (and lock) or not.
	+ * 1 = we dropped it, 0 = the TCB is still locked
	+ * and valid.
	+ */
	+void
	+ctf_do_dropafterack(struct mbuf m, struct tcpcb tp, struct tcphdr th, int32_t thflags, int32_t tlen, int32_t ret_val)
	+{
	+ /*
	+ * Generate an ACK dropping incoming segment if it occupies sequence
	+ * space, where the ACK reflects our state.
	+ *
	+ * We can now skip the test for the RST flag since all paths to this
	+ * code happen after packets containing RST have been dropped.
	+ *
	+ * In the SYN-RECEIVED state, don't send an ACK unless the segment
	+ * we received passes the SYN-RECEIVED ACK test. If it fails send a
	+ * RST. This breaks the loop in the "LAND" DoS attack, and also
	+ * prevents an ACK storm between two listening ports that have been
	+ * sent forged SYN segments, each with the source address of the
	+ * other.
	+ */
	+ if (tp->t_state == TCPS_SYN_RECEIVED && (thflags & TH_ACK) &&
	+ (SEQ_GT(tp->snd_una, th->th_ack) \|\|
	+ SEQ_GT(th->th_ack, tp->snd_max))) {
	+ *ret_val = 1;
	+ ctf_do_dropwithreset(m, tp, th, BANDLIM_RST_OPENPORT, tlen);
	+ return;
	+ } else
	+ *ret_val = 0;
	+ tp->t_flags \|= TF_ACKNOW;
	+ if (m)
	+ m_freem(m);
	+}
	+
	+void
	+ctf_do_drop(struct mbuf m, struct tcpcb tp)
	+{
	+
	+ /*
	+ * Drop space held by incoming segment and return.
	+ */
	+ if (tp != NULL)
	+ INP_WUNLOCK(tp->t_inpcb);
	+ if (m)
	+ m_freem(m);
	+}
	+
	+int
	+ctf_process_rst(struct mbuf m, struct tcphdr th, struct socket so, struct tcpcb tp)
	+{
	+ /*
	+ * RFC5961 Section 3.2
	+ *
	+ * - RST drops connection only if SEG.SEQ == RCV.NXT. - If RST is in
	+ * window, we send challenge ACK.
	+ *
	+ * Note: to take into account delayed ACKs, we should test against
	+ * last_ack_sent instead of rcv_nxt. Note 2: we handle special case
	+ * of closed window, not covered by the RFC.
	+ */
	+ int dropped = 0;
	+
	+ if ((SEQ_GEQ(th->th_seq, (tp->last_ack_sent - 1)) &&
	+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) \|\|
	+ (tp->rcv_wnd == 0 && tp->last_ack_sent == th->th_seq)) {
	+
	+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
	+ KASSERT(tp->t_state != TCPS_SYN_SENT,
	+ ("%s: TH_RST for TCPS_SYN_SENT th %p tp %p",
	+ __func__, th, tp));
	+
	+ if (V_tcp_insecure_rst \|\|
	+ (tp->last_ack_sent == th->th_seq) \|\|
	+ (tp->rcv_nxt == th->th_seq) \|\|
	+ ((tp->last_ack_sent - 1) == th->th_seq)) {
	+ TCPSTAT_INC(tcps_drops);
	+ /* Drop the connection. */
	+ switch (tp->t_state) {
	+ case TCPS_SYN_RECEIVED:
	+ so->so_error = ECONNREFUSED;
	+ goto close;
	+ case TCPS_ESTABLISHED:
	+ case TCPS_FIN_WAIT_1:
	+ case TCPS_FIN_WAIT_2:
	+ case TCPS_CLOSE_WAIT:
	+ case TCPS_CLOSING:
	+ case TCPS_LAST_ACK:
	+ so->so_error = ECONNRESET;
	+ close:
	+ tcp_state_change(tp, TCPS_CLOSED);
	+ /* FALLTHROUGH */
	+ default:
	+ tp = tcp_close(tp);
	+ }
	+ dropped = 1;
	+ ctf_do_drop(m, tp);
	+ } else {
	+ TCPSTAT_INC(tcps_badrst);
	+ /* Send challenge ACK. */
	+ tcp_respond(tp, mtod(m, void *), th, m,
	+ tp->rcv_nxt, tp->snd_nxt, TH_ACK);
	+ tp->last_ack_sent = tp->rcv_nxt;
	+ }
	+ } else {
	+ m_freem(m);
	+ }
	+ return (dropped);
	+}
	+
	+/*
	+ * The value in ret_val informs the caller
	+ * if we dropped the tcb (and lock) or not.
	+ * 1 = we dropped it, 0 = the TCB is still locked
	+ * and valid.
	+ */
	+void
	+ctf_challenge_ack(struct mbuf m, struct tcphdr th, struct tcpcb tp, int32_t ret_val)
	+{
	+ INP_INFO_RLOCK_ASSERT(&V_tcbinfo);
	+
	+ TCPSTAT_INC(tcps_badsyn);
	+ if (V_tcp_insecure_syn &&
	+ SEQ_GEQ(th->th_seq, tp->last_ack_sent) &&
	+ SEQ_LT(th->th_seq, tp->last_ack_sent + tp->rcv_wnd)) {
	+ tp = tcp_drop(tp, ECONNRESET);
	+ *ret_val = 1;
	+ ctf_do_drop(m, tp);
	+ } else {
	+ /* Send challenge ACK. */
	+ tcp_respond(tp, mtod(m, void *), th, m, tp->rcv_nxt,
	+ tp->snd_nxt, TH_ACK);
	+ tp->last_ack_sent = tp->rcv_nxt;
	+ m = NULL;
	+ *ret_val = 0;
	+ ctf_do_drop(m, NULL);
	+ }
	+}
	+
	+/*
	+ * bbr_ts_check returns 1 for you should not proceed, the state
	+ * machine should return. It places in ret_val what should
	+ * be returned 1/0 by the caller (hpts_do_segment). The 1 indicates
	+ * that the TCB is unlocked and probably dropped. The 0 indicates the
	+ * TCB is still valid and locked.
	+ */
	+int
	+ctf_ts_check(struct mbuf m, struct tcphdr th, struct tcpcb *tp,
	+ int32_t tlen, int32_t thflags, int32_t * ret_val)
	+{
	+
	+ if (tcp_ts_getticks() - tp->ts_recent_age > TCP_PAWS_IDLE) {
	+ /*
	+ * Invalidate ts_recent. If this segment updates ts_recent,
	+ * the age will be reset later and ts_recent will get a
	+ * valid value. If it does not, setting ts_recent to zero
	+ * will at least satisfy the requirement that zero be placed
	+ * in the timestamp echo reply when ts_recent isn't valid.
	+ * The age isn't reset until we get a valid ts_recent
	+ * because we don't want out-of-order segments to be dropped
	+ * when ts_recent is old.
	+ */
	+ tp->ts_recent = 0;
	+ } else {
	+ TCPSTAT_INC(tcps_rcvduppack);
	+ TCPSTAT_ADD(tcps_rcvdupbyte, tlen);
	+ TCPSTAT_INC(tcps_pawsdrop);
	+ *ret_val = 0;
	+ if (tlen) {
	+ ctf_do_dropafterack(m, tp, th, thflags, tlen, ret_val);
	+ } else {
	+ ctf_do_drop(m, NULL);
	+ }
	+ return (1);
	+ }
	+ return (0);
	+}
	+
	+void
	+ctf_calc_rwin(struct socket so, struct tcpcb tp)
	+{
	+ int32_t win;
	+
	+ /*
	+ * Calculate amount of space in receive window, and then do TCP
	+ * input processing. Receive window is amount of space in rcv queue,
	+ * but not less than advertised window.
	+ */
	+ win = sbspace(&so->so_rcv);
	+ if (win < 0)
	+ win = 0;
	+ tp->rcv_wnd = imax(win, (int)(tp->rcv_adv - tp->rcv_nxt));
	+}
	+
	+void
	+ctf_do_dropwithreset_conn(struct mbuf m, struct tcpcb tp, struct tcphdr *th,
	+ int32_t rstreason, int32_t tlen)
	+{
	+
	+ if (tp->t_inpcb) {
	+ tcp_set_inp_to_drop(tp->t_inpcb, ETIMEDOUT);
	+ }
	+ tcp_dropwithreset(m, th, tp, tlen, rstreason);
	+ INP_WUNLOCK(tp->t_inpcb);
	+}
	+
	+uint32_t
	+ctf_fixed_maxseg(struct tcpcb *tp)
	+{
	+ int optlen;
	+
	+ if (tp->t_flags & TF_NOOPT)
	+ return (tp->t_maxseg);
	+
	+ /*
	+ * Here we have a simplified code from tcp_addoptions(),
	+ * without a proper loop, and having most of paddings hardcoded.
	+ * We only consider fixed options that we would send every
	+ * time I.e. SACK is not considered.
	+ *
	+ */
	+#define PAD(len) ((((len) / 4) + !!((len) % 4)) * 4)
	+ if (TCPS_HAVEESTABLISHED(tp->t_state)) {
	+ if (tp->t_flags & TF_RCVD_TSTMP)
	+ optlen = TCPOLEN_TSTAMP_APPA;
	+ else
	+ optlen = 0;
	+#if defined(IPSEC_SUPPORT) \|\| defined(TCP_SIGNATURE)
	+ if (tp->t_flags & TF_SIGNATURE)
	+ optlen += PAD(TCPOLEN_SIGNATURE);
	+#endif
	+ } else {
	+ if (tp->t_flags & TF_REQ_TSTMP)
	+ optlen = TCPOLEN_TSTAMP_APPA;
	+ else
	+ optlen = PAD(TCPOLEN_MAXSEG);
	+ if (tp->t_flags & TF_REQ_SCALE)
	+ optlen += PAD(TCPOLEN_WINDOW);
	+#if defined(IPSEC_SUPPORT) \|\| defined(TCP_SIGNATURE)
	+ if (tp->t_flags & TF_SIGNATURE)
	+ optlen += PAD(TCPOLEN_SIGNATURE);
	+#endif
	+ if (tp->t_flags & TF_SACK_PERMIT)
	+ optlen += PAD(TCPOLEN_SACK_PERMITTED);
	+ }
	+#undef PAD
	+ optlen = min(optlen, TCP_MAXOLEN);
	+ return (tp->t_maxseg - optlen);
	+}
	+
	+void
	+ctf_log_sack_filter(struct tcpcb tp, int num_sack_blks, struct sackblk sack_blocks)
	+{
	+ if (tp->t_logstate != TCP_LOG_STATE_OFF) {
	+ union tcp_log_stackspecific log;
	+ struct timeval tv;
	+
	+ memset(&log, 0, sizeof(log));
	+ log.u_bbr.timeStamp = tcp_get_usecs(&tv);
	+ log.u_bbr.flex8 = num_sack_blks;
	+ if (num_sack_blks > 0) {
	+ log.u_bbr.flex1 = sack_blocks[0].start;
	+ log.u_bbr.flex2 = sack_blocks[0].end;
	+ }
	+ if (num_sack_blks > 1) {
	+ log.u_bbr.flex3 = sack_blocks[1].start;
	+ log.u_bbr.flex4 = sack_blocks[1].end;
	+ }
	+ if (num_sack_blks > 2) {
	+ log.u_bbr.flex5 = sack_blocks[2].start;
	+ log.u_bbr.flex6 = sack_blocks[2].end;
	+ }
	+ if (num_sack_blks > 3) {
	+ log.u_bbr.applimited = sack_blocks[3].start;
	+ log.u_bbr.pkts_out = sack_blocks[3].end;
	+ }
	+ TCP_LOG_EVENTP(tp, NULL,
	+ &tp->t_inpcb->inp_socket->so_rcv,
	+ &tp->t_inpcb->inp_socket->so_snd,
	+ TCP_SACK_FILTER_RES, 0,
	+ 0, &log, false, &tv);
	+ }
	+}
	+
	+uint32_t
	+ctf_decay_count(uint32_t count, uint32_t decay)
	+{
	+ /*
	+ * Given a count, decay it by a set percentage. The
	+ * percentage is in thousands i.e. 100% = 1000,
	+ * 19.3% = 193.
	+ */
	+ uint64_t perc_count, decay_per;
	+ uint32_t decayed_count;
	+ if (decay > 1000) {
	+ /* We don't raise it */
	+ return (count);
	+ }
	+ perc_count = count;
	+ decay_per = decay;
	+ perc_count *= decay_per;
	+ perc_count /= 1000;
	+ /*
	+ * So now perc_count holds the
	+ * count decay value.
	+ */
	+ decayed_count = count - (uint32_t)perc_count;
	+ return (decayed_count);
	+}
	Index: head/sys/netinet/tcp_var.h
	===================================================================
	--- head/sys/netinet/tcp_var.h
	+++ head/sys/netinet/tcp_var.h
	@@ -102,7 +102,8 @@
	t_state:4, /* state of this connection */
	t_idle_reduce : 1,
	t_delayed_ack: 7, /* Delayed ack variable */
	- bits_spare : 4;
	+ t_fin_is_rst: 1, /* Are fin's treated as resets */
	+ bits_spare : 3;
	u_int t_flags;
	tcp_seq snd_una; /* sent but unacknowledged */
	tcp_seq snd_max; /* highest sequence number sent;
	@@ -271,6 +272,11 @@
	void (tfb_tcp_do_segment)(struct mbuf , struct tcphdr *,
	struct socket , struct tcpcb ,
	int, int, uint8_t);
	+ int (tfb_do_queued_segments)(struct socket , struct tcpcb *, int);
	+ int (tfb_do_segment_nounlock)(struct mbuf , struct tcphdr *,
	+ struct socket , struct tcpcb ,
	+ int, int, uint8_t,
	+ int, struct timeval *);
	void (tfb_tcp_hpts_do_segment)(struct mbuf , struct tcphdr *,
	struct socket , struct tcpcb ,
	int, int, uint8_t,
	Index: head/sys/sys/mbuf.h
	===================================================================
	--- head/sys/sys/mbuf.h
	+++ head/sys/sys/mbuf.h
	@@ -407,6 +407,7 @@
	#define M_TSTMP_HPREC 0x00000800 /* rcv_tstmp is high-prec, typically
	hw-stamped on port (useful for IEEE 1588
	and 802.1AS) */
	+#define M_TSTMP_LRO 0x00001000 /* Time LRO pushed in pkt is valid in (PH_loc) */

	#define M_PROTO1 0x00001000 /* protocol-specific */
	#define M_PROTO2 0x00002000 /* protocol-specific */

File Metadata

Mime Type: text/plain
Expires: Fri, Jan 31, 7:34 PM (18 h, 46 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 16369431
Default Alt Text: D20834.diff (151 KB)

D20834.diffNo OneTemporaryActions

D20834.diffView Options

File Metadata

Event Timeline

D20834.diff
No OneTemporary
Actions

D20834.diff
View Options