D31083.id.diff
No OneTemporary
Actions

Size

78 KB

Referenced Files

None

Subscribers

None

D31083.id.diff
View Options

	Index: sys/kern/subr_trap.c
	===================================================================
	--- sys/kern/subr_trap.c
	+++ sys/kern/subr_trap.c
	@@ -140,6 +140,16 @@
	#ifdef HWPMC_HOOKS
	if (PMC_THREAD_HAS_SAMPLES(td))
	PMC_CALL_HOOK(td, PMC_FN_THR_USERRET, NULL);
	+#endif
	+#ifdef TCPHPTS
	+ /*
	+ * @gallatin is adament that this needs to go here, I
	+ * am not so sure. Running hpts is a lot like
	+ * a lro_flush() that happens while a user process
	+ * is running. But he may know best so I will go
	+ * with his view of accounting. :-)
	+ */
	+ tcp_run_hpts();
	#endif
	/*
	* Let the scheduler adjust our priority etc.
	Index: sys/netinet/in_pcb.h
	===================================================================
	--- sys/netinet/in_pcb.h
	+++ sys/netinet/in_pcb.h
	@@ -258,6 +258,7 @@
	volatile uint32_t inp_in_input; /* on input hpts (lock b) */
	#endif
	volatile uint16_t inp_hpts_cpu; /* Lock (i) */
	+ volatile uint16_t inp_irq_cpu; /* Set by LRO in behalf of or the driver */
	u_int inp_refcount; /* (i) refcount */
	int inp_flags; /* (i) generic IP/datagram flags */
	int inp_flags2; /* (i) generic IP/datagram flags #2*/
	@@ -266,7 +267,8 @@
	inp_input_cpu_set : 1, /* on input hpts (i) */
	inp_hpts_calls :1, /* (i) from output hpts */
	inp_input_calls :1, /* (i) from input hpts */
	- inp_spare_bits2 : 4;
	+ inp_irq_cpu_set :1, /* (i) from LRO/Driver */
	+ inp_spare_bits2 : 3;
	uint8_t inp_numa_domain; /* numa domain */
	void inp_ppcb; / (i) pointer to per-protocol pcb */
	struct socket inp_socket; / (i) back pointer to socket */
	Index: sys/netinet/tcp_hpts.h
	===================================================================
	--- sys/netinet/tcp_hpts.h
	+++ sys/netinet/tcp_hpts.h
	@@ -44,7 +44,7 @@
	TAILQ_HEAD(hptsh, inpcb);

	/* Number of useconds in a hpts tick */
	-#define HPTS_TICKS_PER_USEC 10
	+#define HPTS_TICKS_PER_SLOT 10
	#define HPTS_MS_TO_SLOTS(x) ((x * 100) + 1)
	#define HPTS_USEC_TO_SLOTS(x) ((x+9) /10)
	#define HPTS_USEC_IN_SEC 1000000
	@@ -56,7 +56,7 @@
	uint32_t p_nxt_slot; /* bbr->flex1 x */
	uint32_t p_cur_slot; /* bbr->flex2 x */
	uint32_t p_prev_slot; /* bbr->delivered */
	- uint32_t p_runningtick; /* bbr->inflight */
	+ uint32_t p_runningslot; /* bbr->inflight */
	uint32_t slot_req; /* bbr->flex3 x */
	uint32_t inp_hptsslot; /* bbr->flex4 x */
	uint32_t slot_remaining; /* bbr->flex5 x */
	@@ -64,8 +64,8 @@
	uint32_t hpts_sleep_time; /* bbr->applimited x */
	uint32_t yet_to_sleep; /* bbr->lt_epoch x */
	uint32_t need_new_to; /* bbr->flex6 x */
	- uint32_t wheel_tick; /* bbr->bw_inuse x */
	- uint32_t maxticks; /* bbr->delRate x */
	+ uint32_t wheel_slot; /* bbr->bw_inuse x */
	+ uint32_t maxslots; /* bbr->delRate x */
	uint32_t wheel_cts; /* bbr->rttProp x */
	int32_t co_ret; /* bbr->pkts_out x */
	uint32_t p_curtick; /* upper bbr->cur_del_rate */
	@@ -83,16 +83,20 @@
	#define PACE_PKT_OUTPUT 0x40 /* Output Packets being paced */
	#define PACE_TMR_MASK (PACE_TMR_KEEP\|PACE_TMR_PERSIT\|PACE_TMR_RXT\|PACE_TMR_TLP\|PACE_TMR_RACK\|PACE_TMR_DELACK)

	+#define DEFAULT_CONNECTION_THESHOLD 100
	+
	#ifdef _KERNEL
	/* Each hpts has its own p_mtx which is used for locking */
	struct tcp_hpts_entry {
	/* Cache line 0x00 */
	struct mtx p_mtx; /* Mutex for hpts */
	+ struct timeval p_mysleep; /* Our min sleep time */
	+ uint64_t syscall_cnt;
	+ uint64_t sleeping; /* What the actual sleep was (if sleeping) */
	uint16_t p_hpts_active; /* Flag that says hpts is awake */
	- uint8_t p_hpts_wake_scheduled; /* Have we scheduled a wakeup? */
	uint8_t p_wheel_complete; /* have we completed the wheel arc walk? */
	uint32_t p_curtick; /* Tick in 10 us the hpts is going to */
	- uint32_t p_runningtick; /* Current tick we are at if we are running */
	+ uint32_t p_runningslot; /* Current tick we are at if we are running */
	uint32_t p_prev_slot; /* Previous slot we were on */
	uint32_t p_cur_slot; /* Current slot in wheel hpts is draining */
	uint32_t p_nxt_slot; /* The next slot outside the current range of
	@@ -101,7 +105,8 @@
	uint32_t p_lasttick; /* Last tick before the current one */
	uint8_t p_direct_wake :1, /* boolean */
	p_on_min_sleep:1, /* boolean */
	- p_avail:6;
	+ p_hpts_wake_scheduled:1, /* boolean */
	+ p_avail:5;
	uint8_t p_fill[3]; /* Fill to 32 bits */
	/* Cache line 0x40 */
	void *p_inp;
	@@ -109,8 +114,6 @@
	/* Hptsi wheel */
	struct hptsh *p_hptss;
	int32_t p_on_inqueue_cnt; /* Count on input queue in this hpts */
	- uint32_t hit_no_enobuf;
	- uint32_t p_dyn_adjust;
	uint32_t p_hpts_sleep_time; /* Current sleep interval having a max
	* of 255ms */
	uint32_t overidden_sleep; /* what was overrided by min-sleep for logging */
	@@ -134,6 +137,7 @@
	struct tcp_hptsi {
	struct proc rp_proc; / Process structure for hpts */
	struct tcp_hpts_entry *rp_ent; / Array of hptss */
	+ uint32_t *cts_last_ran;
	uint32_t rp_num_hptss; /* Number of hpts threads */
	};

	@@ -155,10 +159,37 @@
	* be sent when a TCB is still around must be
	* sent from a routine like tcp_respond().
	*/
	+#define LOWEST_SLEEP_ALLOWED 50
	#define DEFAULT_MIN_SLEEP 250 /* How many usec's is default for hpts sleep
	* this determines min granularity of the
	- * hpts. If 0, granularity is 10useconds at
	- * the cost of more CPU (context switching). */
	+ * hpts. If 1, granularity is 10useconds at
	+ * the cost of more CPU (context switching).
	+ * Note do not set this to 0.
	+ */
	+#define DYNAMIC_MIN_SLEEP DEFAULT_MIN_SLEEP
	+#define DYNAMIC_MAX_SLEEP 100000 /* 100ms */
	+/* No of connections when wee start aligning to the cpu from syscalls */
	+#define OLDEST_THRESHOLD 1200
	+/* Thresholds for raising/lowering sleep */
	+#define TICKS_INDICATE_MORE_SLEEP 100 /* This would be 1ms */
	+#define TICKS_INDICATE_LESS_SLEEP 1000 /* This would indicate 10ms */
	+/**
	+ *
	+ * Dynamic adjustment of sleeping times is done in "new" mode
	+ * where we are depending on syscall returns and lro returns
	+ * to push hpts forward mainly and the timer is only a backstop.
	+ *
	+ * When we are in the "new" mode i.e. conn_cnt > conn_cnt_thresh
	+ * then we do a dynamic adjustment on the time we sleep.
	+ * Our threshold is if the lateness of the first client served (in ticks) is
	+ * greater than or equal too ticks_indicate_more_sleep (10ms
	+ * or 10000 ticks). If we were that late, the actual sleep time
	+ * is adjusted down by 50%. If the ticks_ran is less than
	+ * ticks_indicate_more_sleep (100 ticks or 1000usecs).
	+ *
	+ */
	+
	+
	#ifdef _KERNEL
	#define HPTS_MTX_ASSERT(hpts) mtx_assert(&(hpts)->p_mtx, MA_OWNED)
	struct tcp_hpts_entry tcp_hpts_lock(struct inpcb inp);
	@@ -215,43 +246,61 @@
	void __tcp_set_inp_to_drop(struct inpcb *inp, uint16_t reason, int32_t line);
	#define tcp_set_inp_to_drop(a, b) __tcp_set_inp_to_drop(a, b, __LINE__)

	+void tcp_run_hpts(void);
	+
	+uint16_t hpts_random_cpu(struct inpcb *inp);
	+
	extern int32_t tcp_min_hptsi_time;

	-static __inline uint32_t
	-tcp_tv_to_hptstick(struct timeval *sv)
	-{
	- return ((sv->tv_sec * 100000) + (sv->tv_usec / 10));
	-}
	+#endif /* _KERNEL */

	+/*
	+ * The following functions should also be available
	+ * to userspace as well.
	+ */
	static __inline uint32_t
	-tcp_gethptstick(struct timeval *sv)
	+tcp_tv_to_hptstick(const struct timeval *sv)
	{
	- struct timeval tv;
	-
	- if (sv == NULL)
	- sv = &tv;
	- microuptime(sv);
	- return (tcp_tv_to_hptstick(sv));
	+ return ((sv->tv_sec * 100000) + (sv->tv_usec / HPTS_TICKS_PER_SLOT));
	}

	static __inline uint32_t
	-tcp_tv_to_usectick(struct timeval *sv)
	+tcp_tv_to_usectick(const struct timeval *sv)
	{
	return ((uint32_t) ((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
	}

	static __inline uint32_t
	-tcp_tv_to_mssectick(struct timeval *sv)
	+tcp_tv_to_mssectick(const struct timeval *sv)
	{
	return ((uint32_t) ((sv->tv_sec * HPTS_MSEC_IN_SEC) + (sv->tv_usec/HPTS_USEC_IN_MSEC)));
	}

	+static __inline uint64_t
	+tcp_tv_to_lusectick(const struct timeval *sv)
	+{
	+ return ((uint64_t)((sv->tv_sec * HPTS_USEC_IN_SEC) + sv->tv_usec));
	+}
	+
	+#ifdef _KERNEL
	+
	static __inline void
	tcp_hpts_unlock(struct tcp_hpts_entry *hpts)
	{
	mtx_unlock(&hpts->p_mtx);
	}

	+static __inline uint32_t
	+tcp_gethptstick(struct timeval *sv)
	+{
	+ struct timeval tv;
	+
	+ if (sv == NULL)
	+ sv = &tv;
	+ microuptime(sv);
	+ return (tcp_tv_to_hptstick(sv));
	+}
	+
	static __inline uint32_t
	tcp_get_usecs(struct timeval *tv)
	{
	Index: sys/netinet/tcp_hpts.c
	===================================================================
	--- sys/netinet/tcp_hpts.c
	+++ sys/netinet/tcp_hpts.c
	@@ -193,23 +193,29 @@
	#else
	static int tcp_bind_threads = 2;
	#endif
	-TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
	-
	+static int tcp_use_irq_cpu = 0;
	static struct tcp_hptsi tcp_pace;
	+static uint32_t *cts_last_ran;
	static int hpts_does_tp_logging = 0;
	+static int hpts_use_assigned_cpu = 1;
	+static int32_t hpts_uses_oldest = OLDEST_THRESHOLD;

	-static void tcp_wakehpts(struct tcp_hpts_entry *p);
	-static void tcp_wakeinput(struct tcp_hpts_entry *p);
	static void tcp_input_data(struct tcp_hpts_entry hpts, struct timeval tv);
	-static void tcp_hptsi(struct tcp_hpts_entry *hpts);
	+static int32_t tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout);
	static void tcp_hpts_thread(void *ctx);
	static void tcp_init_hptsi(void *st);

	int32_t tcp_min_hptsi_time = DEFAULT_MIN_SLEEP;
	-static int32_t tcp_hpts_callout_skip_swi = 0;
	+static int conn_cnt_thresh = DEFAULT_CONNECTION_THESHOLD;
	+static int32_t dynamic_min_sleep = DYNAMIC_MIN_SLEEP;
	+static int32_t dynamic_max_sleep = DYNAMIC_MAX_SLEEP;
	+
	+

	SYSCTL_NODE(_net_inet_tcp, OID_AUTO, hpts, CTLFLAG_RW \| CTLFLAG_MPSAFE, 0,
	"TCP Hpts controls");
	+SYSCTL_NODE(_net_inet_tcp_hpts, OID_AUTO, stats, CTLFLAG_RD \| CTLFLAG_MPSAFE, 0,
	+ "TCP Hpts statistics");

	#define timersub(tvp, uvp, vvp) \
	do { \
	@@ -230,44 +236,92 @@

	struct hpts_domain_info hpts_domains[MAXMEMDOM];

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
	- &tcp_hpts_precision, 120,
	- "Value for PRE() precision of callout");
	-
	counter_u64_t hpts_hopelessly_behind;

	-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, hopeless, CTLFLAG_RD,
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, hopeless, CTLFLAG_RD,
	&hpts_hopelessly_behind,
	"Number of times hpts could not catch up and was behind hopelessly");

	counter_u64_t hpts_loops;

	-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, loops, CTLFLAG_RD,
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, loops, CTLFLAG_RD,
	&hpts_loops, "Number of times hpts had to loop to catch up");

	counter_u64_t back_tosleep;

	-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, no_tcbsfound, CTLFLAG_RD,
	&back_tosleep, "Number of times hpts found no tcbs");

	counter_u64_t combined_wheel_wrap;

	-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, comb_wheel_wrap, CTLFLAG_RD,
	&combined_wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");

	counter_u64_t wheel_wrap;

	-SYSCTL_COUNTER_U64(_net_inet_tcp_hpts, OID_AUTO, wheel_wrap, CTLFLAG_RD,
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, wheel_wrap, CTLFLAG_RD,
	&wheel_wrap, "Number of times the wheel lagged enough to have an insert see wrap");

	-static int32_t out_ts_percision = 0;
	+counter_u64_t hpts_direct_call;
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_call, CTLFLAG_RD,
	+ &hpts_direct_call, "Number of times hpts was called by syscall/trap or other entry");
	+
	+counter_u64_t hpts_wake_timeout;
	+
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, timeout_wakeup, CTLFLAG_RD,
	+ &hpts_wake_timeout, "Number of times hpts threads woke up via the callout expiring");
	+
	+counter_u64_t hpts_direct_awakening;
	+
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, direct_awakening, CTLFLAG_RD,
	+ &hpts_direct_awakening, "Number of times hpts threads woke up via the callout expiring");

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, out_tspercision, CTLFLAG_RW,
	- &out_ts_percision, 0,
	- "Do we use a percise timestamp for every output cts");
	+counter_u64_t hpts_back_tosleep;
	+
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, back_tosleep, CTLFLAG_RD,
	+ &hpts_back_tosleep, "Number of times hpts threads woke up via the callout expiring and went back to sleep no work");
	+
	+counter_u64_t cpu_uses_flowid;
	+counter_u64_t cpu_uses_random;
	+
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_flowid, CTLFLAG_RD,
	+ &cpu_uses_flowid, "Number of times when setting cpuid we used the flowid field");
	+SYSCTL_COUNTER_U64(_net_inet_tcp_hpts_stats, OID_AUTO, cpusel_random, CTLFLAG_RD,
	+ &cpu_uses_random, "Number of times when setting cpuid we used the a random value");
	+
	+TUNABLE_INT("net.inet.tcp.bind_hptss", &tcp_bind_threads);
	+TUNABLE_INT("net.inet.tcp.use_irq", &tcp_use_irq_cpu);
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, bind_hptss, CTLFLAG_RD,
	+ &tcp_bind_threads, 2,
	+ "Thread Binding tunable");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_irq, CTLFLAG_RD,
	+ &tcp_use_irq_cpu, 0,
	+ "Use of irq CPU tunable");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, precision, CTLFLAG_RW,
	+ &tcp_hpts_precision, 120,
	+ "Value for PRE() precision of callout");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, cnt_thresh, CTLFLAG_RW,
	+ &conn_cnt_thresh, 0,
	+ "How many connections (below) make us use the callout based mechanism");
	SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, logging, CTLFLAG_RW,
	&hpts_does_tp_logging, 0,
	"Do we add to any tp that has logging on pacer logs");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_assigned_cpu, CTLFLAG_RW,
	+ &hpts_use_assigned_cpu, 0,
	+ "Do we start any hpts timer on the assigned cpu?");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, use_oldest, CTLFLAG_RW,
	+ &hpts_uses_oldest, OLDEST_THRESHOLD,
	+ "Do syscalls look for the hpts that has been the longest since running (or just use cpu no if 0)?");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_minsleep, CTLFLAG_RW,
	+ &dynamic_min_sleep, 250,
	+ "What is the dynamic minsleep value?");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, dyn_maxsleep, CTLFLAG_RW,
	+ &dynamic_max_sleep, 5000,
	+ "What is the dynamic maxsleep value?");
	+
	+
	+
	+

	static int32_t max_pacer_loops = 10;
	SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, loopmax, CTLFLAG_RW,
	@@ -287,7 +341,7 @@
	new = hpts_sleep_max;
	error = sysctl_handle_int(oidp, &new, 0, req);
	if (error == 0 && req->newptr) {
	- if ((new < (NUM_OF_HPTSI_SLOTS / 4)) \|\|
	+ if ((new < dynamic_min_sleep) \|\|
	(new > HPTS_MAX_SLEEP_ALLOWED))
	error = EINVAL;
	else
	@@ -296,26 +350,60 @@
	return (error);
	}

	+static int
	+sysctl_net_inet_tcp_hpts_min_sleep(SYSCTL_HANDLER_ARGS)
	+{
	+ int error;
	+ uint32_t new;
	+
	+ new = tcp_min_hptsi_time;
	+ error = sysctl_handle_int(oidp, &new, 0, req);
	+ if (error == 0 && req->newptr) {
	+ if (new < LOWEST_SLEEP_ALLOWED)
	+ error = EINVAL;
	+ else
	+ tcp_min_hptsi_time = new;
	+ }
	+ return (error);
	+}
	+
	SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, maxsleep,
	CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT,
	&hpts_sleep_max, 0,
	&sysctl_net_inet_tcp_hpts_max_sleep, "IU",
	"Maximum time hpts will sleep");

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, minsleep, CTLFLAG_RW,
	+SYSCTL_PROC(_net_inet_tcp_hpts, OID_AUTO, minsleep,
	+ CTLTYPE_UINT \| CTLFLAG_RW \| CTLFLAG_NEEDGIANT,
	&tcp_min_hptsi_time, 0,
	+ &sysctl_net_inet_tcp_hpts_min_sleep, "IU",
	"The minimum time the hpts must sleep before processing more slots");

	-SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, skip_swi, CTLFLAG_RW,
	- &tcp_hpts_callout_skip_swi, 0,
	- "Do we have the callout call directly to the hpts?");
	+static int ticks_indicate_more_sleep = TICKS_INDICATE_MORE_SLEEP;
	+static int ticks_indicate_less_sleep = TICKS_INDICATE_LESS_SLEEP;
	+static int tcp_hpts_no_wake_over_thresh = 1;
	+
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, more_sleep, CTLFLAG_RW,
	+ &ticks_indicate_more_sleep, 0,
	+ "If we only process this many or less on a timeout, we need longer sleep on the next callout");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, less_sleep, CTLFLAG_RW,
	+ &ticks_indicate_less_sleep, 0,
	+ "If we process this many or more on a timeout, we need less sleep on the next callout");
	+SYSCTL_INT(_net_inet_tcp_hpts, OID_AUTO, nowake_over_thresh, CTLFLAG_RW,
	+ &tcp_hpts_no_wake_over_thresh, 0,
	+ "When we are over the threshold on the pacer do we prohibit wakeups?");

	static void
	tcp_hpts_log(struct tcp_hpts_entry hpts, struct tcpcb tp, struct timeval *tv,
	- int ticks_to_run, int idx)
	+ int slots_to_run, int idx, int from_callout)
	{
	union tcp_log_stackspecific log;
	-
	+ /*
	+ * Unused logs are
	+ * 64 bit - delRate, rttProp, bw_inuse
	+ * 16 bit - cwnd_gain
	+ * 8 bit - bbr_state, bbr_substate, inhpts, ininput;
	+ */
	memset(&log.u_bbr, 0, sizeof(log.u_bbr));
	log.u_bbr.flex1 = hpts->p_nxt_slot;
	log.u_bbr.flex2 = hpts->p_cur_slot;
	@@ -323,8 +411,9 @@
	log.u_bbr.flex4 = idx;
	log.u_bbr.flex5 = hpts->p_curtick;
	log.u_bbr.flex6 = hpts->p_on_queue_cnt;
	- log.u_bbr.use_lt_bw = 1;
	- log.u_bbr.inflight = ticks_to_run;
	+ log.u_bbr.flex7 = hpts->p_cpu;
	+ log.u_bbr.flex8 = (uint8_t)from_callout;
	+ log.u_bbr.inflight = slots_to_run;
	log.u_bbr.applimited = hpts->overidden_sleep;
	log.u_bbr.delivered = hpts->saved_curtick;
	log.u_bbr.timeStamp = tcp_tv_to_usectick(tv);
	@@ -332,7 +421,9 @@
	log.u_bbr.lt_epoch = hpts->saved_prev_slot;
	log.u_bbr.pkts_out = hpts->p_delayed_by;
	log.u_bbr.lost = hpts->p_hpts_sleep_time;
	- log.u_bbr.cur_del_rate = hpts->p_runningtick;
	+ log.u_bbr.pacing_gain = hpts->p_cpu;
	+ log.u_bbr.pkt_epoch = hpts->p_runningslot;
	+ log.u_bbr.use_lt_bw = 1;
	TCP_LOG_EVENTP(tp, NULL,
	&tp->t_inpcb->inp_socket->so_rcv,
	&tp->t_inpcb->inp_socket->so_snd,
	@@ -341,47 +432,40 @@
	}

	static void
	-hpts_timeout_swi(void *arg)
	+tcp_wakehpts(struct tcp_hpts_entry *hpts)
	{
	- struct tcp_hpts_entry *hpts;
	+ HPTS_MTX_ASSERT(hpts);

	- hpts = (struct tcp_hpts_entry *)arg;
	- swi_sched(hpts->ie_cookie, 0);
	+ if (tcp_hpts_no_wake_over_thresh && (hpts->p_on_queue_cnt >= conn_cnt_thresh)) {
	+ hpts->p_direct_wake = 0;
	+ return;
	+ }
	+ if (hpts->p_hpts_wake_scheduled == 0) {
	+ hpts->p_hpts_wake_scheduled = 1;
	+ swi_sched(hpts->ie_cookie, 0);
	+ }
	}

	static void
	-hpts_timeout_dir(void *arg)
	+hpts_timeout_swi(void *arg)
	{
	- tcp_hpts_thread(arg);
	+ struct tcp_hpts_entry *hpts;
	+
	+ hpts = (struct tcp_hpts_entry *)arg;
	+ swi_sched(hpts->ie_cookie, 0);
	}

	static inline void
	hpts_sane_pace_remove(struct tcp_hpts_entry hpts, struct inpcb inp, struct hptsh *head, int clear)
	{
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx) == 0) {
	- /* We don't own the mutex? */
	- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
	- }
	- if (hpts->p_cpu != inp->inp_hpts_cpu) {
	- /* It is not the right cpu/mutex? */
	- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
	- }
	- if (inp->inp_in_hpts == 0) {
	- /* We are not on the hpts? */
	- panic("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp);
	- }
	-#endif
	+ HPTS_MTX_ASSERT(hpts);
	+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu, ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	+ KASSERT(inp->inp_in_hpts != 0, ("%s: hpts:%p inp:%p not on the hpts?", __FUNCTION__, hpts, inp));
	TAILQ_REMOVE(head, inp, inp_hpts);
	hpts->p_on_queue_cnt--;
	- if (hpts->p_on_queue_cnt < 0) {
	- /* Count should not go negative .. */
	-#ifdef INVARIANTS
	- panic("Hpts goes negative inp:%p hpts:%p",
	- inp, hpts);
	-#endif
	- hpts->p_on_queue_cnt = 0;
	- }
	+ KASSERT(hpts->p_on_queue_cnt >= 0,
	+ ("Hpts goes negative inp:%p hpts:%p",
	+ inp, hpts));
	if (clear) {
	inp->inp_hpts_request = 0;
	inp->inp_in_hpts = 0;
	@@ -391,20 +475,13 @@
	static inline void
	hpts_sane_pace_insert(struct tcp_hpts_entry hpts, struct inpcb inp, struct hptsh *head, int line, int noref)
	{
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx) == 0) {
	- /* We don't own the mutex? */
	- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
	- }
	- if (hpts->p_cpu != inp->inp_hpts_cpu) {
	- /* It is not the right cpu/mutex? */
	- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
	- }
	- if ((noref == 0) && (inp->inp_in_hpts == 1)) {
	- /* We are already on the hpts? */
	- panic("%s: hpts:%p inp:%p already on the hpts?", __FUNCTION__, hpts, inp);
	- }
	-#endif
	+ HPTS_MTX_ASSERT(hpts);
	+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
	+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	+ KASSERT(((noref == 1) && (inp->inp_in_hpts == 1)) \|\|
	+ ((noref == 0) && (inp->inp_in_hpts == 0)),
	+ ("%s: hpts:%p inp:%p already on the hpts?",
	+ __FUNCTION__, hpts, inp));
	TAILQ_INSERT_TAIL(head, inp, inp_hpts);
	inp->inp_in_hpts = 1;
	hpts->p_on_queue_cnt++;
	@@ -416,37 +493,20 @@
	static inline void
	hpts_sane_input_remove(struct tcp_hpts_entry hpts, struct inpcb inp, int clear)
	{
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx) == 0) {
	- /* We don't own the mutex? */
	- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
	- }
	- if (hpts->p_cpu != inp->inp_input_cpu) {
	- /* It is not the right cpu/mutex? */
	- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
	- }
	- if (inp->inp_in_input == 0) {
	- /* We are not on the input hpts? */
	- panic("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp);
	- }
	-#endif
	+ HPTS_MTX_ASSERT(hpts);
	+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
	+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	+ KASSERT(inp->inp_in_input != 0,
	+ ("%s: hpts:%p inp:%p not on the input hpts?", __FUNCTION__, hpts, inp));
	TAILQ_REMOVE(&hpts->p_input, inp, inp_input);
	hpts->p_on_inqueue_cnt--;
	- if (hpts->p_on_inqueue_cnt < 0) {
	-#ifdef INVARIANTS
	- panic("Hpts in goes negative inp:%p hpts:%p",
	- inp, hpts);
	-#endif
	- hpts->p_on_inqueue_cnt = 0;
	- }
	-#ifdef INVARIANTS
	- if (TAILQ_EMPTY(&hpts->p_input) &&
	- (hpts->p_on_inqueue_cnt != 0)) {
	- /* We should not be empty with a queue count */
	- panic("%s hpts:%p in_hpts input empty but cnt:%d",
	- __FUNCTION__, hpts, hpts->p_on_inqueue_cnt);
	- }
	-#endif
	+ KASSERT(hpts->p_on_inqueue_cnt >= 0,
	+ ("Hpts in goes negative inp:%p hpts:%p",
	+ inp, hpts));
	+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) \|\|
	+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
	+ ("%s hpts:%p input cnt (p_on_inqueue):%d and queue state mismatch",
	+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
	if (clear)
	inp->inp_in_input = 0;
	}
	@@ -454,46 +514,17 @@
	static inline void
	hpts_sane_input_insert(struct tcp_hpts_entry hpts, struct inpcb inp, int line)
	{
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx) == 0) {
	- /* We don't own the mutex? */
	- panic("%s: hpts:%p inp:%p no hpts mutex", __FUNCTION__, hpts, inp);
	- }
	- if (hpts->p_cpu != inp->inp_input_cpu) {
	- /* It is not the right cpu/mutex? */
	- panic("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp);
	- }
	- if (inp->inp_in_input == 1) {
	- /* We are already on the input hpts? */
	- panic("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp);
	- }
	-#endif
	+ HPTS_MTX_ASSERT(hpts);
	+ KASSERT(hpts->p_cpu == inp->inp_hpts_cpu,
	+ ("%s: hpts:%p inp:%p incorrect CPU", __FUNCTION__, hpts, inp));
	+ KASSERT(inp->inp_in_input == 0,
	+ ("%s: hpts:%p inp:%p already on the input hpts?", __FUNCTION__, hpts, inp));
	TAILQ_INSERT_TAIL(&hpts->p_input, inp, inp_input);
	inp->inp_in_input = 1;
	hpts->p_on_inqueue_cnt++;
	in_pcbref(inp);
	}

	-static void
	-tcp_wakehpts(struct tcp_hpts_entry *hpts)
	-{
	- HPTS_MTX_ASSERT(hpts);
	- if (hpts->p_hpts_wake_scheduled == 0) {
	- hpts->p_hpts_wake_scheduled = 1;
	- swi_sched(hpts->ie_cookie, 0);
	- }
	-}
	-
	-static void
	-tcp_wakeinput(struct tcp_hpts_entry *hpts)
	-{
	- HPTS_MTX_ASSERT(hpts);
	- if (hpts->p_hpts_wake_scheduled == 0) {
	- hpts->p_hpts_wake_scheduled = 1;
	- swi_sched(hpts->ie_cookie, 0);
	- }
	-}
	-
	struct tcp_hpts_entry *
	tcp_cur_hpts(struct inpcb *inp)
	{
	@@ -514,12 +545,9 @@
	again:
	hpts_num = inp->inp_hpts_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx)) {
	- panic("Hpts:%p owns mtx prior-to lock line:%d",
	- hpts, __LINE__);
	- }
	-#endif
	+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
	+ ("Hpts:%p owns mtx prior-to lock line:%d",
	+ hpts, __LINE__));
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_hpts_cpu) {
	mtx_unlock(&hpts->p_mtx);
	@@ -537,12 +565,9 @@
	again:
	hpts_num = inp->inp_input_cpu;
	hpts = tcp_pace.rp_ent[hpts_num];
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx)) {
	- panic("Hpts:%p owns mtx prior-to lock line:%d",
	- hpts, __LINE__);
	- }
	-#endif
	+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
	+ ("Hpts:%p owns mtx prior-to lock line:%d",
	+ hpts, __LINE__));
	mtx_lock(&hpts->p_mtx);
	if (hpts_num != inp->inp_input_cpu) {
	mtx_unlock(&hpts->p_mtx);
	@@ -555,6 +580,7 @@
	tcp_remove_hpts_ref(struct inpcb inp, struct tcp_hpts_entry hpts, int line)
	{
	int32_t add_freed;
	+ int32_t ret;

	if (inp->inp_flags2 & INP_FREED) {
	/*
	@@ -567,26 +593,11 @@
	add_freed = 0;
	}
	#ifndef INP_REF_DEBUG
	- if (in_pcbrele_wlocked(inp)) {
	- /*
	- * This should not happen. We have the inpcb referred to by
	- * the main socket (why we are called) and the hpts. It
	- * should always return 0.
	- */
	- panic("inpcb:%p release ret 1",
	- inp);
	- }
	+ ret = in_pcbrele_wlocked(inp);
	#else
	- if (__in_pcbrele_wlocked(inp, line)) {
	- /*
	- * This should not happen. We have the inpcb referred to by
	- * the main socket (why we are called) and the hpts. It
	- * should always return 0.
	- */
	- panic("inpcb:%p release ret 1",
	- inp);
	- }
	+ ret = __in_pcbrele_wlocked(inp, line);
	#endif
	+ KASSERT(ret != 1, ("inpcb:%p release ret 1", inp));
	if (add_freed) {
	inp->inp_flags2 \|= INP_FREED;
	}
	@@ -642,73 +653,76 @@
	}

	static inline int
	-hpts_tick(uint32_t wheel_tick, uint32_t plus)
	+hpts_slot(uint32_t wheel_slot, uint32_t plus)
	{
	/*
	* Given a slot on the wheel, what slot
	* is that plus ticks out?
	*/
	- KASSERT(wheel_tick < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_tick));
	- return ((wheel_tick + plus) % NUM_OF_HPTSI_SLOTS);
	+ KASSERT(wheel_slot < NUM_OF_HPTSI_SLOTS, ("Invalid tick %u not on wheel", wheel_slot));
	+ return ((wheel_slot + plus) % NUM_OF_HPTSI_SLOTS);
	}

	static inline int
	tick_to_wheel(uint32_t cts_in_wticks)
	{
	/*
	- * Given a timestamp in wheel ticks (10usec inc's)
	- * map it to our limited space wheel.
	+ * Given a timestamp in ticks (so by
	+ * default to get it to a real time one
	+ * would multiply by 10.. i.e the number
	+ * of ticks in a slot) map it to our limited
	+ * space wheel.
	*/
	return (cts_in_wticks % NUM_OF_HPTSI_SLOTS);
	}

	static inline int
	-hpts_ticks_diff(int prev_tick, int tick_now)
	+hpts_slots_diff(int prev_slot, int slot_now)
	{
	/*
	- * Given two ticks that are someplace
	+ * Given two slots that are someplace
	* on our wheel. How far are they apart?
	*/
	- if (tick_now > prev_tick)
	- return (tick_now - prev_tick);
	- else if (tick_now == prev_tick)
	+ if (slot_now > prev_slot)
	+ return (slot_now - prev_slot);
	+ else if (slot_now == prev_slot)
	/*
	* Special case, same means we can go all of our
	* wheel less one slot.
	*/
	return (NUM_OF_HPTSI_SLOTS - 1);
	else
	- return ((NUM_OF_HPTSI_SLOTS - prev_tick) + tick_now);
	+ return ((NUM_OF_HPTSI_SLOTS - prev_slot) + slot_now);
	}

	/*
	- * Given a tick on the wheel that is the current time
	- * mapped to the wheel (wheel_tick), what is the maximum
	+ * Given a slot on the wheel that is the current time
	+ * mapped to the wheel (wheel_slot), what is the maximum
	* distance forward that can be obtained without
	- * wrapping past either prev_tick or running_tick
	+ * wrapping past either prev_slot or running_slot
	* depending on the htps state? Also if passed
	- * a uint32_t *, fill it with the tick location.
	+ * a uint32_t *, fill it with the slot location.
	*
	* Note if you do not give this function the current
	- * time (that you think it is) mapped to the wheel
	+ * time (that you think it is) mapped to the wheel slot
	* then the results will not be what you expect and
	* could lead to invalid inserts.
	*/
	static inline int32_t
	-max_ticks_available(struct tcp_hpts_entry hpts, uint32_t wheel_tick, uint32_t target_tick)
	+max_slots_available(struct tcp_hpts_entry hpts, uint32_t wheel_slot, uint32_t target_slot)
	{
	- uint32_t dis_to_travel, end_tick, pacer_to_now, avail_on_wheel;
	+ uint32_t dis_to_travel, end_slot, pacer_to_now, avail_on_wheel;

	if ((hpts->p_hpts_active == 1) &&
	(hpts->p_wheel_complete == 0)) {
	- end_tick = hpts->p_runningtick;
	+ end_slot = hpts->p_runningslot;
	/* Back up one tick */
	- if (end_tick == 0)
	- end_tick = NUM_OF_HPTSI_SLOTS - 1;
	+ if (end_slot == 0)
	+ end_slot = NUM_OF_HPTSI_SLOTS - 1;
	else
	- end_tick--;
	- if (target_tick)
	- *target_tick = end_tick;
	+ end_slot--;
	+ if (target_slot)
	+ *target_slot = end_slot;
	} else {
	/*
	* For the case where we are
	@@ -718,26 +732,26 @@
	* prev tick and subtract one from it. This puts us
	* as far out as possible on the wheel.
	*/
	- end_tick = hpts->p_prev_slot;
	- if (end_tick == 0)
	- end_tick = NUM_OF_HPTSI_SLOTS - 1;
	+ end_slot = hpts->p_prev_slot;
	+ if (end_slot == 0)
	+ end_slot = NUM_OF_HPTSI_SLOTS - 1;
	else
	- end_tick--;
	- if (target_tick)
	- *target_tick = end_tick;
	+ end_slot--;
	+ if (target_slot)
	+ *target_slot = end_slot;
	/*
	* Now we have close to the full wheel left minus the
	* time it has been since the pacer went to sleep. Note
	* that wheel_tick, passed in, should be the current time
	* from the perspective of the caller, mapped to the wheel.
	*/
	- if (hpts->p_prev_slot != wheel_tick)
	- dis_to_travel = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
	+ if (hpts->p_prev_slot != wheel_slot)
	+ dis_to_travel = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
	else
	dis_to_travel = 1;
	/*
	* dis_to_travel in this case is the space from when the
	- * pacer stopped (p_prev_slot) and where our wheel_tick
	+ * pacer stopped (p_prev_slot) and where our wheel_slot
	* is now. To know how many slots we can put it in we
	* subtract from the wheel size. We would not want
	* to place something after p_prev_slot or it will
	@@ -746,21 +760,21 @@
	return (NUM_OF_HPTSI_SLOTS - dis_to_travel);
	}
	/*
	- * So how many slots are open between p_runningtick -> p_cur_slot
	+ * So how many slots are open between p_runningslot -> p_cur_slot
	* that is what is currently un-available for insertion. Special
	* case when we are at the last slot, this gets 1, so that
	* the answer to how many slots are available is all but 1.
	*/
	- if (hpts->p_runningtick == hpts->p_cur_slot)
	+ if (hpts->p_runningslot == hpts->p_cur_slot)
	dis_to_travel = 1;
	else
	- dis_to_travel = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
	+ dis_to_travel = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
	/*
	* How long has the pacer been running?
	*/
	- if (hpts->p_cur_slot != wheel_tick) {
	+ if (hpts->p_cur_slot != wheel_slot) {
	/* The pacer is a bit late */
	- pacer_to_now = hpts_ticks_diff(hpts->p_cur_slot, wheel_tick);
	+ pacer_to_now = hpts_slots_diff(hpts->p_cur_slot, wheel_slot);
	} else {
	/* The pacer is right on time, now == pacers start time */
	pacer_to_now = 0;
	@@ -774,24 +788,24 @@
	/*
	* Now how many of those we will eat due to the pacer's
	* time (p_cur_slot) of start being behind the
	- * real time (wheel_tick)?
	+ * real time (wheel_slot)?
	*/
	if (avail_on_wheel <= pacer_to_now) {
	/*
	* Wheel wrap, we can't fit on the wheel, that
	* is unusual the system must be way overloaded!
	- * Insert into the assured tick, and return special
	+ * Insert into the assured slot, and return special
	* "0".
	*/
	counter_u64_add(combined_wheel_wrap, 1);
	- *target_tick = hpts->p_nxt_slot;
	+ *target_slot = hpts->p_nxt_slot;
	return (0);
	} else {
	/*
	* We know how many slots are open
	* on the wheel (the reverse of what
	* is left to run. Take away the time
	- * the pacer started to now (wheel_tick)
	+ * the pacer started to now (wheel_slot)
	* and that tells you how many slots are
	* open that can be inserted into that won't
	* be touched by the pacer until later.
	@@ -815,7 +829,7 @@
	* A sleeping hpts we want in next slot to run
	* note that in this state p_prev_slot == p_cur_slot
	*/
	- inp->inp_hptsslot = hpts_tick(hpts->p_prev_slot, 1);
	+ inp->inp_hptsslot = hpts_slot(hpts->p_prev_slot, 1);
	if ((hpts->p_on_min_sleep == 0) && (hpts->p_hpts_active == 0))
	need_wake = 1;
	} else if ((void *)inp == hpts->p_inp) {
	@@ -827,7 +841,7 @@
	*/
	inp->inp_hptsslot = hpts->p_nxt_slot;
	} else
	- inp->inp_hptsslot = hpts->p_runningtick;
	+ inp->inp_hptsslot = hpts->p_runningslot;
	hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, noref);
	if (need_wake) {
	/*
	@@ -862,9 +876,9 @@
	* Sanity checks for the pacer with invariants
	* on insert.
	*/
	- if (inp_hptsslot >= NUM_OF_HPTSI_SLOTS)
	- panic("hpts:%p inp:%p slot:%d > max",
	- hpts, inp, inp_hptsslot);
	+ KASSERT(inp_hptsslot < NUM_OF_HPTSI_SLOTS,
	+ ("hpts:%p inp:%p slot:%d > max",
	+ hpts, inp, inp_hptsslot));
	if ((hpts->p_hpts_active) &&
	(hpts->p_wheel_complete == 0)) {
	/*
	@@ -875,17 +889,16 @@
	*/
	int distance, yet_to_run;

	- distance = hpts_ticks_diff(hpts->p_runningtick, inp_hptsslot);
	- if (hpts->p_runningtick != hpts->p_cur_slot)
	- yet_to_run = hpts_ticks_diff(hpts->p_runningtick, hpts->p_cur_slot);
	+ distance = hpts_slots_diff(hpts->p_runningslot, inp_hptsslot);
	+ if (hpts->p_runningslot != hpts->p_cur_slot)
	+ yet_to_run = hpts_slots_diff(hpts->p_runningslot, hpts->p_cur_slot);
	else
	yet_to_run = 0; /* processing last slot */
	- if (yet_to_run > distance) {
	- panic("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
	- hpts, inp, inp_hptsslot,
	- distance, yet_to_run,
	- hpts->p_runningtick, hpts->p_cur_slot);
	- }
	+ KASSERT(yet_to_run <= distance,
	+ ("hpts:%p inp:%p slot:%d distance:%d yet_to_run:%d rs:%d cs:%d",
	+ hpts, inp, inp_hptsslot,
	+ distance, yet_to_run,
	+ hpts->p_runningslot, hpts->p_cur_slot));
	}
	}
	#endif
	@@ -895,8 +908,9 @@
	struct hpts_diag diag, struct timeval tv)
	{
	uint32_t need_new_to = 0;
	- uint32_t wheel_cts, last_tick;
	- int32_t wheel_tick, maxticks;
	+ uint32_t wheel_cts;
	+ int32_t wheel_slot, maxslots, last_slot;
	+ int cpu;
	int8_t need_wakeup = 0;

	HPTS_MTX_ASSERT(hpts);
	@@ -904,7 +918,7 @@
	memset(diag, 0, sizeof(struct hpts_diag));
	diag->p_hpts_active = hpts->p_hpts_active;
	diag->p_prev_slot = hpts->p_prev_slot;
	- diag->p_runningtick = hpts->p_runningtick;
	+ diag->p_runningslot = hpts->p_runningslot;
	diag->p_nxt_slot = hpts->p_nxt_slot;
	diag->p_cur_slot = hpts->p_cur_slot;
	diag->p_curtick = hpts->p_curtick;
	@@ -913,131 +927,120 @@
	diag->p_on_min_sleep = hpts->p_on_min_sleep;
	diag->hpts_sleep_time = hpts->p_hpts_sleep_time;
	}
	- if (inp->inp_in_hpts == 0) {
	- if (slot == 0) {
	- /* Immediate */
	- tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
	- return;
	- }
	- /* Get the current time relative to the wheel */
	- wheel_cts = tcp_tv_to_hptstick(tv);
	- /* Map it onto the wheel */
	- wheel_tick = tick_to_wheel(wheel_cts);
	- /* Now what's the max we can place it at? */
	- maxticks = max_ticks_available(hpts, wheel_tick, &last_tick);
	- if (diag) {
	- diag->wheel_tick = wheel_tick;
	- diag->maxticks = maxticks;
	- diag->wheel_cts = wheel_cts;
	+ KASSERT(inp->inp_in_hpts == 0, ("Hpts:%p tp:%p already on hpts and add?", hpts, inp));
	+ if (slot == 0) {
	+ /* Immediate */
	+ tcp_queue_to_hpts_immediate_locked(inp, hpts, line, 0);
	+ return;
	+ }
	+ /* Get the current time relative to the wheel */
	+ wheel_cts = tcp_tv_to_hptstick(tv);
	+ /* Map it onto the wheel */
	+ wheel_slot = tick_to_wheel(wheel_cts);
	+ /* Now what's the max we can place it at? */
	+ maxslots = max_slots_available(hpts, wheel_slot, &last_slot);
	+ if (diag) {
	+ diag->wheel_slot = wheel_slot;
	+ diag->maxslots = maxslots;
	+ diag->wheel_cts = wheel_cts;
	+ }
	+ if (maxslots == 0) {
	+ /* The pacer is in a wheel wrap behind, yikes! */
	+ if (slot > 1) {
	+ /*
	+ * Reduce by 1 to prevent a forever loop in
	+ * case something else is wrong. Note this
	+ * probably does not hurt because the pacer
	+ * if its true is so far behind we will be
	+ * > 1second late calling anyway.
	+ */
	+ slot--;
	}
	- if (maxticks == 0) {
	- /* The pacer is in a wheel wrap behind, yikes! */
	- if (slot > 1) {
	- /*
	- * Reduce by 1 to prevent a forever loop in
	- * case something else is wrong. Note this
	- * probably does not hurt because the pacer
	- * if its true is so far behind we will be
	- * > 1second late calling anyway.
	- */
	- slot--;
	- }
	- inp->inp_hptsslot = last_tick;
	- inp->inp_hpts_request = slot;
	- } else if (maxticks >= slot) {
	- /* It all fits on the wheel */
	- inp->inp_hpts_request = 0;
	- inp->inp_hptsslot = hpts_tick(wheel_tick, slot);
	- } else {
	- /* It does not fit */
	- inp->inp_hpts_request = slot - maxticks;
	- inp->inp_hptsslot = last_tick;
	+ inp->inp_hptsslot = last_slot;
	+ inp->inp_hpts_request = slot;
	+ } else if (maxslots >= slot) {
	+ /* It all fits on the wheel */
	+ inp->inp_hpts_request = 0;
	+ inp->inp_hptsslot = hpts_slot(wheel_slot, slot);
	+ } else {
	+ /* It does not fit */
	+ inp->inp_hpts_request = slot - maxslots;
	+ inp->inp_hptsslot = last_slot;
	+ }
	+ if (diag) {
	+ diag->slot_remaining = inp->inp_hpts_request;
	+ diag->inp_hptsslot = inp->inp_hptsslot;
	+ }
	+#ifdef INVARIANTS
	+ check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
	+#endif
	+ hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
	+ if ((hpts->p_hpts_active == 0) &&
	+ (inp->inp_hpts_request == 0) &&
	+ (hpts->p_on_min_sleep == 0)) {
	+ /*
	+ * The hpts is sleeping and NOT on a minimum
	+ * sleep time, we need to figure out where
	+ * it will wake up at and if we need to reschedule
	+ * its time-out.
	+ */
	+ uint32_t have_slept, yet_to_sleep;
	+
	+ /* Now do we need to restart the hpts's timer? */
	+ have_slept = hpts_slots_diff(hpts->p_prev_slot, wheel_slot);
	+ if (have_slept < hpts->p_hpts_sleep_time)
	+ yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
	+ else {
	+ /* We are over-due */
	+ yet_to_sleep = 0;
	+ need_wakeup = 1;
	}
	if (diag) {
	- diag->slot_remaining = inp->inp_hpts_request;
	- diag->inp_hptsslot = inp->inp_hptsslot;
	+ diag->have_slept = have_slept;
	+ diag->yet_to_sleep = yet_to_sleep;
	}
	-#ifdef INVARIANTS
	- check_if_slot_would_be_wrong(hpts, inp, inp->inp_hptsslot, line);
	-#endif
	- hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], line, 0);
	- if ((hpts->p_hpts_active == 0) &&
	- (inp->inp_hpts_request == 0) &&
	- (hpts->p_on_min_sleep == 0)) {
	+ if (yet_to_sleep &&
	+ (yet_to_sleep > slot)) {
	/*
	- * The hpts is sleeping and not on a minimum
	- * sleep time, we need to figure out where
	- * it will wake up at and if we need to reschedule
	- * its time-out.
	+ * We need to reschedule the hpts's time-out.
	*/
	- uint32_t have_slept, yet_to_sleep;
	-
	- /* Now do we need to restart the hpts's timer? */
	- have_slept = hpts_ticks_diff(hpts->p_prev_slot, wheel_tick);
	- if (have_slept < hpts->p_hpts_sleep_time)
	- yet_to_sleep = hpts->p_hpts_sleep_time - have_slept;
	- else {
	- /* We are over-due */
	- yet_to_sleep = 0;
	- need_wakeup = 1;
	- }
	- if (diag) {
	- diag->have_slept = have_slept;
	- diag->yet_to_sleep = yet_to_sleep;
	- }
	- if (yet_to_sleep &&
	- (yet_to_sleep > slot)) {
	- /*
	- * We need to reschedule the hpts's time-out.
	- */
	- hpts->p_hpts_sleep_time = slot;
	- need_new_to = slot * HPTS_TICKS_PER_USEC;
	- }
	+ hpts->p_hpts_sleep_time = slot;
	+ need_new_to = slot * HPTS_TICKS_PER_SLOT;
	}
	- /*
	- * Now how far is the hpts sleeping to? if active is 1, its
	- * up and ticking we do nothing, otherwise we may need to
	- * reschedule its callout if need_new_to is set from above.
	- */
	- if (need_wakeup) {
	- hpts->p_direct_wake = 1;
	- tcp_wakehpts(hpts);
	- if (diag) {
	- diag->need_new_to = 0;
	- diag->co_ret = 0xffff0000;
	- }
	- } else if (need_new_to) {
	- int32_t co_ret;
	- struct timeval tv;
	- sbintime_t sb;
	+ }
	+ /*
	+ * Now how far is the hpts sleeping to? if active is 1, its
	+ * up and ticking we do nothing, otherwise we may need to
	+ * reschedule its callout if need_new_to is set from above.
	+ */
	+ if (need_wakeup) {
	+ hpts->p_direct_wake = 1;
	+ tcp_wakehpts(hpts);
	+ if (diag) {
	+ diag->need_new_to = 0;
	+ diag->co_ret = 0xffff0000;
	+ }
	+ } else if (need_new_to) {
	+ int32_t co_ret;
	+ struct timeval tv;
	+ sbintime_t sb;

	- tv.tv_sec = 0;
	- tv.tv_usec = 0;
	- while (need_new_to > HPTS_USEC_IN_SEC) {
	- tv.tv_sec++;
	- need_new_to -= HPTS_USEC_IN_SEC;
	- }
	- tv.tv_usec = need_new_to;
	- sb = tvtosbt(tv);
	- if (tcp_hpts_callout_skip_swi == 0) {
	- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
	- hpts_timeout_swi, hpts, hpts->p_cpu,
	- (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	- } else {
	- co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
	- hpts_timeout_dir, hpts,
	- hpts->p_cpu,
	- C_PREL(tcp_hpts_precision));
	- }
	- if (diag) {
	- diag->need_new_to = need_new_to;
	- diag->co_ret = co_ret;
	- }
	+ tv.tv_sec = 0;
	+ tv.tv_usec = 0;
	+ while (need_new_to > HPTS_USEC_IN_SEC) {
	+ tv.tv_sec++;
	+ need_new_to -= HPTS_USEC_IN_SEC;
	+ }
	+ tv.tv_usec = need_new_to;
	+ sb = tvtosbt(tv);
	+ cpu = (tcp_bind_threads \|\| hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
	+ co_ret = callout_reset_sbt_on(&hpts->co, sb, 0,
	+ hpts_timeout_swi, hpts, cpu,
	+ (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	+ if (diag) {
	+ diag->need_new_to = need_new_to;
	+ diag->co_ret = co_ret;
	}
	- } else {
	-#ifdef INVARIANTS
	- panic("Hpts:%p tp:%p already on hpts and add?", hpts, inp);
	-#endif
	}
	}

	@@ -1066,6 +1069,7 @@
	__tcp_hpts_insert(struct inpcb *inp, uint32_t slot, int32_t line){
	return (tcp_hpts_insert_diag(inp, slot, line, NULL));
	}
	+
	int
	__tcp_queue_to_input_locked(struct inpcb inp, struct tcp_hpts_entry hpts, int32_t line)
	{
	@@ -1076,18 +1080,20 @@
	/* Ok we need to set it on the hpts in the current slot */
	hpts_sane_input_insert(hpts, inp, line);
	retval = 1;
	- if (hpts->p_hpts_active == 0) {
	+ if ((hpts->p_hpts_active == 0) &&
	+ (hpts->p_on_min_sleep == 0)){
	/*
	* Activate the hpts if it is sleeping.
	*/
	retval = 2;
	hpts->p_direct_wake = 1;
	- tcp_wakeinput(hpts);
	+ tcp_wakehpts(hpts);
	}
	- } else if (hpts->p_hpts_active == 0) {
	+ } else if ((hpts->p_hpts_active == 0) &&
	+ (hpts->p_on_min_sleep == 0)){
	retval = 4;
	hpts->p_direct_wake = 1;
	- tcp_wakeinput(hpts);
	+ tcp_wakehpts(hpts);
	}
	return (retval);
	}
	@@ -1115,22 +1121,24 @@
	if (inp->inp_in_input == 0) {
	/* Ok we need to set it on the hpts in the current slot */
	hpts_sane_input_insert(hpts, inp, line);
	- if (hpts->p_hpts_active == 0) {
	+ if ((hpts->p_hpts_active == 0) &&
	+ (hpts->p_on_min_sleep == 0)){
	/*
	* Activate the hpts if it is sleeping.
	*/
	hpts->p_direct_wake = 1;
	- tcp_wakeinput(hpts);
	+ tcp_wakehpts(hpts);
	}
	- } else if (hpts->p_hpts_active == 0) {
	+ } else if ((hpts->p_hpts_active == 0) &&
	+ (hpts->p_on_min_sleep == 0)){
	hpts->p_direct_wake = 1;
	- tcp_wakeinput(hpts);
	+ tcp_wakehpts(hpts);
	}
	inp->inp_hpts_drop_reas = reason;
	mtx_unlock(&hpts->p_mtx);
	}

	-static uint16_t
	+uint16_t
	hpts_random_cpu(struct inpcb *inp){
	/*
	* No flow type set distribute the load randomly.
	@@ -1149,18 +1157,19 @@
	}
	/* Nothing set use a random number */
	ran = arc4random();
	- cpuid = (ran & 0xffff) % mp_ncpus;
	+ cpuid = (((ran & 0xffff) % mp_ncpus) % tcp_pace.rp_num_hptss);
	return (cpuid);
	}

	static uint16_t
	-hpts_cpuid(struct inpcb *inp)
	+hpts_cpuid(struct inpcb inp, int failed)
	{
	u_int cpuid;
	#if !defined(RSS) && defined(NUMA)
	struct hpts_domain_info *di;
	#endif

	+ *failed = 0;
	/*
	* If one has been set use it i.e. we want both in and out on the
	* same hpts.
	@@ -1170,6 +1179,17 @@
	} else if (inp->inp_hpts_cpu_set) {
	return (inp->inp_hpts_cpu);
	}
	+ /*
	+ * If we are using the irq cpu set by LRO or
	+ * the driver then it overrides all other domains.
	+ */
	+ if (tcp_use_irq_cpu) {
	+ if (inp->inp_irq_cpu_set == 0) {
	+ *failed = 1;
	+ return(0);
	+ }
	+ return(inp->inp_irq_cpu);
	+ }
	/* If one is set the other must be the same */
	#ifdef RSS
	cpuid = rss_hash2cpuid(inp->inp_flowid, inp->inp_flowtype);
	@@ -1183,9 +1203,10 @@
	* unknown cpuids to curcpu. Not the best, but apparently better
	* than defaulting to swi 0.
	*/
	-
	- if (inp->inp_flowtype == M_HASHTYPE_NONE)
	+ if (inp->inp_flowtype == M_HASHTYPE_NONE) {
	+ counter_u64_add(cpu_uses_random, 1);
	return (hpts_random_cpu(inp));
	+ }
	/*
	* Hash to a thread based on the flowid. If we are using numa,
	* then restrict the hash to the numa domain where the inp lives.
	@@ -1197,7 +1218,7 @@
	} else
	#endif
	cpuid = inp->inp_flowid % mp_ncpus;
	-
	+ counter_u64_add(cpu_uses_flowid, 1);
	return (cpuid);
	#endif
	}
	@@ -1323,7 +1344,7 @@
	kern_prefetch(tp->t_fb_ptr, &did_prefetch);
	did_prefetch = 1;
	}
	- if ((inp->inp_flags2 & INP_SUPPORTS_MBUFQ) && tp->t_in_pkt) {
	+ if ((tp->t_fb->tfb_do_queued_segments != NULL) && tp->t_in_pkt) {
	if (inp->inp_in_input)
	tcp_hpts_remove(inp, HPTS_REMOVE_INPUT);
	dropped = (*tp->t_fb->tfb_do_queued_segments)(inp->inp_socket, tp, 0);
	@@ -1357,23 +1378,51 @@
	}

	static void
	-tcp_hptsi(struct tcp_hpts_entry *hpts)
	+tcp_hpts_set_max_sleep(struct tcp_hpts_entry *hpts, int wrap_loop_cnt)
	+{
	+ uint32_t t = 0, i, fnd = 0;
	+
	+ if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
	+ /*
	+ * Find next slot that is occupied and use that to
	+ * be the sleep time.
	+ */
	+ for (i = 0, t = hpts_slot(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
	+ if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
	+ fnd = 1;
	+ break;
	+ }
	+ t = (t + 1) % NUM_OF_HPTSI_SLOTS;
	+ }
	+ KASSERT(fnd != 0, ("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt));
	+ hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
	+ } else {
	+ /* No one on the wheel sleep for all but 400 slots or sleep max */
	+ hpts->p_hpts_sleep_time = hpts_sleep_max;
	+ }
	+}
	+
	+static int32_t
	+tcp_hptsi(struct tcp_hpts_entry *hpts, int from_callout)
	{
	struct tcpcb *tp;
	struct inpcb inp = NULL, ninp;
	struct timeval tv;
	- int32_t ticks_to_run, i, error;
	+ uint64_t total_slots_processed = 0;
	+ int32_t slots_to_run, i, error;
	int32_t paced_cnt = 0;
	int32_t loop_cnt = 0;
	int32_t did_prefetch = 0;
	int32_t prefetch_ninp = 0;
	int32_t prefetch_tp = 0;
	int32_t wrap_loop_cnt = 0;
	+ int32_t slot_pos_of_endpoint = 0;
	+ int32_t orig_exit_slot;
	int16_t set_cpu;
	+ int8_t completed_measure = 0, seen_endpoint = 0;

	HPTS_MTX_ASSERT(hpts);
	NET_EPOCH_ASSERT();
	-
	/* record previous info for any logging */
	hpts->saved_lasttick = hpts->p_lasttick;
	hpts->saved_curtick = hpts->p_curtick;
	@@ -1382,7 +1431,8 @@

	hpts->p_lasttick = hpts->p_curtick;
	hpts->p_curtick = tcp_gethptstick(&tv);
	- hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	+ cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
	+ orig_exit_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	if ((hpts->p_on_queue_cnt == 0) \|\|
	(hpts->p_lasttick == hpts->p_curtick)) {
	/*
	@@ -1396,8 +1446,9 @@
	again:
	hpts->p_wheel_complete = 0;
	HPTS_MTX_ASSERT(hpts);
	- ticks_to_run = hpts_ticks_diff(hpts->p_prev_slot, hpts->p_cur_slot);
	- if (((hpts->p_curtick - hpts->p_lasttick) > ticks_to_run) &&
	+ slots_to_run = hpts_slots_diff(hpts->p_prev_slot, hpts->p_cur_slot);
	+ if (((hpts->p_curtick - hpts->p_lasttick) >
	+ ((NUM_OF_HPTSI_SLOTS-1) * HPTS_TICKS_PER_SLOT)) &&
	(hpts->p_on_queue_cnt != 0)) {
	/*
	* Wheel wrap is occuring, basically we
	@@ -1416,8 +1467,8 @@
	* first slot at the head.
	*/
	wrap_loop_cnt++;
	- hpts->p_nxt_slot = hpts_tick(hpts->p_prev_slot, 1);
	- hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 2);
	+ hpts->p_nxt_slot = hpts_slot(hpts->p_prev_slot, 1);
	+ hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 2);
	/*
	* Adjust p_cur_slot to be where we are starting from
	* hopefully we will catch up (fat chance if something
	@@ -1438,58 +1489,61 @@
	* INP lock and the pacer mutex to change the inp_hptsslot.
	*/
	TAILQ_FOREACH(inp, &hpts->p_hptss[hpts->p_nxt_slot], inp_hpts) {
	- inp->inp_hptsslot = hpts->p_runningtick;
	+ inp->inp_hptsslot = hpts->p_runningslot;
	}
	#endif
	- TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningtick],
	+ TAILQ_CONCAT(&hpts->p_hptss[hpts->p_runningslot],
	&hpts->p_hptss[hpts->p_nxt_slot], inp_hpts);
	- ticks_to_run = NUM_OF_HPTSI_SLOTS - 1;
	+ slots_to_run = NUM_OF_HPTSI_SLOTS - 1;
	counter_u64_add(wheel_wrap, 1);
	} else {
	/*
	- * Nxt slot is always one after p_runningtick though
	+ * Nxt slot is always one after p_runningslot though
	* its not used usually unless we are doing wheel wrap.
	*/
	hpts->p_nxt_slot = hpts->p_prev_slot;
	- hpts->p_runningtick = hpts_tick(hpts->p_prev_slot, 1);
	- }
	-#ifdef INVARIANTS
	- if (TAILQ_EMPTY(&hpts->p_input) &&
	- (hpts->p_on_inqueue_cnt != 0)) {
	- panic("tp:%p in_hpts input empty but cnt:%d",
	- hpts, hpts->p_on_inqueue_cnt);
	+ hpts->p_runningslot = hpts_slot(hpts->p_prev_slot, 1);
	}
	-#endif
	+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) \|\|
	+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
	+ ("%s hpts:%p in_hpts cnt:%d and queue state mismatch",
	+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
	HPTS_MTX_ASSERT(hpts);
	if (hpts->p_on_queue_cnt == 0) {
	goto no_one;
	}
	HPTS_MTX_ASSERT(hpts);
	- for (i = 0; i < ticks_to_run; i++) {
	+ for (i = 0; i < slots_to_run; i++) {
	/*
	* Calculate our delay, if there are no extra ticks there
	- * was not any (i.e. if ticks_to_run == 1, no delay).
	+ * was not any (i.e. if slots_to_run == 1, no delay).
	*/
	- hpts->p_delayed_by = (ticks_to_run - (i + 1)) * HPTS_TICKS_PER_USEC;
	+ hpts->p_delayed_by = (slots_to_run - (i + 1)) * HPTS_TICKS_PER_SLOT;
	HPTS_MTX_ASSERT(hpts);
	- while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
	+ while ((inp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
	+ HPTS_MTX_ASSERT(hpts);
	/* For debugging */
	+ if (seen_endpoint == 0) {
	+ seen_endpoint = 1;
	+ orig_exit_slot = slot_pos_of_endpoint = hpts->p_runningslot;
	+ } else if (completed_measure == 0) {
	+ /* Record the new position */
	+ orig_exit_slot = hpts->p_runningslot;
	+ }
	+ total_slots_processed++;
	hpts->p_inp = inp;
	paced_cnt++;
	-#ifdef INVARIANTS
	- if (hpts->p_runningtick != inp->inp_hptsslot) {
	- panic("Hpts:%p inp:%p slot mis-aligned %u vs %u",
	- hpts, inp, hpts->p_runningtick, inp->inp_hptsslot);
	- }
	-#endif
	+ KASSERT(hpts->p_runningslot == inp->inp_hptsslot,
	+ ("Hpts:%p inp:%p slot mis-aligned %u vs %u",
	+ hpts, inp, hpts->p_runningslot, inp->inp_hptsslot));
	/* Now pull it */
	if (inp->inp_hpts_cpu_set == 0) {
	set_cpu = 1;
	} else {
	set_cpu = 0;
	}
	- hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningtick], 0);
	- if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningtick])) != NULL) {
	+ hpts_sane_pace_remove(hpts, inp, &hpts->p_hptss[hpts->p_runningslot], 0);
	+ if ((ninp = TAILQ_FIRST(&hpts->p_hptss[hpts->p_runningslot])) != NULL) {
	/* We prefetch the next inp if possible */
	kern_prefetch(ninp, &prefetch_ninp);
	prefetch_ninp = 1;
	@@ -1501,22 +1555,22 @@
	* Push him back on the wheel or run it
	* depending.
	*/
	- uint32_t maxticks, last_tick, remaining_slots;
	+ uint32_t maxslots, last_slot, remaining_slots;

	- remaining_slots = ticks_to_run - (i + 1);
	+ remaining_slots = slots_to_run - (i + 1);
	if (inp->inp_hpts_request > remaining_slots) {
	/*
	* How far out can we go?
	*/
	- maxticks = max_ticks_available(hpts, hpts->p_cur_slot, &last_tick);
	- if (maxticks >= inp->inp_hpts_request) {
	+ maxslots = max_slots_available(hpts, hpts->p_cur_slot, &last_slot);
	+ if (maxslots >= inp->inp_hpts_request) {
	/* we can place it finally to be processed */
	- inp->inp_hptsslot = hpts_tick(hpts->p_runningtick, inp->inp_hpts_request);
	+ inp->inp_hptsslot = hpts_slot(hpts->p_runningslot, inp->inp_hpts_request);
	inp->inp_hpts_request = 0;
	} else {
	/* Work off some more time */
	- inp->inp_hptsslot = last_tick;
	- inp->inp_hpts_request-= maxticks;
	+ inp->inp_hptsslot = last_slot;
	+ inp->inp_hpts_request-= maxslots;
	}
	hpts_sane_pace_insert(hpts, inp, &hpts->p_hptss[inp->inp_hptsslot], __LINE__, 1);
	hpts->p_inp = NULL;
	@@ -1542,12 +1596,9 @@
	if ((inp->inp_flags & (INP_TIMEWAIT \| INP_DROPPED)) \|\|
	(inp->inp_flags2 & INP_FREED)) {
	out_now:
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx)) {
	- panic("Hpts:%p owns mtx prior-to lock line:%d",
	- hpts, __LINE__);
	- }
	-#endif
	+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
	+ ("Hpts:%p owns mtx prior-to lock line:%d",
	+ hpts, __LINE__));
	INP_WUNLOCK(inp);
	mtx_lock(&hpts->p_mtx);
	hpts->p_inp = NULL;
	@@ -1582,7 +1633,7 @@
	#endif
	/* Lets do any logging that we might want to */
	if (hpts_does_tp_logging && (tp->t_logstate != TCP_LOG_STATE_OFF)) {
	- tcp_hpts_log(hpts, tp, &tv, ticks_to_run, i);
	+ tcp_hpts_log(hpts, tp, &tv, slots_to_run, i, from_callout);
	}
	/*
	* There is a hole here, we get the refcnt on the
	@@ -1592,12 +1643,10 @@
	* fini gets the lock first we are assured of having
	* a sane INP we can lock and test.
	*/
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx)) {
	- panic("Hpts:%p owns mtx before tcp-output:%d",
	- hpts, __LINE__);
	- }
	-#endif
	+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
	+ ("Hpts:%p owns mtx prior-to tcp_output call line:%d",
	+ hpts, __LINE__));
	+
	if (tp->t_fb_ptr != NULL) {
	kern_prefetch(tp->t_fb_ptr, &did_prefetch);
	did_prefetch = 1;
	@@ -1653,20 +1702,27 @@
	CURVNET_RESTORE();
	#endif
	INP_UNLOCK_ASSERT(inp);
	-#ifdef INVARIANTS
	- if (mtx_owned(&hpts->p_mtx)) {
	- panic("Hpts:%p owns mtx prior-to lock line:%d",
	- hpts, __LINE__);
	- }
	-#endif
	+ KASSERT(mtx_owned(&hpts->p_mtx) == 0,
	+ ("Hpts:%p owns mtx prior-to lock line:%d",
	+ hpts, __LINE__));
	mtx_lock(&hpts->p_mtx);
	hpts->p_inp = NULL;
	}
	+ if (seen_endpoint) {
	+ /*
	+ * We now have a accurate distance between
	+ * slot_pos_of_endpoint <-> orig_exit_slot
	+ * to tell us how late we were, orig_exit_slot
	+ * is where we calculated the end of our cycle to
	+ * be when we first entered.
	+ */
	+ completed_measure = 1;
	+ }
	HPTS_MTX_ASSERT(hpts);
	hpts->p_inp = NULL;
	- hpts->p_runningtick++;
	- if (hpts->p_runningtick >= NUM_OF_HPTSI_SLOTS) {
	- hpts->p_runningtick = 0;
	+ hpts->p_runningslot++;
	+ if (hpts->p_runningslot >= NUM_OF_HPTSI_SLOTS) {
	+ hpts->p_runningslot = 0;
	}
	}
	no_one:
	@@ -1676,16 +1732,13 @@
	* Check to see if we took an excess amount of time and need to run
	* more ticks (if we did not hit eno-bufs).
	*/
	-#ifdef INVARIANTS
	- if (TAILQ_EMPTY(&hpts->p_input) &&
	- (hpts->p_on_inqueue_cnt != 0)) {
	- panic("tp:%p in_hpts input empty but cnt:%d",
	- hpts, hpts->p_on_inqueue_cnt);
	- }
	-#endif
	+ KASSERT((((TAILQ_EMPTY(&hpts->p_input) != 0) && (hpts->p_on_inqueue_cnt == 0)) \|\|
	+ ((TAILQ_EMPTY(&hpts->p_input) == 0) && (hpts->p_on_inqueue_cnt > 0))),
	+ ("%s hpts:%p in_hpts cnt:%d queue state mismatch",
	+ __FUNCTION__, hpts, hpts->p_on_inqueue_cnt));
	hpts->p_prev_slot = hpts->p_cur_slot;
	hpts->p_lasttick = hpts->p_curtick;
	- if (loop_cnt > max_pacer_loops) {
	+ if ((from_callout == 0) \|\| (loop_cnt > max_pacer_loops)) {
	/*
	* Something is serious slow we have
	* looped through processing the wheel
	@@ -1700,11 +1753,16 @@
	* correct. When it next awakens
	* it will find itself further behind.
	*/
	- counter_u64_add(hpts_hopelessly_behind, 1);
	+ if (from_callout)
	+ counter_u64_add(hpts_hopelessly_behind, 1);
	goto no_run;
	}
	hpts->p_curtick = tcp_gethptstick(&tv);
	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	+ if (seen_endpoint == 0) {
	+ /* We saw no endpoint but we may be looping */
	+ orig_exit_slot = hpts->p_cur_slot;
	+ }
	if ((wrap_loop_cnt < 2) &&
	(hpts->p_lasttick != hpts->p_curtick)) {
	counter_u64_add(hpts_loops, 1);
	@@ -1712,6 +1770,7 @@
	goto again;
	}
	no_run:
	+ cts_last_ran[hpts->p_num] = tcp_tv_to_usectick(&tv);
	/*
	* Set flag to tell that we are done for
	* any slot input that happens during
	@@ -1725,76 +1784,58 @@
	if (!TAILQ_EMPTY(&hpts->p_input)) {
	tcp_input_data(hpts, &tv);
	/*
	- * Now did we spend too long running
	- * input and need to run more ticks?
	+ * Now did we spend too long running input and need to run more ticks?
	+ * Note that if wrap_loop_cnt < 2 then we should have the conditions
	+ * in the KASSERT's true. But if the wheel is behind i.e. wrap_loop_cnt
	+ * is greater than 2, then the condtion most likely are not true. Also
	+ * if we are called not from the callout, we don't run the wheel multiple
	+ * times so the slots may not align either.
	*/
	- KASSERT(hpts->p_prev_slot == hpts->p_cur_slot,
	+ KASSERT(((hpts->p_prev_slot == hpts->p_cur_slot) \|\|
	+ (wrap_loop_cnt >= 2) \|\| (from_callout == 0)),
	("H:%p p_prev_slot:%u not equal to p_cur_slot:%u", hpts,
	hpts->p_prev_slot, hpts->p_cur_slot));
	- KASSERT(hpts->p_lasttick == hpts->p_curtick,
	+ KASSERT(((hpts->p_lasttick == hpts->p_curtick)
	+ \|\| (wrap_loop_cnt >= 2) \|\| (from_callout == 0)),
	("H:%p p_lasttick:%u not equal to p_curtick:%u", hpts,
	hpts->p_lasttick, hpts->p_curtick));
	- hpts->p_curtick = tcp_gethptstick(&tv);
	- if (hpts->p_lasttick != hpts->p_curtick) {
	+ if (from_callout && (hpts->p_lasttick != hpts->p_curtick)) {
	+ hpts->p_curtick = tcp_gethptstick(&tv);
	counter_u64_add(hpts_loops, 1);
	hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	goto again;
	}
	}
	- {
	- uint32_t t = 0, i, fnd = 0;
	-
	- if ((hpts->p_on_queue_cnt) && (wrap_loop_cnt < 2)) {
	- /*
	- * Find next slot that is occupied and use that to
	- * be the sleep time.
	- */
	- for (i = 0, t = hpts_tick(hpts->p_cur_slot, 1); i < NUM_OF_HPTSI_SLOTS; i++) {
	- if (TAILQ_EMPTY(&hpts->p_hptss[t]) == 0) {
	- fnd = 1;
	- break;
	- }
	- t = (t + 1) % NUM_OF_HPTSI_SLOTS;
	- }
	- if (fnd) {
	- hpts->p_hpts_sleep_time = min((i + 1), hpts_sleep_max);
	- } else {
	-#ifdef INVARIANTS
	- panic("Hpts:%p cnt:%d but none found", hpts, hpts->p_on_queue_cnt);
	-#endif
	- counter_u64_add(back_tosleep, 1);
	- hpts->p_on_queue_cnt = 0;
	- goto non_found;
	- }
	- } else if (wrap_loop_cnt >= 2) {
	- /* Special case handling */
	- hpts->p_hpts_sleep_time = tcp_min_hptsi_time;
	- } else {
	- /* No one on the wheel sleep for all but 400 slots or sleep max */
	- non_found:
	- hpts->p_hpts_sleep_time = hpts_sleep_max;
	- }
	+ if (from_callout){
	+ tcp_hpts_set_max_sleep(hpts, wrap_loop_cnt);
	}
	+ if (seen_endpoint)
	+ return(hpts_slots_diff(slot_pos_of_endpoint, orig_exit_slot));
	+ else
	+ return (0);
	}

	void
	__tcp_set_hpts(struct inpcb *inp, int32_t line)
	{
	struct tcp_hpts_entry *hpts;
	+ int failed;

	INP_WLOCK_ASSERT(inp);
	hpts = tcp_hpts_lock(inp);
	if ((inp->inp_in_hpts == 0) &&
	(inp->inp_hpts_cpu_set == 0)) {
	- inp->inp_hpts_cpu = hpts_cpuid(inp);
	- inp->inp_hpts_cpu_set = 1;
	+ inp->inp_hpts_cpu = hpts_cpuid(inp, &failed);
	+ if (failed == 0)
	+ inp->inp_hpts_cpu_set = 1;
	}
	mtx_unlock(&hpts->p_mtx);
	hpts = tcp_input_lock(inp);
	if ((inp->inp_input_cpu_set == 0) &&
	(inp->inp_in_input == 0)) {
	- inp->inp_input_cpu = hpts_cpuid(inp);
	- inp->inp_input_cpu_set = 1;
	+ inp->inp_input_cpu = hpts_cpuid(inp, &failed);
	+ if (failed == 0)
	+ inp->inp_input_cpu_set = 1;
	}
	mtx_unlock(&hpts->p_mtx);
	}
	@@ -1804,6 +1845,127 @@
	return (tcp_pace.rp_ent[inp->inp_hpts_cpu]->p_delayed_by);
	}

	+static void
	+__tcp_run_hpts(struct tcp_hpts_entry *hpts)
	+{
	+ int ticks_ran;
	+
	+ if (hpts->p_hpts_active) {
	+ /* Already active */
	+ return;
	+ }
	+ if (mtx_trylock(&hpts->p_mtx) == 0) {
	+ /* Someone else got the lock */
	+ return;
	+ }
	+ if (hpts->p_hpts_active)
	+ goto out_with_mtx;
	+ hpts->syscall_cnt++;
	+ counter_u64_add(hpts_direct_call, 1);
	+ hpts->p_hpts_active = 1;
	+ ticks_ran = tcp_hptsi(hpts, 0);
	+ /* We may want to adjust the sleep values here */
	+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
	+ if (ticks_ran > ticks_indicate_less_sleep) {
	+ struct timeval tv;
	+ sbintime_t sb;
	+ int cpu;
	+
	+ hpts->p_mysleep.tv_usec /= 2;
	+ if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
	+ hpts->p_mysleep.tv_usec = dynamic_min_sleep;
	+ /* Reschedule with new to value */
	+ tcp_hpts_set_max_sleep(hpts, 0);
	+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
	+ /* Validate its in the right ranges */
	+ if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
	+ hpts->overidden_sleep = tv.tv_usec;
	+ tv.tv_usec = hpts->p_mysleep.tv_usec;
	+ } else if (tv.tv_usec > dynamic_max_sleep) {
	+ /* Lets not let sleep get above this value */
	+ hpts->overidden_sleep = tv.tv_usec;
	+ tv.tv_usec = dynamic_max_sleep;
	+ }
	+ /*
	+ * In this mode the timer is a backstop to
	+ * all the userret/lro_flushes so we use
	+ * the dynamic value and set the on_min_sleep
	+ * flag so we will not be awoken.
	+ */
	+ sb = tvtosbt(tv);
	+ cpu = (tcp_bind_threads \|\| hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
	+ /* Store off to make visible the actual sleep time */
	+ hpts->sleeping = tv.tv_usec;
	+ callout_reset_sbt_on(&hpts->co, sb, 0,
	+ hpts_timeout_swi, hpts, cpu,
	+ (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	+ } else if (ticks_ran < ticks_indicate_more_sleep) {
	+ /* For the further sleep, don't reschedule hpts */
	+ hpts->p_mysleep.tv_usec *= 2;
	+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
	+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
	+ }
	+ hpts->p_on_min_sleep = 1;
	+ }
	+ hpts->p_hpts_active = 0;
	+out_with_mtx:
	+ HPTS_MTX_ASSERT(hpts);
	+ mtx_unlock(&hpts->p_mtx);
	+}
	+
	+static struct tcp_hpts_entry *
	+tcp_choose_hpts_to_run()
	+{
	+ int i, oldest_idx;
	+ uint32_t cts, time_since_ran, calc;
	+
	+ if ((hpts_uses_oldest == 0) \|\|
	+ ((hpts_uses_oldest > 1) &&
	+ (tcp_pace.rp_ent[(tcp_pace.rp_num_hptss-1)]->p_on_queue_cnt >= hpts_uses_oldest))) {
	+ /*
	+ * We have either disabled the feature (0), or
	+ * we have crossed over the oldest threshold on the
	+ * last hpts. We use the last one for simplification
	+ * since we don't want to use the first one (it may
	+ * have starting connections that have not settled
	+ * on the cpu yet).
	+ */
	+ return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
	+ }
	+ /* Lets find the oldest hpts to attempt to run */
	+ cts = tcp_get_usecs(NULL);
	+ time_since_ran = 0;
	+ oldest_idx = -1;
	+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
	+ if (TSTMP_GT(cts, cts_last_ran[i]))
	+ calc = cts - cts_last_ran[i];
	+ else
	+ calc = 0;
	+ if (calc > time_since_ran) {
	+ oldest_idx = i;
	+ time_since_ran = calc;
	+ }
	+ }
	+ if (oldest_idx >= 0)
	+ return(tcp_pace.rp_ent[oldest_idx]);
	+ else
	+ return(tcp_pace.rp_ent[(curcpu % tcp_pace.rp_num_hptss)]);
	+}
	+
	+
	+void
	+tcp_run_hpts(void)
	+{
	+ static struct tcp_hpts_entry *hpts;
	+ struct epoch_tracker et;
	+
	+ NET_EPOCH_ENTER(et);
	+ hpts = tcp_choose_hpts_to_run();
	+ __tcp_run_hpts(hpts);
	+ NET_EPOCH_EXIT(et);
	+}
	+
	+
	static void
	tcp_hpts_thread(void *ctx)
	{
	@@ -1811,51 +1973,142 @@
	struct epoch_tracker et;
	struct timeval tv;
	sbintime_t sb;
	+ int cpu, ticks_ran;

	hpts = (struct tcp_hpts_entry *)ctx;
	mtx_lock(&hpts->p_mtx);
	if (hpts->p_direct_wake) {
	- /* Signaled by input */
	+ /* Signaled by input or output with low occupancy count. */
	callout_stop(&hpts->co);
	+ counter_u64_add(hpts_direct_awakening, 1);
	} else {
	- /* Timed out */
	+ /* Timed out, the normal case. */
	+ counter_u64_add(hpts_wake_timeout, 1);
	if (callout_pending(&hpts->co) \|\|
	!callout_active(&hpts->co)) {
	mtx_unlock(&hpts->p_mtx);
	return;
	}
	- callout_deactivate(&hpts->co);
	}
	+ callout_deactivate(&hpts->co);
	hpts->p_hpts_wake_scheduled = 0;
	- hpts->p_hpts_active = 1;
	NET_EPOCH_ENTER(et);
	- tcp_hptsi(hpts);
	- NET_EPOCH_EXIT(et);
	- HPTS_MTX_ASSERT(hpts);
	+ if (hpts->p_hpts_active) {
	+ /*
	+ * We are active already. This means that a syscall
	+ * trap or LRO is running in behalf of hpts. In that case
	+ * we need to double our timeout since there seems to be
	+ * enough activity in the system that we don't need to
	+ * run as often (if we were not directly woken).
	+ */
	+ if (hpts->p_direct_wake == 0) {
	+ counter_u64_add(hpts_back_tosleep, 1);
	+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
	+ hpts->p_mysleep.tv_usec *= 2;
	+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
	+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
	+ tv.tv_usec = hpts->p_mysleep.tv_usec;
	+ hpts->p_on_min_sleep = 1;
	+ } else {
	+ /*
	+ * Here we have low count on the wheel, but
	+ * somehow we still collided with one of the
	+ * connections. Lets go back to sleep for a
	+ * min sleep time, but clear the flag so we
	+ * can be awoken by insert.
	+ */
	+ hpts->p_on_min_sleep = 0;
	+ tv.tv_usec = tcp_min_hptsi_time;
	+ }
	+ } else {
	+ /*
	+ * Directly woken most likely to reset the
	+ * callout time.
	+ */
	+ tv.tv_sec = 0;
	+ tv.tv_usec = hpts->p_mysleep.tv_usec;
	+ }
	+ goto back_to_sleep;
	+ }
	+ hpts->sleeping = 0;
	+ hpts->p_hpts_active = 1;
	+ ticks_ran = tcp_hptsi(hpts, 1);
	tv.tv_sec = 0;
	- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
	- if (tcp_min_hptsi_time && (tv.tv_usec < tcp_min_hptsi_time)) {
	- hpts->overidden_sleep = tv.tv_usec;
	- tv.tv_usec = tcp_min_hptsi_time;
	+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
	+ if (hpts->p_on_queue_cnt >= conn_cnt_thresh) {
	+ if(hpts->p_direct_wake == 0) {
	+ /*
	+ * Only adjust sleep time if we were
	+ * called from the callout i.e. direct_wake == 0.
	+ */
	+ if (ticks_ran < ticks_indicate_more_sleep) {
	+ hpts->p_mysleep.tv_usec *= 2;
	+ if (hpts->p_mysleep.tv_usec > dynamic_max_sleep)
	+ hpts->p_mysleep.tv_usec = dynamic_max_sleep;
	+ } else if (ticks_ran > ticks_indicate_less_sleep) {
	+ hpts->p_mysleep.tv_usec /= 2;
	+ if (hpts->p_mysleep.tv_usec < dynamic_min_sleep)
	+ hpts->p_mysleep.tv_usec = dynamic_min_sleep;
	+ }
	+ }
	+ if (tv.tv_usec < hpts->p_mysleep.tv_usec) {
	+ hpts->overidden_sleep = tv.tv_usec;
	+ tv.tv_usec = hpts->p_mysleep.tv_usec;
	+ } else if (tv.tv_usec > dynamic_max_sleep) {
	+ /* Lets not let sleep get above this value */
	+ hpts->overidden_sleep = tv.tv_usec;
	+ tv.tv_usec = dynamic_max_sleep;
	+ }
	+ /*
	+ * In this mode the timer is a backstop to
	+ * all the userret/lro_flushes so we use
	+ * the dynamic value and set the on_min_sleep
	+ * flag so we will not be awoken.
	+ */
	hpts->p_on_min_sleep = 1;
	- } else {
	- /* Clear the min sleep flag */
	- hpts->overidden_sleep = 0;
	+ } else if (hpts->p_on_queue_cnt == 0) {
	+ /*
	+ * No one on the wheel, please wake us up
	+ * if you insert on the wheel.
	+ */
	hpts->p_on_min_sleep = 0;
	- }
	- hpts->p_hpts_active = 0;
	- sb = tvtosbt(tv);
	- if (tcp_hpts_callout_skip_swi == 0) {
	- callout_reset_sbt_on(&hpts->co, sb, 0,
	- hpts_timeout_swi, hpts, hpts->p_cpu,
	- (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	+ hpts->overidden_sleep = 0;
	} else {
	- callout_reset_sbt_on(&hpts->co, sb, 0,
	- hpts_timeout_dir, hpts,
	- hpts->p_cpu,
	- C_PREL(tcp_hpts_precision));
	+ /*
	+ * We hit here when we have a low number of
	+ * clients on the wheel (our else clause).
	+ * We may need to go on min sleep, if we set
	+ * the flag we will not be awoken if someone
	+ * is inserted ahead of us. Clearing the flag
	+ * means we can be awoken. This is "old mode"
	+ * where the timer is what runs hpts mainly.
	+ */
	+ if (tv.tv_usec < tcp_min_hptsi_time) {
	+ /*
	+ * Yes on min sleep, which means
	+ * we cannot be awoken.
	+ */
	+ hpts->overidden_sleep = tv.tv_usec;
	+ tv.tv_usec = tcp_min_hptsi_time;
	+ hpts->p_on_min_sleep = 1;
	+ } else {
	+ /* Clear the min sleep flag */
	+ hpts->overidden_sleep = 0;
	+ hpts->p_on_min_sleep = 0;
	+ }
	}
	+ HPTS_MTX_ASSERT(hpts);
	+ hpts->p_hpts_active = 0;
	+back_to_sleep:
	hpts->p_direct_wake = 0;
	+ sb = tvtosbt(tv);
	+ cpu = (tcp_bind_threads \|\| hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
	+ /* Store off to make visible the actual sleep time */
	+ hpts->sleeping = tv.tv_usec;
	+ callout_reset_sbt_on(&hpts->co, sb, 0,
	+ hpts_timeout_swi, hpts, cpu,
	+ (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	+ NET_EPOCH_EXIT(et);
	mtx_unlock(&hpts->p_mtx);
	}

	@@ -1873,7 +2126,7 @@
	cpuset_t cs;
	char unit[16];
	uint32_t ncpus = mp_ncpus ? mp_ncpus : MAXCPU;
	- int count, domain;
	+ int count, domain, cpu;

	tcp_pace.rp_proc = NULL;
	tcp_pace.rp_num_hptss = ncpus;
	@@ -1882,8 +2135,18 @@
	back_tosleep = counter_u64_alloc(M_WAITOK);
	combined_wheel_wrap = counter_u64_alloc(M_WAITOK);
	wheel_wrap = counter_u64_alloc(M_WAITOK);
	+ hpts_wake_timeout = counter_u64_alloc(M_WAITOK);
	+ hpts_direct_awakening = counter_u64_alloc(M_WAITOK);
	+ hpts_back_tosleep = counter_u64_alloc(M_WAITOK);
	+ hpts_direct_call = counter_u64_alloc(M_WAITOK);
	+ cpu_uses_flowid = counter_u64_alloc(M_WAITOK);
	+ cpu_uses_random = counter_u64_alloc(M_WAITOK);
	+
	+
	sz = (tcp_pace.rp_num_hptss * sizeof(struct tcp_hpts_entry *));
	tcp_pace.rp_ent = malloc(sz, M_TCPHPTS, M_WAITOK \| M_ZERO);
	+ sz = (sizeof(uint32_t) * tcp_pace.rp_num_hptss);
	+ cts_last_ran = malloc(sz, M_TCPHPTS, M_WAITOK);
	asz = sizeof(struct hptsh) * NUM_OF_HPTSI_SLOTS;
	for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
	tcp_pace.rp_ent[i] = malloc(sizeof(struct tcp_hpts_entry),
	@@ -1933,19 +2196,41 @@
	SYSCTL_ADD_UINT(&hpts->hpts_ctx,
	SYSCTL_CHILDREN(hpts->hpts_root),
	OID_AUTO, "runtick", CTLFLAG_RD,
	- &hpts->p_runningtick, 0,
	+ &hpts->p_runningslot, 0,
	"What the running pacers current slot is");
	SYSCTL_ADD_UINT(&hpts->hpts_ctx,
	SYSCTL_CHILDREN(hpts->hpts_root),
	OID_AUTO, "curtick", CTLFLAG_RD,
	&hpts->p_curtick, 0,
	"What the running pacers last tick mapped to the wheel was");
	+ SYSCTL_ADD_UINT(&hpts->hpts_ctx,
	+ SYSCTL_CHILDREN(hpts->hpts_root),
	+ OID_AUTO, "lastran", CTLFLAG_RD,
	+ &cts_last_ran[i], 0,
	+ "The last usec tick that this hpts ran");
	+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
	+ SYSCTL_CHILDREN(hpts->hpts_root),
	+ OID_AUTO, "cur_min_sleep", CTLFLAG_RD,
	+ &hpts->p_mysleep.tv_usec, 0,
	+ "What the running pacers is using for p_mysleep.tv_usec");
	+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
	+ SYSCTL_CHILDREN(hpts->hpts_root),
	+ OID_AUTO, "now_sleeping", CTLFLAG_RD,
	+ &hpts->sleeping, 0,
	+ "What the running pacers is actually sleeping for");
	+ SYSCTL_ADD_U64(&hpts->hpts_ctx,
	+ SYSCTL_CHILDREN(hpts->hpts_root),
	+ OID_AUTO, "syscall_cnt", CTLFLAG_RD,
	+ &hpts->syscall_cnt, 0,
	+ "How many times we had syscalls on this hpts");
	+
	hpts->p_hpts_sleep_time = hpts_sleep_max;
	hpts->p_num = i;
	hpts->p_curtick = tcp_gethptstick(&tv);
	+ cts_last_ran[i] = tcp_tv_to_usectick(&tv);
	hpts->p_prev_slot = hpts->p_cur_slot = tick_to_wheel(hpts->p_curtick);
	hpts->p_cpu = 0xffff;
	- hpts->p_nxt_slot = hpts_tick(hpts->p_cur_slot, 1);
	+ hpts->p_nxt_slot = hpts_slot(hpts->p_cur_slot, 1);
	callout_init(&hpts->co, 1);
	}

	@@ -1956,17 +2241,18 @@
	/*
	* Now lets start ithreads to handle the hptss.
	*/
	- CPU_FOREACH(i) {
	+ for (i = 0; i < tcp_pace.rp_num_hptss; i++) {
	hpts = tcp_pace.rp_ent[i];
	hpts->p_cpu = i;
	error = swi_add(&hpts->ie, "hpts",
	tcp_hpts_thread, (void *)hpts,
	SWI_NET, INTR_MPSAFE, &hpts->ie_cookie);
	- if (error) {
	- panic("Can't add hpts:%p i:%d err:%d",
	- hpts, i, error);
	- }
	+ KASSERT(error == 0,
	+ ("Can't add hpts:%p i:%d err:%d",
	+ hpts, i, error));
	created++;
	+ hpts->p_mysleep.tv_sec = 0;
	+ hpts->p_mysleep.tv_usec = tcp_min_hptsi_time;
	if (tcp_bind_threads == 1) {
	if (intr_event_bind(hpts->ie, i) == 0)
	bound++;
	@@ -1983,18 +2269,13 @@
	}
	}
	tv.tv_sec = 0;
	- tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_USEC;
	+ tv.tv_usec = hpts->p_hpts_sleep_time * HPTS_TICKS_PER_SLOT;
	+ hpts->sleeping = tv.tv_usec;
	sb = tvtosbt(tv);
	- if (tcp_hpts_callout_skip_swi == 0) {
	- callout_reset_sbt_on(&hpts->co, sb, 0,
	- hpts_timeout_swi, hpts, hpts->p_cpu,
	- (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	- } else {
	- callout_reset_sbt_on(&hpts->co, sb, 0,
	- hpts_timeout_dir, hpts,
	- hpts->p_cpu,
	- C_PREL(tcp_hpts_precision));
	- }
	+ cpu = (tcp_bind_threads \|\| hpts_use_assigned_cpu) ? hpts->p_cpu : curcpu;
	+ callout_reset_sbt_on(&hpts->co, sb, 0,
	+ hpts_timeout_swi, hpts, cpu,
	+ (C_DIRECT_EXEC \| C_PREL(tcp_hpts_precision)));
	}
	/*
	* If we somehow have an empty domain, fall back to choosing
	@@ -2006,11 +2287,13 @@
	break;
	}
	}
	-
	printf("TCP Hpts created %d swi interrupt threads and bound %d to %s\n",
	created, bound,
	tcp_bind_threads == 2 ? "NUMA domains" : "cpus");
	+#ifdef INVARIANTS
	+ printf("HPTS is in INVARIANT mode!!\n");
	+#endif
	}

	-SYSINIT(tcphptsi, SI_SUB_KTHREAD_IDLE, SI_ORDER_ANY, tcp_init_hptsi, NULL);
	+SYSINIT(tcphptsi, SI_SUB_SOFTINTR, SI_ORDER_ANY, tcp_init_hptsi, NULL);
	MODULE_VERSION(tcphpts, 1);
	Index: sys/netinet/tcp_lro.h
	===================================================================
	--- sys/netinet/tcp_lro.h
	+++ sys/netinet/tcp_lro.h
	@@ -56,6 +56,11 @@
	#define TSTMP_LRO 0x0100
	#define TSTMP_HDWR 0x0200
	#define HAS_TSTMP 0x0400
	+/*
	+ * Default number of interrupts on the same cpu in a row
	+ * that will cause us to declare a "affinity cpu".
	+ */
	+#define TCP_LRO_CPU_DECLARATION_THRESH 50

	struct inpcb;

	@@ -162,12 +167,15 @@
	unsigned lro_mbuf_count;
	unsigned lro_mbuf_max;
	unsigned short lro_ackcnt_lim; /* max # of aggregated ACKs */
	+ unsigned short lro_cpu; /* Guess at the cpu we have affinity too */
	unsigned lro_length_lim; /* max len of aggregated data */
	-
	u_long lro_hashsz;
	+ uint32_t lro_last_cpu;
	+ uint32_t lro_cnt_of_same_cpu;
	struct lro_head *lro_hash;
	struct lro_head lro_active;
	struct lro_head lro_free;
	+ uint8_t lro_cpu_is_set; /* Flag to say its ok to set the CPU on the inp */
	};

	struct tcp_ackent {
	Index: sys/netinet/tcp_lro.c
	===================================================================
	--- sys/netinet/tcp_lro.c
	+++ sys/netinet/tcp_lro.c
	@@ -107,6 +107,11 @@
	CTLFLAG_RDTUN \| CTLFLAG_MPSAFE, &tcp_lro_entries, 0,
	"default number of LRO entries");

	+static uint32_t tcp_lro_cpu_set_thresh = TCP_LRO_CPU_DECLARATION_THRESH;
	+SYSCTL_UINT(_net_inet_tcp_lro, OID_AUTO, lro_cpu_threshold,
	+ CTLFLAG_RDTUN \| CTLFLAG_MPSAFE, &tcp_lro_cpu_set_thresh, 0,
	+ "Number of interrups in a row on the same CPU that will make us declare an 'affinity' cpu?");
	+
	SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, fullqueue, CTLFLAG_RD,
	&tcp_inp_lro_direct_queue, "Number of lro's fully queued to transport");
	SYSCTL_COUNTER_U64(_net_inet_tcp_lro, OID_AUTO, wokeup, CTLFLAG_RD,
	@@ -631,12 +636,13 @@
	log.u_bbr.rttProp = le->m_head->m_pkthdr.rcv_tstmp;
	}
	log.u_bbr.inflight = th_seq;
	+ log.u_bbr.delivered = th_ack;
	log.u_bbr.timeStamp = cts;
	log.u_bbr.epoch = le->next_seq;
	- log.u_bbr.delivered = th_ack;
	log.u_bbr.lt_epoch = le->ack_seq;
	log.u_bbr.pacing_gain = th_win;
	log.u_bbr.cwnd_gain = le->window;
	+ log.u_bbr.lost = curcpu;
	log.u_bbr.cur_del_rate = (uintptr_t)m;
	log.u_bbr.bw_inuse = (uintptr_t)le->m_head;
	bintime2timeval(&lc->lro_last_queue_time, &btv);
	@@ -1273,7 +1279,10 @@
	INP_WUNLOCK(inp);
	return (TCP_LRO_CANNOT);
	}
	-
	+ if ((inp->inp_irq_cpu_set == 0) && (lc->lro_cpu_is_set == 1)) {
	+ inp->inp_irq_cpu = lc->lro_last_cpu;
	+ inp->inp_irq_cpu_set = 1;
	+ }
	/* Check if the transport doesn't support the needed optimizations. */
	if ((inp->inp_flags2 & (INP_SUPPORTS_MBUFQ \| INP_MBUF_ACKCMP)) == 0) {
	INP_WUNLOCK(inp);
	@@ -1445,7 +1454,17 @@
	/* check if no mbufs to flush */
	if (lc->lro_mbuf_count == 0)
	goto done;
	-
	+ if (lc->lro_cpu_is_set == 0) {
	+ if (lc->lro_last_cpu == curcpu) {
	+ lc->lro_cnt_of_same_cpu++;
	+ /* Have we reached the threshold to declare a cpu? */
	+ if (lc->lro_cnt_of_same_cpu > tcp_lro_cpu_set_thresh)
	+ lc->lro_cpu_is_set = 1;
	+ } else {
	+ lc->lro_last_cpu = curcpu;
	+ lc->lro_cnt_of_same_cpu = 0;
	+ }
	+ }
	CURVNET_SET(lc->ifp->if_vnet);

	/* get current time */
	@@ -1486,6 +1505,9 @@
	/* flush active streams */
	tcp_lro_rx_done(lc);

	+#ifdef TCPHPTS
	+ tcp_run_hpts();
	+#endif
	lc->lro_mbuf_count = 0;
	}

	Index: sys/netinet/tcp_stacks/bbr.c
	===================================================================
	--- sys/netinet/tcp_stacks/bbr.c
	+++ sys/netinet/tcp_stacks/bbr.c
	@@ -2429,10 +2429,10 @@
	log.u_bbr.pkts_out = diag->co_ret;
	log.u_bbr.applimited = diag->hpts_sleep_time;
	log.u_bbr.delivered = diag->p_prev_slot;
	- log.u_bbr.inflight = diag->p_runningtick;
	- log.u_bbr.bw_inuse = diag->wheel_tick;
	+ log.u_bbr.inflight = diag->p_runningslot;
	+ log.u_bbr.bw_inuse = diag->wheel_slot;
	log.u_bbr.rttProp = diag->wheel_cts;
	- log.u_bbr.delRate = diag->maxticks;
	+ log.u_bbr.delRate = diag->maxslots;
	log.u_bbr.cur_del_rate = diag->p_curtick;
	log.u_bbr.cur_del_rate <<= 32;
	log.u_bbr.cur_del_rate \|= diag->p_lasttick;
	Index: sys/netinet/tcp_stacks/rack.c
	===================================================================
	--- sys/netinet/tcp_stacks/rack.c
	+++ sys/netinet/tcp_stacks/rack.c
	@@ -5609,11 +5609,11 @@
	log.u_bbr.pkts_out = diag->co_ret;
	log.u_bbr.applimited = diag->hpts_sleep_time;
	log.u_bbr.delivered = diag->p_prev_slot;
	- log.u_bbr.inflight = diag->p_runningtick;
	- log.u_bbr.bw_inuse = diag->wheel_tick;
	+ log.u_bbr.inflight = diag->p_runningslot;
	+ log.u_bbr.bw_inuse = diag->wheel_slot;
	log.u_bbr.rttProp = diag->wheel_cts;
	log.u_bbr.timeStamp = cts;
	- log.u_bbr.delRate = diag->maxticks;
	+ log.u_bbr.delRate = diag->maxslots;
	log.u_bbr.cur_del_rate = diag->p_curtick;
	log.u_bbr.cur_del_rate <<= 32;
	log.u_bbr.cur_del_rate \|= diag->p_lasttick;
	@@ -5707,22 +5707,22 @@
	* on the clock. We always have a min
	* 10 slots (10 x 10 i.e. 100 usecs).
	*/
	- if (slot <= HPTS_TICKS_PER_USEC) {
	+ if (slot <= HPTS_TICKS_PER_SLOT) {
	/* We gain delay */
	- rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_USEC - slot);
	- slot = HPTS_TICKS_PER_USEC;
	+ rack->r_ctl.rc_agg_delayed += (HPTS_TICKS_PER_SLOT - slot);
	+ slot = HPTS_TICKS_PER_SLOT;
	} else {
	/* We take off some */
	- rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_USEC);
	- slot = HPTS_TICKS_PER_USEC;
	+ rack->r_ctl.rc_agg_delayed -= (slot - HPTS_TICKS_PER_SLOT);
	+ slot = HPTS_TICKS_PER_SLOT;
	}
	} else {
	slot -= rack->r_ctl.rc_agg_delayed;
	rack->r_ctl.rc_agg_delayed = 0;
	/* Make sure we have 100 useconds at minimum */
	- if (slot < HPTS_TICKS_PER_USEC) {
	- rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_USEC - slot;
	- slot = HPTS_TICKS_PER_USEC;
	+ if (slot < HPTS_TICKS_PER_SLOT) {
	+ rack->r_ctl.rc_agg_delayed = HPTS_TICKS_PER_SLOT - slot;
	+ slot = HPTS_TICKS_PER_SLOT;
	}
	if (rack->r_ctl.rc_agg_delayed == 0)
	rack->r_late = 0;

File Metadata

Mime Type: text/plain
Expires: Tue, Oct 14, 8:20 AM (17 h, 19 m)
Storage Engine: blob
Storage Format: Raw Data
Storage Handle: 23690852
Default Alt Text: D31083.id.diff (78 KB)

D31083.id.diffNo OneTemporaryActions

D31083.id.diffView Options

File Metadata

Event Timeline

D31083.id.diff
No OneTemporary
Actions

D31083.id.diff
View Options